Initial commit: Character Details MCP Server
This commit is contained in:
343
src/character_details/fetcher.py
Normal file
343
src/character_details/fetcher.py
Normal file
@@ -0,0 +1,343 @@
|
||||
"""
|
||||
Fetches fictional character data from Fandom wikis and Wikipedia.
|
||||
|
||||
Strategy:
|
||||
1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
|
||||
2. Fall back to / supplement with Wikipedia
|
||||
3. Parse sections into structured CharacterData fields
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .models import CharacterData
|
||||
|
||||
# Wikipedia requires a descriptive user-agent with contact info
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"character-details-mcp/1.0 "
|
||||
"(https://github.com/example/character-details; contact@example.com)"
|
||||
)
|
||||
}
|
||||
|
||||
# Map franchise keywords -> Fandom community subdomain
|
||||
FRANCHISE_WIKIS: dict[str, str] = {
|
||||
"final fantasy vii": "finalfantasy",
|
||||
"final fantasy 7": "finalfantasy",
|
||||
"ff7": "finalfantasy",
|
||||
"ffvii": "finalfantasy",
|
||||
"final fantasy": "finalfantasy",
|
||||
"super mario": "mario",
|
||||
"mario": "mario",
|
||||
"little witch academia": "little-witch-academia",
|
||||
"lwa": "little-witch-academia",
|
||||
}
|
||||
|
||||
# Section title keywords -> model field
|
||||
APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
|
||||
PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
|
||||
BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
|
||||
ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
|
||||
RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
|
||||
QUOTES_KW = {"quotes", "sayings", "dialogue"}
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
|
||||
|
||||
|
||||
def _classify_section(title: str) -> str | None:
|
||||
# Split into words for accurate matching (avoids "characteristics" matching "character")
|
||||
words = set(re.split(r"\W+", title.lower()))
|
||||
if words & APPEARANCE_KW:
|
||||
return "appearance"
|
||||
if words & PERSONALITY_KW:
|
||||
return "personality"
|
||||
if words & BACKGROUND_KW:
|
||||
return "background"
|
||||
if words & ABILITIES_KW:
|
||||
return "abilities"
|
||||
if words & RELATIONSHIPS_KW:
|
||||
return "relationships"
|
||||
if words & QUOTES_KW:
|
||||
return "quotes"
|
||||
return None
|
||||
|
||||
|
||||
def _find_wiki(franchise: str) -> str | None:
|
||||
return FRANCHISE_WIKIS.get(franchise.lower().strip())
|
||||
|
||||
|
||||
async def fetch_character(name: str, franchise: str) -> CharacterData:
|
||||
"""Fetch character from Fandom and/or Wikipedia, return structured data."""
|
||||
sections: dict[str, str] = {}
|
||||
sources: list[str] = []
|
||||
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
|
||||
# 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
|
||||
wiki = _find_wiki(franchise)
|
||||
if wiki:
|
||||
fandom = await _fetch_fandom(client, name, wiki)
|
||||
if fandom:
|
||||
sections.update(fandom["sections"])
|
||||
sources.append(fandom["url"])
|
||||
|
||||
# 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
|
||||
# and supplement any sections Fandom didn't provide
|
||||
wiki_data = await _fetch_wikipedia(client, name, franchise)
|
||||
if wiki_data:
|
||||
for k, v in wiki_data["sections"].items():
|
||||
# Description: Wikipedia always wins (Fandom lead is infobox-polluted)
|
||||
if k == "description" or k not in sections:
|
||||
sections[k] = v
|
||||
sources.append(wiki_data["url"])
|
||||
|
||||
return _build_character(name, franchise, sections, sources)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
|
||||
base = f"https://{wiki}.fandom.com"
|
||||
api = f"{base}/api.php"
|
||||
|
||||
# Search for the article
|
||||
try:
|
||||
resp = await client.get(
|
||||
api,
|
||||
params={"action": "query", "list": "search", "srsearch": name,
|
||||
"srlimit": 5, "format": "json"},
|
||||
)
|
||||
except httpx.HTTPError:
|
||||
return None
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
results = resp.json().get("query", {}).get("search", [])
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Pick best match (exact name preferred)
|
||||
article = _best_search_match(name, results)
|
||||
pageid = article["pageid"]
|
||||
page_title = article["title"]
|
||||
|
||||
# Get section list
|
||||
try:
|
||||
resp = await client.get(
|
||||
api,
|
||||
params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
|
||||
)
|
||||
except httpx.HTTPError:
|
||||
return None
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
all_sections = resp.json().get("parse", {}).get("sections", [])
|
||||
|
||||
# Always fetch lead (section 0) as description
|
||||
sections: dict[str, str] = {}
|
||||
lead_text = await _fetch_fandom_section(client, api, pageid, 0)
|
||||
if lead_text:
|
||||
sections["description"] = lead_text[:1500]
|
||||
|
||||
# Fetch sections that match our fields of interest
|
||||
# Only fetch top-level sections (toclevel == 1) we care about
|
||||
for sec in all_sections:
|
||||
title = _strip_html(sec.get("line", ""))
|
||||
field = _classify_section(title)
|
||||
if field is None:
|
||||
continue
|
||||
index = sec.get("index", "")
|
||||
if not index:
|
||||
continue
|
||||
text = await _fetch_fandom_section(client, api, pageid, index)
|
||||
if text:
|
||||
key = title.lower()
|
||||
sections[key] = text[:3000]
|
||||
|
||||
return {
|
||||
"sections": sections,
|
||||
"url": f"{base}/wiki/{page_title.replace(' ', '_')}",
|
||||
}
|
||||
|
||||
|
||||
async def _fetch_fandom_section(
|
||||
client: httpx.AsyncClient, api: str, pageid: int, section: int | str
|
||||
) -> str | None:
|
||||
try:
|
||||
resp = await client.get(
|
||||
api,
|
||||
params={"action": "parse", "pageid": pageid, "section": section,
|
||||
"prop": "text", "format": "json"},
|
||||
)
|
||||
except httpx.HTTPError:
|
||||
return None
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
html = resp.json().get("parse", {}).get("text", {}).get("*", "")
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup.select(
|
||||
"table, aside, h1, h2, h3, h4, h5, h6, "
|
||||
".navbox, .toc, #toc, .reference, sup, "
|
||||
".mw-editsection, .portable-infobox, .infobox, "
|
||||
".error, .cite-error, .mw-references-wrap, "
|
||||
# Fandom "Quick Answers" / "AI Answers" widgets
|
||||
".trfc161, section[class^='trfc'], "
|
||||
".fandom-community-question-answer, .qa-placeholder"
|
||||
):
|
||||
tag.decompose()
|
||||
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
# Collapse excess whitespace
|
||||
text = re.sub(r"\s{2,}", " ", text).strip()
|
||||
# Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
|
||||
text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
|
||||
return text if len(text) > 20 else None
|
||||
|
||||
|
||||
def _best_search_match(name: str, results: list[dict]) -> dict:
|
||||
name_lower = name.lower()
|
||||
for item in results:
|
||||
if item["title"].lower() == name_lower:
|
||||
return item
|
||||
return results[0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Wikipedia
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
|
||||
search_query = f"{name} {franchise} character"
|
||||
|
||||
try:
|
||||
resp = await client.get(
|
||||
"https://en.wikipedia.org/w/api.php",
|
||||
params={"action": "query", "list": "search", "srsearch": search_query,
|
||||
"srlimit": 3, "format": "json"},
|
||||
)
|
||||
except httpx.HTTPError:
|
||||
return None
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
results = resp.json().get("query", {}).get("search", [])
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Pick the result whose title best overlaps with the character name
|
||||
name_words = set(name.lower().split())
|
||||
best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
|
||||
title = best["title"]
|
||||
title_words = set(title.lower().split())
|
||||
article_is_about_character = bool(name_words & title_words)
|
||||
sections: dict[str, str] = {}
|
||||
|
||||
# Extracts API — clean plain-text intro, no infobox cruft
|
||||
# Only use as description if the Wikipedia article is actually about the character
|
||||
# (not about the franchise, which happens when no dedicated character article exists)
|
||||
|
||||
if article_is_about_character:
|
||||
try:
|
||||
resp = await client.get(
|
||||
"https://en.wikipedia.org/w/api.php",
|
||||
params={"action": "query", "titles": title, "prop": "extracts",
|
||||
"exintro": True, "explaintext": True, "format": "json"},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
pages = resp.json().get("query", {}).get("pages", {})
|
||||
extract = next(iter(pages.values()), {}).get("extract", "").strip()
|
||||
if extract:
|
||||
sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
|
||||
except httpx.HTTPError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"sections": sections,
|
||||
"url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Build CharacterData from raw sections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_character(
|
||||
name: str,
|
||||
franchise: str,
|
||||
sections: dict[str, str],
|
||||
sources: list[str],
|
||||
) -> CharacterData:
|
||||
appearance_parts: list[str] = []
|
||||
personality_parts: list[str] = []
|
||||
background_parts: list[str] = []
|
||||
abilities: list[str] = []
|
||||
relationships: list[str] = []
|
||||
quotes: list[str] = []
|
||||
extra: dict[str, str] = {}
|
||||
|
||||
description = sections.get("description", "")
|
||||
|
||||
for title, text in sections.items():
|
||||
if title == "description":
|
||||
continue
|
||||
field = _classify_section(title)
|
||||
if field == "appearance":
|
||||
appearance_parts.append(text)
|
||||
elif field == "personality":
|
||||
personality_parts.append(text)
|
||||
elif field == "background":
|
||||
background_parts.append(text)
|
||||
elif field == "abilities":
|
||||
abilities.extend(_extract_list_items(text))
|
||||
elif field == "relationships":
|
||||
relationships.extend(_extract_list_items(text))
|
||||
elif field == "quotes":
|
||||
quotes.extend(_extract_list_items(text))
|
||||
else:
|
||||
extra[title] = text[:500]
|
||||
|
||||
return CharacterData(
|
||||
name=name,
|
||||
franchise=franchise,
|
||||
description=description.strip(),
|
||||
appearance="\n\n".join(appearance_parts).strip(),
|
||||
personality="\n\n".join(personality_parts).strip(),
|
||||
background="\n\n".join(background_parts).strip(),
|
||||
abilities=abilities[:20],
|
||||
relationships=relationships[:15],
|
||||
notable_quotes=quotes[:10],
|
||||
extra_sections={k: v for k, v in list(extra.items())[:8]},
|
||||
sources=sources,
|
||||
cached_at=datetime.now(),
|
||||
)
|
||||
|
||||
|
||||
def _extract_list_items(text: str) -> list[str]:
|
||||
"""Extract bullet items or split prose into sentences if no bullet structure."""
|
||||
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
||||
|
||||
# Check if content is bullet-structured
|
||||
bullet_lines = [l.lstrip("-•*–").strip() for l in lines if re.match(r"^[-•*–]", l)]
|
||||
if len(bullet_lines) >= 2:
|
||||
return [l for l in bullet_lines if len(l) > 5]
|
||||
|
||||
# Otherwise return short, sentence-split chunks (max 300 chars each)
|
||||
sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
|
||||
items = [s.strip() for s in sentences if len(s.strip()) > 10]
|
||||
return items[:15] # cap to avoid bloat
|
||||
Reference in New Issue
Block a user