Initial commit: Character Details MCP Server

2026-03-06 19:23:05 +00:00
commit f330fe8eb3
13 changed files with 1028 additions and 0 deletions
--- a/src/character_details/fetcher.py
+++ b/src/character_details/fetcher.py
@@ -0,0 +1,343 @@
+"""
+Fetches fictional character data from Fandom wikis and Wikipedia.
+
+Strategy:
+  1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
+  2. Fall back to / supplement with Wikipedia
+  3. Parse sections into structured CharacterData fields
+"""
+
+import re
+from datetime import datetime
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .models import CharacterData
+
+# Wikipedia requires a descriptive user-agent with contact info
+HEADERS = {
+    "User-Agent": (
+        "character-details-mcp/1.0 "
+        "(https://github.com/example/character-details; contact@example.com)"
+    )
+}
+
+# Map franchise keywords -> Fandom community subdomain
+FRANCHISE_WIKIS: dict[str, str] = {
+    "final fantasy vii": "finalfantasy",
+    "final fantasy 7": "finalfantasy",
+    "ff7": "finalfantasy",
+    "ffvii": "finalfantasy",
+    "final fantasy": "finalfantasy",
+    "super mario": "mario",
+    "mario": "mario",
+    "little witch academia": "little-witch-academia",
+    "lwa": "little-witch-academia",
+}
+
+# Section title keywords -> model field
+APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
+PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
+BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
+ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
+RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
+QUOTES_KW = {"quotes", "sayings", "dialogue"}
+
+
+def _strip_html(html: str) -> str:
+    return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
+
+
+def _classify_section(title: str) -> str | None:
+    # Split into words for accurate matching (avoids "characteristics" matching "character")
+    words = set(re.split(r"\W+", title.lower()))
+    if words & APPEARANCE_KW:
+        return "appearance"
+    if words & PERSONALITY_KW:
+        return "personality"
+    if words & BACKGROUND_KW:
+        return "background"
+    if words & ABILITIES_KW:
+        return "abilities"
+    if words & RELATIONSHIPS_KW:
+        return "relationships"
+    if words & QUOTES_KW:
+        return "quotes"
+    return None
+
+
+def _find_wiki(franchise: str) -> str | None:
+    return FRANCHISE_WIKIS.get(franchise.lower().strip())
+
+
+async def fetch_character(name: str, franchise: str) -> CharacterData:
+    """Fetch character from Fandom and/or Wikipedia, return structured data."""
+    sections: dict[str, str] = {}
+    sources: list[str] = []
+
+    async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
+        # 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
+        wiki = _find_wiki(franchise)
+        if wiki:
+            fandom = await _fetch_fandom(client, name, wiki)
+            if fandom:
+                sections.update(fandom["sections"])
+                sources.append(fandom["url"])
+
+        # 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
+        #    and supplement any sections Fandom didn't provide
+        wiki_data = await _fetch_wikipedia(client, name, franchise)
+        if wiki_data:
+            for k, v in wiki_data["sections"].items():
+                # Description: Wikipedia always wins (Fandom lead is infobox-polluted)
+                if k == "description" or k not in sections:
+                    sections[k] = v
+            sources.append(wiki_data["url"])
+
+    return _build_character(name, franchise, sections, sources)
+
+
+# ---------------------------------------------------------------------------
+# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
+# ---------------------------------------------------------------------------
+
+async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
+    base = f"https://{wiki}.fandom.com"
+    api = f"{base}/api.php"
+
+    # Search for the article
+    try:
+        resp = await client.get(
+            api,
+            params={"action": "query", "list": "search", "srsearch": name,
+                    "srlimit": 5, "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    results = resp.json().get("query", {}).get("search", [])
+    if not results:
+        return None
+
+    # Pick best match (exact name preferred)
+    article = _best_search_match(name, results)
+    pageid = article["pageid"]
+    page_title = article["title"]
+
+    # Get section list
+    try:
+        resp = await client.get(
+            api,
+            params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    all_sections = resp.json().get("parse", {}).get("sections", [])
+
+    # Always fetch lead (section 0) as description
+    sections: dict[str, str] = {}
+    lead_text = await _fetch_fandom_section(client, api, pageid, 0)
+    if lead_text:
+        sections["description"] = lead_text[:1500]
+
+    # Fetch sections that match our fields of interest
+    # Only fetch top-level sections (toclevel == 1) we care about
+    for sec in all_sections:
+        title = _strip_html(sec.get("line", ""))
+        field = _classify_section(title)
+        if field is None:
+            continue
+        index = sec.get("index", "")
+        if not index:
+            continue
+        text = await _fetch_fandom_section(client, api, pageid, index)
+        if text:
+            key = title.lower()
+            sections[key] = text[:3000]
+
+    return {
+        "sections": sections,
+        "url": f"{base}/wiki/{page_title.replace(' ', '_')}",
+    }
+
+
+async def _fetch_fandom_section(
+    client: httpx.AsyncClient, api: str, pageid: int, section: int | str
+) -> str | None:
+    try:
+        resp = await client.get(
+            api,
+            params={"action": "parse", "pageid": pageid, "section": section,
+                    "prop": "text", "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    html = resp.json().get("parse", {}).get("text", {}).get("*", "")
+    if not html:
+        return None
+
+    # Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup.select(
+        "table, aside, h1, h2, h3, h4, h5, h6, "
+        ".navbox, .toc, #toc, .reference, sup, "
+        ".mw-editsection, .portable-infobox, .infobox, "
+        ".error, .cite-error, .mw-references-wrap, "
+        # Fandom "Quick Answers" / "AI Answers" widgets
+        ".trfc161, section[class^='trfc'], "
+        ".fandom-community-question-answer, .qa-placeholder"
+    ):
+        tag.decompose()
+
+    text = soup.get_text(separator=" ", strip=True)
+    # Collapse excess whitespace
+    text = re.sub(r"\s{2,}", " ", text).strip()
+    # Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
+    text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
+    return text if len(text) > 20 else None
+
+
+def _best_search_match(name: str, results: list[dict]) -> dict:
+    name_lower = name.lower()
+    for item in results:
+        if item["title"].lower() == name_lower:
+            return item
+    return results[0]
+
+
+# ---------------------------------------------------------------------------
+# Wikipedia
+# ---------------------------------------------------------------------------
+
+async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
+    search_query = f"{name} {franchise} character"
+
+    try:
+        resp = await client.get(
+            "https://en.wikipedia.org/w/api.php",
+            params={"action": "query", "list": "search", "srsearch": search_query,
+                    "srlimit": 3, "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    results = resp.json().get("query", {}).get("search", [])
+    if not results:
+        return None
+
+    # Pick the result whose title best overlaps with the character name
+    name_words = set(name.lower().split())
+    best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
+    title = best["title"]
+    title_words = set(title.lower().split())
+    article_is_about_character = bool(name_words & title_words)
+    sections: dict[str, str] = {}
+
+    # Extracts API — clean plain-text intro, no infobox cruft
+    # Only use as description if the Wikipedia article is actually about the character
+    # (not about the franchise, which happens when no dedicated character article exists)
+
+    if article_is_about_character:
+        try:
+            resp = await client.get(
+                "https://en.wikipedia.org/w/api.php",
+                params={"action": "query", "titles": title, "prop": "extracts",
+                        "exintro": True, "explaintext": True, "format": "json"},
+            )
+            if resp.status_code == 200:
+                pages = resp.json().get("query", {}).get("pages", {})
+                extract = next(iter(pages.values()), {}).get("extract", "").strip()
+                if extract:
+                    sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
+        except httpx.HTTPError:
+            pass
+
+    return {
+        "sections": sections,
+        "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
+    }
+
+
+# ---------------------------------------------------------------------------
+# Build CharacterData from raw sections
+# ---------------------------------------------------------------------------
+
+def _build_character(
+    name: str,
+    franchise: str,
+    sections: dict[str, str],
+    sources: list[str],
+) -> CharacterData:
+    appearance_parts: list[str] = []
+    personality_parts: list[str] = []
+    background_parts: list[str] = []
+    abilities: list[str] = []
+    relationships: list[str] = []
+    quotes: list[str] = []
+    extra: dict[str, str] = {}
+
+    description = sections.get("description", "")
+
+    for title, text in sections.items():
+        if title == "description":
+            continue
+        field = _classify_section(title)
+        if field == "appearance":
+            appearance_parts.append(text)
+        elif field == "personality":
+            personality_parts.append(text)
+        elif field == "background":
+            background_parts.append(text)
+        elif field == "abilities":
+            abilities.extend(_extract_list_items(text))
+        elif field == "relationships":
+            relationships.extend(_extract_list_items(text))
+        elif field == "quotes":
+            quotes.extend(_extract_list_items(text))
+        else:
+            extra[title] = text[:500]
+
+    return CharacterData(
+        name=name,
+        franchise=franchise,
+        description=description.strip(),
+        appearance="\n\n".join(appearance_parts).strip(),
+        personality="\n\n".join(personality_parts).strip(),
+        background="\n\n".join(background_parts).strip(),
+        abilities=abilities[:20],
+        relationships=relationships[:15],
+        notable_quotes=quotes[:10],
+        extra_sections={k: v for k, v in list(extra.items())[:8]},
+        sources=sources,
+        cached_at=datetime.now(),
+    )
+
+
+def _extract_list_items(text: str) -> list[str]:
+    """Extract bullet items or split prose into sentences if no bullet structure."""
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+
+    # Check if content is bullet-structured
+    bullet_lines = [l.lstrip("-•*–").strip() for l in lines if re.match(r"^[-•*–]", l)]
+    if len(bullet_lines) >= 2:
+        return [l for l in bullet_lines if len(l) > 5]
+
+    # Otherwise return short, sentence-split chunks (max 300 chars each)
+    sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
+    items = [s.strip() for s in sentences if len(s.strip()) > 10]
+    return items[:15]  # cap to avoid bloat