character-mcp/src/character_details/fetcher.py

"""
Fetches fictional character data from Fandom wikis and Wikipedia.

Strategy:
  1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
  2. Fall back to / supplement with Wikipedia
  3. Parse sections into structured CharacterData fields
"""

import re
from datetime import datetime

import httpx
from bs4 import BeautifulSoup

from .models import CharacterData

# Wikipedia requires a descriptive user-agent with contact info
HEADERS = {
    "User-Agent": (
        "character-details-mcp/1.0 "
        "(https://github.com/example/character-details; contact@example.com)"
    )
}

# Map franchise keywords -> Fandom community subdomain
FRANCHISE_WIKIS: dict[str, str] = {
    "final fantasy": "finalfantasy",
    "final fantasy i": "finalfantasy",
    "final fantasy 1": "finalfantasy",
    "ffi": "finalfantasy",
    "ff1": "finalfantasy",
    "final fantasy ii": "finalfantasy",
    "final fantasy 2": "finalfantasy",
    "ffii": "finalfantasy",
    "ff2": "finalfantasy",
    "final fantasy iii": "finalfantasy",
    "final fantasy 3": "finalfantasy",
    "ffiii": "finalfantasy",
    "ff3": "finalfantasy",
    "final fantasy iv": "finalfantasy",
    "final fantasy 4": "finalfantasy",
    "ffiv": "finalfantasy",
    "ff4": "finalfantasy",
    "final fantasy v": "finalfantasy",
    "final fantasy 5": "finalfantasy",
    "ffv": "finalfantasy",
    "ff5": "finalfantasy",
    "final fantasy vi": "finalfantasy",
    "final fantasy 6": "finalfantasy",
    "ffvi": "finalfantasy",
    "ff6": "finalfantasy",
    "final fantasy vii": "finalfantasy",
    "final fantasy 7": "finalfantasy",
    "ffvii": "finalfantasy",
    "ff7": "finalfantasy",
    "final fantasy viii": "finalfantasy",
    "final fantasy 8": "finalfantasy",
    "ffviii": "finalfantasy",
    "ff8": "finalfantasy",
    "final fantasy ix": "finalfantasy",
    "final fantasy 9": "finalfantasy",
    "ffix": "finalfantasy",
    "ff9": "finalfantasy",
    "final fantasy x": "finalfantasy",
    "final fantasy 10": "finalfantasy",
    "ffx": "finalfantasy",
    "ff10": "finalfantasy",
    "final fantasy xi": "finalfantasy",
    "final fantasy 11": "finalfantasy",
    "ffxi": "finalfantasy",
    "ff11": "finalfantasy",
    "final fantasy xii": "finalfantasy",
    "final fantasy 12": "finalfantasy",
    "ffxii": "finalfantasy",
    "ff12": "finalfantasy",
    "final fantasy xiii": "finalfantasy",
    "final fantasy 13": "finalfantasy",
    "ffxiii": "finalfantasy",
    "ff13": "finalfantasy",
    "final fantasy xiv": "finalfantasy",
    "final fantasy 14": "finalfantasy",
    "ffxiv": "finalfantasy",
    "ff14": "finalfantasy",
    "final fantasy xv": "finalfantasy",
    "final fantasy 15": "finalfantasy",
    "ffxv": "finalfantasy",
    "ff15": "finalfantasy",
    "final fantasy xvi": "finalfantasy",
    "final fantasy 16": "finalfantasy",
    "ffxvi": "finalfantasy",
    "ff16": "finalfantasy",
    "super mario": "mario",
    "mario": "mario",
    "little witch academia": "little-witch-academia",
    "lwa": "little-witch-academia",
    "uma musume": "umamusume",
    "umamusume": "umamusume",
    "uma musume pretty derby": "umamusume",
    "fire emblem": "fireemblem",
    "senran kagura": "senrankagura",
    "vocaloid": "vocaloid",
    "dragon ball": "dragonball",
    "dragon ball z": "dragonball",
    "dbz": "dragonball",
    "dragon ball super": "dragonball",
    "dbs": "dragonball",
    "league of legends": "leagueoflegends",
    "lol": "leagueoflegends",
    "street fighter": "streetfighter",
    "sonic": "sonic",
    "sonic the hedgehog": "sonic",
    "spy x family": "spy-x-family",
    "spy family": "spy-x-family",
    "spyxfamily": "spy-x-family",
    "zelda": "zelda",
    "the legend of zelda": "zelda",
    "legend of zelda": "zelda",
    "witcher": "witcher",
    "the witcher": "witcher",
    "metroid": "metroid",
    "pokemon": "pokemon",
    "pokémon": "pokemon",
}

# Section title keywords -> model field
APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
QUOTES_KW = {"quotes", "sayings", "dialogue"}


def _strip_html(html: str) -> str:
    return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)


def _classify_section(title: str) -> str | None:
    # Split into words for accurate matching (avoids "characteristics" matching "character")
    words = set(re.split(r"\W+", title.lower()))
    if words & APPEARANCE_KW:
        return "appearance"
    if words & PERSONALITY_KW:
        return "personality"
    if words & BACKGROUND_KW:
        return "background"
    if words & ABILITIES_KW:
        return "abilities"
    if words & RELATIONSHIPS_KW:
        return "relationships"
    if words & QUOTES_KW:
        return "quotes"
    return None


def _find_wiki(franchise: str) -> str | None:
    return FRANCHISE_WIKIS.get(franchise.lower().strip())


async def fetch_character(name: str, franchise: str) -> CharacterData:
    """Fetch character from Fandom and/or Wikipedia, return structured data."""
    sections: dict[str, str] = {}
    sources: list[str] = []

    async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
        # 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
        wiki = _find_wiki(franchise)
        if wiki:
            fandom = await _fetch_fandom(client, name, wiki)
            if fandom:
                sections.update(fandom["sections"])
                sources.append(fandom["url"])

        # 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
        #    and supplement any sections Fandom didn't provide
        wiki_data = await _fetch_wikipedia(client, name, franchise)
        if wiki_data:
            for k, v in wiki_data["sections"].items():
                # Description: Wikipedia always wins (Fandom lead is infobox-polluted)
                if k == "description" or k not in sections:
                    sections[k] = v
            sources.append(wiki_data["url"])

    return _build_character(name, franchise, sections, sources)


# ---------------------------------------------------------------------------
# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
# ---------------------------------------------------------------------------

async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
    base = f"https://{wiki}.fandom.com"
    api = f"{base}/api.php"

    # Search for the article
    try:
        resp = await client.get(
            api,
            params={"action": "query", "list": "search", "srsearch": name,
                    "srlimit": 5, "format": "json"},
        )
    except httpx.HTTPError:
        return None

    if resp.status_code != 200:
        return None

    results = resp.json().get("query", {}).get("search", [])
    if not results:
        return None

    # Pick best match (exact name preferred)
    article = _best_search_match(name, results)
    pageid = article["pageid"]
    page_title = article["title"]

    # Get section list
    try:
        resp = await client.get(
            api,
            params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
        )
    except httpx.HTTPError:
        return None

    if resp.status_code != 200:
        return None

    all_sections = resp.json().get("parse", {}).get("sections", [])

    # Always fetch lead (section 0) as description
    sections: dict[str, str] = {}
    lead_text = await _fetch_fandom_section(client, api, pageid, 0)
    if lead_text:
        sections["description"] = lead_text[:1500]

    # Fetch sections that match our fields of interest
    # Only fetch top-level sections (toclevel == 1) we care about
    for sec in all_sections:
        title = _strip_html(sec.get("line", ""))
        field = _classify_section(title)
        if field is None:
            continue
        index = sec.get("index", "")
        if not index:
            continue
        text = await _fetch_fandom_section(client, api, pageid, index)
        if text:
            key = title.lower()
            sections[key] = text[:3000]

    return {
        "sections": sections,
        "url": f"{base}/wiki/{page_title.replace(' ', '_')}",
    }


async def _fetch_fandom_section(
    client: httpx.AsyncClient, api: str, pageid: int, section: int | str
) -> str | None:
    try:
        resp = await client.get(
            api,
            params={"action": "parse", "pageid": pageid, "section": section,
                    "prop": "text", "format": "json"},
        )
    except httpx.HTTPError:
        return None

    if resp.status_code != 200:
        return None

    html = resp.json().get("parse", {}).get("text", {}).get("*", "")
    if not html:
        return None

    # Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup.select(
        "table, aside, h1, h2, h3, h4, h5, h6, "
        ".navbox, .toc, #toc, .reference, sup, "
        ".mw-editsection, .portable-infobox, .infobox, "
        ".error, .cite-error, .mw-references-wrap, "
        # Fandom "Quick Answers" / "AI Answers" widgets
        ".trfc161, section[class^='trfc'], "
        ".fandom-community-question-answer, .qa-placeholder"
    ):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)
    # Collapse excess whitespace
    text = re.sub(r"\s{2,}", " ", text).strip()
    # Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
    text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
    return text if len(text) > 20 else None


def _best_search_match(name: str, results: list[dict]) -> dict:
    name_lower = name.lower()
    for item in results:
        if item["title"].lower() == name_lower:
            return item
    return results[0]


# ---------------------------------------------------------------------------
# Wikipedia
# ---------------------------------------------------------------------------

async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
    search_query = f"{name} {franchise} character"

    try:
        resp = await client.get(
            "https://en.wikipedia.org/w/api.php",
            params={"action": "query", "list": "search", "srsearch": search_query,
                    "srlimit": 3, "format": "json"},
        )
    except httpx.HTTPError:
        return None

    if resp.status_code != 200:
        return None

    results = resp.json().get("query", {}).get("search", [])
    if not results:
        return None

    # Pick the result whose title best overlaps with the character name
    name_words = set(name.lower().split())
    best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
    title = best["title"]
    title_words = set(title.lower().split())
    article_is_about_character = bool(name_words & title_words)
    sections: dict[str, str] = {}

    # Extracts API — clean plain-text intro, no infobox cruft
    # Only use as description if the Wikipedia article is actually about the character
    # (not about the franchise, which happens when no dedicated character article exists)

    if article_is_about_character:
        try:
            resp = await client.get(
                "https://en.wikipedia.org/w/api.php",
                params={"action": "query", "titles": title, "prop": "extracts",
                        "exintro": True, "explaintext": True, "format": "json"},
            )
            if resp.status_code == 200:
                pages = resp.json().get("query", {}).get("pages", {})
                extract = next(iter(pages.values()), {}).get("extract", "").strip()
                if extract:
                    sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
        except httpx.HTTPError:
            pass

    return {
        "sections": sections,
        "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
    }


# ---------------------------------------------------------------------------
# Build CharacterData from raw sections
# ---------------------------------------------------------------------------

def _build_character(
    name: str,
    franchise: str,
    sections: dict[str, str],
    sources: list[str],
) -> CharacterData:
    appearance_parts: list[str] = []
    personality_parts: list[str] = []
    background_parts: list[str] = []
    abilities: list[str] = []
    relationships: list[str] = []
    quotes: list[str] = []
    extra: dict[str, str] = {}

    description = sections.get("description", "")

    for title, text in sections.items():
        if title == "description":
            continue
        field = _classify_section(title)
        if field == "appearance":
            appearance_parts.append(text)
        elif field == "personality":
            personality_parts.append(text)
        elif field == "background":
            background_parts.append(text)
        elif field == "abilities":
            abilities.extend(_extract_list_items(text))
        elif field == "relationships":
            relationships.extend(_extract_list_items(text))
        elif field == "quotes":
            quotes.extend(_extract_list_items(text))
        else:
            extra[title] = text[:500]

    return CharacterData(
        name=name,
        franchise=franchise,
        description=description.strip(),
        appearance="\n\n".join(appearance_parts).strip(),
        personality="\n\n".join(personality_parts).strip(),
        background="\n\n".join(background_parts).strip(),
        abilities=abilities[:20],
        relationships=relationships[:15],
        notable_quotes=quotes[:10],
        extra_sections={k: v for k, v in list(extra.items())[:8]},
        sources=sources,
        cached_at=datetime.now(),
    )


def _extract_list_items(text: str) -> list[str]:
    """Extract bullet items or split prose into sentences if no bullet structure."""
    lines = [l.strip() for l in text.splitlines() if l.strip()]

    # Check if content is bullet-structured
    bullet_lines = [l.lstrip("-•*–").strip() for l in lines if re.match(r"^[-•*–]", l)]
    if len(bullet_lines) >= 2:
        return [l for l in bullet_lines if len(l) > 5]

    # Otherwise return short, sentence-split chunks (max 300 chars each)
    sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
    items = [s.strip() for s in sentences if len(s.strip()) > 10]
    return items[:15]  # cap to avoid bloat