""" Fetches fictional character data from Fandom wikis and Wikipedia. Strategy: 1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data) 2. Fall back to / supplement with Wikipedia 3. Parse sections into structured CharacterData fields """ import re from datetime import datetime import httpx from bs4 import BeautifulSoup from .models import CharacterData # Wikipedia requires a descriptive user-agent with contact info HEADERS = { "User-Agent": ( "character-details-mcp/1.0 " "(https://github.com/example/character-details; contact@example.com)" ) } # Map franchise keywords -> Fandom community subdomain FRANCHISE_WIKIS: dict[str, str] = { "final fantasy": "finalfantasy", "final fantasy i": "finalfantasy", "final fantasy 1": "finalfantasy", "ffi": "finalfantasy", "ff1": "finalfantasy", "final fantasy ii": "finalfantasy", "final fantasy 2": "finalfantasy", "ffii": "finalfantasy", "ff2": "finalfantasy", "final fantasy iii": "finalfantasy", "final fantasy 3": "finalfantasy", "ffiii": "finalfantasy", "ff3": "finalfantasy", "final fantasy iv": "finalfantasy", "final fantasy 4": "finalfantasy", "ffiv": "finalfantasy", "ff4": "finalfantasy", "final fantasy v": "finalfantasy", "final fantasy 5": "finalfantasy", "ffv": "finalfantasy", "ff5": "finalfantasy", "final fantasy vi": "finalfantasy", "final fantasy 6": "finalfantasy", "ffvi": "finalfantasy", "ff6": "finalfantasy", "final fantasy vii": "finalfantasy", "final fantasy 7": "finalfantasy", "ffvii": "finalfantasy", "ff7": "finalfantasy", "final fantasy viii": "finalfantasy", "final fantasy 8": "finalfantasy", "ffviii": "finalfantasy", "ff8": "finalfantasy", "final fantasy ix": "finalfantasy", "final fantasy 9": "finalfantasy", "ffix": "finalfantasy", "ff9": "finalfantasy", "final fantasy x": "finalfantasy", "final fantasy 10": "finalfantasy", "ffx": "finalfantasy", "ff10": "finalfantasy", "final fantasy xi": "finalfantasy", "final fantasy 11": "finalfantasy", "ffxi": "finalfantasy", "ff11": "finalfantasy", "final fantasy xii": "finalfantasy", "final fantasy 12": "finalfantasy", "ffxii": "finalfantasy", "ff12": "finalfantasy", "final fantasy xiii": "finalfantasy", "final fantasy 13": "finalfantasy", "ffxiii": "finalfantasy", "ff13": "finalfantasy", "final fantasy xiv": "finalfantasy", "final fantasy 14": "finalfantasy", "ffxiv": "finalfantasy", "ff14": "finalfantasy", "final fantasy xv": "finalfantasy", "final fantasy 15": "finalfantasy", "ffxv": "finalfantasy", "ff15": "finalfantasy", "final fantasy xvi": "finalfantasy", "final fantasy 16": "finalfantasy", "ffxvi": "finalfantasy", "ff16": "finalfantasy", "super mario": "mario", "mario": "mario", "little witch academia": "little-witch-academia", "lwa": "little-witch-academia", "uma musume": "umamusume", "umamusume": "umamusume", "uma musume pretty derby": "umamusume", "fire emblem": "fireemblem", "senran kagura": "senrankagura", "vocaloid": "vocaloid", "dragon ball": "dragonball", "dragon ball z": "dragonball", "dbz": "dragonball", "dragon ball super": "dragonball", "dbs": "dragonball", "league of legends": "leagueoflegends", "lol": "leagueoflegends", "street fighter": "streetfighter", "sonic": "sonic", "sonic the hedgehog": "sonic", "spy x family": "spy-x-family", "spy family": "spy-x-family", "spyxfamily": "spy-x-family", "zelda": "zelda", "the legend of zelda": "zelda", "legend of zelda": "zelda", "witcher": "witcher", "the witcher": "witcher", "metroid": "metroid", "pokemon": "pokemon", "pokémon": "pokemon", } # Section title keywords -> model field APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"} PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"} BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"} ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"} RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"} QUOTES_KW = {"quotes", "sayings", "dialogue"} def _strip_html(html: str) -> str: return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True) def _classify_section(title: str) -> str | None: # Split into words for accurate matching (avoids "characteristics" matching "character") words = set(re.split(r"\W+", title.lower())) if words & APPEARANCE_KW: return "appearance" if words & PERSONALITY_KW: return "personality" if words & BACKGROUND_KW: return "background" if words & ABILITIES_KW: return "abilities" if words & RELATIONSHIPS_KW: return "relationships" if words & QUOTES_KW: return "quotes" return None def _find_wiki(franchise: str) -> str | None: return FRANCHISE_WIKIS.get(franchise.lower().strip()) async def fetch_character(name: str, franchise: str) -> CharacterData: """Fetch character from Fandom and/or Wikipedia, return structured data.""" sections: dict[str, str] = {} sources: list[str] = [] async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client: # 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections wiki = _find_wiki(franchise) if wiki: fandom = await _fetch_fandom(client, name, wiki) if fandom: sections.update(fandom["sections"]) sources.append(fandom["url"]) # 2. Wikipedia — always prefer its description (cleaner, no infobox cruft) # and supplement any sections Fandom didn't provide wiki_data = await _fetch_wikipedia(client, name, franchise) if wiki_data: for k, v in wiki_data["sections"].items(): # Description: Wikipedia always wins (Fandom lead is infobox-polluted) if k == "description" or k not in sections: sections[k] = v sources.append(wiki_data["url"]) return _build_character(name, franchise, sections, sources) # --------------------------------------------------------------------------- # Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/) # --------------------------------------------------------------------------- async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None: base = f"https://{wiki}.fandom.com" api = f"{base}/api.php" # Search for the article try: resp = await client.get( api, params={"action": "query", "list": "search", "srsearch": name, "srlimit": 5, "format": "json"}, ) except httpx.HTTPError: return None if resp.status_code != 200: return None results = resp.json().get("query", {}).get("search", []) if not results: return None # Pick best match (exact name preferred) article = _best_search_match(name, results) pageid = article["pageid"] page_title = article["title"] # Get section list try: resp = await client.get( api, params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"}, ) except httpx.HTTPError: return None if resp.status_code != 200: return None all_sections = resp.json().get("parse", {}).get("sections", []) # Always fetch lead (section 0) as description sections: dict[str, str] = {} lead_text = await _fetch_fandom_section(client, api, pageid, 0) if lead_text: sections["description"] = lead_text[:1500] # Fetch sections that match our fields of interest # Only fetch top-level sections (toclevel == 1) we care about for sec in all_sections: title = _strip_html(sec.get("line", "")) field = _classify_section(title) if field is None: continue index = sec.get("index", "") if not index: continue text = await _fetch_fandom_section(client, api, pageid, index) if text: key = title.lower() sections[key] = text[:3000] return { "sections": sections, "url": f"{base}/wiki/{page_title.replace(' ', '_')}", } async def _fetch_fandom_section( client: httpx.AsyncClient, api: str, pageid: int, section: int | str ) -> str | None: try: resp = await client.get( api, params={"action": "parse", "pageid": pageid, "section": section, "prop": "text", "format": "json"}, ) except httpx.HTTPError: return None if resp.status_code != 200: return None html = resp.json().get("parse", {}).get("text", {}).get("*", "") if not html: return None # Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets soup = BeautifulSoup(html, "html.parser") for tag in soup.select( "table, aside, h1, h2, h3, h4, h5, h6, " ".navbox, .toc, #toc, .reference, sup, " ".mw-editsection, .portable-infobox, .infobox, " ".error, .cite-error, .mw-references-wrap, " # Fandom "Quick Answers" / "AI Answers" widgets ".trfc161, section[class^='trfc'], " ".fandom-community-question-answer, .qa-placeholder" ): tag.decompose() text = soup.get_text(separator=" ", strip=True) # Collapse excess whitespace text = re.sub(r"\s{2,}", " ", text).strip() # Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ") text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip() return text if len(text) > 20 else None def _best_search_match(name: str, results: list[dict]) -> dict: name_lower = name.lower() for item in results: if item["title"].lower() == name_lower: return item return results[0] # --------------------------------------------------------------------------- # Wikipedia # --------------------------------------------------------------------------- async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None: search_query = f"{name} {franchise} character" try: resp = await client.get( "https://en.wikipedia.org/w/api.php", params={"action": "query", "list": "search", "srsearch": search_query, "srlimit": 3, "format": "json"}, ) except httpx.HTTPError: return None if resp.status_code != 200: return None results = resp.json().get("query", {}).get("search", []) if not results: return None # Pick the result whose title best overlaps with the character name name_words = set(name.lower().split()) best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split()))) title = best["title"] title_words = set(title.lower().split()) article_is_about_character = bool(name_words & title_words) sections: dict[str, str] = {} # Extracts API — clean plain-text intro, no infobox cruft # Only use as description if the Wikipedia article is actually about the character # (not about the franchise, which happens when no dedicated character article exists) if article_is_about_character: try: resp = await client.get( "https://en.wikipedia.org/w/api.php", params={"action": "query", "titles": title, "prop": "extracts", "exintro": True, "explaintext": True, "format": "json"}, ) if resp.status_code == 200: pages = resp.json().get("query", {}).get("pages", {}) extract = next(iter(pages.values()), {}).get("extract", "").strip() if extract: sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500] except httpx.HTTPError: pass return { "sections": sections, "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}", } # --------------------------------------------------------------------------- # Build CharacterData from raw sections # --------------------------------------------------------------------------- def _build_character( name: str, franchise: str, sections: dict[str, str], sources: list[str], ) -> CharacterData: appearance_parts: list[str] = [] personality_parts: list[str] = [] background_parts: list[str] = [] abilities: list[str] = [] relationships: list[str] = [] quotes: list[str] = [] extra: dict[str, str] = {} description = sections.get("description", "") for title, text in sections.items(): if title == "description": continue field = _classify_section(title) if field == "appearance": appearance_parts.append(text) elif field == "personality": personality_parts.append(text) elif field == "background": background_parts.append(text) elif field == "abilities": abilities.extend(_extract_list_items(text)) elif field == "relationships": relationships.extend(_extract_list_items(text)) elif field == "quotes": quotes.extend(_extract_list_items(text)) else: extra[title] = text[:500] return CharacterData( name=name, franchise=franchise, description=description.strip(), appearance="\n\n".join(appearance_parts).strip(), personality="\n\n".join(personality_parts).strip(), background="\n\n".join(background_parts).strip(), abilities=abilities[:20], relationships=relationships[:15], notable_quotes=quotes[:10], extra_sections={k: v for k, v in list(extra.items())[:8]}, sources=sources, cached_at=datetime.now(), ) def _extract_list_items(text: str) -> list[str]: """Extract bullet items or split prose into sentences if no bullet structure.""" lines = [l.strip() for l in text.splitlines() if l.strip()] # Check if content is bullet-structured bullet_lines = [l.lstrip("-•*–").strip() for l in lines if re.match(r"^[-•*–]", l)] if len(bullet_lines) >= 2: return [l for l in bullet_lines if len(l) > 5] # Otherwise return short, sentence-split chunks (max 300 chars each) sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines)) items = [s.strip() for s in sentences if len(s.strip()) > 10] return items[:15] # cap to avoid bloat