Expand franchise wiki mappings to cover Uma Musume, Fire Emblem, Senran Kagura, Vocaloid, Dragon Ball, League of Legends, Street Fighter, Sonic, Spy x Family, Zelda, The Witcher, Metroid, and Pokemon. Also expand Final Fantasy aliases to cover all numbered titles I–XVI with both arabic and roman numeral variants. Adds parametrized integration tests that verify each wiki endpoint returns valid CharacterData with a description and Fandom source URL. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
431 lines
15 KiB
Python
431 lines
15 KiB
Python
"""
|
||
Fetches fictional character data from Fandom wikis and Wikipedia.
|
||
|
||
Strategy:
|
||
1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
|
||
2. Fall back to / supplement with Wikipedia
|
||
3. Parse sections into structured CharacterData fields
|
||
"""
|
||
|
||
import re
|
||
from datetime import datetime
|
||
|
||
import httpx
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .models import CharacterData
|
||
|
||
# Wikipedia requires a descriptive user-agent with contact info
|
||
HEADERS = {
|
||
"User-Agent": (
|
||
"character-details-mcp/1.0 "
|
||
"(https://github.com/example/character-details; contact@example.com)"
|
||
)
|
||
}
|
||
|
||
# Map franchise keywords -> Fandom community subdomain
|
||
FRANCHISE_WIKIS: dict[str, str] = {
|
||
"final fantasy": "finalfantasy",
|
||
"final fantasy i": "finalfantasy",
|
||
"final fantasy 1": "finalfantasy",
|
||
"ffi": "finalfantasy",
|
||
"ff1": "finalfantasy",
|
||
"final fantasy ii": "finalfantasy",
|
||
"final fantasy 2": "finalfantasy",
|
||
"ffii": "finalfantasy",
|
||
"ff2": "finalfantasy",
|
||
"final fantasy iii": "finalfantasy",
|
||
"final fantasy 3": "finalfantasy",
|
||
"ffiii": "finalfantasy",
|
||
"ff3": "finalfantasy",
|
||
"final fantasy iv": "finalfantasy",
|
||
"final fantasy 4": "finalfantasy",
|
||
"ffiv": "finalfantasy",
|
||
"ff4": "finalfantasy",
|
||
"final fantasy v": "finalfantasy",
|
||
"final fantasy 5": "finalfantasy",
|
||
"ffv": "finalfantasy",
|
||
"ff5": "finalfantasy",
|
||
"final fantasy vi": "finalfantasy",
|
||
"final fantasy 6": "finalfantasy",
|
||
"ffvi": "finalfantasy",
|
||
"ff6": "finalfantasy",
|
||
"final fantasy vii": "finalfantasy",
|
||
"final fantasy 7": "finalfantasy",
|
||
"ffvii": "finalfantasy",
|
||
"ff7": "finalfantasy",
|
||
"final fantasy viii": "finalfantasy",
|
||
"final fantasy 8": "finalfantasy",
|
||
"ffviii": "finalfantasy",
|
||
"ff8": "finalfantasy",
|
||
"final fantasy ix": "finalfantasy",
|
||
"final fantasy 9": "finalfantasy",
|
||
"ffix": "finalfantasy",
|
||
"ff9": "finalfantasy",
|
||
"final fantasy x": "finalfantasy",
|
||
"final fantasy 10": "finalfantasy",
|
||
"ffx": "finalfantasy",
|
||
"ff10": "finalfantasy",
|
||
"final fantasy xi": "finalfantasy",
|
||
"final fantasy 11": "finalfantasy",
|
||
"ffxi": "finalfantasy",
|
||
"ff11": "finalfantasy",
|
||
"final fantasy xii": "finalfantasy",
|
||
"final fantasy 12": "finalfantasy",
|
||
"ffxii": "finalfantasy",
|
||
"ff12": "finalfantasy",
|
||
"final fantasy xiii": "finalfantasy",
|
||
"final fantasy 13": "finalfantasy",
|
||
"ffxiii": "finalfantasy",
|
||
"ff13": "finalfantasy",
|
||
"final fantasy xiv": "finalfantasy",
|
||
"final fantasy 14": "finalfantasy",
|
||
"ffxiv": "finalfantasy",
|
||
"ff14": "finalfantasy",
|
||
"final fantasy xv": "finalfantasy",
|
||
"final fantasy 15": "finalfantasy",
|
||
"ffxv": "finalfantasy",
|
||
"ff15": "finalfantasy",
|
||
"final fantasy xvi": "finalfantasy",
|
||
"final fantasy 16": "finalfantasy",
|
||
"ffxvi": "finalfantasy",
|
||
"ff16": "finalfantasy",
|
||
"super mario": "mario",
|
||
"mario": "mario",
|
||
"little witch academia": "little-witch-academia",
|
||
"lwa": "little-witch-academia",
|
||
"uma musume": "umamusume",
|
||
"umamusume": "umamusume",
|
||
"uma musume pretty derby": "umamusume",
|
||
"fire emblem": "fireemblem",
|
||
"senran kagura": "senrankagura",
|
||
"vocaloid": "vocaloid",
|
||
"dragon ball": "dragonball",
|
||
"dragon ball z": "dragonball",
|
||
"dbz": "dragonball",
|
||
"dragon ball super": "dragonball",
|
||
"dbs": "dragonball",
|
||
"league of legends": "leagueoflegends",
|
||
"lol": "leagueoflegends",
|
||
"street fighter": "streetfighter",
|
||
"sonic": "sonic",
|
||
"sonic the hedgehog": "sonic",
|
||
"spy x family": "spy-x-family",
|
||
"spy family": "spy-x-family",
|
||
"spyxfamily": "spy-x-family",
|
||
"zelda": "zelda",
|
||
"the legend of zelda": "zelda",
|
||
"legend of zelda": "zelda",
|
||
"witcher": "witcher",
|
||
"the witcher": "witcher",
|
||
"metroid": "metroid",
|
||
"pokemon": "pokemon",
|
||
"pokémon": "pokemon",
|
||
}
|
||
|
||
# Section title keywords -> model field
|
||
APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
|
||
PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
|
||
BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
|
||
ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
|
||
RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
|
||
QUOTES_KW = {"quotes", "sayings", "dialogue"}
|
||
|
||
|
||
def _strip_html(html: str) -> str:
|
||
return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
|
||
|
||
|
||
def _classify_section(title: str) -> str | None:
|
||
# Split into words for accurate matching (avoids "characteristics" matching "character")
|
||
words = set(re.split(r"\W+", title.lower()))
|
||
if words & APPEARANCE_KW:
|
||
return "appearance"
|
||
if words & PERSONALITY_KW:
|
||
return "personality"
|
||
if words & BACKGROUND_KW:
|
||
return "background"
|
||
if words & ABILITIES_KW:
|
||
return "abilities"
|
||
if words & RELATIONSHIPS_KW:
|
||
return "relationships"
|
||
if words & QUOTES_KW:
|
||
return "quotes"
|
||
return None
|
||
|
||
|
||
def _find_wiki(franchise: str) -> str | None:
|
||
return FRANCHISE_WIKIS.get(franchise.lower().strip())
|
||
|
||
|
||
async def fetch_character(name: str, franchise: str) -> CharacterData:
|
||
"""Fetch character from Fandom and/or Wikipedia, return structured data."""
|
||
sections: dict[str, str] = {}
|
||
sources: list[str] = []
|
||
|
||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
|
||
# 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
|
||
wiki = _find_wiki(franchise)
|
||
if wiki:
|
||
fandom = await _fetch_fandom(client, name, wiki)
|
||
if fandom:
|
||
sections.update(fandom["sections"])
|
||
sources.append(fandom["url"])
|
||
|
||
# 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
|
||
# and supplement any sections Fandom didn't provide
|
||
wiki_data = await _fetch_wikipedia(client, name, franchise)
|
||
if wiki_data:
|
||
for k, v in wiki_data["sections"].items():
|
||
# Description: Wikipedia always wins (Fandom lead is infobox-polluted)
|
||
if k == "description" or k not in sections:
|
||
sections[k] = v
|
||
sources.append(wiki_data["url"])
|
||
|
||
return _build_character(name, franchise, sections, sources)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
|
||
base = f"https://{wiki}.fandom.com"
|
||
api = f"{base}/api.php"
|
||
|
||
# Search for the article
|
||
try:
|
||
resp = await client.get(
|
||
api,
|
||
params={"action": "query", "list": "search", "srsearch": name,
|
||
"srlimit": 5, "format": "json"},
|
||
)
|
||
except httpx.HTTPError:
|
||
return None
|
||
|
||
if resp.status_code != 200:
|
||
return None
|
||
|
||
results = resp.json().get("query", {}).get("search", [])
|
||
if not results:
|
||
return None
|
||
|
||
# Pick best match (exact name preferred)
|
||
article = _best_search_match(name, results)
|
||
pageid = article["pageid"]
|
||
page_title = article["title"]
|
||
|
||
# Get section list
|
||
try:
|
||
resp = await client.get(
|
||
api,
|
||
params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
|
||
)
|
||
except httpx.HTTPError:
|
||
return None
|
||
|
||
if resp.status_code != 200:
|
||
return None
|
||
|
||
all_sections = resp.json().get("parse", {}).get("sections", [])
|
||
|
||
# Always fetch lead (section 0) as description
|
||
sections: dict[str, str] = {}
|
||
lead_text = await _fetch_fandom_section(client, api, pageid, 0)
|
||
if lead_text:
|
||
sections["description"] = lead_text[:1500]
|
||
|
||
# Fetch sections that match our fields of interest
|
||
# Only fetch top-level sections (toclevel == 1) we care about
|
||
for sec in all_sections:
|
||
title = _strip_html(sec.get("line", ""))
|
||
field = _classify_section(title)
|
||
if field is None:
|
||
continue
|
||
index = sec.get("index", "")
|
||
if not index:
|
||
continue
|
||
text = await _fetch_fandom_section(client, api, pageid, index)
|
||
if text:
|
||
key = title.lower()
|
||
sections[key] = text[:3000]
|
||
|
||
return {
|
||
"sections": sections,
|
||
"url": f"{base}/wiki/{page_title.replace(' ', '_')}",
|
||
}
|
||
|
||
|
||
async def _fetch_fandom_section(
|
||
client: httpx.AsyncClient, api: str, pageid: int, section: int | str
|
||
) -> str | None:
|
||
try:
|
||
resp = await client.get(
|
||
api,
|
||
params={"action": "parse", "pageid": pageid, "section": section,
|
||
"prop": "text", "format": "json"},
|
||
)
|
||
except httpx.HTTPError:
|
||
return None
|
||
|
||
if resp.status_code != 200:
|
||
return None
|
||
|
||
html = resp.json().get("parse", {}).get("text", {}).get("*", "")
|
||
if not html:
|
||
return None
|
||
|
||
# Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
for tag in soup.select(
|
||
"table, aside, h1, h2, h3, h4, h5, h6, "
|
||
".navbox, .toc, #toc, .reference, sup, "
|
||
".mw-editsection, .portable-infobox, .infobox, "
|
||
".error, .cite-error, .mw-references-wrap, "
|
||
# Fandom "Quick Answers" / "AI Answers" widgets
|
||
".trfc161, section[class^='trfc'], "
|
||
".fandom-community-question-answer, .qa-placeholder"
|
||
):
|
||
tag.decompose()
|
||
|
||
text = soup.get_text(separator=" ", strip=True)
|
||
# Collapse excess whitespace
|
||
text = re.sub(r"\s{2,}", " ", text).strip()
|
||
# Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
|
||
text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
|
||
return text if len(text) > 20 else None
|
||
|
||
|
||
def _best_search_match(name: str, results: list[dict]) -> dict:
|
||
name_lower = name.lower()
|
||
for item in results:
|
||
if item["title"].lower() == name_lower:
|
||
return item
|
||
return results[0]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Wikipedia
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
|
||
search_query = f"{name} {franchise} character"
|
||
|
||
try:
|
||
resp = await client.get(
|
||
"https://en.wikipedia.org/w/api.php",
|
||
params={"action": "query", "list": "search", "srsearch": search_query,
|
||
"srlimit": 3, "format": "json"},
|
||
)
|
||
except httpx.HTTPError:
|
||
return None
|
||
|
||
if resp.status_code != 200:
|
||
return None
|
||
|
||
results = resp.json().get("query", {}).get("search", [])
|
||
if not results:
|
||
return None
|
||
|
||
# Pick the result whose title best overlaps with the character name
|
||
name_words = set(name.lower().split())
|
||
best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
|
||
title = best["title"]
|
||
title_words = set(title.lower().split())
|
||
article_is_about_character = bool(name_words & title_words)
|
||
sections: dict[str, str] = {}
|
||
|
||
# Extracts API — clean plain-text intro, no infobox cruft
|
||
# Only use as description if the Wikipedia article is actually about the character
|
||
# (not about the franchise, which happens when no dedicated character article exists)
|
||
|
||
if article_is_about_character:
|
||
try:
|
||
resp = await client.get(
|
||
"https://en.wikipedia.org/w/api.php",
|
||
params={"action": "query", "titles": title, "prop": "extracts",
|
||
"exintro": True, "explaintext": True, "format": "json"},
|
||
)
|
||
if resp.status_code == 200:
|
||
pages = resp.json().get("query", {}).get("pages", {})
|
||
extract = next(iter(pages.values()), {}).get("extract", "").strip()
|
||
if extract:
|
||
sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
|
||
except httpx.HTTPError:
|
||
pass
|
||
|
||
return {
|
||
"sections": sections,
|
||
"url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Build CharacterData from raw sections
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _build_character(
|
||
name: str,
|
||
franchise: str,
|
||
sections: dict[str, str],
|
||
sources: list[str],
|
||
) -> CharacterData:
|
||
appearance_parts: list[str] = []
|
||
personality_parts: list[str] = []
|
||
background_parts: list[str] = []
|
||
abilities: list[str] = []
|
||
relationships: list[str] = []
|
||
quotes: list[str] = []
|
||
extra: dict[str, str] = {}
|
||
|
||
description = sections.get("description", "")
|
||
|
||
for title, text in sections.items():
|
||
if title == "description":
|
||
continue
|
||
field = _classify_section(title)
|
||
if field == "appearance":
|
||
appearance_parts.append(text)
|
||
elif field == "personality":
|
||
personality_parts.append(text)
|
||
elif field == "background":
|
||
background_parts.append(text)
|
||
elif field == "abilities":
|
||
abilities.extend(_extract_list_items(text))
|
||
elif field == "relationships":
|
||
relationships.extend(_extract_list_items(text))
|
||
elif field == "quotes":
|
||
quotes.extend(_extract_list_items(text))
|
||
else:
|
||
extra[title] = text[:500]
|
||
|
||
return CharacterData(
|
||
name=name,
|
||
franchise=franchise,
|
||
description=description.strip(),
|
||
appearance="\n\n".join(appearance_parts).strip(),
|
||
personality="\n\n".join(personality_parts).strip(),
|
||
background="\n\n".join(background_parts).strip(),
|
||
abilities=abilities[:20],
|
||
relationships=relationships[:15],
|
||
notable_quotes=quotes[:10],
|
||
extra_sections={k: v for k, v in list(extra.items())[:8]},
|
||
sources=sources,
|
||
cached_at=datetime.now(),
|
||
)
|
||
|
||
|
||
def _extract_list_items(text: str) -> list[str]:
|
||
"""Extract bullet items or split prose into sentences if no bullet structure."""
|
||
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
||
|
||
# Check if content is bullet-structured
|
||
bullet_lines = [l.lstrip("-•*–").strip() for l in lines if re.match(r"^[-•*–]", l)]
|
||
if len(bullet_lines) >= 2:
|
||
return [l for l in bullet_lines if len(l) > 5]
|
||
|
||
# Otherwise return short, sentence-split chunks (max 300 chars each)
|
||
sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
|
||
items = [s.strip() for s in sentences if len(s.strip()) > 10]
|
||
return items[:15] # cap to avoid bloat
|