Files
character-mcp/src/character_details/fetcher.py
Aodhan Collins d4d6788d26 Add 13 new Fandom wiki sources and integration tests
Expand franchise wiki mappings to cover Uma Musume, Fire Emblem,
Senran Kagura, Vocaloid, Dragon Ball, League of Legends, Street Fighter,
Sonic, Spy x Family, Zelda, The Witcher, Metroid, and Pokemon. Also
expand Final Fantasy aliases to cover all numbered titles I–XVI with
both arabic and roman numeral variants.

Adds parametrized integration tests that verify each wiki endpoint
returns valid CharacterData with a description and Fandom source URL.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 21:41:53 +00:00

431 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Fetches fictional character data from Fandom wikis and Wikipedia.
Strategy:
1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
2. Fall back to / supplement with Wikipedia
3. Parse sections into structured CharacterData fields
"""
import re
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
from .models import CharacterData
# Wikipedia requires a descriptive user-agent with contact info
HEADERS = {
"User-Agent": (
"character-details-mcp/1.0 "
"(https://github.com/example/character-details; contact@example.com)"
)
}
# Map franchise keywords -> Fandom community subdomain
FRANCHISE_WIKIS: dict[str, str] = {
"final fantasy": "finalfantasy",
"final fantasy i": "finalfantasy",
"final fantasy 1": "finalfantasy",
"ffi": "finalfantasy",
"ff1": "finalfantasy",
"final fantasy ii": "finalfantasy",
"final fantasy 2": "finalfantasy",
"ffii": "finalfantasy",
"ff2": "finalfantasy",
"final fantasy iii": "finalfantasy",
"final fantasy 3": "finalfantasy",
"ffiii": "finalfantasy",
"ff3": "finalfantasy",
"final fantasy iv": "finalfantasy",
"final fantasy 4": "finalfantasy",
"ffiv": "finalfantasy",
"ff4": "finalfantasy",
"final fantasy v": "finalfantasy",
"final fantasy 5": "finalfantasy",
"ffv": "finalfantasy",
"ff5": "finalfantasy",
"final fantasy vi": "finalfantasy",
"final fantasy 6": "finalfantasy",
"ffvi": "finalfantasy",
"ff6": "finalfantasy",
"final fantasy vii": "finalfantasy",
"final fantasy 7": "finalfantasy",
"ffvii": "finalfantasy",
"ff7": "finalfantasy",
"final fantasy viii": "finalfantasy",
"final fantasy 8": "finalfantasy",
"ffviii": "finalfantasy",
"ff8": "finalfantasy",
"final fantasy ix": "finalfantasy",
"final fantasy 9": "finalfantasy",
"ffix": "finalfantasy",
"ff9": "finalfantasy",
"final fantasy x": "finalfantasy",
"final fantasy 10": "finalfantasy",
"ffx": "finalfantasy",
"ff10": "finalfantasy",
"final fantasy xi": "finalfantasy",
"final fantasy 11": "finalfantasy",
"ffxi": "finalfantasy",
"ff11": "finalfantasy",
"final fantasy xii": "finalfantasy",
"final fantasy 12": "finalfantasy",
"ffxii": "finalfantasy",
"ff12": "finalfantasy",
"final fantasy xiii": "finalfantasy",
"final fantasy 13": "finalfantasy",
"ffxiii": "finalfantasy",
"ff13": "finalfantasy",
"final fantasy xiv": "finalfantasy",
"final fantasy 14": "finalfantasy",
"ffxiv": "finalfantasy",
"ff14": "finalfantasy",
"final fantasy xv": "finalfantasy",
"final fantasy 15": "finalfantasy",
"ffxv": "finalfantasy",
"ff15": "finalfantasy",
"final fantasy xvi": "finalfantasy",
"final fantasy 16": "finalfantasy",
"ffxvi": "finalfantasy",
"ff16": "finalfantasy",
"super mario": "mario",
"mario": "mario",
"little witch academia": "little-witch-academia",
"lwa": "little-witch-academia",
"uma musume": "umamusume",
"umamusume": "umamusume",
"uma musume pretty derby": "umamusume",
"fire emblem": "fireemblem",
"senran kagura": "senrankagura",
"vocaloid": "vocaloid",
"dragon ball": "dragonball",
"dragon ball z": "dragonball",
"dbz": "dragonball",
"dragon ball super": "dragonball",
"dbs": "dragonball",
"league of legends": "leagueoflegends",
"lol": "leagueoflegends",
"street fighter": "streetfighter",
"sonic": "sonic",
"sonic the hedgehog": "sonic",
"spy x family": "spy-x-family",
"spy family": "spy-x-family",
"spyxfamily": "spy-x-family",
"zelda": "zelda",
"the legend of zelda": "zelda",
"legend of zelda": "zelda",
"witcher": "witcher",
"the witcher": "witcher",
"metroid": "metroid",
"pokemon": "pokemon",
"pokémon": "pokemon",
}
# Section title keywords -> model field
APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
QUOTES_KW = {"quotes", "sayings", "dialogue"}
def _strip_html(html: str) -> str:
return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
def _classify_section(title: str) -> str | None:
# Split into words for accurate matching (avoids "characteristics" matching "character")
words = set(re.split(r"\W+", title.lower()))
if words & APPEARANCE_KW:
return "appearance"
if words & PERSONALITY_KW:
return "personality"
if words & BACKGROUND_KW:
return "background"
if words & ABILITIES_KW:
return "abilities"
if words & RELATIONSHIPS_KW:
return "relationships"
if words & QUOTES_KW:
return "quotes"
return None
def _find_wiki(franchise: str) -> str | None:
return FRANCHISE_WIKIS.get(franchise.lower().strip())
async def fetch_character(name: str, franchise: str) -> CharacterData:
"""Fetch character from Fandom and/or Wikipedia, return structured data."""
sections: dict[str, str] = {}
sources: list[str] = []
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
# 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
wiki = _find_wiki(franchise)
if wiki:
fandom = await _fetch_fandom(client, name, wiki)
if fandom:
sections.update(fandom["sections"])
sources.append(fandom["url"])
# 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
# and supplement any sections Fandom didn't provide
wiki_data = await _fetch_wikipedia(client, name, franchise)
if wiki_data:
for k, v in wiki_data["sections"].items():
# Description: Wikipedia always wins (Fandom lead is infobox-polluted)
if k == "description" or k not in sections:
sections[k] = v
sources.append(wiki_data["url"])
return _build_character(name, franchise, sections, sources)
# ---------------------------------------------------------------------------
# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
# ---------------------------------------------------------------------------
async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
base = f"https://{wiki}.fandom.com"
api = f"{base}/api.php"
# Search for the article
try:
resp = await client.get(
api,
params={"action": "query", "list": "search", "srsearch": name,
"srlimit": 5, "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
results = resp.json().get("query", {}).get("search", [])
if not results:
return None
# Pick best match (exact name preferred)
article = _best_search_match(name, results)
pageid = article["pageid"]
page_title = article["title"]
# Get section list
try:
resp = await client.get(
api,
params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
all_sections = resp.json().get("parse", {}).get("sections", [])
# Always fetch lead (section 0) as description
sections: dict[str, str] = {}
lead_text = await _fetch_fandom_section(client, api, pageid, 0)
if lead_text:
sections["description"] = lead_text[:1500]
# Fetch sections that match our fields of interest
# Only fetch top-level sections (toclevel == 1) we care about
for sec in all_sections:
title = _strip_html(sec.get("line", ""))
field = _classify_section(title)
if field is None:
continue
index = sec.get("index", "")
if not index:
continue
text = await _fetch_fandom_section(client, api, pageid, index)
if text:
key = title.lower()
sections[key] = text[:3000]
return {
"sections": sections,
"url": f"{base}/wiki/{page_title.replace(' ', '_')}",
}
async def _fetch_fandom_section(
client: httpx.AsyncClient, api: str, pageid: int, section: int | str
) -> str | None:
try:
resp = await client.get(
api,
params={"action": "parse", "pageid": pageid, "section": section,
"prop": "text", "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
html = resp.json().get("parse", {}).get("text", {}).get("*", "")
if not html:
return None
# Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
soup = BeautifulSoup(html, "html.parser")
for tag in soup.select(
"table, aside, h1, h2, h3, h4, h5, h6, "
".navbox, .toc, #toc, .reference, sup, "
".mw-editsection, .portable-infobox, .infobox, "
".error, .cite-error, .mw-references-wrap, "
# Fandom "Quick Answers" / "AI Answers" widgets
".trfc161, section[class^='trfc'], "
".fandom-community-question-answer, .qa-placeholder"
):
tag.decompose()
text = soup.get_text(separator=" ", strip=True)
# Collapse excess whitespace
text = re.sub(r"\s{2,}", " ", text).strip()
# Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
return text if len(text) > 20 else None
def _best_search_match(name: str, results: list[dict]) -> dict:
name_lower = name.lower()
for item in results:
if item["title"].lower() == name_lower:
return item
return results[0]
# ---------------------------------------------------------------------------
# Wikipedia
# ---------------------------------------------------------------------------
async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
search_query = f"{name} {franchise} character"
try:
resp = await client.get(
"https://en.wikipedia.org/w/api.php",
params={"action": "query", "list": "search", "srsearch": search_query,
"srlimit": 3, "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
results = resp.json().get("query", {}).get("search", [])
if not results:
return None
# Pick the result whose title best overlaps with the character name
name_words = set(name.lower().split())
best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
title = best["title"]
title_words = set(title.lower().split())
article_is_about_character = bool(name_words & title_words)
sections: dict[str, str] = {}
# Extracts API — clean plain-text intro, no infobox cruft
# Only use as description if the Wikipedia article is actually about the character
# (not about the franchise, which happens when no dedicated character article exists)
if article_is_about_character:
try:
resp = await client.get(
"https://en.wikipedia.org/w/api.php",
params={"action": "query", "titles": title, "prop": "extracts",
"exintro": True, "explaintext": True, "format": "json"},
)
if resp.status_code == 200:
pages = resp.json().get("query", {}).get("pages", {})
extract = next(iter(pages.values()), {}).get("extract", "").strip()
if extract:
sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
except httpx.HTTPError:
pass
return {
"sections": sections,
"url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
}
# ---------------------------------------------------------------------------
# Build CharacterData from raw sections
# ---------------------------------------------------------------------------
def _build_character(
name: str,
franchise: str,
sections: dict[str, str],
sources: list[str],
) -> CharacterData:
appearance_parts: list[str] = []
personality_parts: list[str] = []
background_parts: list[str] = []
abilities: list[str] = []
relationships: list[str] = []
quotes: list[str] = []
extra: dict[str, str] = {}
description = sections.get("description", "")
for title, text in sections.items():
if title == "description":
continue
field = _classify_section(title)
if field == "appearance":
appearance_parts.append(text)
elif field == "personality":
personality_parts.append(text)
elif field == "background":
background_parts.append(text)
elif field == "abilities":
abilities.extend(_extract_list_items(text))
elif field == "relationships":
relationships.extend(_extract_list_items(text))
elif field == "quotes":
quotes.extend(_extract_list_items(text))
else:
extra[title] = text[:500]
return CharacterData(
name=name,
franchise=franchise,
description=description.strip(),
appearance="\n\n".join(appearance_parts).strip(),
personality="\n\n".join(personality_parts).strip(),
background="\n\n".join(background_parts).strip(),
abilities=abilities[:20],
relationships=relationships[:15],
notable_quotes=quotes[:10],
extra_sections={k: v for k, v in list(extra.items())[:8]},
sources=sources,
cached_at=datetime.now(),
)
def _extract_list_items(text: str) -> list[str]:
"""Extract bullet items or split prose into sentences if no bullet structure."""
lines = [l.strip() for l in text.splitlines() if l.strip()]
# Check if content is bullet-structured
bullet_lines = [l.lstrip("-•*").strip() for l in lines if re.match(r"^[-•*]", l)]
if len(bullet_lines) >= 2:
return [l for l in bullet_lines if len(l) > 5]
# Otherwise return short, sentence-split chunks (max 300 chars each)
sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
items = [s.strip() for s in sentences if len(s.strip()) > 10]
return items[:15] # cap to avoid bloat