Initial commit: Character Details MCP Server

This commit is contained in:
Aodhan Collins
2026-03-06 19:23:05 +00:00
commit f330fe8eb3
13 changed files with 1028 additions and 0 deletions

View File

@@ -0,0 +1,343 @@
"""
Fetches fictional character data from Fandom wikis and Wikipedia.
Strategy:
1. Try the franchise-specific Fandom wiki via its MediaWiki API (richer data)
2. Fall back to / supplement with Wikipedia
3. Parse sections into structured CharacterData fields
"""
import re
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
from .models import CharacterData
# Wikipedia requires a descriptive user-agent with contact info
HEADERS = {
"User-Agent": (
"character-details-mcp/1.0 "
"(https://github.com/example/character-details; contact@example.com)"
)
}
# Map franchise keywords -> Fandom community subdomain
FRANCHISE_WIKIS: dict[str, str] = {
"final fantasy vii": "finalfantasy",
"final fantasy 7": "finalfantasy",
"ff7": "finalfantasy",
"ffvii": "finalfantasy",
"final fantasy": "finalfantasy",
"super mario": "mario",
"mario": "mario",
"little witch academia": "little-witch-academia",
"lwa": "little-witch-academia",
}
# Section title keywords -> model field
APPEARANCE_KW = {"appearance", "design", "outfit", "clothing", "physical"}
PERSONALITY_KW = {"personality", "traits", "behavior", "attitude", "nature"}
BACKGROUND_KW = {"background", "history", "biography", "story", "backstory", "past"}
ABILITIES_KW = {"abilities", "powers", "skills", "magic", "combat", "techniques", "weapons"}
RELATIONSHIPS_KW = {"relationships", "family", "friends", "allies", "enemies", "romance"}
QUOTES_KW = {"quotes", "sayings", "dialogue"}
def _strip_html(html: str) -> str:
return BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
def _classify_section(title: str) -> str | None:
# Split into words for accurate matching (avoids "characteristics" matching "character")
words = set(re.split(r"\W+", title.lower()))
if words & APPEARANCE_KW:
return "appearance"
if words & PERSONALITY_KW:
return "personality"
if words & BACKGROUND_KW:
return "background"
if words & ABILITIES_KW:
return "abilities"
if words & RELATIONSHIPS_KW:
return "relationships"
if words & QUOTES_KW:
return "quotes"
return None
def _find_wiki(franchise: str) -> str | None:
return FRANCHISE_WIKIS.get(franchise.lower().strip())
async def fetch_character(name: str, franchise: str) -> CharacterData:
"""Fetch character from Fandom and/or Wikipedia, return structured data."""
sections: dict[str, str] = {}
sources: list[str] = []
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
# 1. Try Fandom wiki (MediaWiki API) — rich character-specific sections
wiki = _find_wiki(franchise)
if wiki:
fandom = await _fetch_fandom(client, name, wiki)
if fandom:
sections.update(fandom["sections"])
sources.append(fandom["url"])
# 2. Wikipedia — always prefer its description (cleaner, no infobox cruft)
# and supplement any sections Fandom didn't provide
wiki_data = await _fetch_wikipedia(client, name, franchise)
if wiki_data:
for k, v in wiki_data["sections"].items():
# Description: Wikipedia always wins (Fandom lead is infobox-polluted)
if k == "description" or k not in sections:
sections[k] = v
sources.append(wiki_data["url"])
return _build_character(name, franchise, sections, sources)
# ---------------------------------------------------------------------------
# Fandom (MediaWiki API — avoids Cloudflare-blocked /api/v1/)
# ---------------------------------------------------------------------------
async def _fetch_fandom(client: httpx.AsyncClient, name: str, wiki: str) -> dict | None:
base = f"https://{wiki}.fandom.com"
api = f"{base}/api.php"
# Search for the article
try:
resp = await client.get(
api,
params={"action": "query", "list": "search", "srsearch": name,
"srlimit": 5, "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
results = resp.json().get("query", {}).get("search", [])
if not results:
return None
# Pick best match (exact name preferred)
article = _best_search_match(name, results)
pageid = article["pageid"]
page_title = article["title"]
# Get section list
try:
resp = await client.get(
api,
params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
all_sections = resp.json().get("parse", {}).get("sections", [])
# Always fetch lead (section 0) as description
sections: dict[str, str] = {}
lead_text = await _fetch_fandom_section(client, api, pageid, 0)
if lead_text:
sections["description"] = lead_text[:1500]
# Fetch sections that match our fields of interest
# Only fetch top-level sections (toclevel == 1) we care about
for sec in all_sections:
title = _strip_html(sec.get("line", ""))
field = _classify_section(title)
if field is None:
continue
index = sec.get("index", "")
if not index:
continue
text = await _fetch_fandom_section(client, api, pageid, index)
if text:
key = title.lower()
sections[key] = text[:3000]
return {
"sections": sections,
"url": f"{base}/wiki/{page_title.replace(' ', '_')}",
}
async def _fetch_fandom_section(
client: httpx.AsyncClient, api: str, pageid: int, section: int | str
) -> str | None:
try:
resp = await client.get(
api,
params={"action": "parse", "pageid": pageid, "section": section,
"prop": "text", "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
html = resp.json().get("parse", {}).get("text", {}).get("*", "")
if not html:
return None
# Strip noisy elements: infoboxes, tables, ToC, headings, references, Fandom widgets
soup = BeautifulSoup(html, "html.parser")
for tag in soup.select(
"table, aside, h1, h2, h3, h4, h5, h6, "
".navbox, .toc, #toc, .reference, sup, "
".mw-editsection, .portable-infobox, .infobox, "
".error, .cite-error, .mw-references-wrap, "
# Fandom "Quick Answers" / "AI Answers" widgets
".trfc161, section[class^='trfc'], "
".fandom-community-question-answer, .qa-placeholder"
):
tag.decompose()
text = soup.get_text(separator=" ", strip=True)
# Collapse excess whitespace
text = re.sub(r"\s{2,}", " ", text).strip()
# Strip stray date stamps Fandom sometimes injects at the top (e.g. "February 1, 2011 ")
text = re.sub(r"^\w+ \d{1,2},? \d{4}\s+", "", text).strip()
return text if len(text) > 20 else None
def _best_search_match(name: str, results: list[dict]) -> dict:
name_lower = name.lower()
for item in results:
if item["title"].lower() == name_lower:
return item
return results[0]
# ---------------------------------------------------------------------------
# Wikipedia
# ---------------------------------------------------------------------------
async def _fetch_wikipedia(client: httpx.AsyncClient, name: str, franchise: str) -> dict | None:
search_query = f"{name} {franchise} character"
try:
resp = await client.get(
"https://en.wikipedia.org/w/api.php",
params={"action": "query", "list": "search", "srsearch": search_query,
"srlimit": 3, "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
results = resp.json().get("query", {}).get("search", [])
if not results:
return None
# Pick the result whose title best overlaps with the character name
name_words = set(name.lower().split())
best = max(results, key=lambda r: len(name_words & set(r["title"].lower().split())))
title = best["title"]
title_words = set(title.lower().split())
article_is_about_character = bool(name_words & title_words)
sections: dict[str, str] = {}
# Extracts API — clean plain-text intro, no infobox cruft
# Only use as description if the Wikipedia article is actually about the character
# (not about the franchise, which happens when no dedicated character article exists)
if article_is_about_character:
try:
resp = await client.get(
"https://en.wikipedia.org/w/api.php",
params={"action": "query", "titles": title, "prop": "extracts",
"exintro": True, "explaintext": True, "format": "json"},
)
if resp.status_code == 200:
pages = resp.json().get("query", {}).get("pages", {})
extract = next(iter(pages.values()), {}).get("extract", "").strip()
if extract:
sections["description"] = re.sub(r"\s{2,}", " ", extract)[:1500]
except httpx.HTTPError:
pass
return {
"sections": sections,
"url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
}
# ---------------------------------------------------------------------------
# Build CharacterData from raw sections
# ---------------------------------------------------------------------------
def _build_character(
name: str,
franchise: str,
sections: dict[str, str],
sources: list[str],
) -> CharacterData:
appearance_parts: list[str] = []
personality_parts: list[str] = []
background_parts: list[str] = []
abilities: list[str] = []
relationships: list[str] = []
quotes: list[str] = []
extra: dict[str, str] = {}
description = sections.get("description", "")
for title, text in sections.items():
if title == "description":
continue
field = _classify_section(title)
if field == "appearance":
appearance_parts.append(text)
elif field == "personality":
personality_parts.append(text)
elif field == "background":
background_parts.append(text)
elif field == "abilities":
abilities.extend(_extract_list_items(text))
elif field == "relationships":
relationships.extend(_extract_list_items(text))
elif field == "quotes":
quotes.extend(_extract_list_items(text))
else:
extra[title] = text[:500]
return CharacterData(
name=name,
franchise=franchise,
description=description.strip(),
appearance="\n\n".join(appearance_parts).strip(),
personality="\n\n".join(personality_parts).strip(),
background="\n\n".join(background_parts).strip(),
abilities=abilities[:20],
relationships=relationships[:15],
notable_quotes=quotes[:10],
extra_sections={k: v for k, v in list(extra.items())[:8]},
sources=sources,
cached_at=datetime.now(),
)
def _extract_list_items(text: str) -> list[str]:
"""Extract bullet items or split prose into sentences if no bullet structure."""
lines = [l.strip() for l in text.splitlines() if l.strip()]
# Check if content is bullet-structured
bullet_lines = [l.lstrip("-•*").strip() for l in lines if re.match(r"^[-•*]", l)]
if len(bullet_lines) >= 2:
return [l for l in bullet_lines if len(l) > 5]
# Otherwise return short, sentence-split chunks (max 300 chars each)
sentences = re.split(r"(?<=[.!?])\s+", " ".join(lines))
items = [s.strip() for s in sentences if len(s.strip()) > 10]
return items[:15] # cap to avoid bloat