From 8e4a160c000f88ec63ee5346f0ab73b57ae39127 Mon Sep 17 00:00:00 2001 From: Aodhan Collins Date: Fri, 20 Mar 2026 01:28:38 +0000 Subject: [PATCH] Add get_character_from_url tool for direct URL-based fetching Allows users to pass a specific URL to fetch character data from. Fandom URLs are auto-detected and processed with the structured MediaWiki API pipeline; non-Fandom URLs use a generic HTML scrape. Co-Authored-By: Claude Opus 4.6 --- USER_GUIDE.md | 22 +++++ src/character_details/fetcher.py | 161 +++++++++++++++++++++++++++++++ src/character_details/server.py | 25 ++++- 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/USER_GUIDE.md b/USER_GUIDE.md index 3e5b10b..36a8998 100644 --- a/USER_GUIDE.md +++ b/USER_GUIDE.md @@ -115,6 +115,28 @@ Deletes a specific character from the cache. --- +### `get_character_from_url` + +Fetches character data from a specific URL you provide. Fandom wiki URLs are detected automatically and processed with the same structured extraction as `get_character`, giving richer results. Non-Fandom URLs are scraped generically. + +**Parameters:** +| Name | Type | Description | +|---|---|---| +| `url` | string | Full URL to the character page | +| `name` | string | Character name (optional — inferred from the page title for Fandom URLs) | +| `franchise` | string | Franchise name (optional) | + +**Example prompt:** +> Use get_character_from_url to fetch data from https://finalfantasy.fandom.com/wiki/Tifa_Lockhart + +**When to use instead of `get_character`:** +- The character's wiki page isn't found by the automatic search +- You want data from a specific page (e.g. a particular version or alternate wiki) +- The franchise isn't in the supported list and you have a direct Fandom URL +- You have a non-Fandom source you want to pull from + +--- + ### `generate_image_prompt` Builds a comma-separated tag list for image generation tools (Stable Diffusion, Midjourney, DALL-E, etc.). diff --git a/src/character_details/fetcher.py b/src/character_details/fetcher.py index f4858e8..f6d345b 100644 --- a/src/character_details/fetcher.py +++ b/src/character_details/fetcher.py @@ -415,6 +415,167 @@ def _build_character( ) +# --------------------------------------------------------------------------- +# URL-based fetching +# --------------------------------------------------------------------------- + +_FANDOM_URL_RE = re.compile( + r"^https?://(?P[a-z0-9-]+)\.fandom\.com/wiki/(?P.+?)(?:#.*)?$", + re.IGNORECASE, +) + + +async def fetch_character_from_url(url: str, name: str = "", franchise: str = "") -> CharacterData: + """Fetch character data from a user-supplied URL. + + If the URL points to a Fandom wiki page, the MediaWiki API is used for + structured section extraction (same as the franchise-based path). + Otherwise, a plain HTML scrape is performed. + """ + sections: dict[str, str] = {} + sources: list[str] = [url] + + m = _FANDOM_URL_RE.match(url.strip()) + + async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client: + if m: + wiki = m.group("wiki") + page_title = m.group("title").replace("_", " ") + fandom = await _fetch_fandom_by_title(client, page_title, wiki) + if fandom: + sections.update(fandom["sections"]) + # Use the canonical URL returned by the API + sources = [fandom["url"]] + if not name: + name = page_title + else: + # Generic HTML scrape for non-Fandom URLs + scraped = await _scrape_generic(client, url) + if scraped: + sections.update(scraped) + + if not name: + name = "Unknown" + if not franchise: + franchise = "Unknown" + + return _build_character(name, franchise, sections, sources) + + +async def _fetch_fandom_by_title( + client: httpx.AsyncClient, title: str, wiki: str +) -> dict | None: + """Fetch a Fandom page by exact title (no search step needed).""" + base = f"https://{wiki}.fandom.com" + api = f"{base}/api.php" + + # Resolve title to pageid + try: + resp = await client.get( + api, + params={"action": "query", "titles": title, "format": "json"}, + ) + except httpx.HTTPError: + return None + + if resp.status_code != 200: + return None + + pages = resp.json().get("query", {}).get("pages", {}) + page = next(iter(pages.values()), {}) + pageid = page.get("pageid") + if not pageid: + return None + + canonical_title = page.get("title", title) + + # Get section list + try: + resp = await client.get( + api, + params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"}, + ) + except httpx.HTTPError: + return None + + if resp.status_code != 200: + return None + + all_sections = resp.json().get("parse", {}).get("sections", []) + + sections: dict[str, str] = {} + lead_text = await _fetch_fandom_section(client, api, pageid, 0) + if lead_text: + sections["description"] = lead_text[:1500] + + for sec in all_sections: + sec_title = _strip_html(sec.get("line", "")) + field = _classify_section(sec_title) + if field is None: + continue + index = sec.get("index", "") + if not index: + continue + text = await _fetch_fandom_section(client, api, pageid, index) + if text: + sections[sec_title.lower()] = text[:3000] + + return { + "sections": sections, + "url": f"{base}/wiki/{canonical_title.replace(' ', '_')}", + } + + +async def _scrape_generic(client: httpx.AsyncClient, url: str) -> dict[str, str] | None: + """Best-effort HTML scrape for non-Fandom pages.""" + try: + resp = await client.get(url) + except httpx.HTTPError: + return None + + if resp.status_code != 200: + return None + + soup = BeautifulSoup(resp.text, "html.parser") + + # Remove navigation, headers, footers, scripts, styles + for tag in soup.select("nav, header, footer, script, style, aside, .sidebar, .navbox"): + tag.decompose() + + sections: dict[str, str] = {} + + # Try to grab the first few paragraphs as a description + paragraphs = soup.find_all("p") + desc_parts = [] + for p in paragraphs[:10]: + text = p.get_text(separator=" ", strip=True) + if len(text) > 30: + desc_parts.append(text) + if sum(len(t) for t in desc_parts) > 1500: + break + if desc_parts: + sections["description"] = " ".join(desc_parts)[:1500] + + # Try to extract headed sections + for heading in soup.find_all(["h2", "h3"]): + title = heading.get_text(strip=True) + field = _classify_section(title) + if field is None: + continue + # Collect text until the next heading + content_parts = [] + for sibling in heading.find_next_siblings(): + if sibling.name in {"h2", "h3", "h1"}: + break + text = sibling.get_text(separator=" ", strip=True) + if text: + content_parts.append(text) + if content_parts: + sections[title.lower()] = " ".join(content_parts)[:3000] + + return sections if sections else None + + def _extract_list_items(text: str) -> list[str]: """Extract bullet items or split prose into sentences if no bullet structure.""" lines = [l.strip() for l in text.splitlines() if l.strip()] diff --git a/src/character_details/server.py b/src/character_details/server.py index d7ef7fb..05bf595 100644 --- a/src/character_details/server.py +++ b/src/character_details/server.py @@ -1,7 +1,7 @@ from mcp.server.fastmcp import FastMCP from .cache import delete_cached, get_cached, list_cached, save_cache -from .fetcher import fetch_character +from .fetcher import fetch_character, fetch_character_from_url from .models import CharacterData mcp = FastMCP("character-details") @@ -123,6 +123,29 @@ async def remove_character(name: str, franchise: str) -> str: return f"{name} ({franchise}) was not in the cache." +@mcp.tool() +async def get_character_from_url( + url: str, + name: str = "", + franchise: str = "", +) -> str: + """ + Fetch character details from a specific URL. + + If the URL points to a Fandom wiki page, structured section extraction + is used automatically for richer results. Otherwise a generic HTML + scrape is performed. + + Args: + url: Full URL to the character page (e.g. "https://finalfantasy.fandom.com/wiki/Tifa_Lockhart") + name: Character name (optional — inferred from the page title for Fandom URLs) + franchise: Franchise or series (optional) + """ + character = await fetch_character_from_url(url, name, franchise) + save_cache(character) + return _format_character(character) + + @mcp.tool() async def generate_image_prompt( name: str,