Add get_character_from_url tool for direct URL-based fetching

Allows users to pass a specific URL to fetch character data from.
Fandom URLs are auto-detected and processed with the structured
MediaWiki API pipeline; non-Fandom URLs use a generic HTML scrape.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-20 01:28:38 +00:00
parent d4d6788d26
commit 8e4a160c00
3 changed files with 207 additions and 1 deletions

View File

@@ -115,6 +115,28 @@ Deletes a specific character from the cache.
---
### `get_character_from_url`
Fetches character data from a specific URL you provide. Fandom wiki URLs are detected automatically and processed with the same structured extraction as `get_character`, giving richer results. Non-Fandom URLs are scraped generically.
**Parameters:**
| Name | Type | Description |
|---|---|---|
| `url` | string | Full URL to the character page |
| `name` | string | Character name (optional — inferred from the page title for Fandom URLs) |
| `franchise` | string | Franchise name (optional) |
**Example prompt:**
> Use get_character_from_url to fetch data from https://finalfantasy.fandom.com/wiki/Tifa_Lockhart
**When to use instead of `get_character`:**
- The character's wiki page isn't found by the automatic search
- You want data from a specific page (e.g. a particular version or alternate wiki)
- The franchise isn't in the supported list and you have a direct Fandom URL
- You have a non-Fandom source you want to pull from
---
### `generate_image_prompt`
Builds a comma-separated tag list for image generation tools (Stable Diffusion, Midjourney, DALL-E, etc.).

View File

@@ -415,6 +415,167 @@ def _build_character(
)
# ---------------------------------------------------------------------------
# URL-based fetching
# ---------------------------------------------------------------------------
_FANDOM_URL_RE = re.compile(
r"^https?://(?P<wiki>[a-z0-9-]+)\.fandom\.com/wiki/(?P<title>.+?)(?:#.*)?$",
re.IGNORECASE,
)
async def fetch_character_from_url(url: str, name: str = "", franchise: str = "") -> CharacterData:
"""Fetch character data from a user-supplied URL.
If the URL points to a Fandom wiki page, the MediaWiki API is used for
structured section extraction (same as the franchise-based path).
Otherwise, a plain HTML scrape is performed.
"""
sections: dict[str, str] = {}
sources: list[str] = [url]
m = _FANDOM_URL_RE.match(url.strip())
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
if m:
wiki = m.group("wiki")
page_title = m.group("title").replace("_", " ")
fandom = await _fetch_fandom_by_title(client, page_title, wiki)
if fandom:
sections.update(fandom["sections"])
# Use the canonical URL returned by the API
sources = [fandom["url"]]
if not name:
name = page_title
else:
# Generic HTML scrape for non-Fandom URLs
scraped = await _scrape_generic(client, url)
if scraped:
sections.update(scraped)
if not name:
name = "Unknown"
if not franchise:
franchise = "Unknown"
return _build_character(name, franchise, sections, sources)
async def _fetch_fandom_by_title(
client: httpx.AsyncClient, title: str, wiki: str
) -> dict | None:
"""Fetch a Fandom page by exact title (no search step needed)."""
base = f"https://{wiki}.fandom.com"
api = f"{base}/api.php"
# Resolve title to pageid
try:
resp = await client.get(
api,
params={"action": "query", "titles": title, "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
pages = resp.json().get("query", {}).get("pages", {})
page = next(iter(pages.values()), {})
pageid = page.get("pageid")
if not pageid:
return None
canonical_title = page.get("title", title)
# Get section list
try:
resp = await client.get(
api,
params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
all_sections = resp.json().get("parse", {}).get("sections", [])
sections: dict[str, str] = {}
lead_text = await _fetch_fandom_section(client, api, pageid, 0)
if lead_text:
sections["description"] = lead_text[:1500]
for sec in all_sections:
sec_title = _strip_html(sec.get("line", ""))
field = _classify_section(sec_title)
if field is None:
continue
index = sec.get("index", "")
if not index:
continue
text = await _fetch_fandom_section(client, api, pageid, index)
if text:
sections[sec_title.lower()] = text[:3000]
return {
"sections": sections,
"url": f"{base}/wiki/{canonical_title.replace(' ', '_')}",
}
async def _scrape_generic(client: httpx.AsyncClient, url: str) -> dict[str, str] | None:
"""Best-effort HTML scrape for non-Fandom pages."""
try:
resp = await client.get(url)
except httpx.HTTPError:
return None
if resp.status_code != 200:
return None
soup = BeautifulSoup(resp.text, "html.parser")
# Remove navigation, headers, footers, scripts, styles
for tag in soup.select("nav, header, footer, script, style, aside, .sidebar, .navbox"):
tag.decompose()
sections: dict[str, str] = {}
# Try to grab the first few paragraphs as a description
paragraphs = soup.find_all("p")
desc_parts = []
for p in paragraphs[:10]:
text = p.get_text(separator=" ", strip=True)
if len(text) > 30:
desc_parts.append(text)
if sum(len(t) for t in desc_parts) > 1500:
break
if desc_parts:
sections["description"] = " ".join(desc_parts)[:1500]
# Try to extract headed sections
for heading in soup.find_all(["h2", "h3"]):
title = heading.get_text(strip=True)
field = _classify_section(title)
if field is None:
continue
# Collect text until the next heading
content_parts = []
for sibling in heading.find_next_siblings():
if sibling.name in {"h2", "h3", "h1"}:
break
text = sibling.get_text(separator=" ", strip=True)
if text:
content_parts.append(text)
if content_parts:
sections[title.lower()] = " ".join(content_parts)[:3000]
return sections if sections else None
def _extract_list_items(text: str) -> list[str]:
"""Extract bullet items or split prose into sentences if no bullet structure."""
lines = [l.strip() for l in text.splitlines() if l.strip()]

View File

@@ -1,7 +1,7 @@
from mcp.server.fastmcp import FastMCP
from .cache import delete_cached, get_cached, list_cached, save_cache
from .fetcher import fetch_character
from .fetcher import fetch_character, fetch_character_from_url
from .models import CharacterData
mcp = FastMCP("character-details")
@@ -123,6 +123,29 @@ async def remove_character(name: str, franchise: str) -> str:
return f"{name} ({franchise}) was not in the cache."
@mcp.tool()
async def get_character_from_url(
url: str,
name: str = "",
franchise: str = "",
) -> str:
"""
Fetch character details from a specific URL.
If the URL points to a Fandom wiki page, structured section extraction
is used automatically for richer results. Otherwise a generic HTML
scrape is performed.
Args:
url: Full URL to the character page (e.g. "https://finalfantasy.fandom.com/wiki/Tifa_Lockhart")
name: Character name (optional — inferred from the page title for Fandom URLs)
franchise: Franchise or series (optional)
"""
character = await fetch_character_from_url(url, name, franchise)
save_cache(character)
return _format_character(character)
@mcp.tool()
async def generate_image_prompt(
name: str,