Add get_character_from_url tool for direct URL-based fetching

Allows users to pass a specific URL to fetch character data from. Fandom URLs are auto-detected and processed with the structured MediaWiki API pipeline; non-Fandom URLs use a generic HTML scrape. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 01:28:38 +00:00
parent d4d6788d26
commit 8e4a160c00
3 changed files with 207 additions and 1 deletions
--- a/USER_GUIDE.md
+++ b/USER_GUIDE.md
@@ -115,6 +115,28 @@ Deletes a specific character from the cache.
 ---
 ### `get_character_from_url`
 Fetches character data from a specific URL you provide. Fandom wiki URLs are detected automatically and processed with the same structured extraction as `get_character`, giving richer results. Non-Fandom URLs are scraped generically.
 **Parameters:**
 | Name | Type | Description |
 |---|---|---|
 | `url` | string | Full URL to the character page |
 | `name` | string | Character name (optional — inferred from the page title for Fandom URLs) |
 | `franchise` | string | Franchise name (optional) |
 **Example prompt:**
 > Use get_character_from_url to fetch data from https://finalfantasy.fandom.com/wiki/Tifa_Lockhart
 **When to use instead of `get_character`:**
 - The character's wiki page isn't found by the automatic search
 - You want data from a specific page (e.g. a particular version or alternate wiki)
 - The franchise isn't in the supported list and you have a direct Fandom URL
 - You have a non-Fandom source you want to pull from
 ---
 ### `generate_image_prompt`
 Builds a comma-separated tag list for image generation tools (Stable Diffusion, Midjourney, DALL-E, etc.).
--- a/src/character_details/fetcher.py
+++ b/src/character_details/fetcher.py
@@ -415,6 +415,167 @@ def _build_character(
    )
 # ---------------------------------------------------------------------------
 # URL-based fetching
 # ---------------------------------------------------------------------------
 _FANDOM_URL_RE = re.compile(
    r"^https?://(?P<wiki>[a-z0-9-]+)\.fandom\.com/wiki/(?P<title>.+?)(?:#.*)?$",
    re.IGNORECASE,
 )
 async def fetch_character_from_url(url: str, name: str = "", franchise: str = "") -> CharacterData:
    """Fetch character data from a user-supplied URL.
    If the URL points to a Fandom wiki page, the MediaWiki API is used for
    structured section extraction (same as the franchise-based path).
    Otherwise, a plain HTML scrape is performed.
    """
    sections: dict[str, str] = {}
    sources: list[str] = [url]
    m = _FANDOM_URL_RE.match(url.strip())
    async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
        if m:
            wiki = m.group("wiki")
            page_title = m.group("title").replace("_", " ")
            fandom = await _fetch_fandom_by_title(client, page_title, wiki)
            if fandom:
                sections.update(fandom["sections"])
                # Use the canonical URL returned by the API
                sources = [fandom["url"]]
            if not name:
                name = page_title
        else:
            # Generic HTML scrape for non-Fandom URLs
            scraped = await _scrape_generic(client, url)
            if scraped:
                sections.update(scraped)
    if not name:
        name = "Unknown"
    if not franchise:
        franchise = "Unknown"
    return _build_character(name, franchise, sections, sources)
 async def _fetch_fandom_by_title(
    client: httpx.AsyncClient, title: str, wiki: str
 ) -> dict | None:
    """Fetch a Fandom page by exact title (no search step needed)."""
    base = f"https://{wiki}.fandom.com"
    api = f"{base}/api.php"
    # Resolve title to pageid
    try:
        resp = await client.get(
            api,
            params={"action": "query", "titles": title, "format": "json"},
        )
    except httpx.HTTPError:
        return None
    if resp.status_code != 200:
        return None
    pages = resp.json().get("query", {}).get("pages", {})
    page = next(iter(pages.values()), {})
    pageid = page.get("pageid")
    if not pageid:
        return None
    canonical_title = page.get("title", title)
    # Get section list
    try:
        resp = await client.get(
            api,
            params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
        )
    except httpx.HTTPError:
        return None
    if resp.status_code != 200:
        return None
    all_sections = resp.json().get("parse", {}).get("sections", [])
    sections: dict[str, str] = {}
    lead_text = await _fetch_fandom_section(client, api, pageid, 0)
    if lead_text:
        sections["description"] = lead_text[:1500]
    for sec in all_sections:
        sec_title = _strip_html(sec.get("line", ""))
        field = _classify_section(sec_title)
        if field is None:
            continue
        index = sec.get("index", "")
        if not index:
            continue
        text = await _fetch_fandom_section(client, api, pageid, index)
        if text:
            sections[sec_title.lower()] = text[:3000]
    return {
        "sections": sections,
        "url": f"{base}/wiki/{canonical_title.replace(' ', '_')}",
    }
 async def _scrape_generic(client: httpx.AsyncClient, url: str) -> dict[str, str] | None:
    """Best-effort HTML scrape for non-Fandom pages."""
    try:
        resp = await client.get(url)
    except httpx.HTTPError:
        return None
    if resp.status_code != 200:
        return None
    soup = BeautifulSoup(resp.text, "html.parser")
    # Remove navigation, headers, footers, scripts, styles
    for tag in soup.select("nav, header, footer, script, style, aside, .sidebar, .navbox"):
        tag.decompose()
    sections: dict[str, str] = {}
    # Try to grab the first few paragraphs as a description
    paragraphs = soup.find_all("p")
    desc_parts = []
    for p in paragraphs[:10]:
        text = p.get_text(separator=" ", strip=True)
        if len(text) > 30:
            desc_parts.append(text)
        if sum(len(t) for t in desc_parts) > 1500:
            break
    if desc_parts:
        sections["description"] = " ".join(desc_parts)[:1500]
    # Try to extract headed sections
    for heading in soup.find_all(["h2", "h3"]):
        title = heading.get_text(strip=True)
        field = _classify_section(title)
        if field is None:
            continue
        # Collect text until the next heading
        content_parts = []
        for sibling in heading.find_next_siblings():
            if sibling.name in {"h2", "h3", "h1"}:
                break
            text = sibling.get_text(separator=" ", strip=True)
            if text:
                content_parts.append(text)
        if content_parts:
            sections[title.lower()] = " ".join(content_parts)[:3000]
    return sections if sections else None
 def _extract_list_items(text: str) -> list[str]:
    """Extract bullet items or split prose into sentences if no bullet structure."""
    lines = [l.strip() for l in text.splitlines() if l.strip()]
--- a/src/character_details/server.py
+++ b/src/character_details/server.py
@@ -1,7 +1,7 @@
 from mcp.server.fastmcp import FastMCP
 from .cache import delete_cached, get_cached, list_cached, save_cache
-from .fetcher import fetch_character
+from .fetcher import fetch_character, fetch_character_from_url
 from .models import CharacterData
 mcp = FastMCP("character-details")
@@ -123,6 +123,29 @@ async def remove_character(name: str, franchise: str) -> str:
    return f"{name} ({franchise}) was not in the cache."
@mcp.tool()
 async def get_character_from_url(
    url: str,
    name: str = "",
    franchise: str = "",
 ) -> str:
    """
    Fetch character details from a specific URL.
    If the URL points to a Fandom wiki page, structured section extraction
    is used automatically for richer results. Otherwise a generic HTML
    scrape is performed.
    Args:
        url: Full URL to the character page (e.g. "https://finalfantasy.fandom.com/wiki/Tifa_Lockhart")
        name: Character name (optional — inferred from the page title for Fandom URLs)
        franchise: Franchise or series (optional)
    """
    character = await fetch_character_from_url(url, name, franchise)
    save_cache(character)
    return _format_character(character)
@mcp.tool()
 async def generate_image_prompt(
    name: str,