From 8e4a160c000f88ec63ee5346f0ab73b57ae39127 Mon Sep 17 00:00:00 2001
From: Aodhan Collins <sudosert@prontonmail.com>
Date: Fri, 20 Mar 2026 01:28:38 +0000
Subject: [PATCH] Add get_character_from_url tool for direct URL-based fetching

Allows users to pass a specific URL to fetch character data from.
Fandom URLs are auto-detected and processed with the structured
MediaWiki API pipeline; non-Fandom URLs use a generic HTML scrape.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 USER_GUIDE.md                    |  22 +++++
 src/character_details/fetcher.py | 161 +++++++++++++++++++++++++++++++
 src/character_details/server.py  |  25 ++++-
 3 files changed, 207 insertions(+), 1 deletion(-)
diff --git a/USER_GUIDE.md b/USER_GUIDE.md
index 3e5b10b..36a8998 100644
--- a/USER_GUIDE.md
+++ b/USER_GUIDE.md
@@ -115,6 +115,28 @@ Deletes a specific character from the cache.
 
 ---
 
+### `get_character_from_url`
+
+Fetches character data from a specific URL you provide. Fandom wiki URLs are detected automatically and processed with the same structured extraction as `get_character`, giving richer results. Non-Fandom URLs are scraped generically.
+
+**Parameters:**
+| Name | Type | Description |
+|---|---|---|
+| `url` | string | Full URL to the character page |
+| `name` | string | Character name (optional — inferred from the page title for Fandom URLs) |
+| `franchise` | string | Franchise name (optional) |
+
+**Example prompt:**
+> Use get_character_from_url to fetch data from https://finalfantasy.fandom.com/wiki/Tifa_Lockhart
+
+**When to use instead of `get_character`:**
+- The character's wiki page isn't found by the automatic search
+- You want data from a specific page (e.g. a particular version or alternate wiki)
+- The franchise isn't in the supported list and you have a direct Fandom URL
+- You have a non-Fandom source you want to pull from
+
+---
+
 ### `generate_image_prompt`
 
 Builds a comma-separated tag list for image generation tools (Stable Diffusion, Midjourney, DALL-E, etc.).
diff --git a/src/character_details/fetcher.py b/src/character_details/fetcher.py
index f4858e8..f6d345b 100644
--- a/src/character_details/fetcher.py
+++ b/src/character_details/fetcher.py
@@ -415,6 +415,167 @@ def _build_character(
     )
 
 
+# ---------------------------------------------------------------------------
+# URL-based fetching
+# ---------------------------------------------------------------------------
+
+_FANDOM_URL_RE = re.compile(
+    r"^https?://(?P<wiki>[a-z0-9-]+)\.fandom\.com/wiki/(?P<title>.+?)(?:#.*)?$",
+    re.IGNORECASE,
+)
+
+
+async def fetch_character_from_url(url: str, name: str = "", franchise: str = "") -> CharacterData:
+    """Fetch character data from a user-supplied URL.
+
+    If the URL points to a Fandom wiki page, the MediaWiki API is used for
+    structured section extraction (same as the franchise-based path).
+    Otherwise, a plain HTML scrape is performed.
+    """
+    sections: dict[str, str] = {}
+    sources: list[str] = [url]
+
+    m = _FANDOM_URL_RE.match(url.strip())
+
+    async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, headers=HEADERS) as client:
+        if m:
+            wiki = m.group("wiki")
+            page_title = m.group("title").replace("_", " ")
+            fandom = await _fetch_fandom_by_title(client, page_title, wiki)
+            if fandom:
+                sections.update(fandom["sections"])
+                # Use the canonical URL returned by the API
+                sources = [fandom["url"]]
+            if not name:
+                name = page_title
+        else:
+            # Generic HTML scrape for non-Fandom URLs
+            scraped = await _scrape_generic(client, url)
+            if scraped:
+                sections.update(scraped)
+
+    if not name:
+        name = "Unknown"
+    if not franchise:
+        franchise = "Unknown"
+
+    return _build_character(name, franchise, sections, sources)
+
+
+async def _fetch_fandom_by_title(
+    client: httpx.AsyncClient, title: str, wiki: str
+) -> dict | None:
+    """Fetch a Fandom page by exact title (no search step needed)."""
+    base = f"https://{wiki}.fandom.com"
+    api = f"{base}/api.php"
+
+    # Resolve title to pageid
+    try:
+        resp = await client.get(
+            api,
+            params={"action": "query", "titles": title, "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    pages = resp.json().get("query", {}).get("pages", {})
+    page = next(iter(pages.values()), {})
+    pageid = page.get("pageid")
+    if not pageid:
+        return None
+
+    canonical_title = page.get("title", title)
+
+    # Get section list
+    try:
+        resp = await client.get(
+            api,
+            params={"action": "parse", "pageid": pageid, "prop": "sections", "format": "json"},
+        )
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    all_sections = resp.json().get("parse", {}).get("sections", [])
+
+    sections: dict[str, str] = {}
+    lead_text = await _fetch_fandom_section(client, api, pageid, 0)
+    if lead_text:
+        sections["description"] = lead_text[:1500]
+
+    for sec in all_sections:
+        sec_title = _strip_html(sec.get("line", ""))
+        field = _classify_section(sec_title)
+        if field is None:
+            continue
+        index = sec.get("index", "")
+        if not index:
+            continue
+        text = await _fetch_fandom_section(client, api, pageid, index)
+        if text:
+            sections[sec_title.lower()] = text[:3000]
+
+    return {
+        "sections": sections,
+        "url": f"{base}/wiki/{canonical_title.replace(' ', '_')}",
+    }
+
+
+async def _scrape_generic(client: httpx.AsyncClient, url: str) -> dict[str, str] | None:
+    """Best-effort HTML scrape for non-Fandom pages."""
+    try:
+        resp = await client.get(url)
+    except httpx.HTTPError:
+        return None
+
+    if resp.status_code != 200:
+        return None
+
+    soup = BeautifulSoup(resp.text, "html.parser")
+
+    # Remove navigation, headers, footers, scripts, styles
+    for tag in soup.select("nav, header, footer, script, style, aside, .sidebar, .navbox"):
+        tag.decompose()
+
+    sections: dict[str, str] = {}
+
+    # Try to grab the first few paragraphs as a description
+    paragraphs = soup.find_all("p")
+    desc_parts = []
+    for p in paragraphs[:10]:
+        text = p.get_text(separator=" ", strip=True)
+        if len(text) > 30:
+            desc_parts.append(text)
+        if sum(len(t) for t in desc_parts) > 1500:
+            break
+    if desc_parts:
+        sections["description"] = " ".join(desc_parts)[:1500]
+
+    # Try to extract headed sections
+    for heading in soup.find_all(["h2", "h3"]):
+        title = heading.get_text(strip=True)
+        field = _classify_section(title)
+        if field is None:
+            continue
+        # Collect text until the next heading
+        content_parts = []
+        for sibling in heading.find_next_siblings():
+            if sibling.name in {"h2", "h3", "h1"}:
+                break
+            text = sibling.get_text(separator=" ", strip=True)
+            if text:
+                content_parts.append(text)
+        if content_parts:
+            sections[title.lower()] = " ".join(content_parts)[:3000]
+
+    return sections if sections else None
+
+
 def _extract_list_items(text: str) -> list[str]:
     """Extract bullet items or split prose into sentences if no bullet structure."""
     lines = [l.strip() for l in text.splitlines() if l.strip()]
diff --git a/src/character_details/server.py b/src/character_details/server.py
index d7ef7fb..05bf595 100644
--- a/src/character_details/server.py
+++ b/src/character_details/server.py
@@ -1,7 +1,7 @@
 from mcp.server.fastmcp import FastMCP
 
 from .cache import delete_cached, get_cached, list_cached, save_cache
-from .fetcher import fetch_character
+from .fetcher import fetch_character, fetch_character_from_url
 from .models import CharacterData
 
 mcp = FastMCP("character-details")
@@ -123,6 +123,29 @@ async def remove_character(name: str, franchise: str) -> str:
     return f"{name} ({franchise}) was not in the cache."
 
 
+@mcp.tool()
+async def get_character_from_url(
+    url: str,
+    name: str = "",
+    franchise: str = "",
+) -> str:
+    """
+    Fetch character details from a specific URL.
+
+    If the URL points to a Fandom wiki page, structured section extraction
+    is used automatically for richer results. Otherwise a generic HTML
+    scrape is performed.
+
+    Args:
+        url: Full URL to the character page (e.g. "https://finalfantasy.fandom.com/wiki/Tifa_Lockhart")
+        name: Character name (optional — inferred from the page title for Fandom URLs)
+        franchise: Franchise or series (optional)
+    """
+    character = await fetch_character_from_url(url, name, franchise)
+    save_cache(character)
+    return _format_character(character)
+
+
 @mcp.tool()
 async def generate_image_prompt(
     name: str,