Initial commit

2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions
--- a/scripts/recipes_to_csv_v2.py
+++ b/scripts/recipes_to_csv_v2.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv
+
+Usage:
+    python recipes_to_csv_v2.py <CRAFT>
+
+Where <CRAFT> matches the name of the .txt file in the datasets directory,
+for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.
+
+The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
+having the following columns (identical to the Woodworking v2 spec):
+
+    category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields
+
+See scripts/README.md for details of each column.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import pathlib
+import re
+from typing import List, Optional, Tuple
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+DATASETS_DIR = PROJECT_ROOT / "datasets"
+
+# ---------------------------------------------------------------------------
+# Regex helpers (compiled at runtime where craft-dependent)
+# ---------------------------------------------------------------------------
+
+RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
+RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
+RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
+RE_NQ = re.compile(r"NQ:\s*(.+)")
+RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
+# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
+# Patterns like "+1" denote HQ variant and should be preserved in the name.
+RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)
+
+
+# ---------------------------------------------------------------------------
+# Helper functions / dataclasses
+# ---------------------------------------------------------------------------
+
+def norm(s: str) -> str:
+    """Normalise whitespace inside a string."""
+    return re.sub(r"\s+", " ", s.strip())
+
+
+def split_item_qty(text: str) -> Tuple[str, int]:
+    """Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
+    text = norm(text)
+    m = RE_ITEM_QTY.match(text)
+    if m:
+        name, qty = m.group(1), int(m.group(2))
+    else:
+        name, qty = text, 1
+    return name, qty
+
+
+class Recipe:
+    __slots__ = (
+        "category",
+        "level",
+        "subcrafts",
+        "name",
+        "crystal",
+        "key_item",
+        "ingredients",
+        "hq_yields",
+    )
+
+    def __init__(self, category: str, level: int):
+        self.category = category
+        self.level = level
+        self.subcrafts: List[Tuple[str, int]] = []
+        self.name: str = ""
+        self.crystal: str = ""
+        self.key_item: Optional[str] = None
+        self.ingredients: List[Tuple[str, int]] = []
+        self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
+
+    def row(self) -> List[str]:
+        return [
+            self.category,
+            str(self.level),
+            json.dumps(self.subcrafts, ensure_ascii=False),
+            self.name,
+            self.crystal,
+            self.key_item or "",
+            json.dumps(self.ingredients, ensure_ascii=False),
+            json.dumps(self.hq_yields, ensure_ascii=False),
+        ]
+
+
+# ---------------------------------------------------------------------------
+# Core parse routine
+# ---------------------------------------------------------------------------
+
+def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
+    """Parse a crafting text file into Recipe objects.
+
+    The parsing strategy is now:
+    1. "Main Craft:" marks the *metadata* for the upcoming recipe – level, optional
+       sub-crafts, and key item.
+    2. Ingredient lines follow until an "NQ:" line is reached.  The first recipe
+       ingredient that contains the word "Crystal" determines the crystal type
+       and is removed from the ingredients list.
+    3. An "NQ:" line finalises the recipe: we capture the product name and then
+       look ahead for up to three "HQx:" lines that describe HQ yields.
+
+    Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
+    This allows the parser to cope with datasets where multiple successive
+    "Main Craft:" lines appear without an intervening "NQ:".
+    """
+
+    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+    n = len(lines)
+    i = 0
+
+    current_category: str = ""
+    recipes: List[Recipe] = []
+
+    # Craft-specific regex (compiled once per run)
+    RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")
+
+    while i < n:
+        line = lines[i].strip()
+
+        # 1) Category header
+        if (m_cat := RE_CATEGORY.match(line)):
+            current_category = m_cat.group(1)
+            i += 1
+            continue
+
+        # 2) Start of a recipe – line beginning with "NQ:"
+        if not line.startswith("NQ:"):
+            i += 1
+            continue
+
+        # -------------------------------
+        # New recipe initialised
+        # -------------------------------
+        rec = Recipe(current_category, level=0)
+        rec.name, _ = split_item_qty(line[len("NQ:"):].strip())
+
+        # Collect block until next NQ or EOF
+        block_lines: List[str] = []
+        i += 1
+        while i < n and not lines[i].lstrip().startswith("NQ:"):
+            block_lines.append(lines[i])
+            i += 1
+
+        # ------------------------------------
+        # Parse metadata & ingredients in block
+        # ------------------------------------
+        for raw in block_lines:
+            look = raw.strip()
+            if not look:
+                continue
+
+            # Skip icon decorator lines early
+            if look.endswith("-Icon.gif"):
+                continue
+
+            # Main Craft – level
+            if (m_main := RE_MAIN.search(look)):
+                level_raw = m_main.group(1)
+                # Handle ranges like "115~120" or "115-120" by taking the lower bound
+                m_range = re.match(r"(\d+)", level_raw)
+                if m_range:
+                    rec.level = int(m_range.group(1))
+                else:
+                    rec.level = 0
+                continue
+
+            # Sub crafts
+            if (m_sc := RE_SUBCRAFTS.match(look)):
+                for part in m_sc.group(1).split(','):
+                    part = part.strip()
+                    if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
+                        rec.subcrafts.append((m.group(1), int(m.group(2))))
+                continue
+
+            # Key item
+            if (m_ki := RE_KEY_ITEM.match(look)):
+                rec.key_item = m_ki.group(1)
+                continue
+
+            # HQ lines
+            if look.startswith("HQ"):
+                if (m_hq := RE_HQ.match(look)):
+                    idx = int(m_hq.group(1)) - 1
+                    name, qty = split_item_qty(m_hq.group(2))
+                    rec.hq_yields[idx] = (name, qty)
+                continue
+
+            # Otherwise treat as ingredient
+            name, qty = split_item_qty(look)
+            rec.ingredients.append((name, qty))
+
+        # Determine crystal & clean ingredient list
+        for name, qty in rec.ingredients:
+            if "Crystal" in name:
+                rec.crystal = name.split()[0]
+                break
+        if rec.crystal:
+            rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
+        else:
+            rec.crystal = "Unknown"
+
+        recipes.append(rec)
+
+    return recipes
+
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    """CLI entrypoint.
+
+    Usage examples:
+        # Process a single craft
+        python recipes_to_csv_v2.py Woodworking
+
+        # Process all *.txt files in the datasets directory
+        python recipes_to_csv_v2.py --all
+
+        # Omit positional arg – defaults to --all
+        python recipes_to_csv_v2.py
+    """
+    argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
+    argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
+    argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
+    args = argp.parse_args()
+
+    # Determine which crafts to process
+    if args.all or not args.craft:
+        crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
+        if not crafts:
+            raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
+    else:
+        crafts = [args.craft.strip()]
+
+    for craft in crafts:
+        txt_path = DATASETS_DIR / f"{craft}.txt"
+        if not txt_path.exists():
+            print(f"[WARN] Dataset file not found: {txt_path}")
+            continue
+
+        csv_path = DATASETS_DIR / f"{craft}_v2.csv"
+        recipes = parse(txt_path, craft)
+
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        with csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "category",
+                "level",
+                "subcrafts",
+                "name",
+                "crystal",
+                "key_item",
+                "ingredients",
+                "hq_yields",
+            ])
+            for r in recipes:
+                writer.writerow(r.row())
+
+        rel = csv_path.relative_to(PROJECT_ROOT)
+        print(f"Wrote {len(recipes)} recipes -> {rel}")
+
+
+if __name__ == "__main__":
+    main()