#!/usr/bin/env python3 """Generic recipe parser for crafting disciplines -> _v2.csv Usage: python recipes_to_csv_v2.py Where matches the name of the .txt file in the datasets directory, for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc. The script produces a CSV `_v2.csv` inside the datasets directory having the following columns (identical to the Woodworking v2 spec): category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields See scripts/README.md for details of each column. """ from __future__ import annotations import argparse import csv import json import pathlib import re from typing import List, Optional, Tuple PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1] DATASETS_DIR = PROJECT_ROOT / "datasets" # --------------------------------------------------------------------------- # Regex helpers (compiled at runtime where craft-dependent) # --------------------------------------------------------------------------- RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) $[0-9]+-[0-9]+$$") RE_SUBCRAFTS = re.compile(r"Sub Craft$s$: (.+)") RE_KEY_ITEM = re.compile(r"Key Item: (.+)") RE_NQ = re.compile(r"NQ:\s*(.+)") RE_HQ = re.compile(r"HQ(\d):\s*(.+)") # Quantity delimiter is strictly 'xN' (e.g., "Lumber x6"). # Patterns like "+1" denote HQ variant and should be preserved in the name. RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE) # --------------------------------------------------------------------------- # Helper functions / dataclasses # --------------------------------------------------------------------------- def norm(s: str) -> str: """Normalise whitespace inside a string.""" return re.sub(r"\s+", " ", s.strip()) def split_item_qty(text: str) -> Tuple[str, int]: """Split "Foo x3" → ("Foo", 3). Quantity defaults to 1.""" text = norm(text) m = RE_ITEM_QTY.match(text) if m: name, qty = m.group(1), int(m.group(2)) else: name, qty = text, 1 return name, qty class Recipe: __slots__ = ( "category", "level", "subcrafts", "name", "crystal", "key_item", "ingredients", "hq_yields", ) def __init__(self, category: str, level: int): self.category = category self.level = level self.subcrafts: List[Tuple[str, int]] = [] self.name: str = "" self.crystal: str = "" self.key_item: Optional[str] = None self.ingredients: List[Tuple[str, int]] = [] self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None] def row(self) -> List[str]: return [ self.category, str(self.level), json.dumps(self.subcrafts, ensure_ascii=False), self.name, self.crystal, self.key_item or "", json.dumps(self.ingredients, ensure_ascii=False), json.dumps(self.hq_yields, ensure_ascii=False), ] # --------------------------------------------------------------------------- # Core parse routine # --------------------------------------------------------------------------- def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]: """Parse a crafting text file into Recipe objects. The parsing strategy is now: 1. "Main Craft:" marks the *metadata* for the upcoming recipe – level, optional sub-crafts, and key item. 2. Ingredient lines follow until an "NQ:" line is reached. The first recipe ingredient that contains the word "Crystal" determines the crystal type and is removed from the ingredients list. 3. An "NQ:" line finalises the recipe: we capture the product name and then look ahead for up to three "HQx:" lines that describe HQ yields. Crucially, *"NQ:" now acts as the definitive boundary between recipes*. This allows the parser to cope with datasets where multiple successive "Main Craft:" lines appear without an intervening "NQ:". """ lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines() n = len(lines) i = 0 current_category: str = "" recipes: List[Recipe] = [] # Craft-specific regex (compiled once per run) RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - $([^)]*)$") while i < n: line = lines[i].strip() # 1) Category header if (m_cat := RE_CATEGORY.match(line)): current_category = m_cat.group(1) i += 1 continue # 2) Start of a recipe – line beginning with "NQ:" if not line.startswith("NQ:"): i += 1 continue # ------------------------------- # New recipe initialised # ------------------------------- rec = Recipe(current_category, level=0) rec.name, _ = split_item_qty(line[len("NQ:"):].strip()) # Collect block until next NQ or EOF block_lines: List[str] = [] i += 1 while i < n and not lines[i].lstrip().startswith("NQ:"): block_lines.append(lines[i]) i += 1 # ------------------------------------ # Parse metadata & ingredients in block # ------------------------------------ for raw in block_lines: look = raw.strip() if not look: continue # Skip icon decorator lines early if look.endswith("-Icon.gif"): continue # Main Craft – level if (m_main := RE_MAIN.search(look)): level_raw = m_main.group(1) # Handle ranges like "115~120" or "115-120" by taking the lower bound m_range = re.match(r"(\d+)", level_raw) if m_range: rec.level = int(m_range.group(1)) else: rec.level = 0 continue # Sub crafts if (m_sc := RE_SUBCRAFTS.match(look)): for part in m_sc.group(1).split(','): part = part.strip() if m := re.match(r"([A-Za-z]+) - $(\d+)$", part): rec.subcrafts.append((m.group(1), int(m.group(2)))) continue # Key item if (m_ki := RE_KEY_ITEM.match(look)): rec.key_item = m_ki.group(1) continue # HQ lines if look.startswith("HQ"): if (m_hq := RE_HQ.match(look)): idx = int(m_hq.group(1)) - 1 name, qty = split_item_qty(m_hq.group(2)) rec.hq_yields[idx] = (name, qty) continue # Otherwise treat as ingredient name, qty = split_item_qty(look) rec.ingredients.append((name, qty)) # Determine crystal & clean ingredient list for name, qty in rec.ingredients: if "Crystal" in name: rec.crystal = name.split()[0] break if rec.crystal: rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]] else: rec.crystal = "Unknown" recipes.append(rec) return recipes # --------------------------------------------------------------------------- # Entrypoint # --------------------------------------------------------------------------- def main() -> None: """CLI entrypoint. Usage examples: # Process a single craft python recipes_to_csv_v2.py Woodworking # Process all *.txt files in the datasets directory python recipes_to_csv_v2.py --all # Omit positional arg – defaults to --all python recipes_to_csv_v2.py """ argp = argparse.ArgumentParser(description="Parse .txt into CSV.") argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing") argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/") args = argp.parse_args() # Determine which crafts to process if args.all or not args.craft: crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")] if not crafts: raise SystemExit(f"No .txt files found in {DATASETS_DIR}") else: crafts = [args.craft.strip()] for craft in crafts: txt_path = DATASETS_DIR / f"{craft}.txt" if not txt_path.exists(): print(f"[WARN] Dataset file not found: {txt_path}") continue csv_path = DATASETS_DIR / f"{craft}_v2.csv" recipes = parse(txt_path, craft) csv_path.parent.mkdir(parents=True, exist_ok=True) with csv_path.open("w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ "category", "level", "subcrafts", "name", "crystal", "key_item", "ingredients", "hq_yields", ]) for r in recipes: writer.writerow(r.row()) rel = csv_path.relative_to(PROJECT_ROOT) print(f"Wrote {len(recipes)} recipes -> {rel}") if __name__ == "__main__": main()