#!/usr/bin/env python3 """Improved parser for Woodworking recipes -> Woodworking_v2.csv This version follows the spec provided by the user: Category: category name (without level range) Level: recipe level (integer) Sub-Crafts: JSON list of [name, level] Name: product NQ name Crystal: crystal element (Earth, Wind, etc.) Key Item: key item name or null Ingredients: JSON list of [name, quantity] HQ Yields: JSON list ordered HQ1..HQ3 of [name, quantity] (nulls if N/A) """ from __future__ import annotations import csv import json import pathlib import re from typing import Dict, List, Optional, Tuple PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1] TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt" CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv" RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$") RE_MAIN = re.compile(r"Main Craft: Woodworking - \((\d+)\)") RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)") RE_KEY_ITEM = re.compile(r"Key Item: (.+)") RE_NQ = re.compile(r"NQ:\s*(.+)") RE_HQ = re.compile(r"HQ(\d):\s*(.+)") RE_ITEM_QTY = re.compile(r"(.+?)\s*(?:x|\+)(\d+)$") def norm(s: str) -> str: return re.sub(r"\s+", " ", s.strip()) def split_item_qty(text: str) -> Tuple[str, int]: text = norm(text) m = RE_ITEM_QTY.match(text) if m: name, qty = m.group(1), int(m.group(2)) else: name, qty = text, 1 return name, qty class Recipe: __slots__ = ( "category", "level", "subcrafts", "name", "crystal", "key_item", "ingredients", "hq_yields", ) def __init__(self, category: str, level: int): self.category = category self.level = level self.subcrafts: List[Tuple[str, int]] = [] self.name: str = "" self.crystal: str = "" self.key_item: Optional[str] = None self.ingredients: List[Tuple[str, int]] = [] # index 0,1,2 for HQ1..3; (name, qty) or None self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None] def row(self) -> List[str]: return [ self.category, str(self.level), json.dumps(self.subcrafts, ensure_ascii=False), self.name, self.crystal, self.key_item or "", json.dumps(self.ingredients, ensure_ascii=False), json.dumps(self.hq_yields, ensure_ascii=False), ] # --------------------------------------------------------------------------- def parse() -> List[Recipe]: lines = TXT_PATH.read_text(encoding="utf-8", errors="ignore").splitlines() n = len(lines) i = 0 current_category = "" recipes: List[Recipe] = [] while i < n: line = lines[i].strip() # Category header m_cat = RE_CATEGORY.match(line) if m_cat: current_category = m_cat.group(1) i += 1 continue # Main Craft start m_main = RE_MAIN.search(line) if m_main: level = int(m_main.group(1)) rec = Recipe(current_category, level) i += 1 # look ahead for optional Sub Craft(s) & Key Item lines while i < n: look = lines[i].strip() if not look: i += 1 continue if RE_SUBCRAFTS.match(look): sub_text = RE_SUBCRAFTS.match(look).group(1) # Split by commas for part in sub_text.split(','): part = part.strip() m_sc = re.match(r"([A-Za-z]+) - \((\d+)\)", part) if m_sc: rec.subcrafts.append((m_sc.group(1), int(m_sc.group(2)))) i += 1 continue if RE_KEY_ITEM.match(look): rec.key_item = RE_KEY_ITEM.match(look).group(1) i += 1 continue # If line starts with NQ: we stop metadata collection if look.startswith("NQ:") or look.startswith("HQ"): break # Ingredient lines normally start with crystal or item names and are indented if not look.startswith("NQ:"): # Ingredient collection will happen in separate loop break i += 1 # Collect ingredients until NQ line encountered while i < n and not lines[i].lstrip().startswith("NQ:"): ingr_line = lines[i].strip() if ingr_line and not ingr_line.endswith("-Icon.gif"): name, qty = split_item_qty(ingr_line) rec.ingredients.append((name, qty)) i += 1 # Determine crystal (must contain 'Crystal') for name, qty in rec.ingredients: if "Crystal" in name: rec.crystal = name.split()[0] # Earth Crystal -> Earth break if rec.crystal: # Remove crystal from ingredient list rec.ingredients = [pair for pair in rec.ingredients if "Crystal" not in pair[0]] else: rec.crystal = "Unknown" # NQ line if i < n and (m_nq := RE_NQ.match(lines[i].strip())): rec.name, _ = split_item_qty(m_nq.group(1)) i += 1 else: i += 1 # Skip blank lines before HQ entries while i < n and not lines[i].strip(): i += 1 # HQ lines while i < n and lines[i].lstrip().startswith("HQ"): m_hq = RE_HQ.match(lines[i].strip()) if m_hq: idx = int(m_hq.group(1)) - 1 name, qty = split_item_qty(m_hq.group(2)) rec.hq_yields[idx] = (name, qty) i += 1 recipes.append(rec) continue i += 1 return recipes # --------------------------------------------------------------------------- def main() -> None: recs = parse() CSV_PATH.parent.mkdir(parents=True, exist_ok=True) with CSV_PATH.open("w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ "category", "level", "subcrafts", "name", "crystal", "key_item", "ingredients", "hq_yields", ]) for r in recs: writer.writerow(r.row()) print(f"Wrote {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}") if __name__ == "__main__": main()