#!/usr/bin/env python3 """Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV. The output CSV will be written to datasets/Woodworking.csv with the following columns: category – Text group header e.g. Amateur (1-10) level – Woodworking skill level for the recipe (integer) product_name – Produced item (without quantity suffix) nq_yield – Quantity produced on a normal quality synth (may be empty) hq1_yield – Quantity produced on HQ1 synth (may be empty) hq2_yield – Quantity produced on HQ2 synth (may be empty) hq3_yield – Quantity produced on HQ3 synth (may be empty) crystal – Crystal used for the synth (Wind Crystal, Earth Crystal, etc.) ingredients – Semi-colon-separated list of remaining ingredients (excluding the crystal) Run the script from the project root: python3 scripts/woodworking_to_csv.py """ from __future__ import annotations import csv import pathlib import re from typing import List, Optional PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1] TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt" CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv" # --- Regex patterns --------------------------------------------------------- RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$") RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)") RE_NQ = re.compile(r"NQ:\s*(.+)") RE_HQ = re.compile(r"HQ(\d):\s*(.+)") RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$") def normalise_whitespace(text: str) -> str: """Collapse internal runs of whitespace and trim.""" return re.sub(r"\s+", " ", text.strip()) class Recipe: __slots__ = ( "category", "level", "product_name", "nq_yield", "hq1_yield", "hq2_yield", "hq3_yield", "crystal", "ingredients", ) def __init__(self, category: str, level: int): self.category: str = category self.level: int = level self.product_name: str = "" self.nq_yield: str = "" self.hq1_yield: str = "" self.hq2_yield: str = "" self.hq3_yield: str = "" self.crystal: str = "" self.ingredients: List[str] = [] # ------------------------------------------------------------------ # Helper methods # ------------------------------------------------------------------ def _set_product(self, text: str, nq: bool = False) -> None: name, qty = self._split_item_qty(text) self.product_name = name if nq: self.nq_yield = qty def _add_hq(self, idx: int, text: str) -> None: _name, qty = self._split_item_qty(text) if idx == 1: self.hq1_yield = qty elif idx == 2: self.hq2_yield = qty elif idx == 3: self.hq3_yield = qty @staticmethod def _split_item_qty(text: str) -> tuple[str, str]: text = normalise_whitespace(text) m = RE_ITEM_QTY.match(text) if m: return m.group(1), m.group(2) return text, "" def to_row(self) -> List[str]: return [ self.category, str(self.level), self.product_name, self.nq_yield, self.hq1_yield, self.hq2_yield, self.hq3_yield, self.crystal, "; ".join(self.ingredients), ] # ---------------------------------------------------------------------------- # Parsing logic # ---------------------------------------------------------------------------- def parse_file(txt_path: pathlib.Path) -> List[Recipe]: recipes: List[Recipe] = [] current_category = "" lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines() i = 0 n = len(lines) while i < n: line = lines[i].strip() # Update category headers e.g. "Amateur (1-10)" if RE_CATEGORY.match(line): current_category = line i += 1 continue # Detect start of a recipe block m_main = RE_MAIN_CRAFT.search(line) if m_main: level = int(m_main.group(1)) rec = Recipe(current_category, level) i += 1 # Collect ingredients until we hit the NQ line while i < n and not lines[i].lstrip().startswith("NQ:"): ing_line = lines[i].strip() if ing_line: rec.ingredients.append(normalise_whitespace(ing_line)) i += 1 # Extract crystal (first ingredient if it contains "Crystal") if rec.ingredients and "Crystal" in rec.ingredients[0]: rec.crystal = rec.ingredients.pop(0) # Now we should be at the NQ line if i < n and (m_nq := RE_NQ.match(lines[i].strip())): rec._set_product(m_nq.group(1), nq=True) i += 1 else: # Malformed entry – skip ahead i += 1 continue # Collect HQ lines (0–3 lines) while i < n and lines[i].lstrip().startswith("HQ"): m_hq = RE_HQ.match(lines[i].strip()) if m_hq: idx = int(m_hq.group(1)) rec._add_hq(idx, m_hq.group(2)) i += 1 recipes.append(rec) continue # skip to next line without increment to avoid double increment # Fallback increment i += 1 return recipes # ---------------------------------------------------------------------------- # CSV writer # ---------------------------------------------------------------------------- def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None: csv_path.parent.mkdir(parents=True, exist_ok=True) with csv_path.open("w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow( [ "category", "level", "product_name", "nq_yield", "hq1_yield", "hq2_yield", "hq3_yield", "crystal", "ingredients", ] ) for r in recipes: writer.writerow(r.to_row()) if __name__ == "__main__": if not TXT_PATH.exists(): raise SystemExit(f"Input file not found: {TXT_PATH}") recs = parse_file(TXT_PATH) write_csv(recs, CSV_PATH) print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")