Initial commit

2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions
--- a/scripts/woodworking_to_csv.py
+++ b/scripts/woodworking_to_csv.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
+
+The output CSV will be written to datasets/Woodworking.csv with the following columns:
+    category           – Text group header e.g. Amateur (1-10)
+    level              – Woodworking skill level for the recipe (integer)
+    product_name       – Produced item (without quantity suffix)
+    nq_yield           – Quantity produced on a normal quality synth (may be empty)
+    hq1_yield          – Quantity produced on HQ1 synth (may be empty)
+    hq2_yield          – Quantity produced on HQ2 synth (may be empty)
+    hq3_yield          – Quantity produced on HQ3 synth (may be empty)
+    crystal            – Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
+    ingredients        – Semi-colon-separated list of remaining ingredients (excluding the crystal)
+
+Run the script from the project root:
+    python3 scripts/woodworking_to_csv.py
+"""
+from __future__ import annotations
+
+import csv
+import pathlib
+import re
+from typing import List, Optional
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
+CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
+
+# --- Regex patterns ---------------------------------------------------------
+RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
+RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
+RE_NQ = re.compile(r"NQ:\s*(.+)")
+RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
+RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
+
+
+def normalise_whitespace(text: str) -> str:
+    """Collapse internal runs of whitespace and trim."""
+    return re.sub(r"\s+", " ", text.strip())
+
+
+class Recipe:
+    __slots__ = (
+        "category",
+        "level",
+        "product_name",
+        "nq_yield",
+        "hq1_yield",
+        "hq2_yield",
+        "hq3_yield",
+        "crystal",
+        "ingredients",
+    )
+
+    def __init__(self, category: str, level: int):
+        self.category: str = category
+        self.level: int = level
+        self.product_name: str = ""
+        self.nq_yield: str = ""
+        self.hq1_yield: str = ""
+        self.hq2_yield: str = ""
+        self.hq3_yield: str = ""
+        self.crystal: str = ""
+        self.ingredients: List[str] = []
+
+    # ------------------------------------------------------------------
+    # Helper methods
+    # ------------------------------------------------------------------
+    def _set_product(self, text: str, nq: bool = False) -> None:
+        name, qty = self._split_item_qty(text)
+        self.product_name = name
+        if nq:
+            self.nq_yield = qty
+
+    def _add_hq(self, idx: int, text: str) -> None:
+        _name, qty = self._split_item_qty(text)
+        if idx == 1:
+            self.hq1_yield = qty
+        elif idx == 2:
+            self.hq2_yield = qty
+        elif idx == 3:
+            self.hq3_yield = qty
+
+    @staticmethod
+    def _split_item_qty(text: str) -> tuple[str, str]:
+        text = normalise_whitespace(text)
+        m = RE_ITEM_QTY.match(text)
+        if m:
+            return m.group(1), m.group(2)
+        return text, ""
+
+    def to_row(self) -> List[str]:
+        return [
+            self.category,
+            str(self.level),
+            self.product_name,
+            self.nq_yield,
+            self.hq1_yield,
+            self.hq2_yield,
+            self.hq3_yield,
+            self.crystal,
+            "; ".join(self.ingredients),
+        ]
+
+
+# ----------------------------------------------------------------------------
+# Parsing logic
+# ----------------------------------------------------------------------------
+
+def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
+    recipes: List[Recipe] = []
+    current_category = ""
+    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # Update category headers e.g. "Amateur (1-10)"
+        if RE_CATEGORY.match(line):
+            current_category = line
+            i += 1
+            continue
+
+        # Detect start of a recipe block
+        m_main = RE_MAIN_CRAFT.search(line)
+        if m_main:
+            level = int(m_main.group(1))
+            rec = Recipe(current_category, level)
+            i += 1
+
+            # Collect ingredients until we hit the NQ line
+            while i < n and not lines[i].lstrip().startswith("NQ:"):
+                ing_line = lines[i].strip()
+                if ing_line:
+                    rec.ingredients.append(normalise_whitespace(ing_line))
+                i += 1
+
+            # Extract crystal (first ingredient if it contains "Crystal")
+            if rec.ingredients and "Crystal" in rec.ingredients[0]:
+                rec.crystal = rec.ingredients.pop(0)
+
+            # Now we should be at the NQ line
+            if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
+                rec._set_product(m_nq.group(1), nq=True)
+                i += 1
+            else:
+                # Malformed entry – skip ahead
+                i += 1
+                continue
+
+            # Collect HQ lines (0–3 lines)
+            while i < n and lines[i].lstrip().startswith("HQ"):
+                m_hq = RE_HQ.match(lines[i].strip())
+                if m_hq:
+                    idx = int(m_hq.group(1))
+                    rec._add_hq(idx, m_hq.group(2))
+                i += 1
+
+            recipes.append(rec)
+            continue  # skip to next line without increment to avoid double increment
+
+        # Fallback increment
+        i += 1
+
+    return recipes
+
+
+# ----------------------------------------------------------------------------
+# CSV writer
+# ----------------------------------------------------------------------------
+
+def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(
+            [
+                "category",
+                "level",
+                "product_name",
+                "nq_yield",
+                "hq1_yield",
+                "hq2_yield",
+                "hq3_yield",
+                "crystal",
+                "ingredients",
+            ]
+        )
+        for r in recipes:
+            writer.writerow(r.to_row())
+
+
+if __name__ == "__main__":
+    if not TXT_PATH.exists():
+        raise SystemExit(f"Input file not found: {TXT_PATH}")
+
+    recs = parse_file(TXT_PATH)
+    write_csv(recs, CSV_PATH)
+    print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")