Mog-Squire/scripts/woodworking_to_csv.py

#!/usr/bin/env python3
"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.

The output CSV will be written to datasets/Woodworking.csv with the following columns:
    category           – Text group header e.g. Amateur (1-10)
    level              – Woodworking skill level for the recipe (integer)
    product_name       – Produced item (without quantity suffix)
    nq_yield           – Quantity produced on a normal quality synth (may be empty)
    hq1_yield          – Quantity produced on HQ1 synth (may be empty)
    hq2_yield          – Quantity produced on HQ2 synth (may be empty)
    hq3_yield          – Quantity produced on HQ3 synth (may be empty)
    crystal            – Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
    ingredients        – Semi-colon-separated list of remaining ingredients (excluding the crystal)

Run the script from the project root:
    python3 scripts/woodworking_to_csv.py
"""
from __future__ import annotations

import csv
import pathlib
import re
from typing import List, Optional

PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"

# --- Regex patterns ---------------------------------------------------------
RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")


def normalise_whitespace(text: str) -> str:
    """Collapse internal runs of whitespace and trim."""
    return re.sub(r"\s+", " ", text.strip())


class Recipe:
    __slots__ = (
        "category",
        "level",
        "product_name",
        "nq_yield",
        "hq1_yield",
        "hq2_yield",
        "hq3_yield",
        "crystal",
        "ingredients",
    )

    def __init__(self, category: str, level: int):
        self.category: str = category
        self.level: int = level
        self.product_name: str = ""
        self.nq_yield: str = ""
        self.hq1_yield: str = ""
        self.hq2_yield: str = ""
        self.hq3_yield: str = ""
        self.crystal: str = ""
        self.ingredients: List[str] = []

    # ------------------------------------------------------------------
    # Helper methods
    # ------------------------------------------------------------------
    def _set_product(self, text: str, nq: bool = False) -> None:
        name, qty = self._split_item_qty(text)
        self.product_name = name
        if nq:
            self.nq_yield = qty

    def _add_hq(self, idx: int, text: str) -> None:
        _name, qty = self._split_item_qty(text)
        if idx == 1:
            self.hq1_yield = qty
        elif idx == 2:
            self.hq2_yield = qty
        elif idx == 3:
            self.hq3_yield = qty

    @staticmethod
    def _split_item_qty(text: str) -> tuple[str, str]:
        text = normalise_whitespace(text)
        m = RE_ITEM_QTY.match(text)
        if m:
            return m.group(1), m.group(2)
        return text, ""

    def to_row(self) -> List[str]:
        return [
            self.category,
            str(self.level),
            self.product_name,
            self.nq_yield,
            self.hq1_yield,
            self.hq2_yield,
            self.hq3_yield,
            self.crystal,
            "; ".join(self.ingredients),
        ]


# ----------------------------------------------------------------------------
# Parsing logic
# ----------------------------------------------------------------------------

def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
    recipes: List[Recipe] = []
    current_category = ""
    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
    i = 0
    n = len(lines)

    while i < n:
        line = lines[i].strip()

        # Update category headers e.g. "Amateur (1-10)"
        if RE_CATEGORY.match(line):
            current_category = line
            i += 1
            continue

        # Detect start of a recipe block
        m_main = RE_MAIN_CRAFT.search(line)
        if m_main:
            level = int(m_main.group(1))
            rec = Recipe(current_category, level)
            i += 1

            # Collect ingredients until we hit the NQ line
            while i < n and not lines[i].lstrip().startswith("NQ:"):
                ing_line = lines[i].strip()
                if ing_line:
                    rec.ingredients.append(normalise_whitespace(ing_line))
                i += 1

            # Extract crystal (first ingredient if it contains "Crystal")
            if rec.ingredients and "Crystal" in rec.ingredients[0]:
                rec.crystal = rec.ingredients.pop(0)

            # Now we should be at the NQ line
            if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
                rec._set_product(m_nq.group(1), nq=True)
                i += 1
            else:
                # Malformed entry – skip ahead
                i += 1
                continue

            # Collect HQ lines (0–3 lines)
            while i < n and lines[i].lstrip().startswith("HQ"):
                m_hq = RE_HQ.match(lines[i].strip())
                if m_hq:
                    idx = int(m_hq.group(1))
                    rec._add_hq(idx, m_hq.group(2))
                i += 1

            recipes.append(rec)
            continue  # skip to next line without increment to avoid double increment

        # Fallback increment
        i += 1

    return recipes


# ----------------------------------------------------------------------------
# CSV writer
# ----------------------------------------------------------------------------

def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    with csv_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(
            [
                "category",
                "level",
                "product_name",
                "nq_yield",
                "hq1_yield",
                "hq2_yield",
                "hq3_yield",
                "crystal",
                "ingredients",
            ]
        )
        for r in recipes:
            writer.writerow(r.to_row())


if __name__ == "__main__":
    if not TXT_PATH.exists():
        raise SystemExit(f"Input file not found: {TXT_PATH}")

    recs = parse_file(TXT_PATH)
    write_csv(recs, CSV_PATH)
    print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")