Mog-Squire/scripts/woodworking_to_csv_v2.py

#!/usr/bin/env python3
"""Improved parser for Woodworking recipes -> Woodworking_v2.csv

This version follows the spec provided by the user:

Category: category name (without level range)
Level: recipe level (integer)
Sub-Crafts: JSON list of [name, level]
Name: product NQ name
Crystal: crystal element (Earth, Wind, etc.)
Key Item: key item name or null
Ingredients: JSON list of [name, quantity]
HQ Yields: JSON list ordered HQ1..HQ3 of [name, quantity] (nulls if N/A)
"""
from __future__ import annotations

import csv
import json
import pathlib
import re
from typing import Dict, List, Optional, Tuple

PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"

RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_MAIN = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s*(?:x|\+)(\d+)$")


def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip())


def split_item_qty(text: str) -> Tuple[str, int]:
    text = norm(text)
    m = RE_ITEM_QTY.match(text)
    if m:
        name, qty = m.group(1), int(m.group(2))
    else:
        name, qty = text, 1
    return name, qty


class Recipe:
    __slots__ = (
        "category",
        "level",
        "subcrafts",
        "name",
        "crystal",
        "key_item",
        "ingredients",
        "hq_yields",
    )

    def __init__(self, category: str, level: int):
        self.category = category
        self.level = level
        self.subcrafts: List[Tuple[str, int]] = []
        self.name: str = ""
        self.crystal: str = ""
        self.key_item: Optional[str] = None
        self.ingredients: List[Tuple[str, int]] = []
        # index 0,1,2 for HQ1..3; (name, qty) or None
        self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]

    def row(self) -> List[str]:
        return [
            self.category,
            str(self.level),
            json.dumps(self.subcrafts, ensure_ascii=False),
            self.name,
            self.crystal,
            self.key_item or "",
            json.dumps(self.ingredients, ensure_ascii=False),
            json.dumps(self.hq_yields, ensure_ascii=False),
        ]


# ---------------------------------------------------------------------------

def parse() -> List[Recipe]:
    lines = TXT_PATH.read_text(encoding="utf-8", errors="ignore").splitlines()
    n = len(lines)
    i = 0
    current_category = ""
    recipes: List[Recipe] = []

    while i < n:
        line = lines[i].strip()

        # Category header
        m_cat = RE_CATEGORY.match(line)
        if m_cat:
            current_category = m_cat.group(1)
            i += 1
            continue

        # Main Craft start
        m_main = RE_MAIN.search(line)
        if m_main:
            level = int(m_main.group(1))
            rec = Recipe(current_category, level)
            i += 1

            # look ahead for optional Sub Craft(s) & Key Item lines
            while i < n:
                look = lines[i].strip()
                if not look:
                    i += 1
                    continue
                if RE_SUBCRAFTS.match(look):
                    sub_text = RE_SUBCRAFTS.match(look).group(1)
                    # Split by commas
                    for part in sub_text.split(','):
                        part = part.strip()
                        m_sc = re.match(r"([A-Za-z]+) - \((\d+)\)", part)
                        if m_sc:
                            rec.subcrafts.append((m_sc.group(1), int(m_sc.group(2))))
                    i += 1
                    continue
                if RE_KEY_ITEM.match(look):
                    rec.key_item = RE_KEY_ITEM.match(look).group(1)
                    i += 1
                    continue
                # If line starts with NQ: we stop metadata collection
                if look.startswith("NQ:") or look.startswith("HQ"):
                    break
                # Ingredient lines normally start with crystal or item names and are indented
                if not look.startswith("NQ:"):
                    # Ingredient collection will happen in separate loop
                    break
                i += 1

            # Collect ingredients until NQ line encountered
            while i < n and not lines[i].lstrip().startswith("NQ:"):
                ingr_line = lines[i].strip()
                if ingr_line and not ingr_line.endswith("-Icon.gif"):
                    name, qty = split_item_qty(ingr_line)
                    rec.ingredients.append((name, qty))
                i += 1

            # Determine crystal (must contain 'Crystal')
            for name, qty in rec.ingredients:
                if "Crystal" in name:
                    rec.crystal = name.split()[0]  # Earth Crystal -> Earth
                    break
            if rec.crystal:
                # Remove crystal from ingredient list
                rec.ingredients = [pair for pair in rec.ingredients if "Crystal" not in pair[0]]
            else:
                rec.crystal = "Unknown"

            # NQ line
            if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
                rec.name, _ = split_item_qty(m_nq.group(1))
                i += 1
            else:
                i += 1

            # Skip blank lines before HQ entries
            while i < n and not lines[i].strip():
                i += 1
            # HQ lines
            while i < n and lines[i].lstrip().startswith("HQ"):
                m_hq = RE_HQ.match(lines[i].strip())
                if m_hq:
                    idx = int(m_hq.group(1)) - 1
                    name, qty = split_item_qty(m_hq.group(2))
                    rec.hq_yields[idx] = (name, qty)
                i += 1

            recipes.append(rec)
            continue

        i += 1

    return recipes


# ---------------------------------------------------------------------------

def main() -> None:
    recs = parse()
    CSV_PATH.parent.mkdir(parents=True, exist_ok=True)
    with CSV_PATH.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "category",
            "level",
            "subcrafts",
            "name",
            "crystal",
            "key_item",
            "ingredients",
            "hq_yields",
        ])
        for r in recs:
            writer.writerow(r.row())
    print(f"Wrote {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")


if __name__ == "__main__":
    main()