Mog-Squire/scripts/recipes_to_csv_v2.py

#!/usr/bin/env python3
"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv

Usage:
    python recipes_to_csv_v2.py <CRAFT>

Where <CRAFT> matches the name of the .txt file in the datasets directory,
for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.

The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
having the following columns (identical to the Woodworking v2 spec):

    category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields

See scripts/README.md for details of each column.
"""
from __future__ import annotations

import argparse
import csv
import json
import pathlib
import re
from typing import List, Optional, Tuple

PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
DATASETS_DIR = PROJECT_ROOT / "datasets"

# ---------------------------------------------------------------------------
# Regex helpers (compiled at runtime where craft-dependent)
# ---------------------------------------------------------------------------

RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
# Patterns like "+1" denote HQ variant and should be preserved in the name.
RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)


# ---------------------------------------------------------------------------
# Helper functions / dataclasses
# ---------------------------------------------------------------------------

def norm(s: str) -> str:
    """Normalise whitespace inside a string."""
    return re.sub(r"\s+", " ", s.strip())


def split_item_qty(text: str) -> Tuple[str, int]:
    """Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
    text = norm(text)
    m = RE_ITEM_QTY.match(text)
    if m:
        name, qty = m.group(1), int(m.group(2))
    else:
        name, qty = text, 1
    return name, qty


class Recipe:
    __slots__ = (
        "category",
        "level",
        "subcrafts",
        "name",
        "crystal",
        "key_item",
        "ingredients",
        "hq_yields",
    )

    def __init__(self, category: str, level: int):
        self.category = category
        self.level = level
        self.subcrafts: List[Tuple[str, int]] = []
        self.name: str = ""
        self.crystal: str = ""
        self.key_item: Optional[str] = None
        self.ingredients: List[Tuple[str, int]] = []
        self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]

    def row(self) -> List[str]:
        return [
            self.category,
            str(self.level),
            json.dumps(self.subcrafts, ensure_ascii=False),
            self.name,
            self.crystal,
            self.key_item or "",
            json.dumps(self.ingredients, ensure_ascii=False),
            json.dumps(self.hq_yields, ensure_ascii=False),
        ]


# ---------------------------------------------------------------------------
# Core parse routine
# ---------------------------------------------------------------------------

def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
    """Parse a crafting text file into Recipe objects.

    The parsing strategy is now:
    1. "Main Craft:" marks the *metadata* for the upcoming recipe – level, optional
       sub-crafts, and key item.
    2. Ingredient lines follow until an "NQ:" line is reached.  The first recipe
       ingredient that contains the word "Crystal" determines the crystal type
       and is removed from the ingredients list.
    3. An "NQ:" line finalises the recipe: we capture the product name and then
       look ahead for up to three "HQx:" lines that describe HQ yields.

    Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
    This allows the parser to cope with datasets where multiple successive
    "Main Craft:" lines appear without an intervening "NQ:".
    """

    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
    n = len(lines)
    i = 0

    current_category: str = ""
    recipes: List[Recipe] = []

    # Craft-specific regex (compiled once per run)
    RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")

    while i < n:
        line = lines[i].strip()

        # 1) Category header
        if (m_cat := RE_CATEGORY.match(line)):
            current_category = m_cat.group(1)
            i += 1
            continue

        # 2) Start of a recipe – line beginning with "NQ:"
        if not line.startswith("NQ:"):
            i += 1
            continue

        # -------------------------------
        # New recipe initialised
        # -------------------------------
        rec = Recipe(current_category, level=0)
        rec.name, _ = split_item_qty(line[len("NQ:"):].strip())

        # Collect block until next NQ or EOF
        block_lines: List[str] = []
        i += 1
        while i < n and not lines[i].lstrip().startswith("NQ:"):
            block_lines.append(lines[i])
            i += 1

        # ------------------------------------
        # Parse metadata & ingredients in block
        # ------------------------------------
        for raw in block_lines:
            look = raw.strip()
            if not look:
                continue

            # Skip icon decorator lines early
            if look.endswith("-Icon.gif"):
                continue

            # Main Craft – level
            if (m_main := RE_MAIN.search(look)):
                level_raw = m_main.group(1)
                # Handle ranges like "115~120" or "115-120" by taking the lower bound
                m_range = re.match(r"(\d+)", level_raw)
                if m_range:
                    rec.level = int(m_range.group(1))
                else:
                    rec.level = 0
                continue

            # Sub crafts
            if (m_sc := RE_SUBCRAFTS.match(look)):
                for part in m_sc.group(1).split(','):
                    part = part.strip()
                    if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
                        rec.subcrafts.append((m.group(1), int(m.group(2))))
                continue

            # Key item
            if (m_ki := RE_KEY_ITEM.match(look)):
                rec.key_item = m_ki.group(1)
                continue

            # HQ lines
            if look.startswith("HQ"):
                if (m_hq := RE_HQ.match(look)):
                    idx = int(m_hq.group(1)) - 1
                    name, qty = split_item_qty(m_hq.group(2))
                    rec.hq_yields[idx] = (name, qty)
                continue

            # Otherwise treat as ingredient
            name, qty = split_item_qty(look)
            rec.ingredients.append((name, qty))

        # Determine crystal & clean ingredient list
        for name, qty in rec.ingredients:
            if "Crystal" in name:
                rec.crystal = name.split()[0]
                break
        if rec.crystal:
            rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
        else:
            rec.crystal = "Unknown"

        recipes.append(rec)

    return recipes


# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------

def main() -> None:
    """CLI entrypoint.

    Usage examples:
        # Process a single craft
        python recipes_to_csv_v2.py Woodworking

        # Process all *.txt files in the datasets directory
        python recipes_to_csv_v2.py --all

        # Omit positional arg – defaults to --all
        python recipes_to_csv_v2.py
    """
    argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
    argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
    argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
    args = argp.parse_args()

    # Determine which crafts to process
    if args.all or not args.craft:
        crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
        if not crafts:
            raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
    else:
        crafts = [args.craft.strip()]

    for craft in crafts:
        txt_path = DATASETS_DIR / f"{craft}.txt"
        if not txt_path.exists():
            print(f"[WARN] Dataset file not found: {txt_path}")
            continue

        csv_path = DATASETS_DIR / f"{craft}_v2.csv"
        recipes = parse(txt_path, craft)

        csv_path.parent.mkdir(parents=True, exist_ok=True)
        with csv_path.open("w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([
                "category",
                "level",
                "subcrafts",
                "name",
                "crystal",
                "key_item",
                "ingredients",
                "hq_yields",
            ])
            for r in recipes:
                writer.writerow(r.row())

        rel = csv_path.relative_to(PROJECT_ROOT)
        print(f"Wrote {len(recipes)} recipes -> {rel}")


if __name__ == "__main__":
    main()