Files
Mog-Squire/scripts/recipes_to_csv_v2.py
2025-07-07 13:39:46 +01:00

280 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv
Usage:
python recipes_to_csv_v2.py <CRAFT>
Where <CRAFT> matches the name of the .txt file in the datasets directory,
for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.
The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
having the following columns (identical to the Woodworking v2 spec):
category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields
See scripts/README.md for details of each column.
"""
from __future__ import annotations
import argparse
import csv
import json
import pathlib
import re
from typing import List, Optional, Tuple
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
DATASETS_DIR = PROJECT_ROOT / "datasets"
# ---------------------------------------------------------------------------
# Regex helpers (compiled at runtime where craft-dependent)
# ---------------------------------------------------------------------------
RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
# Patterns like "+1" denote HQ variant and should be preserved in the name.
RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)
# ---------------------------------------------------------------------------
# Helper functions / dataclasses
# ---------------------------------------------------------------------------
def norm(s: str) -> str:
"""Normalise whitespace inside a string."""
return re.sub(r"\s+", " ", s.strip())
def split_item_qty(text: str) -> Tuple[str, int]:
"""Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
text = norm(text)
m = RE_ITEM_QTY.match(text)
if m:
name, qty = m.group(1), int(m.group(2))
else:
name, qty = text, 1
return name, qty
class Recipe:
__slots__ = (
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
)
def __init__(self, category: str, level: int):
self.category = category
self.level = level
self.subcrafts: List[Tuple[str, int]] = []
self.name: str = ""
self.crystal: str = ""
self.key_item: Optional[str] = None
self.ingredients: List[Tuple[str, int]] = []
self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
def row(self) -> List[str]:
return [
self.category,
str(self.level),
json.dumps(self.subcrafts, ensure_ascii=False),
self.name,
self.crystal,
self.key_item or "",
json.dumps(self.ingredients, ensure_ascii=False),
json.dumps(self.hq_yields, ensure_ascii=False),
]
# ---------------------------------------------------------------------------
# Core parse routine
# ---------------------------------------------------------------------------
def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
"""Parse a crafting text file into Recipe objects.
The parsing strategy is now:
1. "Main Craft:" marks the *metadata* for the upcoming recipe level, optional
sub-crafts, and key item.
2. Ingredient lines follow until an "NQ:" line is reached. The first recipe
ingredient that contains the word "Crystal" determines the crystal type
and is removed from the ingredients list.
3. An "NQ:" line finalises the recipe: we capture the product name and then
look ahead for up to three "HQx:" lines that describe HQ yields.
Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
This allows the parser to cope with datasets where multiple successive
"Main Craft:" lines appear without an intervening "NQ:".
"""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
n = len(lines)
i = 0
current_category: str = ""
recipes: List[Recipe] = []
# Craft-specific regex (compiled once per run)
RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")
while i < n:
line = lines[i].strip()
# 1) Category header
if (m_cat := RE_CATEGORY.match(line)):
current_category = m_cat.group(1)
i += 1
continue
# 2) Start of a recipe line beginning with "NQ:"
if not line.startswith("NQ:"):
i += 1
continue
# -------------------------------
# New recipe initialised
# -------------------------------
rec = Recipe(current_category, level=0)
rec.name, _ = split_item_qty(line[len("NQ:"):].strip())
# Collect block until next NQ or EOF
block_lines: List[str] = []
i += 1
while i < n and not lines[i].lstrip().startswith("NQ:"):
block_lines.append(lines[i])
i += 1
# ------------------------------------
# Parse metadata & ingredients in block
# ------------------------------------
for raw in block_lines:
look = raw.strip()
if not look:
continue
# Skip icon decorator lines early
if look.endswith("-Icon.gif"):
continue
# Main Craft level
if (m_main := RE_MAIN.search(look)):
level_raw = m_main.group(1)
# Handle ranges like "115~120" or "115-120" by taking the lower bound
m_range = re.match(r"(\d+)", level_raw)
if m_range:
rec.level = int(m_range.group(1))
else:
rec.level = 0
continue
# Sub crafts
if (m_sc := RE_SUBCRAFTS.match(look)):
for part in m_sc.group(1).split(','):
part = part.strip()
if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
rec.subcrafts.append((m.group(1), int(m.group(2))))
continue
# Key item
if (m_ki := RE_KEY_ITEM.match(look)):
rec.key_item = m_ki.group(1)
continue
# HQ lines
if look.startswith("HQ"):
if (m_hq := RE_HQ.match(look)):
idx = int(m_hq.group(1)) - 1
name, qty = split_item_qty(m_hq.group(2))
rec.hq_yields[idx] = (name, qty)
continue
# Otherwise treat as ingredient
name, qty = split_item_qty(look)
rec.ingredients.append((name, qty))
# Determine crystal & clean ingredient list
for name, qty in rec.ingredients:
if "Crystal" in name:
rec.crystal = name.split()[0]
break
if rec.crystal:
rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
else:
rec.crystal = "Unknown"
recipes.append(rec)
return recipes
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
"""CLI entrypoint.
Usage examples:
# Process a single craft
python recipes_to_csv_v2.py Woodworking
# Process all *.txt files in the datasets directory
python recipes_to_csv_v2.py --all
# Omit positional arg defaults to --all
python recipes_to_csv_v2.py
"""
argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
args = argp.parse_args()
# Determine which crafts to process
if args.all or not args.craft:
crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
if not crafts:
raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
else:
crafts = [args.craft.strip()]
for craft in crafts:
txt_path = DATASETS_DIR / f"{craft}.txt"
if not txt_path.exists():
print(f"[WARN] Dataset file not found: {txt_path}")
continue
csv_path = DATASETS_DIR / f"{craft}_v2.csv"
recipes = parse(txt_path, craft)
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
])
for r in recipes:
writer.writerow(r.row())
rel = csv_path.relative_to(PROJECT_ROOT)
print(f"Wrote {len(recipes)} recipes -> {rel}")
if __name__ == "__main__":
main()