Files
Mog-Squire/scripts/woodworking_to_csv_v2.py
2025-07-07 13:39:46 +01:00

211 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""Improved parser for Woodworking recipes -> Woodworking_v2.csv
This version follows the spec provided by the user:
Category: category name (without level range)
Level: recipe level (integer)
Sub-Crafts: JSON list of [name, level]
Name: product NQ name
Crystal: crystal element (Earth, Wind, etc.)
Key Item: key item name or null
Ingredients: JSON list of [name, quantity]
HQ Yields: JSON list ordered HQ1..HQ3 of [name, quantity] (nulls if N/A)
"""
from __future__ import annotations
import csv
import json
import pathlib
import re
from typing import Dict, List, Optional, Tuple
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"
RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_MAIN = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s*(?:x|\+)(\d+)$")
def norm(s: str) -> str:
return re.sub(r"\s+", " ", s.strip())
def split_item_qty(text: str) -> Tuple[str, int]:
text = norm(text)
m = RE_ITEM_QTY.match(text)
if m:
name, qty = m.group(1), int(m.group(2))
else:
name, qty = text, 1
return name, qty
class Recipe:
__slots__ = (
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
)
def __init__(self, category: str, level: int):
self.category = category
self.level = level
self.subcrafts: List[Tuple[str, int]] = []
self.name: str = ""
self.crystal: str = ""
self.key_item: Optional[str] = None
self.ingredients: List[Tuple[str, int]] = []
# index 0,1,2 for HQ1..3; (name, qty) or None
self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
def row(self) -> List[str]:
return [
self.category,
str(self.level),
json.dumps(self.subcrafts, ensure_ascii=False),
self.name,
self.crystal,
self.key_item or "",
json.dumps(self.ingredients, ensure_ascii=False),
json.dumps(self.hq_yields, ensure_ascii=False),
]
# ---------------------------------------------------------------------------
def parse() -> List[Recipe]:
lines = TXT_PATH.read_text(encoding="utf-8", errors="ignore").splitlines()
n = len(lines)
i = 0
current_category = ""
recipes: List[Recipe] = []
while i < n:
line = lines[i].strip()
# Category header
m_cat = RE_CATEGORY.match(line)
if m_cat:
current_category = m_cat.group(1)
i += 1
continue
# Main Craft start
m_main = RE_MAIN.search(line)
if m_main:
level = int(m_main.group(1))
rec = Recipe(current_category, level)
i += 1
# look ahead for optional Sub Craft(s) & Key Item lines
while i < n:
look = lines[i].strip()
if not look:
i += 1
continue
if RE_SUBCRAFTS.match(look):
sub_text = RE_SUBCRAFTS.match(look).group(1)
# Split by commas
for part in sub_text.split(','):
part = part.strip()
m_sc = re.match(r"([A-Za-z]+) - \((\d+)\)", part)
if m_sc:
rec.subcrafts.append((m_sc.group(1), int(m_sc.group(2))))
i += 1
continue
if RE_KEY_ITEM.match(look):
rec.key_item = RE_KEY_ITEM.match(look).group(1)
i += 1
continue
# If line starts with NQ: we stop metadata collection
if look.startswith("NQ:") or look.startswith("HQ"):
break
# Ingredient lines normally start with crystal or item names and are indented
if not look.startswith("NQ:"):
# Ingredient collection will happen in separate loop
break
i += 1
# Collect ingredients until NQ line encountered
while i < n and not lines[i].lstrip().startswith("NQ:"):
ingr_line = lines[i].strip()
if ingr_line and not ingr_line.endswith("-Icon.gif"):
name, qty = split_item_qty(ingr_line)
rec.ingredients.append((name, qty))
i += 1
# Determine crystal (must contain 'Crystal')
for name, qty in rec.ingredients:
if "Crystal" in name:
rec.crystal = name.split()[0] # Earth Crystal -> Earth
break
if rec.crystal:
# Remove crystal from ingredient list
rec.ingredients = [pair for pair in rec.ingredients if "Crystal" not in pair[0]]
else:
rec.crystal = "Unknown"
# NQ line
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
rec.name, _ = split_item_qty(m_nq.group(1))
i += 1
else:
i += 1
# Skip blank lines before HQ entries
while i < n and not lines[i].strip():
i += 1
# HQ lines
while i < n and lines[i].lstrip().startswith("HQ"):
m_hq = RE_HQ.match(lines[i].strip())
if m_hq:
idx = int(m_hq.group(1)) - 1
name, qty = split_item_qty(m_hq.group(2))
rec.hq_yields[idx] = (name, qty)
i += 1
recipes.append(rec)
continue
i += 1
return recipes
# ---------------------------------------------------------------------------
def main() -> None:
recs = parse()
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)
with CSV_PATH.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
])
for r in recs:
writer.writerow(r.row())
print(f"Wrote {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")
if __name__ == "__main__":
main()