Files
Mog-Squire/scripts/woodworking_to_csv.py
2025-07-07 13:39:46 +01:00

202 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
The output CSV will be written to datasets/Woodworking.csv with the following columns:
category Text group header e.g. Amateur (1-10)
level Woodworking skill level for the recipe (integer)
product_name Produced item (without quantity suffix)
nq_yield Quantity produced on a normal quality synth (may be empty)
hq1_yield Quantity produced on HQ1 synth (may be empty)
hq2_yield Quantity produced on HQ2 synth (may be empty)
hq3_yield Quantity produced on HQ3 synth (may be empty)
crystal Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
ingredients Semi-colon-separated list of remaining ingredients (excluding the crystal)
Run the script from the project root:
python3 scripts/woodworking_to_csv.py
"""
from __future__ import annotations
import csv
import pathlib
import re
from typing import List, Optional
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
# --- Regex patterns ---------------------------------------------------------
RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
def normalise_whitespace(text: str) -> str:
"""Collapse internal runs of whitespace and trim."""
return re.sub(r"\s+", " ", text.strip())
class Recipe:
__slots__ = (
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
)
def __init__(self, category: str, level: int):
self.category: str = category
self.level: int = level
self.product_name: str = ""
self.nq_yield: str = ""
self.hq1_yield: str = ""
self.hq2_yield: str = ""
self.hq3_yield: str = ""
self.crystal: str = ""
self.ingredients: List[str] = []
# ------------------------------------------------------------------
# Helper methods
# ------------------------------------------------------------------
def _set_product(self, text: str, nq: bool = False) -> None:
name, qty = self._split_item_qty(text)
self.product_name = name
if nq:
self.nq_yield = qty
def _add_hq(self, idx: int, text: str) -> None:
_name, qty = self._split_item_qty(text)
if idx == 1:
self.hq1_yield = qty
elif idx == 2:
self.hq2_yield = qty
elif idx == 3:
self.hq3_yield = qty
@staticmethod
def _split_item_qty(text: str) -> tuple[str, str]:
text = normalise_whitespace(text)
m = RE_ITEM_QTY.match(text)
if m:
return m.group(1), m.group(2)
return text, ""
def to_row(self) -> List[str]:
return [
self.category,
str(self.level),
self.product_name,
self.nq_yield,
self.hq1_yield,
self.hq2_yield,
self.hq3_yield,
self.crystal,
"; ".join(self.ingredients),
]
# ----------------------------------------------------------------------------
# Parsing logic
# ----------------------------------------------------------------------------
def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
recipes: List[Recipe] = []
current_category = ""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# Update category headers e.g. "Amateur (1-10)"
if RE_CATEGORY.match(line):
current_category = line
i += 1
continue
# Detect start of a recipe block
m_main = RE_MAIN_CRAFT.search(line)
if m_main:
level = int(m_main.group(1))
rec = Recipe(current_category, level)
i += 1
# Collect ingredients until we hit the NQ line
while i < n and not lines[i].lstrip().startswith("NQ:"):
ing_line = lines[i].strip()
if ing_line:
rec.ingredients.append(normalise_whitespace(ing_line))
i += 1
# Extract crystal (first ingredient if it contains "Crystal")
if rec.ingredients and "Crystal" in rec.ingredients[0]:
rec.crystal = rec.ingredients.pop(0)
# Now we should be at the NQ line
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
rec._set_product(m_nq.group(1), nq=True)
i += 1
else:
# Malformed entry skip ahead
i += 1
continue
# Collect HQ lines (03 lines)
while i < n and lines[i].lstrip().startswith("HQ"):
m_hq = RE_HQ.match(lines[i].strip())
if m_hq:
idx = int(m_hq.group(1))
rec._add_hq(idx, m_hq.group(2))
i += 1
recipes.append(rec)
continue # skip to next line without increment to avoid double increment
# Fallback increment
i += 1
return recipes
# ----------------------------------------------------------------------------
# CSV writer
# ----------------------------------------------------------------------------
def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(
[
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
]
)
for r in recipes:
writer.writerow(r.to_row())
if __name__ == "__main__":
if not TXT_PATH.exists():
raise SystemExit(f"Input file not found: {TXT_PATH}")
recs = parse_file(TXT_PATH)
write_csv(recs, CSV_PATH)
print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")