202 lines
6.5 KiB
Python
202 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
||
"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
|
||
|
||
The output CSV will be written to datasets/Woodworking.csv with the following columns:
|
||
category – Text group header e.g. Amateur (1-10)
|
||
level – Woodworking skill level for the recipe (integer)
|
||
product_name – Produced item (without quantity suffix)
|
||
nq_yield – Quantity produced on a normal quality synth (may be empty)
|
||
hq1_yield – Quantity produced on HQ1 synth (may be empty)
|
||
hq2_yield – Quantity produced on HQ2 synth (may be empty)
|
||
hq3_yield – Quantity produced on HQ3 synth (may be empty)
|
||
crystal – Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
|
||
ingredients – Semi-colon-separated list of remaining ingredients (excluding the crystal)
|
||
|
||
Run the script from the project root:
|
||
python3 scripts/woodworking_to_csv.py
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import pathlib
|
||
import re
|
||
from typing import List, Optional
|
||
|
||
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
|
||
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
|
||
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
|
||
|
||
# --- Regex patterns ---------------------------------------------------------
|
||
RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
|
||
RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
|
||
RE_NQ = re.compile(r"NQ:\s*(.+)")
|
||
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
|
||
RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
|
||
|
||
|
||
def normalise_whitespace(text: str) -> str:
|
||
"""Collapse internal runs of whitespace and trim."""
|
||
return re.sub(r"\s+", " ", text.strip())
|
||
|
||
|
||
class Recipe:
|
||
__slots__ = (
|
||
"category",
|
||
"level",
|
||
"product_name",
|
||
"nq_yield",
|
||
"hq1_yield",
|
||
"hq2_yield",
|
||
"hq3_yield",
|
||
"crystal",
|
||
"ingredients",
|
||
)
|
||
|
||
def __init__(self, category: str, level: int):
|
||
self.category: str = category
|
||
self.level: int = level
|
||
self.product_name: str = ""
|
||
self.nq_yield: str = ""
|
||
self.hq1_yield: str = ""
|
||
self.hq2_yield: str = ""
|
||
self.hq3_yield: str = ""
|
||
self.crystal: str = ""
|
||
self.ingredients: List[str] = []
|
||
|
||
# ------------------------------------------------------------------
|
||
# Helper methods
|
||
# ------------------------------------------------------------------
|
||
def _set_product(self, text: str, nq: bool = False) -> None:
|
||
name, qty = self._split_item_qty(text)
|
||
self.product_name = name
|
||
if nq:
|
||
self.nq_yield = qty
|
||
|
||
def _add_hq(self, idx: int, text: str) -> None:
|
||
_name, qty = self._split_item_qty(text)
|
||
if idx == 1:
|
||
self.hq1_yield = qty
|
||
elif idx == 2:
|
||
self.hq2_yield = qty
|
||
elif idx == 3:
|
||
self.hq3_yield = qty
|
||
|
||
@staticmethod
|
||
def _split_item_qty(text: str) -> tuple[str, str]:
|
||
text = normalise_whitespace(text)
|
||
m = RE_ITEM_QTY.match(text)
|
||
if m:
|
||
return m.group(1), m.group(2)
|
||
return text, ""
|
||
|
||
def to_row(self) -> List[str]:
|
||
return [
|
||
self.category,
|
||
str(self.level),
|
||
self.product_name,
|
||
self.nq_yield,
|
||
self.hq1_yield,
|
||
self.hq2_yield,
|
||
self.hq3_yield,
|
||
self.crystal,
|
||
"; ".join(self.ingredients),
|
||
]
|
||
|
||
|
||
# ----------------------------------------------------------------------------
|
||
# Parsing logic
|
||
# ----------------------------------------------------------------------------
|
||
|
||
def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
|
||
recipes: List[Recipe] = []
|
||
current_category = ""
|
||
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
||
i = 0
|
||
n = len(lines)
|
||
|
||
while i < n:
|
||
line = lines[i].strip()
|
||
|
||
# Update category headers e.g. "Amateur (1-10)"
|
||
if RE_CATEGORY.match(line):
|
||
current_category = line
|
||
i += 1
|
||
continue
|
||
|
||
# Detect start of a recipe block
|
||
m_main = RE_MAIN_CRAFT.search(line)
|
||
if m_main:
|
||
level = int(m_main.group(1))
|
||
rec = Recipe(current_category, level)
|
||
i += 1
|
||
|
||
# Collect ingredients until we hit the NQ line
|
||
while i < n and not lines[i].lstrip().startswith("NQ:"):
|
||
ing_line = lines[i].strip()
|
||
if ing_line:
|
||
rec.ingredients.append(normalise_whitespace(ing_line))
|
||
i += 1
|
||
|
||
# Extract crystal (first ingredient if it contains "Crystal")
|
||
if rec.ingredients and "Crystal" in rec.ingredients[0]:
|
||
rec.crystal = rec.ingredients.pop(0)
|
||
|
||
# Now we should be at the NQ line
|
||
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
|
||
rec._set_product(m_nq.group(1), nq=True)
|
||
i += 1
|
||
else:
|
||
# Malformed entry – skip ahead
|
||
i += 1
|
||
continue
|
||
|
||
# Collect HQ lines (0–3 lines)
|
||
while i < n and lines[i].lstrip().startswith("HQ"):
|
||
m_hq = RE_HQ.match(lines[i].strip())
|
||
if m_hq:
|
||
idx = int(m_hq.group(1))
|
||
rec._add_hq(idx, m_hq.group(2))
|
||
i += 1
|
||
|
||
recipes.append(rec)
|
||
continue # skip to next line without increment to avoid double increment
|
||
|
||
# Fallback increment
|
||
i += 1
|
||
|
||
return recipes
|
||
|
||
|
||
# ----------------------------------------------------------------------------
|
||
# CSV writer
|
||
# ----------------------------------------------------------------------------
|
||
|
||
def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
|
||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
||
writer = csv.writer(f)
|
||
writer.writerow(
|
||
[
|
||
"category",
|
||
"level",
|
||
"product_name",
|
||
"nq_yield",
|
||
"hq1_yield",
|
||
"hq2_yield",
|
||
"hq3_yield",
|
||
"crystal",
|
||
"ingredients",
|
||
]
|
||
)
|
||
for r in recipes:
|
||
writer.writerow(r.to_row())
|
||
|
||
|
||
if __name__ == "__main__":
|
||
if not TXT_PATH.exists():
|
||
raise SystemExit(f"Input file not found: {TXT_PATH}")
|
||
|
||
recs = parse_file(TXT_PATH)
|
||
write_csv(recs, CSV_PATH)
|
||
print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")
|