Initial commit

This commit is contained in:
Aodhan
2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions

View File

@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
The output CSV will be written to datasets/Woodworking.csv with the following columns:
category Text group header e.g. Amateur (1-10)
level Woodworking skill level for the recipe (integer)
product_name Produced item (without quantity suffix)
nq_yield Quantity produced on a normal quality synth (may be empty)
hq1_yield Quantity produced on HQ1 synth (may be empty)
hq2_yield Quantity produced on HQ2 synth (may be empty)
hq3_yield Quantity produced on HQ3 synth (may be empty)
crystal Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
ingredients Semi-colon-separated list of remaining ingredients (excluding the crystal)
Run the script from the project root:
python3 scripts/woodworking_to_csv.py
"""
from __future__ import annotations
import csv
import pathlib
import re
from typing import List, Optional
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
# --- Regex patterns ---------------------------------------------------------
RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
def normalise_whitespace(text: str) -> str:
"""Collapse internal runs of whitespace and trim."""
return re.sub(r"\s+", " ", text.strip())
class Recipe:
__slots__ = (
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
)
def __init__(self, category: str, level: int):
self.category: str = category
self.level: int = level
self.product_name: str = ""
self.nq_yield: str = ""
self.hq1_yield: str = ""
self.hq2_yield: str = ""
self.hq3_yield: str = ""
self.crystal: str = ""
self.ingredients: List[str] = []
# ------------------------------------------------------------------
# Helper methods
# ------------------------------------------------------------------
def _set_product(self, text: str, nq: bool = False) -> None:
name, qty = self._split_item_qty(text)
self.product_name = name
if nq:
self.nq_yield = qty
def _add_hq(self, idx: int, text: str) -> None:
_name, qty = self._split_item_qty(text)
if idx == 1:
self.hq1_yield = qty
elif idx == 2:
self.hq2_yield = qty
elif idx == 3:
self.hq3_yield = qty
@staticmethod
def _split_item_qty(text: str) -> tuple[str, str]:
text = normalise_whitespace(text)
m = RE_ITEM_QTY.match(text)
if m:
return m.group(1), m.group(2)
return text, ""
def to_row(self) -> List[str]:
return [
self.category,
str(self.level),
self.product_name,
self.nq_yield,
self.hq1_yield,
self.hq2_yield,
self.hq3_yield,
self.crystal,
"; ".join(self.ingredients),
]
# ----------------------------------------------------------------------------
# Parsing logic
# ----------------------------------------------------------------------------
def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
recipes: List[Recipe] = []
current_category = ""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# Update category headers e.g. "Amateur (1-10)"
if RE_CATEGORY.match(line):
current_category = line
i += 1
continue
# Detect start of a recipe block
m_main = RE_MAIN_CRAFT.search(line)
if m_main:
level = int(m_main.group(1))
rec = Recipe(current_category, level)
i += 1
# Collect ingredients until we hit the NQ line
while i < n and not lines[i].lstrip().startswith("NQ:"):
ing_line = lines[i].strip()
if ing_line:
rec.ingredients.append(normalise_whitespace(ing_line))
i += 1
# Extract crystal (first ingredient if it contains "Crystal")
if rec.ingredients and "Crystal" in rec.ingredients[0]:
rec.crystal = rec.ingredients.pop(0)
# Now we should be at the NQ line
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
rec._set_product(m_nq.group(1), nq=True)
i += 1
else:
# Malformed entry skip ahead
i += 1
continue
# Collect HQ lines (03 lines)
while i < n and lines[i].lstrip().startswith("HQ"):
m_hq = RE_HQ.match(lines[i].strip())
if m_hq:
idx = int(m_hq.group(1))
rec._add_hq(idx, m_hq.group(2))
i += 1
recipes.append(rec)
continue # skip to next line without increment to avoid double increment
# Fallback increment
i += 1
return recipes
# ----------------------------------------------------------------------------
# CSV writer
# ----------------------------------------------------------------------------
def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(
[
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
]
)
for r in recipes:
writer.writerow(r.to_row())
if __name__ == "__main__":
if not TXT_PATH.exists():
raise SystemExit(f"Input file not found: {TXT_PATH}")
recs = parse_file(TXT_PATH)
write_csv(recs, CSV_PATH)
print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")