Initial commit

This commit is contained in:
Aodhan
2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions

View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv
Usage:
python recipes_to_csv_v2.py <CRAFT>
Where <CRAFT> matches the name of the .txt file in the datasets directory,
for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.
The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
having the following columns (identical to the Woodworking v2 spec):
category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields
See scripts/README.md for details of each column.
"""
from __future__ import annotations
import argparse
import csv
import json
import pathlib
import re
from typing import List, Optional, Tuple
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
DATASETS_DIR = PROJECT_ROOT / "datasets"
# ---------------------------------------------------------------------------
# Regex helpers (compiled at runtime where craft-dependent)
# ---------------------------------------------------------------------------
RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
# Patterns like "+1" denote HQ variant and should be preserved in the name.
RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)
# ---------------------------------------------------------------------------
# Helper functions / dataclasses
# ---------------------------------------------------------------------------
def norm(s: str) -> str:
"""Normalise whitespace inside a string."""
return re.sub(r"\s+", " ", s.strip())
def split_item_qty(text: str) -> Tuple[str, int]:
"""Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
text = norm(text)
m = RE_ITEM_QTY.match(text)
if m:
name, qty = m.group(1), int(m.group(2))
else:
name, qty = text, 1
return name, qty
class Recipe:
__slots__ = (
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
)
def __init__(self, category: str, level: int):
self.category = category
self.level = level
self.subcrafts: List[Tuple[str, int]] = []
self.name: str = ""
self.crystal: str = ""
self.key_item: Optional[str] = None
self.ingredients: List[Tuple[str, int]] = []
self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
def row(self) -> List[str]:
return [
self.category,
str(self.level),
json.dumps(self.subcrafts, ensure_ascii=False),
self.name,
self.crystal,
self.key_item or "",
json.dumps(self.ingredients, ensure_ascii=False),
json.dumps(self.hq_yields, ensure_ascii=False),
]
# ---------------------------------------------------------------------------
# Core parse routine
# ---------------------------------------------------------------------------
def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
"""Parse a crafting text file into Recipe objects.
The parsing strategy is now:
1. "Main Craft:" marks the *metadata* for the upcoming recipe level, optional
sub-crafts, and key item.
2. Ingredient lines follow until an "NQ:" line is reached. The first recipe
ingredient that contains the word "Crystal" determines the crystal type
and is removed from the ingredients list.
3. An "NQ:" line finalises the recipe: we capture the product name and then
look ahead for up to three "HQx:" lines that describe HQ yields.
Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
This allows the parser to cope with datasets where multiple successive
"Main Craft:" lines appear without an intervening "NQ:".
"""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
n = len(lines)
i = 0
current_category: str = ""
recipes: List[Recipe] = []
# Craft-specific regex (compiled once per run)
RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")
while i < n:
line = lines[i].strip()
# 1) Category header
if (m_cat := RE_CATEGORY.match(line)):
current_category = m_cat.group(1)
i += 1
continue
# 2) Start of a recipe line beginning with "NQ:"
if not line.startswith("NQ:"):
i += 1
continue
# -------------------------------
# New recipe initialised
# -------------------------------
rec = Recipe(current_category, level=0)
rec.name, _ = split_item_qty(line[len("NQ:"):].strip())
# Collect block until next NQ or EOF
block_lines: List[str] = []
i += 1
while i < n and not lines[i].lstrip().startswith("NQ:"):
block_lines.append(lines[i])
i += 1
# ------------------------------------
# Parse metadata & ingredients in block
# ------------------------------------
for raw in block_lines:
look = raw.strip()
if not look:
continue
# Skip icon decorator lines early
if look.endswith("-Icon.gif"):
continue
# Main Craft level
if (m_main := RE_MAIN.search(look)):
level_raw = m_main.group(1)
# Handle ranges like "115~120" or "115-120" by taking the lower bound
m_range = re.match(r"(\d+)", level_raw)
if m_range:
rec.level = int(m_range.group(1))
else:
rec.level = 0
continue
# Sub crafts
if (m_sc := RE_SUBCRAFTS.match(look)):
for part in m_sc.group(1).split(','):
part = part.strip()
if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
rec.subcrafts.append((m.group(1), int(m.group(2))))
continue
# Key item
if (m_ki := RE_KEY_ITEM.match(look)):
rec.key_item = m_ki.group(1)
continue
# HQ lines
if look.startswith("HQ"):
if (m_hq := RE_HQ.match(look)):
idx = int(m_hq.group(1)) - 1
name, qty = split_item_qty(m_hq.group(2))
rec.hq_yields[idx] = (name, qty)
continue
# Otherwise treat as ingredient
name, qty = split_item_qty(look)
rec.ingredients.append((name, qty))
# Determine crystal & clean ingredient list
for name, qty in rec.ingredients:
if "Crystal" in name:
rec.crystal = name.split()[0]
break
if rec.crystal:
rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
else:
rec.crystal = "Unknown"
recipes.append(rec)
return recipes
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
"""CLI entrypoint.
Usage examples:
# Process a single craft
python recipes_to_csv_v2.py Woodworking
# Process all *.txt files in the datasets directory
python recipes_to_csv_v2.py --all
# Omit positional arg defaults to --all
python recipes_to_csv_v2.py
"""
argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
args = argp.parse_args()
# Determine which crafts to process
if args.all or not args.craft:
crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
if not crafts:
raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
else:
crafts = [args.craft.strip()]
for craft in crafts:
txt_path = DATASETS_DIR / f"{craft}.txt"
if not txt_path.exists():
print(f"[WARN] Dataset file not found: {txt_path}")
continue
csv_path = DATASETS_DIR / f"{craft}_v2.csv"
recipes = parse(txt_path, craft)
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
])
for r in recipes:
writer.writerow(r.row())
rel = csv_path.relative_to(PROJECT_ROOT)
print(f"Wrote {len(recipes)} recipes -> {rel}")
if __name__ == "__main__":
main()