Initial commit

This commit is contained in:
Aodhan
2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions

88
scripts/README.md Normal file
View File

@@ -0,0 +1,88 @@
# Recipe ETL Scripts
This directory contains helper scripts for extracting Woodworking recipe data
from the raw **datasets/Woodworking.txt** file and loading it into the project
PostgreSQL database.
## File overview
| File | Purpose |
|------|---------|
| **woodworking_to_csv.py** | Legacy first-pass parser → `datasets/Woodworking.csv`. |
| **woodworking_to_csv_v2.py** | Improved parser that matches the spec (category, level, sub-crafts, ingredients, HQ yields, etc.) → `datasets/Woodworking_v2.csv`. |
| **recipes_to_csv_v2.py** | Generic parser. `python recipes_to_csv_v2.py <Craft>` processes one craft; use `python recipes_to_csv_v2.py --all` **or simply omit the argument** to parse every `.txt` file under `datasets/`, producing `datasets/<Craft>_v2.csv` for each. |
| **load_woodworking_to_db.py** | Loader for the legacy CSV (kept for reference). |
| **load_woodworking_v2_to_db.py** | Drops & recreates **recipes_woodworking** table and bulk-loads `Woodworking_v2.csv`. |
| **load_recipes_v2_to_db.py** | Generic loader. `python load_recipes_v2_to_db.py <Craft>` loads one craft; omit the argument to load **all** generated CSVs into their respective `recipes_<craft>` tables. |
| **requirements.txt** | Minimal Python dependencies for the scripts. |
| **venv/** | Local virtual-environment created by the setup steps below. |
## Prerequisites
* Python ≥ 3.9
* PostgreSQL instance reachable with credentials in `db.conf` at project root:
```ini
PSQL_HOST=…
PSQL_PORT=…
PSQL_USER=…
PSQL_PASSWORD=…
PSQL_DBNAME=…
```
## Quick start (Woodworking example)
```bash
# 1. From project root
cd scripts
# 2. Create & activate virtualenv (only once)
python3 -m venv venv
source venv/bin/activate
# 3. Install dependencies
pip install -r requirements.txt
# 4. Generate CSVs for **all** crafts
python recipes_to_csv_v2.py --all # or simply `python recipes_to_csv_v2.py`
# 5. Load all crafts into the DB (drops/recreates each table)
python load_recipes_v2_to_db.py
```
To work with a **single craft**, specify its name instead:
```bash
python recipes_to_csv_v2.py Smithing # generate Smithing_v2.csv
python load_recipes_v2_to_db.py Smithing # load only Smithing recipes
```
The loader will output e.g.:
```
Wrote 480 recipes -> datasets/Woodworking_v2.csv
Loaded recipes into new recipes_woodworking table.
```
## CSV schema (v2)
Column | Notes
------ | -----
`category` | Craft rank without level range (e.g. "Amateur")
`level` | Recipe level integer
`subcrafts` | JSON list `[["Smithing",2],["Alchemy",7]]`
`name` | NQ product name
`crystal` | Element used (Wind, Earth, etc.)
`key_item` | Required key item (blank if none)
`ingredients` | JSON list `[["Arrowwood Log",1]]`
`hq_yields` | JSON list HQ1-HQ3 e.g. `[["Arrowwood Lumber",6],["Arrowwood Lumber",9],["Arrowwood Lumber",12]]`
## Parsing rules
* Item quantities are detected only when the suffix uses an “x” (e.g. `Lumber x6`).
* Strings such as `Bronze Leggings +1` are treated as the **full item name**; the `+1/+2/+3` suffix is preserved.
## Developing / debugging
* Edit the parsers as needed, then rerun them to regenerate CSV.
* Feel free to add new scripts here; remember to update **requirements.txt** & this README.

View File

@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""Load <Craft>_v2.csv into PostgreSQL.
Usage:
python load_recipes_v2_to_db.py <CRAFT>
The script drop-creates table `recipes_<craft>` (lowercased) with the generic
v2 schema, then bulk-loads from the CSV produced by recipes_to_csv_v2.py.
"""
from __future__ import annotations
import argparse
import asyncio
import csv
import json
import pathlib
import re
from typing import Dict
import asyncpg
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
CONF_PATH = PROJECT_ROOT / "db.conf"
DATASETS_DIR = PROJECT_ROOT / "datasets"
RE_KEY = re.compile(r"^([A-Z0-9_]+)=(.*)$")
# ---------------------------------------------------------------------------
# Category mapping
# ---------------------------------------------------------------------------
CATEGORY_RANGES = [
("Amateur", 1, 10),
("Recruit", 8, 20),
("Initiate", 18, 30),
("Novice", 28, 40),
("Apprentice", 38, 50),
("Journeyman", 48, 60),
("Craftsman", 58, 70),
("Artisan", 68, 80),
("Adept", 78, 90),
("Veteran", 88, 100),
("Expert", 98, 110),
("Authority", 111, 120),
]
def category_for_level(level: int) -> str:
for name, lo, hi in CATEGORY_RANGES:
if lo <= level <= hi:
return name
return "Unknown"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
conf: Dict[str, str] = {}
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith('#'):
continue
if (m := RE_KEY.match(line)):
k, v = m.group(1), m.group(2).strip().strip("'\"")
conf[k] = v
return conf
async def recreate_table(conn: asyncpg.Connection, craft: str):
table = f"recipes_{craft.lower()}"
await conn.execute(f"DROP TABLE IF EXISTS {table};")
await conn.execute(
f"""
CREATE TABLE {table} (
id SERIAL PRIMARY KEY,
category TEXT NOT NULL,
level INT NOT NULL,
subcrafts JSONB,
name TEXT NOT NULL,
crystal TEXT NOT NULL,
key_item TEXT,
ingredients JSONB,
hq_yields JSONB
);
"""
)
async def insert_csv(conn: asyncpg.Connection, craft: str, csv_path: pathlib.Path):
table = f"recipes_{craft.lower()}"
with csv_path.open(encoding="utf-8") as f:
reader = csv.DictReader(f)
records = []
for row in reader:
records.append(
(
category_for_level(int(row["level"])),
int(row["level"]),
json.dumps(json.loads(row["subcrafts"] or "[]")),
row["name"],
row["crystal"],
row["key_item"] or None,
json.dumps(json.loads(row["ingredients"] or "[]")),
json.dumps(json.loads(row["hq_yields"] or "[]")),
)
)
await conn.copy_records_to_table(
table,
records=records,
columns=[
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
],
)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def process_craft(conn: asyncpg.Connection, craft: str):
csv_path = DATASETS_DIR / f"{craft}_v2.csv"
if not csv_path.exists():
print(f"CSV not found for {craft}, skipping.")
return
await recreate_table(conn, craft)
await insert_csv(conn, craft, csv_path)
print(f"Loaded {craft} -> recipes_{craft.lower()} table.")
async def main_async(craft: str | None):
conf = parse_db_conf(CONF_PATH)
conn = await asyncpg.connect(
host=conf["PSQL_HOST"],
port=int(conf["PSQL_PORT"]),
user=conf["PSQL_USER"],
password=conf["PSQL_PASSWORD"],
database=conf["PSQL_DBNAME"],
)
try:
if craft:
await process_craft(conn, craft)
else:
# Scan datasets dir
for p in DATASETS_DIR.glob("*_v2.csv"):
c = p.stem.replace("_v2", "")
await process_craft(conn, c)
finally:
await conn.close()
def main() -> None:
p = argparse.ArgumentParser(description="Load <Craft>_v2.csv into DB")
p.add_argument("craft", nargs="?", help="Craft name; if omitted, load all *_v2.csv files")
args = p.parse_args()
craft_arg = args.craft.strip() if args.craft else None
asyncio.run(main_async(craft_arg))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""Create recipes_woodworking table and load data from datasets/Woodworking.csv.
Usage:
python3 scripts/load_woodworking_to_db.py
The script reads database connection details from db.conf located at the project root.
It is idempotent creating the table only if it doesn't already exist, then
inserting new rows (it truncates beforehand to avoid duplicates).
"""
from __future__ import annotations
import asyncio
import csv
import pathlib
import re
from typing import Any, Dict, List, Optional
import asyncpg
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
CONF_PATH = PROJECT_ROOT / "db.conf"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
"""Parse simple KEY=VALUE lines into a dict."""
conf: Dict[str, str] = {}
pattern = re.compile(r"^([A-Z0-9_]+)=(.*)$")
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
m = pattern.match(line)
if m:
key, value = m.group(1), m.group(2)
# Remove surrounding quotes if present
value = value.strip().strip("'\"")
conf[key] = value
required = {"PSQL_HOST", "PSQL_PORT", "PSQL_USER", "PSQL_PASSWORD", "PSQL_DBNAME"}
missing = required - conf.keys()
if missing:
raise RuntimeError(f"Missing keys in db.conf: {', '.join(sorted(missing))}")
return conf
async def create_table(conn: asyncpg.Connection) -> None:
await conn.execute(
"""
CREATE TABLE IF NOT EXISTS recipes_woodworking (
id SERIAL PRIMARY KEY,
category TEXT NOT NULL,
level INT NOT NULL,
product_name TEXT NOT NULL,
nq_yield INT,
hq1_yield INT,
hq2_yield INT,
hq3_yield INT,
crystal TEXT,
ingredients TEXT
);
"""
)
async def truncate_table(conn: asyncpg.Connection) -> None:
await conn.execute("TRUNCATE TABLE recipes_woodworking;")
async def insert_rows(conn: asyncpg.Connection, rows: List[Dict[str, str]]) -> None:
"""Bulk insert via copy protocol for speed."""
# Prepare iterable of tuples converting blanks to None and ints accordingly
tuples = []
for r in rows:
tuples.append(
(
r["category"],
int(r["level"]),
r["product_name"],
_to_int_or_none(r["nq_yield"]),
_to_int_or_none(r["hq1_yield"]),
_to_int_or_none(r["hq2_yield"]),
_to_int_or_none(r["hq3_yield"]),
r["crystal"] or None,
r["ingredients"] or None,
)
)
await conn.copy_records_to_table(
"recipes_woodworking",
records=tuples,
columns=[
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
],
)
def _to_int_or_none(s: str) -> Optional[int]:
s = s.strip()
return int(s) if s else None
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def main() -> None:
if not CSV_PATH.exists():
raise SystemExit("CSV file not found. Run woodworking_to_csv.py first.")
conf = parse_db_conf(CONF_PATH)
conn = await asyncpg.connect(
host=conf["PSQL_HOST"],
port=int(conf["PSQL_PORT"]),
user=conf["PSQL_USER"],
password=conf["PSQL_PASSWORD"],
database=conf["PSQL_DBNAME"],
)
try:
await create_table(conn)
await truncate_table(conn)
with CSV_PATH.open(newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
await insert_rows(conn, rows)
print(f"Inserted {len(rows)} rows into recipes_woodworking.")
finally:
await conn.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Load datasets/Woodworking_v2.csv into PostgreSQL (recipes_woodworking table).
Drops the old table if present and creates a new one matching the v2 schema.
"""
from __future__ import annotations
import asyncio
import csv
import json
import pathlib
import re
from typing import Any, Dict, List, Optional
import asyncpg
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
CONF_PATH = PROJECT_ROOT / "db.conf"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"
RE_KEY = re.compile(r"^([A-Z0-9_]+)=(.*)$")
def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
data: Dict[str, str] = {}
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
m = RE_KEY.match(line)
if m:
k, v = m.group(1), m.group(2).strip().strip("'\"")
data[k] = v
return data
async def recreate_table(conn: asyncpg.Connection):
await conn.execute("DROP TABLE IF EXISTS recipes_woodworking;")
await conn.execute(
"""
CREATE TABLE recipes_woodworking (
id SERIAL PRIMARY KEY,
category TEXT NOT NULL,
level INT NOT NULL,
subcrafts JSONB,
name TEXT NOT NULL,
crystal TEXT NOT NULL,
key_item TEXT,
ingredients JSONB,
hq_yields JSONB
);
"""
)
CATEGORY_RANGES = [
("Amateur", 1, 10),
("Recruit", 8, 20),
("Initiate", 18, 30),
("Novice", 28, 40),
("Apprentice", 38, 50),
("Journeyman", 48, 60),
("Craftsman", 58, 70),
("Artisan", 68, 80),
("Adept", 78, 90),
("Veteran", 88, 100),
("Expert", 98, 110),
("Authority", 111, 120),
]
def category_for_level(level: int) -> str:
"""Return the category name that includes the given level.
If multiple ranges overlap, the first match in CATEGORY_RANGES is returned.
"""
for name, lo, hi in CATEGORY_RANGES:
if lo <= level <= hi:
return name
return "Unknown"
async def insert_csv(conn: asyncpg.Connection):
with CSV_PATH.open(encoding="utf-8") as f:
reader = csv.DictReader(f)
records = []
for row in reader:
records.append(
(
category_for_level(int(row["level"])),
int(row["level"]),
json.dumps(json.loads(row["subcrafts"] or "[]")), # jsonb text
row["name"],
row["crystal"],
row["key_item"] or None,
json.dumps(json.loads(row["ingredients"] or "[]")), # jsonb text
json.dumps(json.loads(row["hq_yields"] or "[]")), # jsonb text
)
)
await conn.copy_records_to_table(
"recipes_woodworking",
records=records,
columns=[
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
],
)
async def main():
if not CSV_PATH.exists():
raise SystemExit("CSV v2 not found; run parser first.")
conf = parse_db_conf(CONF_PATH)
conn = await asyncpg.connect(
host=conf["PSQL_HOST"],
port=int(conf["PSQL_PORT"]),
user=conf["PSQL_USER"],
password=conf["PSQL_PASSWORD"],
database=conf["PSQL_DBNAME"],
)
try:
await recreate_table(conn)
await insert_csv(conn)
print("Loaded recipes into new recipes_woodworking table.")
finally:
await conn.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,146 @@
"""Populate the spells table using scroll information from usable_items.
Assumptions
-----------
1. `usable_items` table has (at least) the following columns:
- item_name (text)
- description (text)
- type_description (text) where scrolls have the value `SCROLL`
2. The spell name can be derived from the item name by stripping common prefixes like
"Scroll of ", "Scroll: ", etc. This heuristic can be adjusted if necessary.
3. Job / level information is embedded in the description in patterns like
"RDM Lv. 1", "WHM Lv.75", etc. Multiple jobs may appear in one description.
4. The database URL is provided via the `DATABASE_URL` environment variable, e.g.
postgresql+psycopg2://user:password@host/dbname
Usage
-----
$ export DATABASE_URL=postgresql+psycopg2://...
$ python scripts/populate_spells_from_scrolls.py
The script will insert new rows or update existing rows in the `spells` table.
"""
from __future__ import annotations
import os
import re
from typing import Dict, List
from sqlalchemy import MetaData, Table, select, create_engine, update, insert
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session
# Job abbreviations to column mapping (identity mapping here but could differ)
JOBS: List[str] = [
"run", "whm", "blm", "rdm", "pld", "drk", "brd", "nin", "smn", "cor", "sch", "geo",
]
# Regex to capture patterns like "RDM Lv. 1" or "RDM Lv.1" (space optional)
JOB_LV_PATTERN = re.compile(r"([A-Z]{3})\s*Lv\.?\s*(\d+)")
def _derive_spell_name(scroll_name: str) -> str:
"""Convert a scroll item name to a spell name.
Examples
--------
>>> _derive_spell_name("Scroll of Fire")
'Fire'
>>> _derive_spell_name("Scroll: Cure IV")
'Cure IV'
"""
# Remove common prefixes
prefixes = ["Scroll of ", "Scroll: ", "Scroll "]
for p in prefixes:
if scroll_name.startswith(p):
return scroll_name[len(p):].strip()
return scroll_name.strip()
def _parse_job_levels(description: str) -> Dict[str, int]:
"""Extract job-level mappings from a description string."""
mapping: Dict[str, int] = {}
for job, lvl in JOB_LV_PATTERN.findall(description):
job_l = job.lower()
if job_l in JOBS:
mapping[job_l] = int(lvl)
return mapping
from pathlib import Path
def _get_engine() -> Engine:
"""Return SQLAlchemy engine using DATABASE_URL or db.conf."""
url = os.getenv("DATABASE_URL")
if not url:
# Attempt to build from db.conf at project root
conf_path = Path(__file__).resolve().parents[1] / "db.conf"
if not conf_path.exists():
raise RuntimeError("DATABASE_URL env var not set and db.conf not found")
cfg: Dict[str, str] = {}
with conf_path.open() as fh:
for line in fh:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" not in line:
continue
k, v = line.split("=", 1)
cfg[k.strip()] = v.strip().strip("'\"") # remove quotes if any
try:
url = (
f"postgresql+psycopg2://{cfg['PSQL_USER']}:{cfg['PSQL_PASSWORD']}@"
f"{cfg['PSQL_HOST']}:{cfg.get('PSQL_PORT', '5432')}/{cfg['PSQL_DBNAME']}"
)
except KeyError as e:
raise RuntimeError(f"Missing key in db.conf: {e}")
return create_engine(url)
def main() -> None:
engine = _get_engine()
meta = MetaData()
usable_items = Table("usable_items", meta, autoload_with=engine)
spells = Table("spells", meta, autoload_with=engine)
with Session(engine) as session:
# Fetch scroll items
scroll_rows = session.execute(
select(
usable_items.c.name,
usable_items.c.description
).where(usable_items.c.type_description == "SCROLL")
).all()
for name, description in scroll_rows:
spell_name = _derive_spell_name(name)
job_levels = _parse_job_levels(description or "")
# Build values dict w/ None default
values = {job: None for job in JOBS}
values.update(job_levels)
values["name"] = spell_name
# Upsert logic
existing = session.execute(
select(spells.c.name).where(spells.c.name == spell_name)
).first()
if existing:
# Update existing row
stmt = (
update(spells)
.where(spells.c.name == spell_name)
.values(**job_levels)
)
session.execute(stmt)
else:
stmt = insert(spells).values(**values)
session.execute(stmt)
session.commit()
print(f"Processed {len(scroll_rows)} scrolls. Spells table updated.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv
Usage:
python recipes_to_csv_v2.py <CRAFT>
Where <CRAFT> matches the name of the .txt file in the datasets directory,
for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.
The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
having the following columns (identical to the Woodworking v2 spec):
category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields
See scripts/README.md for details of each column.
"""
from __future__ import annotations
import argparse
import csv
import json
import pathlib
import re
from typing import List, Optional, Tuple
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
DATASETS_DIR = PROJECT_ROOT / "datasets"
# ---------------------------------------------------------------------------
# Regex helpers (compiled at runtime where craft-dependent)
# ---------------------------------------------------------------------------
RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
# Patterns like "+1" denote HQ variant and should be preserved in the name.
RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)
# ---------------------------------------------------------------------------
# Helper functions / dataclasses
# ---------------------------------------------------------------------------
def norm(s: str) -> str:
"""Normalise whitespace inside a string."""
return re.sub(r"\s+", " ", s.strip())
def split_item_qty(text: str) -> Tuple[str, int]:
"""Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
text = norm(text)
m = RE_ITEM_QTY.match(text)
if m:
name, qty = m.group(1), int(m.group(2))
else:
name, qty = text, 1
return name, qty
class Recipe:
__slots__ = (
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
)
def __init__(self, category: str, level: int):
self.category = category
self.level = level
self.subcrafts: List[Tuple[str, int]] = []
self.name: str = ""
self.crystal: str = ""
self.key_item: Optional[str] = None
self.ingredients: List[Tuple[str, int]] = []
self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
def row(self) -> List[str]:
return [
self.category,
str(self.level),
json.dumps(self.subcrafts, ensure_ascii=False),
self.name,
self.crystal,
self.key_item or "",
json.dumps(self.ingredients, ensure_ascii=False),
json.dumps(self.hq_yields, ensure_ascii=False),
]
# ---------------------------------------------------------------------------
# Core parse routine
# ---------------------------------------------------------------------------
def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
"""Parse a crafting text file into Recipe objects.
The parsing strategy is now:
1. "Main Craft:" marks the *metadata* for the upcoming recipe level, optional
sub-crafts, and key item.
2. Ingredient lines follow until an "NQ:" line is reached. The first recipe
ingredient that contains the word "Crystal" determines the crystal type
and is removed from the ingredients list.
3. An "NQ:" line finalises the recipe: we capture the product name and then
look ahead for up to three "HQx:" lines that describe HQ yields.
Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
This allows the parser to cope with datasets where multiple successive
"Main Craft:" lines appear without an intervening "NQ:".
"""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
n = len(lines)
i = 0
current_category: str = ""
recipes: List[Recipe] = []
# Craft-specific regex (compiled once per run)
RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")
while i < n:
line = lines[i].strip()
# 1) Category header
if (m_cat := RE_CATEGORY.match(line)):
current_category = m_cat.group(1)
i += 1
continue
# 2) Start of a recipe line beginning with "NQ:"
if not line.startswith("NQ:"):
i += 1
continue
# -------------------------------
# New recipe initialised
# -------------------------------
rec = Recipe(current_category, level=0)
rec.name, _ = split_item_qty(line[len("NQ:"):].strip())
# Collect block until next NQ or EOF
block_lines: List[str] = []
i += 1
while i < n and not lines[i].lstrip().startswith("NQ:"):
block_lines.append(lines[i])
i += 1
# ------------------------------------
# Parse metadata & ingredients in block
# ------------------------------------
for raw in block_lines:
look = raw.strip()
if not look:
continue
# Skip icon decorator lines early
if look.endswith("-Icon.gif"):
continue
# Main Craft level
if (m_main := RE_MAIN.search(look)):
level_raw = m_main.group(1)
# Handle ranges like "115~120" or "115-120" by taking the lower bound
m_range = re.match(r"(\d+)", level_raw)
if m_range:
rec.level = int(m_range.group(1))
else:
rec.level = 0
continue
# Sub crafts
if (m_sc := RE_SUBCRAFTS.match(look)):
for part in m_sc.group(1).split(','):
part = part.strip()
if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
rec.subcrafts.append((m.group(1), int(m.group(2))))
continue
# Key item
if (m_ki := RE_KEY_ITEM.match(look)):
rec.key_item = m_ki.group(1)
continue
# HQ lines
if look.startswith("HQ"):
if (m_hq := RE_HQ.match(look)):
idx = int(m_hq.group(1)) - 1
name, qty = split_item_qty(m_hq.group(2))
rec.hq_yields[idx] = (name, qty)
continue
# Otherwise treat as ingredient
name, qty = split_item_qty(look)
rec.ingredients.append((name, qty))
# Determine crystal & clean ingredient list
for name, qty in rec.ingredients:
if "Crystal" in name:
rec.crystal = name.split()[0]
break
if rec.crystal:
rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
else:
rec.crystal = "Unknown"
recipes.append(rec)
return recipes
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
"""CLI entrypoint.
Usage examples:
# Process a single craft
python recipes_to_csv_v2.py Woodworking
# Process all *.txt files in the datasets directory
python recipes_to_csv_v2.py --all
# Omit positional arg defaults to --all
python recipes_to_csv_v2.py
"""
argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
args = argp.parse_args()
# Determine which crafts to process
if args.all or not args.craft:
crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
if not crafts:
raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
else:
crafts = [args.craft.strip()]
for craft in crafts:
txt_path = DATASETS_DIR / f"{craft}.txt"
if not txt_path.exists():
print(f"[WARN] Dataset file not found: {txt_path}")
continue
csv_path = DATASETS_DIR / f"{craft}_v2.csv"
recipes = parse(txt_path, craft)
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
])
for r in recipes:
writer.writerow(r.row())
rel = csv_path.relative_to(PROJECT_ROOT)
print(f"Wrote {len(recipes)} recipes -> {rel}")
if __name__ == "__main__":
main()

2
scripts/requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
asyncpg==0.29.0
python-dotenv==1.0.1

View File

@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
The output CSV will be written to datasets/Woodworking.csv with the following columns:
category Text group header e.g. Amateur (1-10)
level Woodworking skill level for the recipe (integer)
product_name Produced item (without quantity suffix)
nq_yield Quantity produced on a normal quality synth (may be empty)
hq1_yield Quantity produced on HQ1 synth (may be empty)
hq2_yield Quantity produced on HQ2 synth (may be empty)
hq3_yield Quantity produced on HQ3 synth (may be empty)
crystal Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
ingredients Semi-colon-separated list of remaining ingredients (excluding the crystal)
Run the script from the project root:
python3 scripts/woodworking_to_csv.py
"""
from __future__ import annotations
import csv
import pathlib
import re
from typing import List, Optional
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
# --- Regex patterns ---------------------------------------------------------
RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
def normalise_whitespace(text: str) -> str:
"""Collapse internal runs of whitespace and trim."""
return re.sub(r"\s+", " ", text.strip())
class Recipe:
__slots__ = (
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
)
def __init__(self, category: str, level: int):
self.category: str = category
self.level: int = level
self.product_name: str = ""
self.nq_yield: str = ""
self.hq1_yield: str = ""
self.hq2_yield: str = ""
self.hq3_yield: str = ""
self.crystal: str = ""
self.ingredients: List[str] = []
# ------------------------------------------------------------------
# Helper methods
# ------------------------------------------------------------------
def _set_product(self, text: str, nq: bool = False) -> None:
name, qty = self._split_item_qty(text)
self.product_name = name
if nq:
self.nq_yield = qty
def _add_hq(self, idx: int, text: str) -> None:
_name, qty = self._split_item_qty(text)
if idx == 1:
self.hq1_yield = qty
elif idx == 2:
self.hq2_yield = qty
elif idx == 3:
self.hq3_yield = qty
@staticmethod
def _split_item_qty(text: str) -> tuple[str, str]:
text = normalise_whitespace(text)
m = RE_ITEM_QTY.match(text)
if m:
return m.group(1), m.group(2)
return text, ""
def to_row(self) -> List[str]:
return [
self.category,
str(self.level),
self.product_name,
self.nq_yield,
self.hq1_yield,
self.hq2_yield,
self.hq3_yield,
self.crystal,
"; ".join(self.ingredients),
]
# ----------------------------------------------------------------------------
# Parsing logic
# ----------------------------------------------------------------------------
def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
recipes: List[Recipe] = []
current_category = ""
lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# Update category headers e.g. "Amateur (1-10)"
if RE_CATEGORY.match(line):
current_category = line
i += 1
continue
# Detect start of a recipe block
m_main = RE_MAIN_CRAFT.search(line)
if m_main:
level = int(m_main.group(1))
rec = Recipe(current_category, level)
i += 1
# Collect ingredients until we hit the NQ line
while i < n and not lines[i].lstrip().startswith("NQ:"):
ing_line = lines[i].strip()
if ing_line:
rec.ingredients.append(normalise_whitespace(ing_line))
i += 1
# Extract crystal (first ingredient if it contains "Crystal")
if rec.ingredients and "Crystal" in rec.ingredients[0]:
rec.crystal = rec.ingredients.pop(0)
# Now we should be at the NQ line
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
rec._set_product(m_nq.group(1), nq=True)
i += 1
else:
# Malformed entry skip ahead
i += 1
continue
# Collect HQ lines (03 lines)
while i < n and lines[i].lstrip().startswith("HQ"):
m_hq = RE_HQ.match(lines[i].strip())
if m_hq:
idx = int(m_hq.group(1))
rec._add_hq(idx, m_hq.group(2))
i += 1
recipes.append(rec)
continue # skip to next line without increment to avoid double increment
# Fallback increment
i += 1
return recipes
# ----------------------------------------------------------------------------
# CSV writer
# ----------------------------------------------------------------------------
def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(
[
"category",
"level",
"product_name",
"nq_yield",
"hq1_yield",
"hq2_yield",
"hq3_yield",
"crystal",
"ingredients",
]
)
for r in recipes:
writer.writerow(r.to_row())
if __name__ == "__main__":
if not TXT_PATH.exists():
raise SystemExit(f"Input file not found: {TXT_PATH}")
recs = parse_file(TXT_PATH)
write_csv(recs, CSV_PATH)
print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env python3
"""Improved parser for Woodworking recipes -> Woodworking_v2.csv
This version follows the spec provided by the user:
Category: category name (without level range)
Level: recipe level (integer)
Sub-Crafts: JSON list of [name, level]
Name: product NQ name
Crystal: crystal element (Earth, Wind, etc.)
Key Item: key item name or null
Ingredients: JSON list of [name, quantity]
HQ Yields: JSON list ordered HQ1..HQ3 of [name, quantity] (nulls if N/A)
"""
from __future__ import annotations
import csv
import json
import pathlib
import re
from typing import Dict, List, Optional, Tuple
PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"
RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
RE_MAIN = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
RE_NQ = re.compile(r"NQ:\s*(.+)")
RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
RE_ITEM_QTY = re.compile(r"(.+?)\s*(?:x|\+)(\d+)$")
def norm(s: str) -> str:
return re.sub(r"\s+", " ", s.strip())
def split_item_qty(text: str) -> Tuple[str, int]:
text = norm(text)
m = RE_ITEM_QTY.match(text)
if m:
name, qty = m.group(1), int(m.group(2))
else:
name, qty = text, 1
return name, qty
class Recipe:
__slots__ = (
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
)
def __init__(self, category: str, level: int):
self.category = category
self.level = level
self.subcrafts: List[Tuple[str, int]] = []
self.name: str = ""
self.crystal: str = ""
self.key_item: Optional[str] = None
self.ingredients: List[Tuple[str, int]] = []
# index 0,1,2 for HQ1..3; (name, qty) or None
self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
def row(self) -> List[str]:
return [
self.category,
str(self.level),
json.dumps(self.subcrafts, ensure_ascii=False),
self.name,
self.crystal,
self.key_item or "",
json.dumps(self.ingredients, ensure_ascii=False),
json.dumps(self.hq_yields, ensure_ascii=False),
]
# ---------------------------------------------------------------------------
def parse() -> List[Recipe]:
lines = TXT_PATH.read_text(encoding="utf-8", errors="ignore").splitlines()
n = len(lines)
i = 0
current_category = ""
recipes: List[Recipe] = []
while i < n:
line = lines[i].strip()
# Category header
m_cat = RE_CATEGORY.match(line)
if m_cat:
current_category = m_cat.group(1)
i += 1
continue
# Main Craft start
m_main = RE_MAIN.search(line)
if m_main:
level = int(m_main.group(1))
rec = Recipe(current_category, level)
i += 1
# look ahead for optional Sub Craft(s) & Key Item lines
while i < n:
look = lines[i].strip()
if not look:
i += 1
continue
if RE_SUBCRAFTS.match(look):
sub_text = RE_SUBCRAFTS.match(look).group(1)
# Split by commas
for part in sub_text.split(','):
part = part.strip()
m_sc = re.match(r"([A-Za-z]+) - \((\d+)\)", part)
if m_sc:
rec.subcrafts.append((m_sc.group(1), int(m_sc.group(2))))
i += 1
continue
if RE_KEY_ITEM.match(look):
rec.key_item = RE_KEY_ITEM.match(look).group(1)
i += 1
continue
# If line starts with NQ: we stop metadata collection
if look.startswith("NQ:") or look.startswith("HQ"):
break
# Ingredient lines normally start with crystal or item names and are indented
if not look.startswith("NQ:"):
# Ingredient collection will happen in separate loop
break
i += 1
# Collect ingredients until NQ line encountered
while i < n and not lines[i].lstrip().startswith("NQ:"):
ingr_line = lines[i].strip()
if ingr_line and not ingr_line.endswith("-Icon.gif"):
name, qty = split_item_qty(ingr_line)
rec.ingredients.append((name, qty))
i += 1
# Determine crystal (must contain 'Crystal')
for name, qty in rec.ingredients:
if "Crystal" in name:
rec.crystal = name.split()[0] # Earth Crystal -> Earth
break
if rec.crystal:
# Remove crystal from ingredient list
rec.ingredients = [pair for pair in rec.ingredients if "Crystal" not in pair[0]]
else:
rec.crystal = "Unknown"
# NQ line
if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
rec.name, _ = split_item_qty(m_nq.group(1))
i += 1
else:
i += 1
# Skip blank lines before HQ entries
while i < n and not lines[i].strip():
i += 1
# HQ lines
while i < n and lines[i].lstrip().startswith("HQ"):
m_hq = RE_HQ.match(lines[i].strip())
if m_hq:
idx = int(m_hq.group(1)) - 1
name, qty = split_item_qty(m_hq.group(2))
rec.hq_yields[idx] = (name, qty)
i += 1
recipes.append(rec)
continue
i += 1
return recipes
# ---------------------------------------------------------------------------
def main() -> None:
recs = parse()
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)
with CSV_PATH.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"category",
"level",
"subcrafts",
"name",
"crystal",
"key_item",
"ingredients",
"hq_yields",
])
for r in recs:
writer.writerow(r.row())
print(f"Wrote {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")
if __name__ == "__main__":
main()