Initial commit

2025-07-07 13:39:46 +01:00
commit cfa2eff6ef
69 changed files with 70452 additions and 0 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,88 @@
+# Recipe ETL Scripts
+
+This directory contains helper scripts for extracting Woodworking recipe data
+from the raw **datasets/Woodworking.txt** file and loading it into the project
+PostgreSQL database.
+
+## File overview
+
+| File | Purpose |
+|------|---------|
+| **woodworking_to_csv.py** | Legacy first-pass parser → `datasets/Woodworking.csv`. |
+| **woodworking_to_csv_v2.py** | Improved parser that matches the spec (category, level, sub-crafts, ingredients, HQ yields, etc.) → `datasets/Woodworking_v2.csv`. |
+| **recipes_to_csv_v2.py** | Generic parser. `python recipes_to_csv_v2.py <Craft>` processes one craft; use `python recipes_to_csv_v2.py --all` **or simply omit the argument** to parse every `.txt` file under `datasets/`, producing `datasets/<Craft>_v2.csv` for each. |
+| **load_woodworking_to_db.py** | Loader for the legacy CSV (kept for reference). |
+| **load_woodworking_v2_to_db.py** | Drops & recreates **recipes_woodworking** table and bulk-loads `Woodworking_v2.csv`. |
+| **load_recipes_v2_to_db.py** | Generic loader. `python load_recipes_v2_to_db.py <Craft>` loads one craft; omit the argument to load **all** generated CSVs into their respective `recipes_<craft>` tables. |
+| **requirements.txt** | Minimal Python dependencies for the scripts. |
+| **venv/** | Local virtual-environment created by the setup steps below. |
+
+## Prerequisites
+
+* Python ≥ 3.9
+* PostgreSQL instance reachable with credentials in `db.conf` at project root:
+
+  ```ini
+  PSQL_HOST=…
+  PSQL_PORT=…
+  PSQL_USER=…
+  PSQL_PASSWORD=…
+  PSQL_DBNAME=…
+  ```
+
+## Quick start (Woodworking example)
+
+```bash
+# 1. From project root
+cd scripts
+
+# 2. Create & activate virtualenv (only once)
+python3 -m venv venv
+source venv/bin/activate
+
+# 3. Install dependencies
+pip install -r requirements.txt
+
+# 4. Generate CSVs for **all** crafts
+python recipes_to_csv_v2.py --all  # or simply `python recipes_to_csv_v2.py`
+
+# 5. Load all crafts into the DB (drops/recreates each table)
+python load_recipes_v2_to_db.py
+```
+
+To work with a **single craft**, specify its name instead:
+
+```bash
+python recipes_to_csv_v2.py Smithing       # generate Smithing_v2.csv
+python load_recipes_v2_to_db.py Smithing   # load only Smithing recipes
+```
+
+The loader will output e.g.:
+
+```
+Wrote 480 recipes -> datasets/Woodworking_v2.csv
+Loaded recipes into new recipes_woodworking table.
+```
+
+## CSV schema (v2)
+
+Column | Notes
+------ | -----
+`category` | Craft rank without level range (e.g. "Amateur")
+`level` | Recipe level integer
+`subcrafts` | JSON list `[["Smithing",2],["Alchemy",7]]`
+`name` | NQ product name
+`crystal` | Element used (Wind, Earth, etc.)
+`key_item` | Required key item (blank if none)
+`ingredients` | JSON list `[["Arrowwood Log",1]]`
+`hq_yields` | JSON list HQ1-HQ3 e.g. `[["Arrowwood Lumber",6],["Arrowwood Lumber",9],["Arrowwood Lumber",12]]`
+
+## Parsing rules
+
+* Item quantities are detected only when the suffix uses an “x” (e.g. `Lumber x6`).
+* Strings such as `Bronze Leggings +1` are treated as the **full item name**; the `+1/+2/+3` suffix is preserved.
+
+## Developing / debugging
+
+* Edit the parsers as needed, then rerun them to regenerate CSV.
+* Feel free to add new scripts here; remember to update **requirements.txt** & this README.
--- a/scripts/load_recipes_v2_to_db.py
+++ b/scripts/load_recipes_v2_to_db.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""Load <Craft>_v2.csv into PostgreSQL.
+
+Usage:
+    python load_recipes_v2_to_db.py <CRAFT>
+
+The script drop-creates table `recipes_<craft>` (lowercased) with the generic
+v2 schema, then bulk-loads from the CSV produced by recipes_to_csv_v2.py.
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import csv
+import json
+import pathlib
+import re
+from typing import Dict
+
+import asyncpg
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+CONF_PATH = PROJECT_ROOT / "db.conf"
+DATASETS_DIR = PROJECT_ROOT / "datasets"
+
+RE_KEY = re.compile(r"^([A-Z0-9_]+)=(.*)$")
+
+
+# ---------------------------------------------------------------------------
+# Category mapping
+# ---------------------------------------------------------------------------
+CATEGORY_RANGES = [
+    ("Amateur", 1, 10),
+    ("Recruit", 8, 20),
+    ("Initiate", 18, 30),
+    ("Novice", 28, 40),
+    ("Apprentice", 38, 50),
+    ("Journeyman", 48, 60),
+    ("Craftsman", 58, 70),
+    ("Artisan", 68, 80),
+    ("Adept", 78, 90),
+    ("Veteran", 88, 100),
+    ("Expert", 98, 110),
+    ("Authority", 111, 120),
+]
+
+def category_for_level(level: int) -> str:
+    for name, lo, hi in CATEGORY_RANGES:
+        if lo <= level <= hi:
+            return name
+    return "Unknown"
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
+    conf: Dict[str, str] = {}
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        if (m := RE_KEY.match(line)):
+            k, v = m.group(1), m.group(2).strip().strip("'\"")
+            conf[k] = v
+    return conf
+
+
+async def recreate_table(conn: asyncpg.Connection, craft: str):
+    table = f"recipes_{craft.lower()}"
+    await conn.execute(f"DROP TABLE IF EXISTS {table};")
+    await conn.execute(
+        f"""
+        CREATE TABLE {table} (
+            id SERIAL PRIMARY KEY,
+            category TEXT NOT NULL,
+            level INT NOT NULL,
+            subcrafts JSONB,
+            name TEXT NOT NULL,
+            crystal TEXT NOT NULL,
+            key_item TEXT,
+            ingredients JSONB,
+            hq_yields JSONB
+        );
+        """
+    )
+
+
+async def insert_csv(conn: asyncpg.Connection, craft: str, csv_path: pathlib.Path):
+    table = f"recipes_{craft.lower()}"
+    with csv_path.open(encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        records = []
+        for row in reader:
+            records.append(
+                (
+                    category_for_level(int(row["level"])),
+                    int(row["level"]),
+                    json.dumps(json.loads(row["subcrafts"] or "[]")),
+                    row["name"],
+                    row["crystal"],
+                    row["key_item"] or None,
+                    json.dumps(json.loads(row["ingredients"] or "[]")),
+                    json.dumps(json.loads(row["hq_yields"] or "[]")),
+                )
+            )
+    await conn.copy_records_to_table(
+        table,
+        records=records,
+        columns=[
+            "category",
+            "level",
+            "subcrafts",
+            "name",
+            "crystal",
+            "key_item",
+            "ingredients",
+            "hq_yields",
+        ],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+async def process_craft(conn: asyncpg.Connection, craft: str):
+    csv_path = DATASETS_DIR / f"{craft}_v2.csv"
+    if not csv_path.exists():
+        print(f"CSV not found for {craft}, skipping.")
+        return
+    await recreate_table(conn, craft)
+    await insert_csv(conn, craft, csv_path)
+    print(f"Loaded {craft} -> recipes_{craft.lower()} table.")
+
+
+async def main_async(craft: str | None):
+    conf = parse_db_conf(CONF_PATH)
+    conn = await asyncpg.connect(
+        host=conf["PSQL_HOST"],
+        port=int(conf["PSQL_PORT"]),
+        user=conf["PSQL_USER"],
+        password=conf["PSQL_PASSWORD"],
+        database=conf["PSQL_DBNAME"],
+    )
+    try:
+        if craft:
+            await process_craft(conn, craft)
+        else:
+            # Scan datasets dir
+            for p in DATASETS_DIR.glob("*_v2.csv"):
+                c = p.stem.replace("_v2", "")
+                await process_craft(conn, c)
+    finally:
+        await conn.close()
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Load <Craft>_v2.csv into DB")
+    p.add_argument("craft", nargs="?", help="Craft name; if omitted, load all *_v2.csv files")
+    args = p.parse_args()
+    craft_arg = args.craft.strip() if args.craft else None
+    asyncio.run(main_async(craft_arg))
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/load_woodworking_to_db.py
+++ b/scripts/load_woodworking_to_db.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""Create recipes_woodworking table and load data from datasets/Woodworking.csv.
+
+Usage:
+    python3 scripts/load_woodworking_to_db.py
+
+The script reads database connection details from db.conf located at the project root.
+It is idempotent – creating the table only if it doesn't already exist, then
+inserting new rows (it truncates beforehand to avoid duplicates).
+"""
+from __future__ import annotations
+
+import asyncio
+import csv
+import pathlib
+import re
+from typing import Any, Dict, List, Optional
+
+import asyncpg
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+CONF_PATH = PROJECT_ROOT / "db.conf"
+CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
+    """Parse simple KEY=VALUE lines into a dict."""
+    conf: Dict[str, str] = {}
+    pattern = re.compile(r"^([A-Z0-9_]+)=(.*)$")
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        m = pattern.match(line)
+        if m:
+            key, value = m.group(1), m.group(2)
+            # Remove surrounding quotes if present
+            value = value.strip().strip("'\"")
+            conf[key] = value
+    required = {"PSQL_HOST", "PSQL_PORT", "PSQL_USER", "PSQL_PASSWORD", "PSQL_DBNAME"}
+    missing = required - conf.keys()
+    if missing:
+        raise RuntimeError(f"Missing keys in db.conf: {', '.join(sorted(missing))}")
+    return conf
+
+
+async def create_table(conn: asyncpg.Connection) -> None:
+    await conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS recipes_woodworking (
+            id SERIAL PRIMARY KEY,
+            category TEXT NOT NULL,
+            level INT NOT NULL,
+            product_name TEXT NOT NULL,
+            nq_yield INT,
+            hq1_yield INT,
+            hq2_yield INT,
+            hq3_yield INT,
+            crystal TEXT,
+            ingredients TEXT
+        );
+        """
+    )
+
+
+async def truncate_table(conn: asyncpg.Connection) -> None:
+    await conn.execute("TRUNCATE TABLE recipes_woodworking;")
+
+
+async def insert_rows(conn: asyncpg.Connection, rows: List[Dict[str, str]]) -> None:
+    """Bulk insert via copy protocol for speed."""
+    # Prepare iterable of tuples converting blanks to None and ints accordingly
+    tuples = []
+    for r in rows:
+        tuples.append(
+            (
+                r["category"],
+                int(r["level"]),
+                r["product_name"],
+                _to_int_or_none(r["nq_yield"]),
+                _to_int_or_none(r["hq1_yield"]),
+                _to_int_or_none(r["hq2_yield"]),
+                _to_int_or_none(r["hq3_yield"]),
+                r["crystal"] or None,
+                r["ingredients"] or None,
+            )
+        )
+
+    await conn.copy_records_to_table(
+        "recipes_woodworking",
+        records=tuples,
+        columns=[
+            "category",
+            "level",
+            "product_name",
+            "nq_yield",
+            "hq1_yield",
+            "hq2_yield",
+            "hq3_yield",
+            "crystal",
+            "ingredients",
+        ],
+    )
+
+
+def _to_int_or_none(s: str) -> Optional[int]:
+    s = s.strip()
+    return int(s) if s else None
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+async def main() -> None:
+    if not CSV_PATH.exists():
+        raise SystemExit("CSV file not found. Run woodworking_to_csv.py first.")
+
+    conf = parse_db_conf(CONF_PATH)
+
+    conn = await asyncpg.connect(
+        host=conf["PSQL_HOST"],
+        port=int(conf["PSQL_PORT"]),
+        user=conf["PSQL_USER"],
+        password=conf["PSQL_PASSWORD"],
+        database=conf["PSQL_DBNAME"],
+    )
+    try:
+        await create_table(conn)
+        await truncate_table(conn)
+
+        with CSV_PATH.open(newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        await insert_rows(conn, rows)
+        print(f"Inserted {len(rows)} rows into recipes_woodworking.")
+    finally:
+        await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/scripts/load_woodworking_v2_to_db.py
+++ b/scripts/load_woodworking_v2_to_db.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Load datasets/Woodworking_v2.csv into PostgreSQL (recipes_woodworking table).
+Drops the old table if present and creates a new one matching the v2 schema.
+"""
+from __future__ import annotations
+
+import asyncio
+import csv
+import json
+import pathlib
+import re
+from typing import Any, Dict, List, Optional
+
+import asyncpg
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+CONF_PATH = PROJECT_ROOT / "db.conf"
+CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"
+
+RE_KEY = re.compile(r"^([A-Z0-9_]+)=(.*)$")
+
+
+def parse_db_conf(path: pathlib.Path) -> Dict[str, str]:
+    data: Dict[str, str] = {}
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        m = RE_KEY.match(line)
+        if m:
+            k, v = m.group(1), m.group(2).strip().strip("'\"")
+            data[k] = v
+    return data
+
+
+async def recreate_table(conn: asyncpg.Connection):
+    await conn.execute("DROP TABLE IF EXISTS recipes_woodworking;")
+    await conn.execute(
+        """
+        CREATE TABLE recipes_woodworking (
+            id SERIAL PRIMARY KEY,
+            category TEXT NOT NULL,
+            level INT NOT NULL,
+            subcrafts JSONB,
+            name TEXT NOT NULL,
+            crystal TEXT NOT NULL,
+            key_item TEXT,
+            ingredients JSONB,
+            hq_yields JSONB
+        );
+        """
+    )
+
+
+CATEGORY_RANGES = [
+    ("Amateur", 1, 10),
+    ("Recruit", 8, 20),
+    ("Initiate", 18, 30),
+    ("Novice", 28, 40),
+    ("Apprentice", 38, 50),
+    ("Journeyman", 48, 60),
+    ("Craftsman", 58, 70),
+    ("Artisan", 68, 80),
+    ("Adept", 78, 90),
+    ("Veteran", 88, 100),
+    ("Expert", 98, 110),
+    ("Authority", 111, 120),
+]
+
+def category_for_level(level: int) -> str:
+    """Return the category name that includes the given level.
+
+    If multiple ranges overlap, the first match in CATEGORY_RANGES is returned.
+    """
+    for name, lo, hi in CATEGORY_RANGES:
+        if lo <= level <= hi:
+            return name
+    return "Unknown"
+
+
+async def insert_csv(conn: asyncpg.Connection):
+    with CSV_PATH.open(encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        records = []
+        for row in reader:
+            records.append(
+                (
+                    category_for_level(int(row["level"])),
+                    int(row["level"]),
+                    json.dumps(json.loads(row["subcrafts"] or "[]")), # jsonb text
+                    row["name"],
+                    row["crystal"],
+                    row["key_item"] or None,
+                    json.dumps(json.loads(row["ingredients"] or "[]")), # jsonb text
+                    json.dumps(json.loads(row["hq_yields"] or "[]")), # jsonb text
+                )
+            )
+    await conn.copy_records_to_table(
+        "recipes_woodworking",
+        records=records,
+        columns=[
+            "category",
+            "level",
+            "subcrafts",
+            "name",
+            "crystal",
+            "key_item",
+            "ingredients",
+            "hq_yields",
+        ],
+    )
+
+
+async def main():
+    if not CSV_PATH.exists():
+        raise SystemExit("CSV v2 not found; run parser first.")
+    conf = parse_db_conf(CONF_PATH)
+    conn = await asyncpg.connect(
+        host=conf["PSQL_HOST"],
+        port=int(conf["PSQL_PORT"]),
+        user=conf["PSQL_USER"],
+        password=conf["PSQL_PASSWORD"],
+        database=conf["PSQL_DBNAME"],
+    )
+    try:
+        await recreate_table(conn)
+        await insert_csv(conn)
+        print("Loaded recipes into new recipes_woodworking table.")
+    finally:
+        await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/scripts/populate_spells_from_scrolls.py
+++ b/scripts/populate_spells_from_scrolls.py
@@ -0,0 +1,146 @@
+"""Populate the spells table using scroll information from usable_items.
+
+Assumptions
+-----------
+1. `usable_items` table has (at least) the following columns:
+   - item_name (text)
+   - description (text)
+   - type_description (text) where scrolls have the value `SCROLL`
+2. The spell name can be derived from the item name by stripping common prefixes like
+   "Scroll of ", "Scroll: ", etc. This heuristic can be adjusted if necessary.
+3. Job / level information is embedded in the description in patterns like
+   "RDM Lv. 1", "WHM Lv.75", etc. Multiple jobs may appear in one description.
+4. The database URL is provided via the `DATABASE_URL` environment variable, e.g.
+   postgresql+psycopg2://user:password@host/dbname
+
+Usage
+-----
+    $ export DATABASE_URL=postgresql+psycopg2://...
+    $ python scripts/populate_spells_from_scrolls.py
+
+The script will insert new rows or update existing rows in the `spells` table.
+"""
+from __future__ import annotations
+
+import os
+import re
+from typing import Dict, List
+
+from sqlalchemy import MetaData, Table, select, create_engine, update, insert
+from sqlalchemy.engine import Engine
+from sqlalchemy.orm import Session
+
+# Job abbreviations to column mapping (identity mapping here but could differ)
+JOBS: List[str] = [
+    "run", "whm", "blm", "rdm", "pld", "drk", "brd", "nin", "smn", "cor", "sch", "geo",
+]
+
+# Regex to capture patterns like "RDM Lv. 1" or "RDM Lv.1" (space optional)
+JOB_LV_PATTERN = re.compile(r"([A-Z]{3})\s*Lv\.?\s*(\d+)")
+
+def _derive_spell_name(scroll_name: str) -> str:
+    """Convert a scroll item name to a spell name.
+
+    Examples
+    --------
+    >>> _derive_spell_name("Scroll of Fire")
+    'Fire'
+    >>> _derive_spell_name("Scroll: Cure IV")
+    'Cure IV'
+    """
+    # Remove common prefixes
+    prefixes = ["Scroll of ", "Scroll: ", "Scroll "]
+    for p in prefixes:
+        if scroll_name.startswith(p):
+            return scroll_name[len(p):].strip()
+    return scroll_name.strip()
+
+
+def _parse_job_levels(description: str) -> Dict[str, int]:
+    """Extract job-level mappings from a description string."""
+    mapping: Dict[str, int] = {}
+    for job, lvl in JOB_LV_PATTERN.findall(description):
+        job_l = job.lower()
+        if job_l in JOBS:
+            mapping[job_l] = int(lvl)
+    return mapping
+
+
+from pathlib import Path
+
+def _get_engine() -> Engine:
+    """Return SQLAlchemy engine using DATABASE_URL or db.conf."""
+    url = os.getenv("DATABASE_URL")
+    if not url:
+        # Attempt to build from db.conf at project root
+        conf_path = Path(__file__).resolve().parents[1] / "db.conf"
+        if not conf_path.exists():
+            raise RuntimeError("DATABASE_URL env var not set and db.conf not found")
+        cfg: Dict[str, str] = {}
+        with conf_path.open() as fh:
+            for line in fh:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" not in line:
+                    continue
+                k, v = line.split("=", 1)
+                cfg[k.strip()] = v.strip().strip("'\"")  # remove quotes if any
+        try:
+            url = (
+                f"postgresql+psycopg2://{cfg['PSQL_USER']}:{cfg['PSQL_PASSWORD']}@"
+                f"{cfg['PSQL_HOST']}:{cfg.get('PSQL_PORT', '5432')}/{cfg['PSQL_DBNAME']}"
+            )
+        except KeyError as e:
+            raise RuntimeError(f"Missing key in db.conf: {e}")
+    return create_engine(url)
+
+
+def main() -> None:
+    engine = _get_engine()
+
+    meta = MetaData()
+    usable_items = Table("usable_items", meta, autoload_with=engine)
+    spells = Table("spells", meta, autoload_with=engine)
+
+    with Session(engine) as session:
+        # Fetch scroll items
+        scroll_rows = session.execute(
+            select(
+                usable_items.c.name,
+                usable_items.c.description
+            ).where(usable_items.c.type_description == "SCROLL")
+        ).all()
+
+        for name, description in scroll_rows:
+            spell_name = _derive_spell_name(name)
+            job_levels = _parse_job_levels(description or "")
+
+            # Build values dict w/ None default
+            values = {job: None for job in JOBS}
+            values.update(job_levels)
+            values["name"] = spell_name
+
+            # Upsert logic
+            existing = session.execute(
+                select(spells.c.name).where(spells.c.name == spell_name)
+            ).first()
+
+            if existing:
+                # Update existing row
+                stmt = (
+                    update(spells)
+                    .where(spells.c.name == spell_name)
+                    .values(**job_levels)
+                )
+                session.execute(stmt)
+            else:
+                stmt = insert(spells).values(**values)
+                session.execute(stmt)
+
+        session.commit()
+        print(f"Processed {len(scroll_rows)} scrolls. Spells table updated.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/recipes_to_csv_v2.py
+++ b/scripts/recipes_to_csv_v2.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""Generic recipe parser for crafting disciplines -> <Craft>_v2.csv
+
+Usage:
+    python recipes_to_csv_v2.py <CRAFT>
+
+Where <CRAFT> matches the name of the .txt file in the datasets directory,
+for example `Woodworking`, `Smithing`, `Goldsmithing`, `Alchemy`, etc.
+
+The script produces a CSV `<Craft>_v2.csv` inside the datasets directory
+having the following columns (identical to the Woodworking v2 spec):
+
+    category, level, subcrafts, name, crystal, key_item, ingredients, hq_yields
+
+See scripts/README.md for details of each column.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import pathlib
+import re
+from typing import List, Optional, Tuple
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+DATASETS_DIR = PROJECT_ROOT / "datasets"
+
+# ---------------------------------------------------------------------------
+# Regex helpers (compiled at runtime where craft-dependent)
+# ---------------------------------------------------------------------------
+
+RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
+RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
+RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
+RE_NQ = re.compile(r"NQ:\s*(.+)")
+RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
+# Quantity delimiter is strictly 'xN' (e.g., "Lumber x6").
+# Patterns like "+1" denote HQ variant and should be preserved in the name.
+RE_ITEM_QTY = re.compile(r"(.+?)\s*x(\d+)$", re.IGNORECASE)
+
+
+# ---------------------------------------------------------------------------
+# Helper functions / dataclasses
+# ---------------------------------------------------------------------------
+
+def norm(s: str) -> str:
+    """Normalise whitespace inside a string."""
+    return re.sub(r"\s+", " ", s.strip())
+
+
+def split_item_qty(text: str) -> Tuple[str, int]:
+    """Split "Foo x3" → ("Foo", 3). Quantity defaults to 1."""
+    text = norm(text)
+    m = RE_ITEM_QTY.match(text)
+    if m:
+        name, qty = m.group(1), int(m.group(2))
+    else:
+        name, qty = text, 1
+    return name, qty
+
+
+class Recipe:
+    __slots__ = (
+        "category",
+        "level",
+        "subcrafts",
+        "name",
+        "crystal",
+        "key_item",
+        "ingredients",
+        "hq_yields",
+    )
+
+    def __init__(self, category: str, level: int):
+        self.category = category
+        self.level = level
+        self.subcrafts: List[Tuple[str, int]] = []
+        self.name: str = ""
+        self.crystal: str = ""
+        self.key_item: Optional[str] = None
+        self.ingredients: List[Tuple[str, int]] = []
+        self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
+
+    def row(self) -> List[str]:
+        return [
+            self.category,
+            str(self.level),
+            json.dumps(self.subcrafts, ensure_ascii=False),
+            self.name,
+            self.crystal,
+            self.key_item or "",
+            json.dumps(self.ingredients, ensure_ascii=False),
+            json.dumps(self.hq_yields, ensure_ascii=False),
+        ]
+
+
+# ---------------------------------------------------------------------------
+# Core parse routine
+# ---------------------------------------------------------------------------
+
+def parse(txt_path: pathlib.Path, craft_name: str) -> List[Recipe]:
+    """Parse a crafting text file into Recipe objects.
+
+    The parsing strategy is now:
+    1. "Main Craft:" marks the *metadata* for the upcoming recipe – level, optional
+       sub-crafts, and key item.
+    2. Ingredient lines follow until an "NQ:" line is reached.  The first recipe
+       ingredient that contains the word "Crystal" determines the crystal type
+       and is removed from the ingredients list.
+    3. An "NQ:" line finalises the recipe: we capture the product name and then
+       look ahead for up to three "HQx:" lines that describe HQ yields.
+
+    Crucially, *"NQ:" now acts as the definitive boundary between recipes*.
+    This allows the parser to cope with datasets where multiple successive
+    "Main Craft:" lines appear without an intervening "NQ:".
+    """
+
+    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+    n = len(lines)
+    i = 0
+
+    current_category: str = ""
+    recipes: List[Recipe] = []
+
+    # Craft-specific regex (compiled once per run)
+    RE_MAIN = re.compile(rf"Main Craft: {re.escape(craft_name)} - \(([^)]*)\)")
+
+    while i < n:
+        line = lines[i].strip()
+
+        # 1) Category header
+        if (m_cat := RE_CATEGORY.match(line)):
+            current_category = m_cat.group(1)
+            i += 1
+            continue
+
+        # 2) Start of a recipe – line beginning with "NQ:"
+        if not line.startswith("NQ:"):
+            i += 1
+            continue
+
+        # -------------------------------
+        # New recipe initialised
+        # -------------------------------
+        rec = Recipe(current_category, level=0)
+        rec.name, _ = split_item_qty(line[len("NQ:"):].strip())
+
+        # Collect block until next NQ or EOF
+        block_lines: List[str] = []
+        i += 1
+        while i < n and not lines[i].lstrip().startswith("NQ:"):
+            block_lines.append(lines[i])
+            i += 1
+
+        # ------------------------------------
+        # Parse metadata & ingredients in block
+        # ------------------------------------
+        for raw in block_lines:
+            look = raw.strip()
+            if not look:
+                continue
+
+            # Skip icon decorator lines early
+            if look.endswith("-Icon.gif"):
+                continue
+
+            # Main Craft – level
+            if (m_main := RE_MAIN.search(look)):
+                level_raw = m_main.group(1)
+                # Handle ranges like "115~120" or "115-120" by taking the lower bound
+                m_range = re.match(r"(\d+)", level_raw)
+                if m_range:
+                    rec.level = int(m_range.group(1))
+                else:
+                    rec.level = 0
+                continue
+
+            # Sub crafts
+            if (m_sc := RE_SUBCRAFTS.match(look)):
+                for part in m_sc.group(1).split(','):
+                    part = part.strip()
+                    if m := re.match(r"([A-Za-z]+) - \((\d+)\)", part):
+                        rec.subcrafts.append((m.group(1), int(m.group(2))))
+                continue
+
+            # Key item
+            if (m_ki := RE_KEY_ITEM.match(look)):
+                rec.key_item = m_ki.group(1)
+                continue
+
+            # HQ lines
+            if look.startswith("HQ"):
+                if (m_hq := RE_HQ.match(look)):
+                    idx = int(m_hq.group(1)) - 1
+                    name, qty = split_item_qty(m_hq.group(2))
+                    rec.hq_yields[idx] = (name, qty)
+                continue
+
+            # Otherwise treat as ingredient
+            name, qty = split_item_qty(look)
+            rec.ingredients.append((name, qty))
+
+        # Determine crystal & clean ingredient list
+        for name, qty in rec.ingredients:
+            if "Crystal" in name:
+                rec.crystal = name.split()[0]
+                break
+        if rec.crystal:
+            rec.ingredients = [p for p in rec.ingredients if "Crystal" not in p[0]]
+        else:
+            rec.crystal = "Unknown"
+
+        recipes.append(rec)
+
+    return recipes
+
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    """CLI entrypoint.
+
+    Usage examples:
+        # Process a single craft
+        python recipes_to_csv_v2.py Woodworking
+
+        # Process all *.txt files in the datasets directory
+        python recipes_to_csv_v2.py --all
+
+        # Omit positional arg – defaults to --all
+        python recipes_to_csv_v2.py
+    """
+    argp = argparse.ArgumentParser(description="Parse <Craft>.txt into CSV.")
+    argp.add_argument("craft", nargs="?", help="Craft name, e.g. Woodworking, Smithing")
+    argp.add_argument("--all", action="store_true", help="Process every .txt file in datasets/")
+    args = argp.parse_args()
+
+    # Determine which crafts to process
+    if args.all or not args.craft:
+        crafts = [p.stem for p in DATASETS_DIR.glob("*.txt")]
+        if not crafts:
+            raise SystemExit(f"No .txt files found in {DATASETS_DIR}")
+    else:
+        crafts = [args.craft.strip()]
+
+    for craft in crafts:
+        txt_path = DATASETS_DIR / f"{craft}.txt"
+        if not txt_path.exists():
+            print(f"[WARN] Dataset file not found: {txt_path}")
+            continue
+
+        csv_path = DATASETS_DIR / f"{craft}_v2.csv"
+        recipes = parse(txt_path, craft)
+
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        with csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "category",
+                "level",
+                "subcrafts",
+                "name",
+                "crystal",
+                "key_item",
+                "ingredients",
+                "hq_yields",
+            ])
+            for r in recipes:
+                writer.writerow(r.row())
+
+        rel = csv_path.relative_to(PROJECT_ROOT)
+        print(f"Wrote {len(recipes)} recipes -> {rel}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -0,0 +1,2 @@
+asyncpg==0.29.0
+python-dotenv==1.0.1
--- a/scripts/woodworking_to_csv.py
+++ b/scripts/woodworking_to_csv.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Parse the semi-structured datasets/Woodworking.txt file and export it as a CSV.
+
+The output CSV will be written to datasets/Woodworking.csv with the following columns:
+    category           – Text group header e.g. Amateur (1-10)
+    level              – Woodworking skill level for the recipe (integer)
+    product_name       – Produced item (without quantity suffix)
+    nq_yield           – Quantity produced on a normal quality synth (may be empty)
+    hq1_yield          – Quantity produced on HQ1 synth (may be empty)
+    hq2_yield          – Quantity produced on HQ2 synth (may be empty)
+    hq3_yield          – Quantity produced on HQ3 synth (may be empty)
+    crystal            – Crystal used for the synth (Wind Crystal, Earth Crystal, etc.)
+    ingredients        – Semi-colon-separated list of remaining ingredients (excluding the crystal)
+
+Run the script from the project root:
+    python3 scripts/woodworking_to_csv.py
+"""
+from __future__ import annotations
+
+import csv
+import pathlib
+import re
+from typing import List, Optional
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
+CSV_PATH = PROJECT_ROOT / "datasets/Woodworking.csv"
+
+# --- Regex patterns ---------------------------------------------------------
+RE_CATEGORY = re.compile(r"^[A-Za-z' ]+ \([0-9]+-[0-9]+\)$")
+RE_MAIN_CRAFT = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
+RE_NQ = re.compile(r"NQ:\s*(.+)")
+RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
+RE_ITEM_QTY = re.compile(r"(.+?)\s+x(\d+)$")
+
+
+def normalise_whitespace(text: str) -> str:
+    """Collapse internal runs of whitespace and trim."""
+    return re.sub(r"\s+", " ", text.strip())
+
+
+class Recipe:
+    __slots__ = (
+        "category",
+        "level",
+        "product_name",
+        "nq_yield",
+        "hq1_yield",
+        "hq2_yield",
+        "hq3_yield",
+        "crystal",
+        "ingredients",
+    )
+
+    def __init__(self, category: str, level: int):
+        self.category: str = category
+        self.level: int = level
+        self.product_name: str = ""
+        self.nq_yield: str = ""
+        self.hq1_yield: str = ""
+        self.hq2_yield: str = ""
+        self.hq3_yield: str = ""
+        self.crystal: str = ""
+        self.ingredients: List[str] = []
+
+    # ------------------------------------------------------------------
+    # Helper methods
+    # ------------------------------------------------------------------
+    def _set_product(self, text: str, nq: bool = False) -> None:
+        name, qty = self._split_item_qty(text)
+        self.product_name = name
+        if nq:
+            self.nq_yield = qty
+
+    def _add_hq(self, idx: int, text: str) -> None:
+        _name, qty = self._split_item_qty(text)
+        if idx == 1:
+            self.hq1_yield = qty
+        elif idx == 2:
+            self.hq2_yield = qty
+        elif idx == 3:
+            self.hq3_yield = qty
+
+    @staticmethod
+    def _split_item_qty(text: str) -> tuple[str, str]:
+        text = normalise_whitespace(text)
+        m = RE_ITEM_QTY.match(text)
+        if m:
+            return m.group(1), m.group(2)
+        return text, ""
+
+    def to_row(self) -> List[str]:
+        return [
+            self.category,
+            str(self.level),
+            self.product_name,
+            self.nq_yield,
+            self.hq1_yield,
+            self.hq2_yield,
+            self.hq3_yield,
+            self.crystal,
+            "; ".join(self.ingredients),
+        ]
+
+
+# ----------------------------------------------------------------------------
+# Parsing logic
+# ----------------------------------------------------------------------------
+
+def parse_file(txt_path: pathlib.Path) -> List[Recipe]:
+    recipes: List[Recipe] = []
+    current_category = ""
+    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # Update category headers e.g. "Amateur (1-10)"
+        if RE_CATEGORY.match(line):
+            current_category = line
+            i += 1
+            continue
+
+        # Detect start of a recipe block
+        m_main = RE_MAIN_CRAFT.search(line)
+        if m_main:
+            level = int(m_main.group(1))
+            rec = Recipe(current_category, level)
+            i += 1
+
+            # Collect ingredients until we hit the NQ line
+            while i < n and not lines[i].lstrip().startswith("NQ:"):
+                ing_line = lines[i].strip()
+                if ing_line:
+                    rec.ingredients.append(normalise_whitespace(ing_line))
+                i += 1
+
+            # Extract crystal (first ingredient if it contains "Crystal")
+            if rec.ingredients and "Crystal" in rec.ingredients[0]:
+                rec.crystal = rec.ingredients.pop(0)
+
+            # Now we should be at the NQ line
+            if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
+                rec._set_product(m_nq.group(1), nq=True)
+                i += 1
+            else:
+                # Malformed entry – skip ahead
+                i += 1
+                continue
+
+            # Collect HQ lines (0–3 lines)
+            while i < n and lines[i].lstrip().startswith("HQ"):
+                m_hq = RE_HQ.match(lines[i].strip())
+                if m_hq:
+                    idx = int(m_hq.group(1))
+                    rec._add_hq(idx, m_hq.group(2))
+                i += 1
+
+            recipes.append(rec)
+            continue  # skip to next line without increment to avoid double increment
+
+        # Fallback increment
+        i += 1
+
+    return recipes
+
+
+# ----------------------------------------------------------------------------
+# CSV writer
+# ----------------------------------------------------------------------------
+
+def write_csv(recipes: List[Recipe], csv_path: pathlib.Path) -> None:
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(
+            [
+                "category",
+                "level",
+                "product_name",
+                "nq_yield",
+                "hq1_yield",
+                "hq2_yield",
+                "hq3_yield",
+                "crystal",
+                "ingredients",
+            ]
+        )
+        for r in recipes:
+            writer.writerow(r.to_row())
+
+
+if __name__ == "__main__":
+    if not TXT_PATH.exists():
+        raise SystemExit(f"Input file not found: {TXT_PATH}")
+
+    recs = parse_file(TXT_PATH)
+    write_csv(recs, CSV_PATH)
+    print(f"Parsed {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")
--- a/scripts/woodworking_to_csv_v2.py
+++ b/scripts/woodworking_to_csv_v2.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""Improved parser for Woodworking recipes -> Woodworking_v2.csv
+
+This version follows the spec provided by the user:
+
+Category: category name (without level range)
+Level: recipe level (integer)
+Sub-Crafts: JSON list of [name, level]
+Name: product NQ name
+Crystal: crystal element (Earth, Wind, etc.)
+Key Item: key item name or null
+Ingredients: JSON list of [name, quantity]
+HQ Yields: JSON list ordered HQ1..HQ3 of [name, quantity] (nulls if N/A)
+"""
+from __future__ import annotations
+
+import csv
+import json
+import pathlib
+import re
+from typing import Dict, List, Optional, Tuple
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+TXT_PATH = PROJECT_ROOT / "datasets/Woodworking.txt"
+CSV_PATH = PROJECT_ROOT / "datasets/Woodworking_v2.csv"
+
+RE_CATEGORY = re.compile(r"^([A-Za-z' ]+) \([0-9]+-[0-9]+\)$")
+RE_MAIN = re.compile(r"Main Craft: Woodworking - \((\d+)\)")
+RE_SUBCRAFTS = re.compile(r"Sub Craft\(s\): (.+)")
+RE_KEY_ITEM = re.compile(r"Key Item: (.+)")
+RE_NQ = re.compile(r"NQ:\s*(.+)")
+RE_HQ = re.compile(r"HQ(\d):\s*(.+)")
+RE_ITEM_QTY = re.compile(r"(.+?)\s*(?:x|\+)(\d+)$")
+
+
+def norm(s: str) -> str:
+    return re.sub(r"\s+", " ", s.strip())
+
+
+def split_item_qty(text: str) -> Tuple[str, int]:
+    text = norm(text)
+    m = RE_ITEM_QTY.match(text)
+    if m:
+        name, qty = m.group(1), int(m.group(2))
+    else:
+        name, qty = text, 1
+    return name, qty
+
+
+class Recipe:
+    __slots__ = (
+        "category",
+        "level",
+        "subcrafts",
+        "name",
+        "crystal",
+        "key_item",
+        "ingredients",
+        "hq_yields",
+    )
+
+    def __init__(self, category: str, level: int):
+        self.category = category
+        self.level = level
+        self.subcrafts: List[Tuple[str, int]] = []
+        self.name: str = ""
+        self.crystal: str = ""
+        self.key_item: Optional[str] = None
+        self.ingredients: List[Tuple[str, int]] = []
+        # index 0,1,2 for HQ1..3; (name, qty) or None
+        self.hq_yields: List[Optional[Tuple[str, int]]] = [None, None, None]
+
+    def row(self) -> List[str]:
+        return [
+            self.category,
+            str(self.level),
+            json.dumps(self.subcrafts, ensure_ascii=False),
+            self.name,
+            self.crystal,
+            self.key_item or "",
+            json.dumps(self.ingredients, ensure_ascii=False),
+            json.dumps(self.hq_yields, ensure_ascii=False),
+        ]
+
+
+# ---------------------------------------------------------------------------
+
+def parse() -> List[Recipe]:
+    lines = TXT_PATH.read_text(encoding="utf-8", errors="ignore").splitlines()
+    n = len(lines)
+    i = 0
+    current_category = ""
+    recipes: List[Recipe] = []
+
+    while i < n:
+        line = lines[i].strip()
+
+        # Category header
+        m_cat = RE_CATEGORY.match(line)
+        if m_cat:
+            current_category = m_cat.group(1)
+            i += 1
+            continue
+
+        # Main Craft start
+        m_main = RE_MAIN.search(line)
+        if m_main:
+            level = int(m_main.group(1))
+            rec = Recipe(current_category, level)
+            i += 1
+
+            # look ahead for optional Sub Craft(s) & Key Item lines
+            while i < n:
+                look = lines[i].strip()
+                if not look:
+                    i += 1
+                    continue
+                if RE_SUBCRAFTS.match(look):
+                    sub_text = RE_SUBCRAFTS.match(look).group(1)
+                    # Split by commas
+                    for part in sub_text.split(','):
+                        part = part.strip()
+                        m_sc = re.match(r"([A-Za-z]+) - \((\d+)\)", part)
+                        if m_sc:
+                            rec.subcrafts.append((m_sc.group(1), int(m_sc.group(2))))
+                    i += 1
+                    continue
+                if RE_KEY_ITEM.match(look):
+                    rec.key_item = RE_KEY_ITEM.match(look).group(1)
+                    i += 1
+                    continue
+                # If line starts with NQ: we stop metadata collection
+                if look.startswith("NQ:") or look.startswith("HQ"):
+                    break
+                # Ingredient lines normally start with crystal or item names and are indented
+                if not look.startswith("NQ:"):
+                    # Ingredient collection will happen in separate loop
+                    break
+                i += 1
+
+            # Collect ingredients until NQ line encountered
+            while i < n and not lines[i].lstrip().startswith("NQ:"):
+                ingr_line = lines[i].strip()
+                if ingr_line and not ingr_line.endswith("-Icon.gif"):
+                    name, qty = split_item_qty(ingr_line)
+                    rec.ingredients.append((name, qty))
+                i += 1
+
+            # Determine crystal (must contain 'Crystal')
+            for name, qty in rec.ingredients:
+                if "Crystal" in name:
+                    rec.crystal = name.split()[0]  # Earth Crystal -> Earth
+                    break
+            if rec.crystal:
+                # Remove crystal from ingredient list
+                rec.ingredients = [pair for pair in rec.ingredients if "Crystal" not in pair[0]]
+            else:
+                rec.crystal = "Unknown"
+
+            # NQ line
+            if i < n and (m_nq := RE_NQ.match(lines[i].strip())):
+                rec.name, _ = split_item_qty(m_nq.group(1))
+                i += 1
+            else:
+                i += 1
+
+            # Skip blank lines before HQ entries
+            while i < n and not lines[i].strip():
+                i += 1
+            # HQ lines
+            while i < n and lines[i].lstrip().startswith("HQ"):
+                m_hq = RE_HQ.match(lines[i].strip())
+                if m_hq:
+                    idx = int(m_hq.group(1)) - 1
+                    name, qty = split_item_qty(m_hq.group(2))
+                    rec.hq_yields[idx] = (name, qty)
+                i += 1
+
+            recipes.append(rec)
+            continue
+
+        i += 1
+
+    return recipes
+
+
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    recs = parse()
+    CSV_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with CSV_PATH.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            "category",
+            "level",
+            "subcrafts",
+            "name",
+            "crystal",
+            "key_item",
+            "ingredients",
+            "hq_yields",
+        ])
+        for r in recs:
+            writer.writerow(r.row())
+    print(f"Wrote {len(recs)} recipes -> {CSV_PATH.relative_to(PROJECT_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()