llm-gateway/packages/fine-tuner/scripts/train_blog_v6.py

#!/usr/bin/env python3
"""
fo-blog-v6 Training Script
===========================
Fixes all root causes of fo-blog-v5 mode collapse:

ROOT CAUSES OF v5 FAILURE:
  1. System-prompt mismatch: trained with 1-line prompt, inference sends 2000-char prompt
  2. Wrong training data: 15 JSON stub examples instead of real blog articles
  3. Over-training: 15 epochs × 15 examples = catastrophic overfitting
  4. Too aggressive LoRA: r=32, α=64 → memorized training set

FIXES IN v6:
  1. Uses EXACT system_prompt from LLM gateway DB (same prompt used to generate articles)
  2. Loads 280 real blog articles from llm_gateway.learning_corpus (avg 7182 chars)
  3. 3 epochs only (was 15) — standard SFT
  4. Conservative LoRA: r=16, α=32, dropout=0.1
  5. Lower learning rate: 5e-5 (was 2e-4)
  6. Q4_K_M quantization (was Q5_K_M — same quality, 15% smaller)

Usage:
  cd /Users/renefichtmueller/Desktop/Claude Code/llm-gateway/packages/fine-tuner
  python3 scripts/train_blog_v6.py

Requires:
  - SSH access to Erik (82.165.222.127) with port forwarding
  - Ollama running on Mac Studio (:11434)
  - HuggingFace model cache (Qwen/Qwen2.5-7B-Instruct)
  - pip: transformers trl peft torch datasets psycopg2-binary pyyaml
"""

import json
import os
import subprocess
import sys
import time
from pathlib import Path

import yaml

REPO_ROOT = Path(__file__).parent.parent
CONFIG_FILE = REPO_ROOT / "config" / "fo-blog-v6.yaml"
DATA_CACHE = REPO_ROOT / "data" / "fo-blog-v6-training.jsonl"
MODEL_NAME = "fo-blog-v6"
TUNNEL_PORT = 15432  # Local port for SSH tunnel to Erik's PG
MEGA_DATASET = Path("/Users/renefichtmueller/Desktop/BlogLLM-v5-Mega-Training/mega-training-dataset.jsonl")

# ──────────────────────────────────────────────
# 1. Load config
# ──────────────────────────────────────────────
with open(CONFIG_FILE) as f:
    cfg = yaml.safe_load(f)

training_cfg = cfg["training"]
llama_cfg = cfg["llama_cpp"]
sft_cfg = training_cfg["sft"]
output_cfg = cfg["output"]


# ──────────────────────────────────────────────
# 2. Fetch training data from Erik via SSH tunnel
# ──────────────────────────────────────────────
def fetch_training_data() -> list[dict]:
    """Fetch fo-blog-v1 corpus from LLM gateway DB on Erik via SSH tunnel."""
    if DATA_CACHE.exists():
        print(f"  Using cached training data: {DATA_CACHE}")
        with open(DATA_CACHE) as f:
            data = [json.loads(l) for l in f if l.strip()]
        print(f"  Loaded {len(data)} examples from cache")
        return data

    print("  Opening SSH tunnel to Erik (82.165.222.127:5432)...")
    tunnel = subprocess.Popen(
        ["ssh", "-N", "-L", f"{TUNNEL_PORT}:127.0.0.1:5432", "erik"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    time.sleep(3)  # Wait for tunnel to establish

    try:
        import psycopg2  # type: ignore

        conn = psycopg2.connect(
            host="127.0.0.1",
            port=TUNNEL_PORT,
            user="llm",
            password="llm_secure_2026",
            dbname="llm_gateway",
            connect_timeout=10,
        )
        cur = conn.cursor()
        cur.execute("""
            SELECT system_prompt, input_text, output_text, quality_score
            FROM learning_corpus
            WHERE task_type = 'fo-blog-v1'
              AND status = 'approved'
              AND output_text IS NOT NULL
              AND LENGTH(output_text) > 500
            ORDER BY quality_score DESC, created_at DESC
        """)
        rows = cur.fetchall()
        conn.close()

        print(f"  Fetched {len(rows)} examples from Erik DB")

        # Deduplicate by output prefix (avoid near-duplicates)
        seen = set()
        data = []
        for sys_prompt, inp, out, score in rows:
            key = out[:100].strip()
            if key in seen:
                continue
            seen.add(key)
            data.append({
                "system_prompt": sys_prompt or "",
                "input_text": inp or "",
                "output_text": out,
                "quality_score": float(score) if score is not None else 9.0,
            })

        print(f"  After dedup: {len(data)} unique examples")

        # Cache to disk
        DATA_CACHE.parent.mkdir(parents=True, exist_ok=True)
        with open(DATA_CACHE, "w") as f:
            for ex in data:
                f.write(json.dumps(ex, ensure_ascii=False) + "\n")
        print(f"  Cached to {DATA_CACHE}")
        return data

    finally:
        tunnel.terminate()
        tunnel.wait()


# ──────────────────────────────────────────────
# 2b. Load prose-only examples from mega dataset
# ──────────────────────────────────────────────
def load_mega_prose() -> list[dict]:
    """
    Extract prose-only examples from the v5 mega dataset.
    Filters out any example with markdown headers (##, #) or heavy bullet lists.
    These were crawled from Fearghas/RIR/NANOG sources and are domain-relevant.
    """
    if not MEGA_DATASET.exists():
        print(f"  Mega dataset not found at {MEGA_DATASET} — skipping")
        return []

    with open(MEGA_DATASET) as f:
        mega = [json.loads(l) for l in f if l.strip()]

    def is_prose(out: str) -> bool:
        first = out[:500]
        if "# " in first or first.startswith("#"):
            return False
        if out.count("\n## ") > 0 or out.count("\n### ") > 0:
            return False
        if out.count("\n- ") > 5:  # allow a few bullets but not bullet-heavy
            return False
        return True

    prose = [ex for ex in mega if is_prose(ex.get("output_text", ""))]
    print(f"  Mega dataset: {len(mega)} total → {len(prose)} prose-only kept")
    return prose


# ──────────────────────────────────────────────
# 3. Format for SFT training (ChatML)
# ──────────────────────────────────────────────
def format_for_training(examples: list[dict]) -> list[dict]:
    """Convert corpus examples to ChatML messages format for SFTTrainer."""
    formatted = []
    for ex in examples:
        messages = [
            {"role": "system", "content": ex["system_prompt"]},
            {"role": "user", "content": ex["input_text"]},
            {"role": "assistant", "content": ex["output_text"]},
        ]
        formatted.append({"messages": messages})
    return formatted


# ──────────────────────────────────────────────
# 4. Train
# ──────────────────────────────────────────────
def train(data: list[dict]) -> Path:
    """Run LoRA SFT training and return path to merged model."""
    import torch  # type: ignore
    from datasets import Dataset  # type: ignore
    from peft import LoraConfig, TaskType, get_peft_model  # type: ignore
    from transformers import AutoModelForCausalLM, AutoTokenizer  # type: ignore
    from trl import SFTConfig, SFTTrainer  # type: ignore

    model_id = cfg["models"]["qwen_7b_hf"]
    device = training_cfg["device"]

    print(f"\n  Loading base model: {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16 if device == "mps" else torch.bfloat16,
        device_map=device,
        trust_remote_code=True,
    )

    # LoRA config — conservative to prevent overfitting
    lora_config = LoraConfig(
        r=training_cfg["lora_r"],
        lora_alpha=training_cfg["lora_alpha"],
        lora_dropout=training_cfg["lora_dropout"],
        target_modules=training_cfg["target_modules"],
        task_type=TaskType.CAUSAL_LM,
        bias="none",
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Dataset
    formatted = format_for_training(data)
    dataset = Dataset.from_list(formatted)
    print(f"\n  Dataset: {len(dataset)} examples")

    # Output dir
    adapter_dir = REPO_ROOT / output_cfg["adapters_dir"] / MODEL_NAME
    adapter_dir.mkdir(parents=True, exist_ok=True)

    # SFT config
    sft_config = SFTConfig(
        output_dir=str(adapter_dir),
        num_train_epochs=sft_cfg["num_epochs"],
        per_device_train_batch_size=sft_cfg["batch_size"],
        gradient_accumulation_steps=sft_cfg["gradient_accumulation"],
        learning_rate=sft_cfg["learning_rate"],
        warmup_ratio=sft_cfg["warmup_ratio"],
        logging_steps=10,
        save_steps=50,
        save_total_limit=2,
        max_length=training_cfg["max_seq_length"],
        fp16=(device != "mps"),
        bf16=False,
        dataloader_pin_memory=False,  # Required for MPS
        report_to="none",
    )

    trainer = SFTTrainer(
        model=model,
        args=sft_config,
        train_dataset=dataset,
        processing_class=tokenizer,
    )

    print(f"\n  Training fo-blog-v6: {sft_cfg['num_epochs']} epochs, "
          f"lr={sft_cfg['learning_rate']}, LoRA r={training_cfg['lora_r']}")
    trainer.train()

    # Merge LoRA into base model
    print("\n  Merging LoRA adapter into base model...")
    merged_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME / "merged"
    merged_dir.mkdir(parents=True, exist_ok=True)
    merged_model = model.merge_and_unload()
    merged_model.save_pretrained(str(merged_dir))
    tokenizer.save_pretrained(str(merged_dir))
    print(f"  Merged model saved to: {merged_dir}")
    return merged_dir


# ──────────────────────────────────────────────
# 5. Convert to GGUF + quantize
# ──────────────────────────────────────────────
def convert_to_gguf(merged_dir: Path) -> Path:
    """Convert merged HF model to quantized GGUF for Ollama."""
    gguf_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME
    gguf_fp16 = gguf_dir / f"{MODEL_NAME}-f16.gguf"
    gguf_q4 = gguf_dir / f"{MODEL_NAME}.gguf"
    quantization = llama_cfg["default_quantization"]

    print(f"\n  Converting to GGUF (fp16)...")
    subprocess.run([
        llama_cfg["python_bin"],
        llama_cfg["convert_script"],
        str(merged_dir),
        "--outfile", str(gguf_fp16),
        "--outtype", "f16",
    ], check=True)

    print(f"  Quantizing to {quantization}...")
    subprocess.run([
        llama_cfg["quantize_binary"],
        str(gguf_fp16),
        str(gguf_q4),
        quantization,
    ], check=True)

    # Cleanup fp16 (large)
    gguf_fp16.unlink(missing_ok=True)
    print(f"  GGUF: {gguf_q4} ({gguf_q4.stat().st_size / 1e9:.2f} GB)")
    return gguf_q4


# ──────────────────────────────────────────────
# 6. Register in Ollama via Modelfile
# ──────────────────────────────────────────────
MODELFILE_TEMPLATE = """FROM {gguf_path}
TEMPLATE \"\"\"{{{{- if .Messages }}}}
{{{{- if or .System .Tools }}}}<|im_start|>system
{{{{- if .System }}}}
{{{{ .System }}}}
{{{{- end }}}}<|im_end|>
{{{{ end }}}}
{{{{- range $i, $_ := .Messages }}}}
{{{{- $last := eq (len (slice $.Messages $i)) 1 -}}}}
{{{{- if eq .Role "user" }}}}<|im_start|>user
{{{{ .Content }}}}<|im_end|>
{{{{ else if eq .Role "assistant" }}}}<|im_start|>assistant
{{{{ if .Content }}}}{{{{ .Content }}}}{{{{- end }}}}{{{{ if not $last }}}}<|im_end|>
{{{{ end }}}}
{{{{- end }}}}
{{{{- if and (ne .Role "assistant") $last }}}}<|im_start|>assistant
{{{{ end }}}}
{{{{- end }}}}
{{{{- else }}}}
{{{{- if .System }}}}<|im_start|>system
{{{{ .System }}}}<|im_end|>
{{{{ end }}}}{{{{ if .Prompt }}}}<|im_start|>user
{{{{ .Prompt }}}}<|im_end|>
{{{{ end }}}}<|im_start|>assistant
{{{{ end }}}}{{{{ .Response }}}}{{{{ if .Response }}}}<|im_end|>{{{{ end }}}}\"\"\"
PARAMETER num_predict 4096
PARAMETER temperature 0.72
PARAMETER top_k 40
PARAMETER top_p 0.9
PARAMETER repeat_penalty 1.1
"""


def register_in_ollama(gguf_path: Path) -> None:
    """Create Modelfile and register fo-blog-v6 in Ollama."""
    modelfile_path = gguf_path.parent / "Modelfile"
    modelfile_content = MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve())

    modelfile_path.write_text(modelfile_content)
    print(f"\n  Modelfile: {modelfile_path}")

    print(f"  Registering fo-blog-v6 in Ollama...")
    result = subprocess.run(
        ["ollama", "create", MODEL_NAME, "-f", str(modelfile_path)],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  ERROR: {result.stderr}")
        sys.exit(1)
    print(f"  ✅ fo-blog-v6 registered in Ollama")

    # Verify
    result = subprocess.run(
        ["ollama", "list"],
        capture_output=True, text=True
    )
    if MODEL_NAME in result.stdout:
        print(f"  ✅ Verified: fo-blog-v6 visible in `ollama list`")
    else:
        print(f"  ⚠ Warning: fo-blog-v6 not found in `ollama list` output")


# ──────────────────────────────────────────────
# MAIN
# ──────────────────────────────────────────────
def main() -> None:
    print("=" * 60)
    print("  fo-blog-v6 Training Pipeline")
    print("=" * 60)
    print(f"  Config: {CONFIG_FILE}")
    print(f"  Epochs: {sft_cfg['num_epochs']} (was 15 in v5)")
    print(f"  LR:     {sft_cfg['learning_rate']} (was 2e-4 in v5)")
    print(f"  LoRA r: {training_cfg['lora_r']} (was 32 in v5)")
    print()

    print("[1/4] Fetching training data...")
    db_data = fetch_training_data()
    mega_data = load_mega_prose()

    # Merge: DB examples first (highest quality), then filtered mega prose
    # Deduplicate by output prefix across both sources
    seen: set[str] = set()
    data: list[dict] = []
    for ex in db_data + mega_data:
        key = ex["output_text"][:80].strip()
        if key not in seen:
            seen.add(key)
            data.append(ex)

    avg_len = sum(len(ex["output_text"]) for ex in data) // len(data)
    print(f"  Combined: {len(data)} unique examples ({len(db_data)} DB + {len(mega_data)} mega-prose)")
    print(f"  Avg output: {avg_len} chars")

    print("\n[2/4] Training LoRA adapter...")
    merged_dir = train(data)

    print("\n[3/4] Converting to GGUF...")
    gguf_path = convert_to_gguf(merged_dir)

    print("\n[4/4] Registering in Ollama...")
    register_in_ollama(gguf_path)

    print("\n" + "=" * 60)
    print("  fo-blog-v6 training complete!")
    print(f"  GGUF: {gguf_path}")
    print()
    print("  Next steps:")
    print("  1. Update ecosystem.config.js: OLLAMA_LLM_MODEL=fo-blog-v6")
    print("  2. Test: ollama run fo-blog-v6 'Write a blog intro about 400G'")
    print("  3. If output is clean: pm2 restart ecosystem.config.js --update-env")
    print("=" * 60)


if __name__ == "__main__":
    main()