#!/usr/bin/env python3 """ fo-blog-v6 Training Script =========================== Fixes all root causes of fo-blog-v5 mode collapse: ROOT CAUSES OF v5 FAILURE: 1. System-prompt mismatch: trained with 1-line prompt, inference sends 2000-char prompt 2. Wrong training data: 15 JSON stub examples instead of real blog articles 3. Over-training: 15 epochs × 15 examples = catastrophic overfitting 4. Too aggressive LoRA: r=32, α=64 → memorized training set FIXES IN v6: 1. Uses EXACT system_prompt from LLM gateway DB (same prompt used to generate articles) 2. Loads 280 real blog articles from llm_gateway.learning_corpus (avg 7182 chars) 3. 3 epochs only (was 15) — standard SFT 4. Conservative LoRA: r=16, α=32, dropout=0.1 5. Lower learning rate: 5e-5 (was 2e-4) 6. Q4_K_M quantization (was Q5_K_M — same quality, 15% smaller) Usage: cd /Users/renefichtmueller/Desktop/Claude Code/llm-gateway/packages/fine-tuner python3 scripts/train_blog_v6.py Requires: - SSH access to Erik (82.165.222.127) with port forwarding - Ollama running on Mac Studio (:11434) - HuggingFace model cache (Qwen/Qwen2.5-7B-Instruct) - pip: transformers trl peft torch datasets psycopg2-binary pyyaml """ import json import os import subprocess import sys import time from pathlib import Path import yaml REPO_ROOT = Path(__file__).parent.parent CONFIG_FILE = REPO_ROOT / "config" / "fo-blog-v6.yaml" DATA_CACHE = REPO_ROOT / "data" / "fo-blog-v6-training.jsonl" MODEL_NAME = "fo-blog-v6" TUNNEL_PORT = 15432 # Local port for SSH tunnel to Erik's PG MEGA_DATASET = Path("/Users/renefichtmueller/Desktop/BlogLLM-v5-Mega-Training/mega-training-dataset.jsonl") # ────────────────────────────────────────────── # 1. Load config # ────────────────────────────────────────────── with open(CONFIG_FILE) as f: cfg = yaml.safe_load(f) training_cfg = cfg["training"] llama_cfg = cfg["llama_cpp"] sft_cfg = training_cfg["sft"] output_cfg = cfg["output"] # ────────────────────────────────────────────── # 2. Fetch training data from Erik via SSH tunnel # ────────────────────────────────────────────── def fetch_training_data() -> list[dict]: """Fetch fo-blog-v1 corpus from LLM gateway DB on Erik via SSH tunnel.""" if DATA_CACHE.exists(): print(f" Using cached training data: {DATA_CACHE}") with open(DATA_CACHE) as f: data = [json.loads(l) for l in f if l.strip()] print(f" Loaded {len(data)} examples from cache") return data print(" Opening SSH tunnel to Erik (82.165.222.127:5432)...") tunnel = subprocess.Popen( ["ssh", "-N", "-L", f"{TUNNEL_PORT}:127.0.0.1:5432", "erik"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) time.sleep(3) # Wait for tunnel to establish try: import psycopg2 # type: ignore conn = psycopg2.connect( host="127.0.0.1", port=TUNNEL_PORT, user="llm", password="llm_secure_2026", dbname="llm_gateway", connect_timeout=10, ) cur = conn.cursor() cur.execute(""" SELECT system_prompt, input_text, output_text, quality_score FROM learning_corpus WHERE task_type = 'fo-blog-v1' AND status = 'approved' AND output_text IS NOT NULL AND LENGTH(output_text) > 500 ORDER BY quality_score DESC, created_at DESC """) rows = cur.fetchall() conn.close() print(f" Fetched {len(rows)} examples from Erik DB") # Deduplicate by output prefix (avoid near-duplicates) seen = set() data = [] for sys_prompt, inp, out, score in rows: key = out[:100].strip() if key in seen: continue seen.add(key) data.append({ "system_prompt": sys_prompt or "", "input_text": inp or "", "output_text": out, "quality_score": float(score) if score is not None else 9.0, }) print(f" After dedup: {len(data)} unique examples") # Cache to disk DATA_CACHE.parent.mkdir(parents=True, exist_ok=True) with open(DATA_CACHE, "w") as f: for ex in data: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f" Cached to {DATA_CACHE}") return data finally: tunnel.terminate() tunnel.wait() # ────────────────────────────────────────────── # 2b. Load prose-only examples from mega dataset # ────────────────────────────────────────────── def load_mega_prose() -> list[dict]: """ Extract prose-only examples from the v5 mega dataset. Filters out any example with markdown headers (##, #) or heavy bullet lists. These were crawled from Fearghas/RIR/NANOG sources and are domain-relevant. """ if not MEGA_DATASET.exists(): print(f" Mega dataset not found at {MEGA_DATASET} — skipping") return [] with open(MEGA_DATASET) as f: mega = [json.loads(l) for l in f if l.strip()] def is_prose(out: str) -> bool: first = out[:500] if "# " in first or first.startswith("#"): return False if out.count("\n## ") > 0 or out.count("\n### ") > 0: return False if out.count("\n- ") > 5: # allow a few bullets but not bullet-heavy return False return True prose = [ex for ex in mega if is_prose(ex.get("output_text", ""))] print(f" Mega dataset: {len(mega)} total → {len(prose)} prose-only kept") return prose # ────────────────────────────────────────────── # 3. Format for SFT training (ChatML) # ────────────────────────────────────────────── def format_for_training(examples: list[dict]) -> list[dict]: """Convert corpus examples to ChatML messages format for SFTTrainer.""" formatted = [] for ex in examples: messages = [ {"role": "system", "content": ex["system_prompt"]}, {"role": "user", "content": ex["input_text"]}, {"role": "assistant", "content": ex["output_text"]}, ] formatted.append({"messages": messages}) return formatted # ────────────────────────────────────────────── # 4. Train # ────────────────────────────────────────────── def train(data: list[dict]) -> Path: """Run LoRA SFT training and return path to merged model.""" import torch # type: ignore from datasets import Dataset # type: ignore from peft import LoraConfig, TaskType, get_peft_model # type: ignore from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore from trl import SFTConfig, SFTTrainer # type: ignore model_id = cfg["models"]["qwen_7b_hf"] device = training_cfg["device"] print(f"\n Loading base model: {model_id}") tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if device == "mps" else torch.bfloat16, device_map=device, trust_remote_code=True, ) # LoRA config — conservative to prevent overfitting lora_config = LoraConfig( r=training_cfg["lora_r"], lora_alpha=training_cfg["lora_alpha"], lora_dropout=training_cfg["lora_dropout"], target_modules=training_cfg["target_modules"], task_type=TaskType.CAUSAL_LM, bias="none", ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Dataset formatted = format_for_training(data) dataset = Dataset.from_list(formatted) print(f"\n Dataset: {len(dataset)} examples") # Output dir adapter_dir = REPO_ROOT / output_cfg["adapters_dir"] / MODEL_NAME adapter_dir.mkdir(parents=True, exist_ok=True) # SFT config sft_config = SFTConfig( output_dir=str(adapter_dir), num_train_epochs=sft_cfg["num_epochs"], per_device_train_batch_size=sft_cfg["batch_size"], gradient_accumulation_steps=sft_cfg["gradient_accumulation"], learning_rate=sft_cfg["learning_rate"], warmup_ratio=sft_cfg["warmup_ratio"], logging_steps=10, save_steps=50, save_total_limit=2, max_length=training_cfg["max_seq_length"], fp16=(device != "mps"), bf16=False, dataloader_pin_memory=False, # Required for MPS report_to="none", ) trainer = SFTTrainer( model=model, args=sft_config, train_dataset=dataset, processing_class=tokenizer, ) print(f"\n Training fo-blog-v6: {sft_cfg['num_epochs']} epochs, " f"lr={sft_cfg['learning_rate']}, LoRA r={training_cfg['lora_r']}") trainer.train() # Merge LoRA into base model print("\n Merging LoRA adapter into base model...") merged_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME / "merged" merged_dir.mkdir(parents=True, exist_ok=True) merged_model = model.merge_and_unload() merged_model.save_pretrained(str(merged_dir)) tokenizer.save_pretrained(str(merged_dir)) print(f" Merged model saved to: {merged_dir}") return merged_dir # ────────────────────────────────────────────── # 5. Convert to GGUF + quantize # ────────────────────────────────────────────── def convert_to_gguf(merged_dir: Path) -> Path: """Convert merged HF model to quantized GGUF for Ollama.""" gguf_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME gguf_fp16 = gguf_dir / f"{MODEL_NAME}-f16.gguf" gguf_q4 = gguf_dir / f"{MODEL_NAME}.gguf" quantization = llama_cfg["default_quantization"] print(f"\n Converting to GGUF (fp16)...") subprocess.run([ llama_cfg["python_bin"], llama_cfg["convert_script"], str(merged_dir), "--outfile", str(gguf_fp16), "--outtype", "f16", ], check=True) print(f" Quantizing to {quantization}...") subprocess.run([ llama_cfg["quantize_binary"], str(gguf_fp16), str(gguf_q4), quantization, ], check=True) # Cleanup fp16 (large) gguf_fp16.unlink(missing_ok=True) print(f" GGUF: {gguf_q4} ({gguf_q4.stat().st_size / 1e9:.2f} GB)") return gguf_q4 # ────────────────────────────────────────────── # 6. Register in Ollama via Modelfile # ────────────────────────────────────────────── MODELFILE_TEMPLATE = """FROM {gguf_path} TEMPLATE \"\"\"{{{{- if .Messages }}}} {{{{- if or .System .Tools }}}}<|im_start|>system {{{{- if .System }}}} {{{{ .System }}}} {{{{- end }}}}<|im_end|> {{{{ end }}}} {{{{- range $i, $_ := .Messages }}}} {{{{- $last := eq (len (slice $.Messages $i)) 1 -}}}} {{{{- if eq .Role "user" }}}}<|im_start|>user {{{{ .Content }}}}<|im_end|> {{{{ else if eq .Role "assistant" }}}}<|im_start|>assistant {{{{ if .Content }}}}{{{{ .Content }}}}{{{{- end }}}}{{{{ if not $last }}}}<|im_end|> {{{{ end }}}} {{{{- end }}}} {{{{- if and (ne .Role "assistant") $last }}}}<|im_start|>assistant {{{{ end }}}} {{{{- end }}}} {{{{- else }}}} {{{{- if .System }}}}<|im_start|>system {{{{ .System }}}}<|im_end|> {{{{ end }}}}{{{{ if .Prompt }}}}<|im_start|>user {{{{ .Prompt }}}}<|im_end|> {{{{ end }}}}<|im_start|>assistant {{{{ end }}}}{{{{ .Response }}}}{{{{ if .Response }}}}<|im_end|>{{{{ end }}}}\"\"\" PARAMETER num_predict 4096 PARAMETER temperature 0.72 PARAMETER top_k 40 PARAMETER top_p 0.9 PARAMETER repeat_penalty 1.1 """ def register_in_ollama(gguf_path: Path) -> None: """Create Modelfile and register fo-blog-v6 in Ollama.""" modelfile_path = gguf_path.parent / "Modelfile" modelfile_content = MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve()) modelfile_path.write_text(modelfile_content) print(f"\n Modelfile: {modelfile_path}") print(f" Registering fo-blog-v6 in Ollama...") result = subprocess.run( ["ollama", "create", MODEL_NAME, "-f", str(modelfile_path)], capture_output=True, text=True ) if result.returncode != 0: print(f" ERROR: {result.stderr}") sys.exit(1) print(f" ✅ fo-blog-v6 registered in Ollama") # Verify result = subprocess.run( ["ollama", "list"], capture_output=True, text=True ) if MODEL_NAME in result.stdout: print(f" ✅ Verified: fo-blog-v6 visible in `ollama list`") else: print(f" ⚠ Warning: fo-blog-v6 not found in `ollama list` output") # ────────────────────────────────────────────── # MAIN # ────────────────────────────────────────────── def main() -> None: print("=" * 60) print(" fo-blog-v6 Training Pipeline") print("=" * 60) print(f" Config: {CONFIG_FILE}") print(f" Epochs: {sft_cfg['num_epochs']} (was 15 in v5)") print(f" LR: {sft_cfg['learning_rate']} (was 2e-4 in v5)") print(f" LoRA r: {training_cfg['lora_r']} (was 32 in v5)") print() print("[1/4] Fetching training data...") db_data = fetch_training_data() mega_data = load_mega_prose() # Merge: DB examples first (highest quality), then filtered mega prose # Deduplicate by output prefix across both sources seen: set[str] = set() data: list[dict] = [] for ex in db_data + mega_data: key = ex["output_text"][:80].strip() if key not in seen: seen.add(key) data.append(ex) avg_len = sum(len(ex["output_text"]) for ex in data) // len(data) print(f" Combined: {len(data)} unique examples ({len(db_data)} DB + {len(mega_data)} mega-prose)") print(f" Avg output: {avg_len} chars") print("\n[2/4] Training LoRA adapter...") merged_dir = train(data) print("\n[3/4] Converting to GGUF...") gguf_path = convert_to_gguf(merged_dir) print("\n[4/4] Registering in Ollama...") register_in_ollama(gguf_path) print("\n" + "=" * 60) print(" fo-blog-v6 training complete!") print(f" GGUF: {gguf_path}") print() print(" Next steps:") print(" 1. Update ecosystem.config.js: OLLAMA_LLM_MODEL=fo-blog-v6") print(" 2. Test: ollama run fo-blog-v6 'Write a blog intro about 400G'") print(" 3. If output is clean: pm2 restart ecosystem.config.js --update-env") print("=" * 60) if __name__ == "__main__": main()