Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

418 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
fo-blog-v6 Training Script
===========================
Fixes all root causes of fo-blog-v5 mode collapse:
ROOT CAUSES OF v5 FAILURE:
1. System-prompt mismatch: trained with 1-line prompt, inference sends 2000-char prompt
2. Wrong training data: 15 JSON stub examples instead of real blog articles
3. Over-training: 15 epochs × 15 examples = catastrophic overfitting
4. Too aggressive LoRA: r=32, α=64 → memorized training set
FIXES IN v6:
1. Uses EXACT system_prompt from LLM gateway DB (same prompt used to generate articles)
2. Loads 280 real blog articles from llm_gateway.learning_corpus (avg 7182 chars)
3. 3 epochs only (was 15) — standard SFT
4. Conservative LoRA: r=16, α=32, dropout=0.1
5. Lower learning rate: 5e-5 (was 2e-4)
6. Q4_K_M quantization (was Q5_K_M — same quality, 15% smaller)
Usage:
cd /Users/renefichtmueller/Desktop/Claude Code/llm-gateway/packages/fine-tuner
python3 scripts/train_blog_v6.py
Requires:
- SSH access to Erik (82.165.222.127) with port forwarding
- Ollama running on Mac Studio (:11434)
- HuggingFace model cache (Qwen/Qwen2.5-7B-Instruct)
- pip: transformers trl peft torch datasets psycopg2-binary pyyaml
"""
import json
import os
import subprocess
import sys
import time
from pathlib import Path
import yaml
REPO_ROOT = Path(__file__).parent.parent
CONFIG_FILE = REPO_ROOT / "config" / "fo-blog-v6.yaml"
DATA_CACHE = REPO_ROOT / "data" / "fo-blog-v6-training.jsonl"
MODEL_NAME = "fo-blog-v6"
TUNNEL_PORT = 15432 # Local port for SSH tunnel to Erik's PG
MEGA_DATASET = Path("/Users/renefichtmueller/Desktop/BlogLLM-v5-Mega-Training/mega-training-dataset.jsonl")
# ──────────────────────────────────────────────
# 1. Load config
# ──────────────────────────────────────────────
with open(CONFIG_FILE) as f:
cfg = yaml.safe_load(f)
training_cfg = cfg["training"]
llama_cfg = cfg["llama_cpp"]
sft_cfg = training_cfg["sft"]
output_cfg = cfg["output"]
# ──────────────────────────────────────────────
# 2. Fetch training data from Erik via SSH tunnel
# ──────────────────────────────────────────────
def fetch_training_data() -> list[dict]:
"""Fetch fo-blog-v1 corpus from LLM gateway DB on Erik via SSH tunnel."""
if DATA_CACHE.exists():
print(f" Using cached training data: {DATA_CACHE}")
with open(DATA_CACHE) as f:
data = [json.loads(l) for l in f if l.strip()]
print(f" Loaded {len(data)} examples from cache")
return data
print(" Opening SSH tunnel to Erik (82.165.222.127:5432)...")
tunnel = subprocess.Popen(
["ssh", "-N", "-L", f"{TUNNEL_PORT}:127.0.0.1:5432", "erik"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
time.sleep(3) # Wait for tunnel to establish
try:
import psycopg2 # type: ignore
conn = psycopg2.connect(
host="127.0.0.1",
port=TUNNEL_PORT,
user="llm",
password="llm_secure_2026",
dbname="llm_gateway",
connect_timeout=10,
)
cur = conn.cursor()
cur.execute("""
SELECT system_prompt, input_text, output_text, quality_score
FROM learning_corpus
WHERE task_type = 'fo-blog-v1'
AND status = 'approved'
AND output_text IS NOT NULL
AND LENGTH(output_text) > 500
ORDER BY quality_score DESC, created_at DESC
""")
rows = cur.fetchall()
conn.close()
print(f" Fetched {len(rows)} examples from Erik DB")
# Deduplicate by output prefix (avoid near-duplicates)
seen = set()
data = []
for sys_prompt, inp, out, score in rows:
key = out[:100].strip()
if key in seen:
continue
seen.add(key)
data.append({
"system_prompt": sys_prompt or "",
"input_text": inp or "",
"output_text": out,
"quality_score": float(score) if score is not None else 9.0,
})
print(f" After dedup: {len(data)} unique examples")
# Cache to disk
DATA_CACHE.parent.mkdir(parents=True, exist_ok=True)
with open(DATA_CACHE, "w") as f:
for ex in data:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f" Cached to {DATA_CACHE}")
return data
finally:
tunnel.terminate()
tunnel.wait()
# ──────────────────────────────────────────────
# 2b. Load prose-only examples from mega dataset
# ──────────────────────────────────────────────
def load_mega_prose() -> list[dict]:
"""
Extract prose-only examples from the v5 mega dataset.
Filters out any example with markdown headers (##, #) or heavy bullet lists.
These were crawled from Fearghas/RIR/NANOG sources and are domain-relevant.
"""
if not MEGA_DATASET.exists():
print(f" Mega dataset not found at {MEGA_DATASET} — skipping")
return []
with open(MEGA_DATASET) as f:
mega = [json.loads(l) for l in f if l.strip()]
def is_prose(out: str) -> bool:
first = out[:500]
if "# " in first or first.startswith("#"):
return False
if out.count("\n## ") > 0 or out.count("\n### ") > 0:
return False
if out.count("\n- ") > 5: # allow a few bullets but not bullet-heavy
return False
return True
prose = [ex for ex in mega if is_prose(ex.get("output_text", ""))]
print(f" Mega dataset: {len(mega)} total → {len(prose)} prose-only kept")
return prose
# ──────────────────────────────────────────────
# 3. Format for SFT training (ChatML)
# ──────────────────────────────────────────────
def format_for_training(examples: list[dict]) -> list[dict]:
"""Convert corpus examples to ChatML messages format for SFTTrainer."""
formatted = []
for ex in examples:
messages = [
{"role": "system", "content": ex["system_prompt"]},
{"role": "user", "content": ex["input_text"]},
{"role": "assistant", "content": ex["output_text"]},
]
formatted.append({"messages": messages})
return formatted
# ──────────────────────────────────────────────
# 4. Train
# ──────────────────────────────────────────────
def train(data: list[dict]) -> Path:
"""Run LoRA SFT training and return path to merged model."""
import torch # type: ignore
from datasets import Dataset # type: ignore
from peft import LoraConfig, TaskType, get_peft_model # type: ignore
from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore
from trl import SFTConfig, SFTTrainer # type: ignore
model_id = cfg["models"]["qwen_7b_hf"]
device = training_cfg["device"]
print(f"\n Loading base model: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if device == "mps" else torch.bfloat16,
device_map=device,
trust_remote_code=True,
)
# LoRA config — conservative to prevent overfitting
lora_config = LoraConfig(
r=training_cfg["lora_r"],
lora_alpha=training_cfg["lora_alpha"],
lora_dropout=training_cfg["lora_dropout"],
target_modules=training_cfg["target_modules"],
task_type=TaskType.CAUSAL_LM,
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Dataset
formatted = format_for_training(data)
dataset = Dataset.from_list(formatted)
print(f"\n Dataset: {len(dataset)} examples")
# Output dir
adapter_dir = REPO_ROOT / output_cfg["adapters_dir"] / MODEL_NAME
adapter_dir.mkdir(parents=True, exist_ok=True)
# SFT config
sft_config = SFTConfig(
output_dir=str(adapter_dir),
num_train_epochs=sft_cfg["num_epochs"],
per_device_train_batch_size=sft_cfg["batch_size"],
gradient_accumulation_steps=sft_cfg["gradient_accumulation"],
learning_rate=sft_cfg["learning_rate"],
warmup_ratio=sft_cfg["warmup_ratio"],
logging_steps=10,
save_steps=50,
save_total_limit=2,
max_length=training_cfg["max_seq_length"],
fp16=(device != "mps"),
bf16=False,
dataloader_pin_memory=False, # Required for MPS
report_to="none",
)
trainer = SFTTrainer(
model=model,
args=sft_config,
train_dataset=dataset,
processing_class=tokenizer,
)
print(f"\n Training fo-blog-v6: {sft_cfg['num_epochs']} epochs, "
f"lr={sft_cfg['learning_rate']}, LoRA r={training_cfg['lora_r']}")
trainer.train()
# Merge LoRA into base model
print("\n Merging LoRA adapter into base model...")
merged_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME / "merged"
merged_dir.mkdir(parents=True, exist_ok=True)
merged_model = model.merge_and_unload()
merged_model.save_pretrained(str(merged_dir))
tokenizer.save_pretrained(str(merged_dir))
print(f" Merged model saved to: {merged_dir}")
return merged_dir
# ──────────────────────────────────────────────
# 5. Convert to GGUF + quantize
# ──────────────────────────────────────────────
def convert_to_gguf(merged_dir: Path) -> Path:
"""Convert merged HF model to quantized GGUF for Ollama."""
gguf_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME
gguf_fp16 = gguf_dir / f"{MODEL_NAME}-f16.gguf"
gguf_q4 = gguf_dir / f"{MODEL_NAME}.gguf"
quantization = llama_cfg["default_quantization"]
print(f"\n Converting to GGUF (fp16)...")
subprocess.run([
llama_cfg["python_bin"],
llama_cfg["convert_script"],
str(merged_dir),
"--outfile", str(gguf_fp16),
"--outtype", "f16",
], check=True)
print(f" Quantizing to {quantization}...")
subprocess.run([
llama_cfg["quantize_binary"],
str(gguf_fp16),
str(gguf_q4),
quantization,
], check=True)
# Cleanup fp16 (large)
gguf_fp16.unlink(missing_ok=True)
print(f" GGUF: {gguf_q4} ({gguf_q4.stat().st_size / 1e9:.2f} GB)")
return gguf_q4
# ──────────────────────────────────────────────
# 6. Register in Ollama via Modelfile
# ──────────────────────────────────────────────
MODELFILE_TEMPLATE = """FROM {gguf_path}
TEMPLATE \"\"\"{{{{- if .Messages }}}}
{{{{- if or .System .Tools }}}}<|im_start|>system
{{{{- if .System }}}}
{{{{ .System }}}}
{{{{- end }}}}<|im_end|>
{{{{ end }}}}
{{{{- range $i, $_ := .Messages }}}}
{{{{- $last := eq (len (slice $.Messages $i)) 1 -}}}}
{{{{- if eq .Role "user" }}}}<|im_start|>user
{{{{ .Content }}}}<|im_end|>
{{{{ else if eq .Role "assistant" }}}}<|im_start|>assistant
{{{{ if .Content }}}}{{{{ .Content }}}}{{{{- end }}}}{{{{ if not $last }}}}<|im_end|>
{{{{ end }}}}
{{{{- end }}}}
{{{{- if and (ne .Role "assistant") $last }}}}<|im_start|>assistant
{{{{ end }}}}
{{{{- end }}}}
{{{{- else }}}}
{{{{- if .System }}}}<|im_start|>system
{{{{ .System }}}}<|im_end|>
{{{{ end }}}}{{{{ if .Prompt }}}}<|im_start|>user
{{{{ .Prompt }}}}<|im_end|>
{{{{ end }}}}<|im_start|>assistant
{{{{ end }}}}{{{{ .Response }}}}{{{{ if .Response }}}}<|im_end|>{{{{ end }}}}\"\"\"
PARAMETER num_predict 4096
PARAMETER temperature 0.72
PARAMETER top_k 40
PARAMETER top_p 0.9
PARAMETER repeat_penalty 1.1
"""
def register_in_ollama(gguf_path: Path) -> None:
"""Create Modelfile and register fo-blog-v6 in Ollama."""
modelfile_path = gguf_path.parent / "Modelfile"
modelfile_content = MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve())
modelfile_path.write_text(modelfile_content)
print(f"\n Modelfile: {modelfile_path}")
print(f" Registering fo-blog-v6 in Ollama...")
result = subprocess.run(
["ollama", "create", MODEL_NAME, "-f", str(modelfile_path)],
capture_output=True, text=True
)
if result.returncode != 0:
print(f" ERROR: {result.stderr}")
sys.exit(1)
print(f" ✅ fo-blog-v6 registered in Ollama")
# Verify
result = subprocess.run(
["ollama", "list"],
capture_output=True, text=True
)
if MODEL_NAME in result.stdout:
print(f" ✅ Verified: fo-blog-v6 visible in `ollama list`")
else:
print(f" ⚠ Warning: fo-blog-v6 not found in `ollama list` output")
# ──────────────────────────────────────────────
# MAIN
# ──────────────────────────────────────────────
def main() -> None:
print("=" * 60)
print(" fo-blog-v6 Training Pipeline")
print("=" * 60)
print(f" Config: {CONFIG_FILE}")
print(f" Epochs: {sft_cfg['num_epochs']} (was 15 in v5)")
print(f" LR: {sft_cfg['learning_rate']} (was 2e-4 in v5)")
print(f" LoRA r: {training_cfg['lora_r']} (was 32 in v5)")
print()
print("[1/4] Fetching training data...")
db_data = fetch_training_data()
mega_data = load_mega_prose()
# Merge: DB examples first (highest quality), then filtered mega prose
# Deduplicate by output prefix across both sources
seen: set[str] = set()
data: list[dict] = []
for ex in db_data + mega_data:
key = ex["output_text"][:80].strip()
if key not in seen:
seen.add(key)
data.append(ex)
avg_len = sum(len(ex["output_text"]) for ex in data) // len(data)
print(f" Combined: {len(data)} unique examples ({len(db_data)} DB + {len(mega_data)} mega-prose)")
print(f" Avg output: {avg_len} chars")
print("\n[2/4] Training LoRA adapter...")
merged_dir = train(data)
print("\n[3/4] Converting to GGUF...")
gguf_path = convert_to_gguf(merged_dir)
print("\n[4/4] Registering in Ollama...")
register_in_ollama(gguf_path)
print("\n" + "=" * 60)
print(" fo-blog-v6 training complete!")
print(f" GGUF: {gguf_path}")
print()
print(" Next steps:")
print(" 1. Update ecosystem.config.js: OLLAMA_LLM_MODEL=fo-blog-v6")
print(" 2. Test: ollama run fo-blog-v6 'Write a blog intro about 400G'")
print(" 3. If output is clean: pm2 restart ecosystem.config.js --update-env")
print("=" * 60)
if __name__ == "__main__":
main()