- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator - ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation) - ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles) - ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral) - Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry - Integration tests: claude-code-integration.test.ts (14 test cases) - PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan - Post-deployment verification procedures for health, client fallback, metrics
418 lines
16 KiB
Python
418 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
fo-blog-v6 Training Script
|
||
===========================
|
||
Fixes all root causes of fo-blog-v5 mode collapse:
|
||
|
||
ROOT CAUSES OF v5 FAILURE:
|
||
1. System-prompt mismatch: trained with 1-line prompt, inference sends 2000-char prompt
|
||
2. Wrong training data: 15 JSON stub examples instead of real blog articles
|
||
3. Over-training: 15 epochs × 15 examples = catastrophic overfitting
|
||
4. Too aggressive LoRA: r=32, α=64 → memorized training set
|
||
|
||
FIXES IN v6:
|
||
1. Uses EXACT system_prompt from LLM gateway DB (same prompt used to generate articles)
|
||
2. Loads 280 real blog articles from llm_gateway.learning_corpus (avg 7182 chars)
|
||
3. 3 epochs only (was 15) — standard SFT
|
||
4. Conservative LoRA: r=16, α=32, dropout=0.1
|
||
5. Lower learning rate: 5e-5 (was 2e-4)
|
||
6. Q4_K_M quantization (was Q5_K_M — same quality, 15% smaller)
|
||
|
||
Usage:
|
||
cd /Users/renefichtmueller/Desktop/Claude Code/llm-gateway/packages/fine-tuner
|
||
python3 scripts/train_blog_v6.py
|
||
|
||
Requires:
|
||
- SSH access to Erik (82.165.222.127) with port forwarding
|
||
- Ollama running on Mac Studio (:11434)
|
||
- HuggingFace model cache (Qwen/Qwen2.5-7B-Instruct)
|
||
- pip: transformers trl peft torch datasets psycopg2-binary pyyaml
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
REPO_ROOT = Path(__file__).parent.parent
|
||
CONFIG_FILE = REPO_ROOT / "config" / "fo-blog-v6.yaml"
|
||
DATA_CACHE = REPO_ROOT / "data" / "fo-blog-v6-training.jsonl"
|
||
MODEL_NAME = "fo-blog-v6"
|
||
TUNNEL_PORT = 15432 # Local port for SSH tunnel to Erik's PG
|
||
MEGA_DATASET = Path("/Users/renefichtmueller/Desktop/BlogLLM-v5-Mega-Training/mega-training-dataset.jsonl")
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 1. Load config
|
||
# ──────────────────────────────────────────────
|
||
with open(CONFIG_FILE) as f:
|
||
cfg = yaml.safe_load(f)
|
||
|
||
training_cfg = cfg["training"]
|
||
llama_cfg = cfg["llama_cpp"]
|
||
sft_cfg = training_cfg["sft"]
|
||
output_cfg = cfg["output"]
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 2. Fetch training data from Erik via SSH tunnel
|
||
# ──────────────────────────────────────────────
|
||
def fetch_training_data() -> list[dict]:
|
||
"""Fetch fo-blog-v1 corpus from LLM gateway DB on Erik via SSH tunnel."""
|
||
if DATA_CACHE.exists():
|
||
print(f" Using cached training data: {DATA_CACHE}")
|
||
with open(DATA_CACHE) as f:
|
||
data = [json.loads(l) for l in f if l.strip()]
|
||
print(f" Loaded {len(data)} examples from cache")
|
||
return data
|
||
|
||
print(" Opening SSH tunnel to Erik (82.165.222.127:5432)...")
|
||
tunnel = subprocess.Popen(
|
||
["ssh", "-N", "-L", f"{TUNNEL_PORT}:127.0.0.1:5432", "erik"],
|
||
stdout=subprocess.DEVNULL,
|
||
stderr=subprocess.DEVNULL,
|
||
)
|
||
time.sleep(3) # Wait for tunnel to establish
|
||
|
||
try:
|
||
import psycopg2 # type: ignore
|
||
|
||
conn = psycopg2.connect(
|
||
host="127.0.0.1",
|
||
port=TUNNEL_PORT,
|
||
user="llm",
|
||
password="llm_secure_2026",
|
||
dbname="llm_gateway",
|
||
connect_timeout=10,
|
||
)
|
||
cur = conn.cursor()
|
||
cur.execute("""
|
||
SELECT system_prompt, input_text, output_text, quality_score
|
||
FROM learning_corpus
|
||
WHERE task_type = 'fo-blog-v1'
|
||
AND status = 'approved'
|
||
AND output_text IS NOT NULL
|
||
AND LENGTH(output_text) > 500
|
||
ORDER BY quality_score DESC, created_at DESC
|
||
""")
|
||
rows = cur.fetchall()
|
||
conn.close()
|
||
|
||
print(f" Fetched {len(rows)} examples from Erik DB")
|
||
|
||
# Deduplicate by output prefix (avoid near-duplicates)
|
||
seen = set()
|
||
data = []
|
||
for sys_prompt, inp, out, score in rows:
|
||
key = out[:100].strip()
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
data.append({
|
||
"system_prompt": sys_prompt or "",
|
||
"input_text": inp or "",
|
||
"output_text": out,
|
||
"quality_score": float(score) if score is not None else 9.0,
|
||
})
|
||
|
||
print(f" After dedup: {len(data)} unique examples")
|
||
|
||
# Cache to disk
|
||
DATA_CACHE.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(DATA_CACHE, "w") as f:
|
||
for ex in data:
|
||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||
print(f" Cached to {DATA_CACHE}")
|
||
return data
|
||
|
||
finally:
|
||
tunnel.terminate()
|
||
tunnel.wait()
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 2b. Load prose-only examples from mega dataset
|
||
# ──────────────────────────────────────────────
|
||
def load_mega_prose() -> list[dict]:
|
||
"""
|
||
Extract prose-only examples from the v5 mega dataset.
|
||
Filters out any example with markdown headers (##, #) or heavy bullet lists.
|
||
These were crawled from Fearghas/RIR/NANOG sources and are domain-relevant.
|
||
"""
|
||
if not MEGA_DATASET.exists():
|
||
print(f" Mega dataset not found at {MEGA_DATASET} — skipping")
|
||
return []
|
||
|
||
with open(MEGA_DATASET) as f:
|
||
mega = [json.loads(l) for l in f if l.strip()]
|
||
|
||
def is_prose(out: str) -> bool:
|
||
first = out[:500]
|
||
if "# " in first or first.startswith("#"):
|
||
return False
|
||
if out.count("\n## ") > 0 or out.count("\n### ") > 0:
|
||
return False
|
||
if out.count("\n- ") > 5: # allow a few bullets but not bullet-heavy
|
||
return False
|
||
return True
|
||
|
||
prose = [ex for ex in mega if is_prose(ex.get("output_text", ""))]
|
||
print(f" Mega dataset: {len(mega)} total → {len(prose)} prose-only kept")
|
||
return prose
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 3. Format for SFT training (ChatML)
|
||
# ──────────────────────────────────────────────
|
||
def format_for_training(examples: list[dict]) -> list[dict]:
|
||
"""Convert corpus examples to ChatML messages format for SFTTrainer."""
|
||
formatted = []
|
||
for ex in examples:
|
||
messages = [
|
||
{"role": "system", "content": ex["system_prompt"]},
|
||
{"role": "user", "content": ex["input_text"]},
|
||
{"role": "assistant", "content": ex["output_text"]},
|
||
]
|
||
formatted.append({"messages": messages})
|
||
return formatted
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 4. Train
|
||
# ──────────────────────────────────────────────
|
||
def train(data: list[dict]) -> Path:
|
||
"""Run LoRA SFT training and return path to merged model."""
|
||
import torch # type: ignore
|
||
from datasets import Dataset # type: ignore
|
||
from peft import LoraConfig, TaskType, get_peft_model # type: ignore
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore
|
||
from trl import SFTConfig, SFTTrainer # type: ignore
|
||
|
||
model_id = cfg["models"]["qwen_7b_hf"]
|
||
device = training_cfg["device"]
|
||
|
||
print(f"\n Loading base model: {model_id}")
|
||
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
||
model = AutoModelForCausalLM.from_pretrained(
|
||
model_id,
|
||
torch_dtype=torch.float16 if device == "mps" else torch.bfloat16,
|
||
device_map=device,
|
||
trust_remote_code=True,
|
||
)
|
||
|
||
# LoRA config — conservative to prevent overfitting
|
||
lora_config = LoraConfig(
|
||
r=training_cfg["lora_r"],
|
||
lora_alpha=training_cfg["lora_alpha"],
|
||
lora_dropout=training_cfg["lora_dropout"],
|
||
target_modules=training_cfg["target_modules"],
|
||
task_type=TaskType.CAUSAL_LM,
|
||
bias="none",
|
||
)
|
||
model = get_peft_model(model, lora_config)
|
||
model.print_trainable_parameters()
|
||
|
||
# Dataset
|
||
formatted = format_for_training(data)
|
||
dataset = Dataset.from_list(formatted)
|
||
print(f"\n Dataset: {len(dataset)} examples")
|
||
|
||
# Output dir
|
||
adapter_dir = REPO_ROOT / output_cfg["adapters_dir"] / MODEL_NAME
|
||
adapter_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# SFT config
|
||
sft_config = SFTConfig(
|
||
output_dir=str(adapter_dir),
|
||
num_train_epochs=sft_cfg["num_epochs"],
|
||
per_device_train_batch_size=sft_cfg["batch_size"],
|
||
gradient_accumulation_steps=sft_cfg["gradient_accumulation"],
|
||
learning_rate=sft_cfg["learning_rate"],
|
||
warmup_ratio=sft_cfg["warmup_ratio"],
|
||
logging_steps=10,
|
||
save_steps=50,
|
||
save_total_limit=2,
|
||
max_length=training_cfg["max_seq_length"],
|
||
fp16=(device != "mps"),
|
||
bf16=False,
|
||
dataloader_pin_memory=False, # Required for MPS
|
||
report_to="none",
|
||
)
|
||
|
||
trainer = SFTTrainer(
|
||
model=model,
|
||
args=sft_config,
|
||
train_dataset=dataset,
|
||
processing_class=tokenizer,
|
||
)
|
||
|
||
print(f"\n Training fo-blog-v6: {sft_cfg['num_epochs']} epochs, "
|
||
f"lr={sft_cfg['learning_rate']}, LoRA r={training_cfg['lora_r']}")
|
||
trainer.train()
|
||
|
||
# Merge LoRA into base model
|
||
print("\n Merging LoRA adapter into base model...")
|
||
merged_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME / "merged"
|
||
merged_dir.mkdir(parents=True, exist_ok=True)
|
||
merged_model = model.merge_and_unload()
|
||
merged_model.save_pretrained(str(merged_dir))
|
||
tokenizer.save_pretrained(str(merged_dir))
|
||
print(f" Merged model saved to: {merged_dir}")
|
||
return merged_dir
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 5. Convert to GGUF + quantize
|
||
# ──────────────────────────────────────────────
|
||
def convert_to_gguf(merged_dir: Path) -> Path:
|
||
"""Convert merged HF model to quantized GGUF for Ollama."""
|
||
gguf_dir = REPO_ROOT / output_cfg["models_dir"] / MODEL_NAME
|
||
gguf_fp16 = gguf_dir / f"{MODEL_NAME}-f16.gguf"
|
||
gguf_q4 = gguf_dir / f"{MODEL_NAME}.gguf"
|
||
quantization = llama_cfg["default_quantization"]
|
||
|
||
print(f"\n Converting to GGUF (fp16)...")
|
||
subprocess.run([
|
||
llama_cfg["python_bin"],
|
||
llama_cfg["convert_script"],
|
||
str(merged_dir),
|
||
"--outfile", str(gguf_fp16),
|
||
"--outtype", "f16",
|
||
], check=True)
|
||
|
||
print(f" Quantizing to {quantization}...")
|
||
subprocess.run([
|
||
llama_cfg["quantize_binary"],
|
||
str(gguf_fp16),
|
||
str(gguf_q4),
|
||
quantization,
|
||
], check=True)
|
||
|
||
# Cleanup fp16 (large)
|
||
gguf_fp16.unlink(missing_ok=True)
|
||
print(f" GGUF: {gguf_q4} ({gguf_q4.stat().st_size / 1e9:.2f} GB)")
|
||
return gguf_q4
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 6. Register in Ollama via Modelfile
|
||
# ──────────────────────────────────────────────
|
||
MODELFILE_TEMPLATE = """FROM {gguf_path}
|
||
TEMPLATE \"\"\"{{{{- if .Messages }}}}
|
||
{{{{- if or .System .Tools }}}}<|im_start|>system
|
||
{{{{- if .System }}}}
|
||
{{{{ .System }}}}
|
||
{{{{- end }}}}<|im_end|>
|
||
{{{{ end }}}}
|
||
{{{{- range $i, $_ := .Messages }}}}
|
||
{{{{- $last := eq (len (slice $.Messages $i)) 1 -}}}}
|
||
{{{{- if eq .Role "user" }}}}<|im_start|>user
|
||
{{{{ .Content }}}}<|im_end|>
|
||
{{{{ else if eq .Role "assistant" }}}}<|im_start|>assistant
|
||
{{{{ if .Content }}}}{{{{ .Content }}}}{{{{- end }}}}{{{{ if not $last }}}}<|im_end|>
|
||
{{{{ end }}}}
|
||
{{{{- end }}}}
|
||
{{{{- if and (ne .Role "assistant") $last }}}}<|im_start|>assistant
|
||
{{{{ end }}}}
|
||
{{{{- end }}}}
|
||
{{{{- else }}}}
|
||
{{{{- if .System }}}}<|im_start|>system
|
||
{{{{ .System }}}}<|im_end|>
|
||
{{{{ end }}}}{{{{ if .Prompt }}}}<|im_start|>user
|
||
{{{{ .Prompt }}}}<|im_end|>
|
||
{{{{ end }}}}<|im_start|>assistant
|
||
{{{{ end }}}}{{{{ .Response }}}}{{{{ if .Response }}}}<|im_end|>{{{{ end }}}}\"\"\"
|
||
PARAMETER num_predict 4096
|
||
PARAMETER temperature 0.72
|
||
PARAMETER top_k 40
|
||
PARAMETER top_p 0.9
|
||
PARAMETER repeat_penalty 1.1
|
||
"""
|
||
|
||
|
||
def register_in_ollama(gguf_path: Path) -> None:
|
||
"""Create Modelfile and register fo-blog-v6 in Ollama."""
|
||
modelfile_path = gguf_path.parent / "Modelfile"
|
||
modelfile_content = MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve())
|
||
|
||
modelfile_path.write_text(modelfile_content)
|
||
print(f"\n Modelfile: {modelfile_path}")
|
||
|
||
print(f" Registering fo-blog-v6 in Ollama...")
|
||
result = subprocess.run(
|
||
["ollama", "create", MODEL_NAME, "-f", str(modelfile_path)],
|
||
capture_output=True, text=True
|
||
)
|
||
if result.returncode != 0:
|
||
print(f" ERROR: {result.stderr}")
|
||
sys.exit(1)
|
||
print(f" ✅ fo-blog-v6 registered in Ollama")
|
||
|
||
# Verify
|
||
result = subprocess.run(
|
||
["ollama", "list"],
|
||
capture_output=True, text=True
|
||
)
|
||
if MODEL_NAME in result.stdout:
|
||
print(f" ✅ Verified: fo-blog-v6 visible in `ollama list`")
|
||
else:
|
||
print(f" ⚠ Warning: fo-blog-v6 not found in `ollama list` output")
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# MAIN
|
||
# ──────────────────────────────────────────────
|
||
def main() -> None:
|
||
print("=" * 60)
|
||
print(" fo-blog-v6 Training Pipeline")
|
||
print("=" * 60)
|
||
print(f" Config: {CONFIG_FILE}")
|
||
print(f" Epochs: {sft_cfg['num_epochs']} (was 15 in v5)")
|
||
print(f" LR: {sft_cfg['learning_rate']} (was 2e-4 in v5)")
|
||
print(f" LoRA r: {training_cfg['lora_r']} (was 32 in v5)")
|
||
print()
|
||
|
||
print("[1/4] Fetching training data...")
|
||
db_data = fetch_training_data()
|
||
mega_data = load_mega_prose()
|
||
|
||
# Merge: DB examples first (highest quality), then filtered mega prose
|
||
# Deduplicate by output prefix across both sources
|
||
seen: set[str] = set()
|
||
data: list[dict] = []
|
||
for ex in db_data + mega_data:
|
||
key = ex["output_text"][:80].strip()
|
||
if key not in seen:
|
||
seen.add(key)
|
||
data.append(ex)
|
||
|
||
avg_len = sum(len(ex["output_text"]) for ex in data) // len(data)
|
||
print(f" Combined: {len(data)} unique examples ({len(db_data)} DB + {len(mega_data)} mega-prose)")
|
||
print(f" Avg output: {avg_len} chars")
|
||
|
||
print("\n[2/4] Training LoRA adapter...")
|
||
merged_dir = train(data)
|
||
|
||
print("\n[3/4] Converting to GGUF...")
|
||
gguf_path = convert_to_gguf(merged_dir)
|
||
|
||
print("\n[4/4] Registering in Ollama...")
|
||
register_in_ollama(gguf_path)
|
||
|
||
print("\n" + "=" * 60)
|
||
print(" fo-blog-v6 training complete!")
|
||
print(f" GGUF: {gguf_path}")
|
||
print()
|
||
print(" Next steps:")
|
||
print(" 1. Update ecosystem.config.js: OLLAMA_LLM_MODEL=fo-blog-v6")
|
||
print(" 2. Test: ollama run fo-blog-v6 'Write a blog intro about 400G'")
|
||
print(" 3. If output is clean: pm2 restart ecosystem.config.js --update-env")
|
||
print("=" * 60)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|