Full v8 training pipeline for the optical networking blog model: - train_blog_v8.py: SFT (LoRA r=64, 5 epochs) + DPO (2 epochs) on Qwen2.5-14B-Instruct Fixed for trl 1.2.x: SFTConfig instead of TrainingArguments, processing_class= instead of tokenizer=, eval_strategy= instead of deprecated evaluation_strategy= - consolidate_v8_dataset.py: weighted merge of all data sources (820 effective SFT / 235 DPO) - crawl_v8_sources.py: APNIC/RIPE Labs/potaroo/Cloudflare crawler with balanced div extraction - process_v6_blogs.py: converts 101 real v6 TIP blog outputs into SFT + DPO pairs - label_v7_quality.py: Claude-judged quality labels → v8 quality DPO pairs - parse_real_posts.py: parses blog.fichtmueller.org Ghost CMS HTML → gold SFT records - run_v8_pipeline.sh: autopilot (consolidate → SFT → DPO → GGUF → Ollama) - blog-v8-training.yaml: training config reference Dataset breakdown: 19 real posts ×3 + 196 v7-gen + 28 v6blogs ×2 + 135 external ×1.5
410 lines
14 KiB
Python
410 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
train_blog_v8.py — fo-blog-v8 Training (Qwen2.5-14B, MPS LoRA)
|
||
|
||
Phase 1: SFT (5 epochs, LoRA r=64, from merged v8 dataset)
|
||
Phase 2: DPO (2 epochs, from SFT adapter)
|
||
|
||
Usage:
|
||
python3 scripts/train_blog_v8.py --phase sft
|
||
python3 scripts/train_blog_v8.py --phase dpo
|
||
python3 scripts/train_blog_v8.py --phase both # SFT then DPO sequentially
|
||
|
||
Hardware: Apple Silicon M4 Max (48GB), MPS backend
|
||
Estimated: SFT ~10-14h, DPO ~3-5h (run overnight)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# ─── Paths ────────────────────────────────────────────────────────────────────
|
||
FINE_TUNER_DIR = Path(__file__).parent.parent
|
||
DATA_DIR = Path.home() / "transceiver-training-data"
|
||
SFT_DATA = DATA_DIR / "v8-sft-merged.jsonl"
|
||
DPO_DATA = DATA_DIR / "v8-dpo-merged.jsonl"
|
||
SFT_ADAPTER = FINE_TUNER_DIR / "adapters" / "fo-blog-v8" / "adapter"
|
||
DPO_ADAPTER = FINE_TUNER_DIR / "adapters" / "fo-blog-v8-dpo" / "adapter"
|
||
MERGED_DIR = FINE_TUNER_DIR / "models" / "fo-blog-v8" / "merged"
|
||
|
||
BASE_MODEL = "Qwen/Qwen2.5-14B-Instruct"
|
||
|
||
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
|
||
|
||
STRICT CONSTRAINTS — Follow exactly, no exceptions:
|
||
- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
|
||
- STRUCTURE (mandatory, in this order):
|
||
1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
|
||
2. Technical sections — 3–4 H2 sections covering the topic in depth
|
||
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
|
||
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
|
||
- NO REPETITION: Every sentence must add new information. No restating.
|
||
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
|
||
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
|
||
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
|
||
|
||
Do not summarize what you are about to write. Start with the hook directly."""
|
||
|
||
|
||
def build_chatml(system: str, user: str, assistant: str) -> str:
|
||
"""Build ChatML-formatted training string."""
|
||
return (
|
||
f"<|im_start|>system\n{system}<|im_end|>\n"
|
||
f"<|im_start|>user\n{user}<|im_end|>\n"
|
||
f"<|im_start|>assistant\n{assistant}<|im_end|>"
|
||
)
|
||
|
||
|
||
def load_sft_dataset(tokenizer, max_seq_length: int = 4096):
|
||
"""Load + tokenize SFT dataset from v8-sft-merged.jsonl."""
|
||
from datasets import Dataset
|
||
|
||
if not SFT_DATA.exists():
|
||
raise FileNotFoundError(
|
||
f"SFT data not found: {SFT_DATA}\n"
|
||
"Run: python3 scripts/consolidate_v8_dataset.py"
|
||
)
|
||
|
||
records = []
|
||
with open(SFT_DATA, encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
item = json.loads(line)
|
||
system = item.get("system_prompt", SYSTEM_PROMPT)
|
||
user = item.get("input_text", "")
|
||
assistant = item.get("output_text", "")
|
||
if user and assistant:
|
||
text = build_chatml(system, user, assistant)
|
||
records.append({"text": text})
|
||
except (json.JSONDecodeError, KeyError):
|
||
pass
|
||
|
||
print(f"Loaded {len(records)} SFT examples from {SFT_DATA.name}")
|
||
return Dataset.from_list(records)
|
||
|
||
|
||
def load_dpo_dataset():
|
||
"""Load DPO dataset from v8-dpo-merged.jsonl."""
|
||
from datasets import Dataset
|
||
|
||
if not DPO_DATA.exists():
|
||
raise FileNotFoundError(
|
||
f"DPO data not found: {DPO_DATA}\n"
|
||
"Run: python3 scripts/consolidate_v8_dataset.py"
|
||
)
|
||
|
||
records = []
|
||
with open(DPO_DATA, encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
item = json.loads(line)
|
||
records.append({
|
||
"prompt": item["prompt"],
|
||
"chosen": item["chosen"],
|
||
"rejected": item["rejected"],
|
||
})
|
||
except (json.JSONDecodeError, KeyError):
|
||
pass
|
||
|
||
print(f"Loaded {len(records)} DPO pairs from {DPO_DATA.name}")
|
||
return Dataset.from_list(records)
|
||
|
||
|
||
def run_sft() -> None:
|
||
"""Phase 1: Supervised Fine-Tuning with LoRA."""
|
||
import torch
|
||
from peft import LoraConfig, TaskType
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
from trl import SFTTrainer, SFTConfig
|
||
|
||
print(f"=== fo-blog-v8 SFT: {BASE_MODEL} → LoRA r=64 ===")
|
||
print(f"Device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
|
||
|
||
# ── Tokenizer ──
|
||
print("Loading tokenizer...")
|
||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
||
tokenizer.pad_token = tokenizer.eos_token
|
||
tokenizer.padding_side = "right"
|
||
|
||
# ── Dataset ──
|
||
dataset = load_sft_dataset(tokenizer, max_seq_length=4096)
|
||
|
||
# ── Model ──
|
||
print(f"Loading base model: {BASE_MODEL}")
|
||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||
model = AutoModelForCausalLM.from_pretrained(
|
||
BASE_MODEL,
|
||
dtype=torch.bfloat16, # bf16 for M4 Max (transformers 5.x: dtype= not torch_dtype=)
|
||
device_map=device,
|
||
trust_remote_code=True,
|
||
)
|
||
model.config.use_cache = False
|
||
|
||
# ── LoRA Config ──
|
||
lora_config = LoraConfig(
|
||
r=64,
|
||
lora_alpha=128,
|
||
lora_dropout=0.05,
|
||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
||
"gate_proj", "up_proj", "down_proj"],
|
||
bias="none",
|
||
task_type=TaskType.CAUSAL_LM,
|
||
)
|
||
|
||
# ── Training Config (trl 1.x: SFTConfig carries both TrainingArguments + SFT params) ──
|
||
SFT_ADAPTER.mkdir(parents=True, exist_ok=True)
|
||
training_args = SFTConfig(
|
||
output_dir=str(SFT_ADAPTER),
|
||
num_train_epochs=5,
|
||
per_device_train_batch_size=1,
|
||
gradient_accumulation_steps=8,
|
||
learning_rate=1.2e-4,
|
||
warmup_ratio=0.05,
|
||
lr_scheduler_type="cosine",
|
||
bf16=True,
|
||
fp16=False,
|
||
optim="adamw_torch",
|
||
weight_decay=0.01,
|
||
max_grad_norm=1.0,
|
||
logging_steps=10,
|
||
save_steps=100,
|
||
save_total_limit=2,
|
||
eval_strategy="no",
|
||
dataloader_num_workers=0,
|
||
remove_unused_columns=False,
|
||
gradient_checkpointing=True,
|
||
report_to="none",
|
||
# SFT-specific (moved from SFTTrainer in trl 1.x)
|
||
dataset_text_field="text",
|
||
max_seq_length=4096,
|
||
packing=False,
|
||
)
|
||
|
||
# ── Trainer ──
|
||
trainer = SFTTrainer(
|
||
model=model,
|
||
train_dataset=dataset,
|
||
peft_config=lora_config,
|
||
processing_class=tokenizer,
|
||
args=training_args,
|
||
)
|
||
|
||
print(f"Starting SFT training: {len(dataset)} examples, 5 epochs...")
|
||
trainer.train()
|
||
|
||
print(f"Saving SFT adapter → {SFT_ADAPTER}")
|
||
trainer.save_model(str(SFT_ADAPTER))
|
||
tokenizer.save_pretrained(str(SFT_ADAPTER))
|
||
print("SFT Phase COMPLETE.")
|
||
|
||
|
||
def run_dpo() -> None:
|
||
"""Phase 2: Direct Preference Optimization."""
|
||
import torch
|
||
from peft import PeftModel
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
|
||
from trl import DPOTrainer, DPOConfig
|
||
|
||
print(f"=== fo-blog-v8 DPO: SFT adapter → DPO ===")
|
||
|
||
if not SFT_ADAPTER.exists():
|
||
raise FileNotFoundError(
|
||
f"SFT adapter not found at {SFT_ADAPTER}\n"
|
||
"Run: python3 scripts/train_blog_v8.py --phase sft"
|
||
)
|
||
|
||
# ── Tokenizer ──
|
||
tokenizer = AutoTokenizer.from_pretrained(str(SFT_ADAPTER), trust_remote_code=True)
|
||
tokenizer.pad_token = tokenizer.eos_token
|
||
|
||
# ── Dataset ──
|
||
dataset = load_dpo_dataset()
|
||
|
||
# ── Model (base + SFT adapter) ──
|
||
print(f"Loading model + SFT adapter...")
|
||
device = "mps" if __import__("torch").backends.mps.is_available() else "cpu"
|
||
base_model = AutoModelForCausalLM.from_pretrained(
|
||
BASE_MODEL,
|
||
dtype=torch.bfloat16,
|
||
device_map=device,
|
||
trust_remote_code=True,
|
||
)
|
||
model = PeftModel.from_pretrained(base_model, str(SFT_ADAPTER))
|
||
|
||
# ── DPO Config ──
|
||
DPO_ADAPTER.mkdir(parents=True, exist_ok=True)
|
||
dpo_config = DPOConfig(
|
||
output_dir=str(DPO_ADAPTER),
|
||
num_train_epochs=2,
|
||
per_device_train_batch_size=1,
|
||
gradient_accumulation_steps=8,
|
||
learning_rate=5e-5,
|
||
warmup_ratio=0.05,
|
||
lr_scheduler_type="cosine",
|
||
bf16=True,
|
||
fp16=False,
|
||
optim="adamw_torch",
|
||
max_grad_norm=1.0,
|
||
logging_steps=5,
|
||
save_steps=50,
|
||
save_total_limit=2,
|
||
eval_strategy="no",
|
||
dataloader_num_workers=0,
|
||
gradient_checkpointing=True,
|
||
report_to="none",
|
||
# DPO-specific
|
||
beta=0.1,
|
||
loss_type="sigmoid",
|
||
max_prompt_length=512,
|
||
max_length=4096,
|
||
)
|
||
|
||
# ── Trainer ──
|
||
trainer = DPOTrainer(
|
||
model=model,
|
||
ref_model=None, # use implicit reference via peft
|
||
args=dpo_config,
|
||
train_dataset=dataset,
|
||
processing_class=tokenizer,
|
||
)
|
||
|
||
print(f"Starting DPO training: {len(dataset)} pairs, 2 epochs...")
|
||
trainer.train()
|
||
|
||
print(f"Saving DPO adapter → {DPO_ADAPTER}")
|
||
trainer.save_model(str(DPO_ADAPTER))
|
||
tokenizer.save_pretrained(str(DPO_ADAPTER))
|
||
print("DPO Phase COMPLETE.")
|
||
|
||
|
||
def run_merge_and_convert() -> None:
|
||
"""Merge adapter → full model, convert to GGUF, register in Ollama."""
|
||
import subprocess
|
||
import shutil
|
||
import torch
|
||
from peft import PeftModel
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
||
# Prefer DPO adapter, fall back to SFT
|
||
adapter_path = DPO_ADAPTER if DPO_ADAPTER.exists() else SFT_ADAPTER
|
||
if not adapter_path.exists():
|
||
print(f"No adapter found. Run --phase sft first.")
|
||
return
|
||
|
||
print(f"=== fo-blog-v8 Merge + GGUF ===")
|
||
print(f"Adapter: {adapter_path}")
|
||
|
||
# ── Merge ──
|
||
MERGED_DIR.mkdir(parents=True, exist_ok=True)
|
||
safetensors = MERGED_DIR / "model.safetensors"
|
||
if safetensors.exists() and safetensors.stat().st_size > 10_000_000_000:
|
||
print(f" Already merged ({safetensors.stat().st_size/1e9:.1f} GB) — skip merge")
|
||
else:
|
||
print(" Loading base model on CPU for merge (avoids MPS OOM)...")
|
||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
||
model = AutoModelForCausalLM.from_pretrained(
|
||
BASE_MODEL, dtype=torch.float16,
|
||
device_map="cpu", trust_remote_code=True,
|
||
)
|
||
print(" Loading adapter...")
|
||
model = PeftModel.from_pretrained(model, str(adapter_path))
|
||
print(" Merging...")
|
||
model = model.merge_and_unload()
|
||
print(f" Saving merged model → {MERGED_DIR}")
|
||
model.save_pretrained(str(MERGED_DIR), safe_serialization=True)
|
||
tokenizer.save_pretrained(str(MERGED_DIR))
|
||
del model
|
||
print(" Merge done.")
|
||
|
||
# ── Copy tokenizer files from HF cache if needed ──
|
||
hf_cache = Path.home() / ".cache/huggingface/hub"
|
||
snaps = list(hf_cache.glob("models--Qwen--Qwen2.5-14B-Instruct/snapshots/*/tokenizer.json"))
|
||
if snaps:
|
||
snap_dir = snaps[0].parent
|
||
for fname in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt"]:
|
||
if (snap_dir / fname).exists() and not (MERGED_DIR / fname).exists():
|
||
shutil.copy2(snap_dir / fname, MERGED_DIR / fname)
|
||
|
||
# ── GGUF Conversion ──
|
||
gguf_dir = FINE_TUNER_DIR / "models" / "fo-blog-v8"
|
||
gguf_f16 = gguf_dir / "fo-blog-v8-f16.gguf"
|
||
gguf_q4 = gguf_dir / "fo-blog-v8.gguf"
|
||
convert_script = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
|
||
quantize_bin = "/opt/homebrew/bin/llama-quantize"
|
||
python_bin = "/opt/homebrew/bin/python3.13"
|
||
|
||
if not gguf_f16.exists():
|
||
print(" Converting to GGUF f16...")
|
||
subprocess.run(
|
||
[python_bin, convert_script, str(MERGED_DIR),
|
||
"--outfile", str(gguf_f16), "--outtype", "f16"],
|
||
check=True,
|
||
)
|
||
else:
|
||
print(f" F16 GGUF exists ({gguf_f16.stat().st_size/1e9:.1f} GB) — skip")
|
||
|
||
if not gguf_q4.exists():
|
||
print(" Quantizing to Q4_K_M...")
|
||
subprocess.run(
|
||
[quantize_bin, str(gguf_f16), str(gguf_q4), "Q4_K_M"],
|
||
check=True,
|
||
)
|
||
gguf_f16.unlink(missing_ok=True)
|
||
print(f" Q4_K_M GGUF: {gguf_q4} ({gguf_q4.stat().st_size/1e9:.1f} GB)")
|
||
|
||
# ── Ollama Registration ──
|
||
modelfile_path = gguf_dir / "Modelfile-v8"
|
||
modelfile_content = f"""FROM {gguf_q4.resolve()}
|
||
|
||
SYSTEM \"\"\"{SYSTEM_PROMPT}\"\"\"
|
||
|
||
PARAMETER temperature 0.7
|
||
PARAMETER top_p 0.9
|
||
PARAMETER top_k 40
|
||
PARAMETER repeat_penalty 1.15
|
||
PARAMETER num_predict 1500
|
||
"""
|
||
modelfile_path.write_text(modelfile_content)
|
||
print(" Registering in Ollama as fo-blog-v8...")
|
||
subprocess.run(["ollama", "create", "fo-blog-v8", "-f", str(modelfile_path)], check=True)
|
||
|
||
import subprocess as sp
|
||
result = sp.run(["ollama", "list"], capture_output=True, text=True)
|
||
registered = "fo-blog-v8" in result.stdout
|
||
print(f" Ollama registration: {'✓ SUCCESS' if registered else '✗ FAILED'}")
|
||
print(f"\nDONE: {gguf_q4}")
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="Train fo-blog-v8 (Qwen2.5-14B LoRA)")
|
||
parser.add_argument(
|
||
"--phase",
|
||
choices=["sft", "dpo", "both", "convert"],
|
||
default="sft",
|
||
help="Training phase to run (default: sft)",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if args.phase in ("sft", "both"):
|
||
run_sft()
|
||
|
||
if args.phase in ("dpo", "both"):
|
||
run_dpo()
|
||
|
||
if args.phase == "convert":
|
||
run_merge_and_convert()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|