llm-gateway/packages/fine-tuner/config/blog-v8-training.yaml

# ═══════════════════════════════════════════════════════════════════════════════
# blog-v8-training.yaml — fo-blog-v8 Training Configuration
#
# Base:    Qwen/Qwen2.5-14B-Instruct  (4× the capacity of v7's 7B)
# Target:  700-1000w blog posts, optical networking + BGP + infra
#
# Key improvements over v7:
#   - 14B params → better instruction following at higher complexity
#   - LoRA r=64 (was r=32) → more expressive adapter
#   - Weighted datasets: human posts × 3.0, external rewritten × 1.5
#   - More epochs (5 SFT, 2 DPO) → deeper style absorption
#   - max_seq_length=4096 → handles longer real posts
#   - DPO from real v7 quality labels (good/bad scored posts)
# ═══════════════════════════════════════════════════════════════════════════════

base_model: "Qwen/Qwen2.5-14B-Instruct"

# ─── Dataset Sources (merged by consolidate_v8_dataset.py) ────────────────────
datasets:
  # Tier 1: Rene's actual blog posts — Gold Standard
  - path: "~/transceiver-training-data/v8-real-posts-sft.jsonl"
    weight: 3.0
    description: "19 real blog posts from blog.fichtmueller.org (human written)"

  # Tier 2: v7 generated blogs (Claude-written, 197 topics, validated)
  - path: "~/transceiver-training-data/v7-generated-sft.jsonl"
    weight: 1.0
    description: "Claude-generated optical networking blogs (v7, 197 topics)"

  # Tier 2: RIPE / APNIC NAS data
  - path: "~/transceiver-training-data/v7-ripe-apnic-sft.jsonl"
    weight: 1.0
    description: "RIPE/APNIC BGP and routing content (v7 ingested)"

  # Tier 3: External crawled + rewritten content
  - path: "~/transceiver-training-data/v8-external-sft.jsonl"
    weight: 1.5
    description: "APNIC Blog / RIPE Labs / potaroo.net / Cloudflare (Claude rewritten)"

  # DPO preferences: chosen/rejected pairs for preference learning
  dpo:
    - path: "~/transceiver-training-data/v7-dpo-pairs.jsonl"
      description: "v7 DPO pairs (5 rejection strategies)"
    - path: "~/transceiver-training-data/v8-quality-dpo.jsonl"
      description: "v8 real quality labels (good/bad from v7 generated posts)"
      optional: true  # generated by label_v7_quality.py if run

# ─── SFT Phase ────────────────────────────────────────────────────────────────
sft:
  output_dir: "adapters/fo-blog-v8/adapter"
  merged_dir: "models/fo-blog-v8/merged"

  # LoRA parameters
  lora:
    r: 64                    # was 32 in v7 — more expressive
    alpha: 128               # 2× r for stable training
    dropout: 0.05
    target_modules:
      - "q_proj"
      - "k_proj"
      - "v_proj"
      - "o_proj"
      - "gate_proj"
      - "up_proj"
      - "down_proj"

  # Training hyperparameters
  training:
    num_train_epochs: 5      # was 4 — extra epoch for 14B
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 8   # effective batch = 8
    learning_rate: 1.2e-4    # slightly lower than v7's 1.5e-4 for 14B stability
    warmup_ratio: 0.05
    lr_scheduler_type: "cosine"
    max_seq_length: 4096     # was 2048 — handles longer real posts
    fp16: false
    bf16: true               # M4 Max supports bf16
    optim: "adamw_torch"
    weight_decay: 0.01
    max_grad_norm: 1.0
    logging_steps: 10
    save_steps: 100
    evaluation_strategy: "no"
    dataloader_num_workers: 0   # MPS: no multiprocessing
    remove_unused_columns: false
    gradient_checkpointing: true  # save RAM on 14B

  # Chat template (Qwen2.5 uses ChatML)
  chat_template: "chatml"
  dataset_text_field: "text"

# ─── DPO Phase ────────────────────────────────────────────────────────────────
dpo:
  input_adapter: "adapters/fo-blog-v8/adapter"   # start from SFT
  output_dir: "adapters/fo-blog-v8-dpo/adapter"

  training:
    num_train_epochs: 2      # was 1 — more DPO for 14B
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 8
    learning_rate: 5e-5
    warmup_ratio: 0.05
    lr_scheduler_type: "cosine"
    max_seq_length: 4096
    bf16: true
    optim: "adamw_torch"
    logging_steps: 5
    save_steps: 50
    dataloader_num_workers: 0
    gradient_checkpointing: true

  dpo_params:
    beta: 0.1                # KL penalty (standard)
    loss_type: "sigmoid"     # standard DPO loss
    max_prompt_length: 512
    max_length: 4096

# ─── GGUF Conversion ──────────────────────────────────────────────────────────
gguf:
  output_name: "fo-blog-v8.gguf"
  quantization: "Q4_K_M"
  ollama_model_name: "fo-blog-v8"
  convert_script: "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
  quantize_bin: "/opt/homebrew/bin/llama-quantize"

  # Ollama Modelfile system prompt
  modelfile_system: |
    You are an expert technical writer specializing in optical networking and transceiver technology.

    STRICT CONSTRAINTS:
    - LENGTH: 700-1000 words ONLY. Stop at 1000 words maximum.
    - STRUCTURE: 1) Hook paragraph, 2) Technical sections (## headers), 3) Exactly 3 takeaways
    - TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
    - NO REPETITION: Every sentence adds new information.
    - VOICE: Confident and direct. No hedging phrases.
    - AUDIENCE: Network engineers and IT professionals.

  modelfile_params:
    temperature: 0.7
    top_p: 0.9
    top_k: 40
    repeat_penalty: 1.15
    num_predict: 1500

# ─── Hardware ──────────────────────────────────────────────────────────────────
hardware:
  device: "mps"              # Apple Silicon M4 Max
  ram_gb: 48
  python: "/opt/homebrew/bin/python3.13"
  # 14B model in fp16 ≈ 28GB — fits in 48GB with LoRA overhead (~4GB)
  # Training peak RAM estimate: ~36-40GB
  # Merge on CPU: device_map="cpu" to avoid MPS OOM during save_pretrained

# ─── Expected Timeline ─────────────────────────────────────────────────────────
# SFT:    ~8-12 hours (5 epochs, 14B, MPS)
# DPO:    ~2-4 hours  (2 epochs, 14B)
# Merge:  ~30 min     (CPU)
# GGUF:   ~15 min
# Total:  ~12-16 hours (run overnight)