llm-gateway/packages/fine-tuner/config/blog-v7-training.yaml

# ═══════════════════════════════════════════════════════════════════════════════
# blog-v7-training.yaml — fo-blog-v7 Training Configuration
#
# Key improvements over v6:
#   1. Anchored system prompt with STRICT length (700-1000w) and structure rules
#   2. 350+ training examples (250 generated + 150 RIPE/APNIC/NOG ingested)
#   3. Full 700-1000w articles as output_text (not keyword stubs)
#   4. DPO phase 2 with negative examples (v6 failure modes)
#   5. Diverse topics: transceivers + BGP + IPv6 + RIPE/NOG + data center
#
# v6 problem analysis:
#   - Training data avg 1152w but no word-count constraint in system prompt
#   - Model produces 5000w+ (inherits Qwen base behavior, no stopping signal)
#   - Topic drift: "400G request" → generic SFP+ content
#   - Missing intro paragraph
#   - Repeated sections verbatim
#
# v7 fix strategy:
#   - System prompt hard-encodes: 700-1000 words, hook+body+takeaways structure
#   - Input_text explicitly states word limit
#   - DPO training on (good, bad) pairs to reinforce constraint adherence
# ═══════════════════════════════════════════════════════════════════════════════

job_name: "fo-blog-v7"
description: "BlogLLM v7 — anchored constraints, 350+ diverse examples, DPO phase"

# ─── Data sources ──────────────────────────────────────────────────────────────

data:
  # Phase 1 SFT data (combine all v7 sources)
  sft_files:
    - "~/transceiver-training-data/v7-generated-sft.jsonl"      # 250 Claude-generated
    - "~/transceiver-training-data/v7-ripe-apnic-sft.jsonl"     # ~30 RIPE/APNIC/NOG
    - "~/transceiver-training-data/blog-fichtmueller-posts.jsonl" # 24 real posts (upgrade SP)

  # Phase 2 DPO data
  dpo_file: "~/transceiver-training-data/v7-dpo-pairs.jsonl"   # ~200 chosen/rejected pairs

  # Pre-existing high-quality data (optional inclusion)
  supplemental_files:
    - path: "~/transceiver-training-data/master-training-dataset.jsonl"
      filter: "word_count >= 700"   # Only include longer examples
      max_samples: 50               # Limit to best 50

  # Dataset stats (updated after generate_v7_data.py completes)
  estimated_total_sft: 350
  estimated_total_dpo: 200
  target_word_count_range: "700-1000"

# ─── Model ────────────────────────────────────────────────────────────────────

model:
  base: "Qwen/Qwen2.5-7B-Instruct"   # Same as v6 — proven on Mac Studio
  model_type: "qwen2.5"
  revision: "main"

# ─── Phase 1: SFT (Supervised Fine-Tuning) ────────────────────────────────────

sft:
  device: "mps"                   # Apple Silicon MPS (Mac Studio M4 Max)
  max_seq_length: 2048            # 700-1000w target ≈ 900-1300 tokens + prompt
  lora_r: 32                      # Doubled from v6 (16→32) for stronger signal
  lora_alpha: 64                  # 2x r
  lora_dropout: 0.05
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  num_epochs: 4                   # +1 vs v6 (larger adapter capacity)
  batch_size: 1                   # MPS limitation
  gradient_accumulation: 8        # Effective batch = 8
  learning_rate: 1.5e-4           # Slightly lower than v6 (2e-4) for stability
  warmup_ratio: 0.1
  weight_decay: 0.01
  lr_scheduler: "cosine"
  logging_steps: 10
  save_steps: 50
  eval_steps: 50
  eval_split: 0.1                 # 10% for eval

  # MPS-specific
  gradient_checkpointing: true
  gradient_checkpointing_kwargs:
    use_reentrant: false
  bf16: false
  fp16: false
  use_cache: false

  # Output
  output_dir: "adapters/fo-blog-v7"

# ─── Phase 2: DPO (Direct Preference Optimization) ────────────────────────────

dpo:
  enabled: true
  base_adapter: "adapters/fo-blog-v7/adapter"  # Use SFT adapter as starting point
  output_dir: "adapters/fo-blog-v7-dpo"

  num_epochs: 1
  batch_size: 1
  gradient_accumulation: 4
  beta: 0.1                       # DPO temperature (lower = stronger preference signal)
  learning_rate: 5.0e-5
  max_seq_length: 2048
  max_prompt_length: 512

  # DPO-specific
  loss_type: "sigmoid"            # Standard DPO loss (vs ipo, spo)
  reference_free: false           # Use SFT adapter as reference model

# ─── Evaluation prompts ────────────────────────────────────────────────────────

evaluation:
  n_eval_samples: 35              # 10% of ~350
  eval_prompts:
    - input: "Write a blog post on the following topic:\n\n**Topic:** QSFP-DD vs OSFP: Which 400G Form Factor Wins in 2026\n\n**Target audience:** IT managers and operators who evaluate and buy transceivers\n\nRemember: 700–1000 words, hook + technical sections + 3 takeaways. Stay strictly on-topic. Start writing now."
      check: "word_count_700_1000"

    - input: "Write a blog post on the following topic:\n\n**Topic:** BGP Route Leaks: Detection, Impact, and Prevention in 2026\n\n**Target audience:** network engineers and NOC operators\n\nRemember: 700–1000 words, hook + technical sections + 3 takeaways. Start writing now."
      check: "word_count_700_1000"

    - input: "Write a blog post on the following topic:\n\n**Topic:** RPKI Route Origin Validation: A Practical Deployment Guide\n\n**Target audience:** network engineers and architects who design and operate optical infrastructure\n\nRemember: 700–1000 words, hook + technical sections + 3 takeaways. Start writing now."
      check: "word_count_700_1000"

  quality_checks:
    - word_count_in_range: [700, 1000]
    - has_intro_paragraph: true         # Non-header first paragraph
    - has_headers: true                 # At least 2x ##
    - has_takeaways: true               # Bullet points at end
    - no_topic_drift: true             # Topic mentioned in first 100 words
    - no_repeated_sections: true        # No paragraph appears twice

# ─── Post-training: GGUF conversion ──────────────────────────────────────────

gguf:
  quantization: "Q4_K_M"
  output_name: "fo-blog-v7.gguf"
  ollama_model: "fo-blog-v7"
  modelfile_template: |
    FROM {gguf_path}

    SYSTEM """You are an expert technical writer specializing in optical networking and transceiver technology.

    STRICT CONSTRAINTS:
    - LENGTH: 700-1000 words ONLY. Stop at 1000 words maximum.
    - STRUCTURE: 1) Hook paragraph, 2) Technical sections (## headers), 3) Exactly 3 takeaways
    - TOPIC DISCIPLINE: Write ONLY about the exact topic requested.
    - VOICE: Confident and direct. No hedging.
    - AUDIENCE: Network engineers and IT professionals."""

    PARAMETER temperature 0.7
    PARAMETER top_p 0.9
    PARAMETER top_k 40
    PARAMETER repeat_penalty 1.15
    PARAMETER num_predict 1500

# ─── Training commands ────────────────────────────────────────────────────────
#
# Step 0: Generate training data (run BEFORE training)
#   cd packages/fine-tuner
#   python3 scripts/generate_v7_data.py &               # Background: 250 blogs
#   python3 scripts/ingest_ripe_apnic.py \
#     --nas-path /Volumes/KnowledgeLake/tashi-crawler/2026-03-06  # RIPE/NOG data
#
# Step 1: SFT Phase
#   python3 scripts/train_blog_v7.py --phase sft
#
# Step 2: Wait for training, then generate DPO pairs
#   python3 scripts/generate_dpo_pairs.py
#
# Step 3: DPO Phase
#   python3 scripts/train_blog_v7.py --phase dpo
#
# Step 4: Convert to GGUF + register in Ollama
#   python3 scripts/merge_and_convert.py --version v7
#
# Step 5: Test + deploy to Erik
#   curl -X POST http://localhost:11434/api/generate \
#     -d '{"model":"fo-blog-v7","prompt":"Write 700-1000w blog about QSFP-DD..."}'
#
# Estimated training time on Mac Studio M4 Max 48GB:
#   SFT 7B, 350 examples, 4 epochs:  ~25-35 min
#   DPO, 200 pairs, 1 epoch:          ~10-15 min
# ═══════════════════════════════════════════════════════════════════════════════