llm-gateway/packages/fine-tuner/config/fo-blog-v6.yaml

database_url: "${DATABASE_URL}"  # Set via environment variable at runtime
gateway_url: "https://llm-gateway.context-x.org"
ollama_url: "http://localhost:11434"

models:
  qwen_7b_hf: "Qwen/Qwen2.5-7B-Instruct"  # fo-blog uses 7B — fast inference on Mac Studio

training:
  device: "mps"  # Apple Silicon MPS
  max_seq_length: 2048  # Reduced from 2560 — articles avg 7k chars, fits in 2k tokens
  lora_r: 16            # Halved from 32 — less aggressive, prevents memorization
  lora_alpha: 32        # Halved from 64 — proportional to r
  lora_dropout: 0.1     # Increased from 0.05 — adds regularization
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"

  sft:
    num_epochs: 3        # Down from 15 — 3 epochs is standard for SFT, prevents overfitting
    batch_size: 1
    gradient_accumulation: 8
    learning_rate: 5.0e-5  # Down from 2e-4 — gentler updates preserve base model
    warmup_ratio: 0.05

output:
  adapters_dir: "adapters"
  models_dir: "models"
  model_name: "fo-blog-v6"

llama_cpp:
  convert_script: "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
  quantize_binary: "/opt/homebrew/bin/llama-quantize"
  python_bin: "/opt/homebrew/bin/python3.13"
  default_quantization: "Q4_K_M"  # Smaller than Q5_K_M, still high quality