llm-gateway/packages/fine-tuner/scripts/run_v8_pipeline.sh
Rene Fichtmueller c3ab87b167 feat: add fo-blog-v8 training pipeline (Qwen2.5-14B, SFT+DPO)
Full v8 training pipeline for the optical networking blog model:
- train_blog_v8.py: SFT (LoRA r=64, 5 epochs) + DPO (2 epochs) on Qwen2.5-14B-Instruct
  Fixed for trl 1.2.x: SFTConfig instead of TrainingArguments, processing_class= instead
  of tokenizer=, eval_strategy= instead of deprecated evaluation_strategy=
- consolidate_v8_dataset.py: weighted merge of all data sources (820 effective SFT / 235 DPO)
- crawl_v8_sources.py: APNIC/RIPE Labs/potaroo/Cloudflare crawler with balanced div extraction
- process_v6_blogs.py: converts 101 real v6 TIP blog outputs into SFT + DPO pairs
- label_v7_quality.py: Claude-judged quality labels → v8 quality DPO pairs
- parse_real_posts.py: parses blog.fichtmueller.org Ghost CMS HTML → gold SFT records
- run_v8_pipeline.sh: autopilot (consolidate → SFT → DPO → GGUF → Ollama)
- blog-v8-training.yaml: training config reference

Dataset breakdown: 19 real posts ×3 + 196 v7-gen + 28 v6blogs ×2 + 135 external ×1.5
2026-04-19 11:44:09 +02:00

179 lines
8.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# ═══════════════════════════════════════════════════════════════════════════════
# run_v8_pipeline.sh — fo-blog-v8 Autopilot Pipeline
#
# Qwen2.5-14B, LoRA r=64, 5 epochs SFT + 2 epochs DPO
#
# Erwartet dass folgende Daten bereit sind:
# ~/transceiver-training-data/v8-real-posts-sft.jsonl (19 real posts)
# ~/transceiver-training-data/v7-generated-sft.jsonl (v7 generated, ≥100)
# ~/transceiver-training-data/v8-v6blogs-sft.jsonl (v6 tip blogs good)
# ~/transceiver-training-data/v8-external-sft.jsonl (crawled external)
# ~/transceiver-training-data/v7-dpo-pairs.jsonl (v7 DPO)
# ~/transceiver-training-data/v8-v6blogs-dpo.jsonl (real v6 failures)
#
# Usage:
# bash scripts/run_v8_pipeline.sh # full auto
# bash scripts/run_v8_pipeline.sh --wait-crawl # wait for crawler first
# bash scripts/run_v8_pipeline.sh --phase-from merge # skip training
# bash scripts/run_v8_pipeline.sh --phase-from dpo # skip SFT, do DPO + merge
# ═══════════════════════════════════════════════════════════════════════════════
set -euo pipefail
FINE_TUNER_DIR="$(cd "$(dirname "$0")/.." && pwd)"
PYTHON="/opt/homebrew/bin/python3.13"
SCRIPTS="$FINE_TUNER_DIR/scripts"
DATA_DIR="$HOME/transceiver-training-data"
LOG_DIR="/tmp/v8-pipeline"
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
mkdir -p "$LOG_DIR"
# ─── Colors ───────────────────────────────────────────────────────────────────
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'; BOLD='\033[1m'
log() { echo -e "${GREEN}[$(date +%H:%M:%S)]${NC} $*"; }
warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${NC} $*"; }
err() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${NC} $*"; }
step() { echo -e "\n${BOLD}${GREEN}══ $* ══${NC}"; }
# ─── Args ─────────────────────────────────────────────────────────────────────
WAIT_CRAWL=false
PHASE_FROM="consolidate" # consolidate | sft | dpo | merge
for arg in "$@"; do
case "$arg" in
--wait-crawl) WAIT_CRAWL=true ;;
--phase-from=*) PHASE_FROM="${arg#*=}" ;;
--phase-from) shift; PHASE_FROM="$1" ;;
esac
done
# ─── Step 0: Wait for external crawler ────────────────────────────────────────
if [[ "$WAIT_CRAWL" == "true" ]]; then
step "Warte auf v8 External Crawler (crawl_v8_sources.py)"
while pgrep -f "crawl_v8_sources.py" > /dev/null 2>&1; do
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
log " Crawler läuft noch... $EXT_COUNT externe Artikel bisher"
sleep 120
done
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
log "✓ Crawler fertig: $EXT_COUNT externe Artikel → $DATA_DIR/v8-external-sft.jsonl"
# Also wait for v6 DPO generation
while pgrep -f "process_v6_blogs.py" > /dev/null 2>&1; do
DPO_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
log " v6 DPO Generator läuft... $DPO_COUNT Pairs bisher"
sleep 120
done
DPO_V6_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
log "✓ v6 DPO fertig: $DPO_V6_COUNT Pairs"
fi
# ─── Step 1: Datenlage prüfen ─────────────────────────────────────────────────
step "Datenlage prüfen"
cd "$FINE_TUNER_DIR"
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py" --stats-only
REAL_COUNT=$(wc -l < "$DATA_DIR/v8-real-posts-sft.jsonl" 2>/dev/null || echo 0)
V7GEN_COUNT=$(wc -l < "$DATA_DIR/v7-generated-sft.jsonl" 2>/dev/null || echo 0)
V6BLOG_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-sft.jsonl" 2>/dev/null || echo 0)
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
log "SFT Quellen:"
log " Real posts (Gold ×3): $REAL_COUNT"
log " v7 Generated (×1): $V7GEN_COUNT"
log " v6 TIP Blogs (×2): $V6BLOG_COUNT"
log " External crawled (×1.5): $EXT_COUNT"
TOTAL_EST=$(( REAL_COUNT*3 + V7GEN_COUNT + V6BLOG_COUNT*2 + EXT_COUNT*2 ))
log " Geschätzt total effective: $TOTAL_EST"
if [[ "$TOTAL_EST" -lt 80 ]]; then
err "Zu wenig Daten ($TOTAL_EST effective) — mindestens 80 nötig!"
err "Warte auf v7-generation oder crawl_v8_sources.py"
exit 1
fi
# ─── Step 2: Dataset konsolidieren ────────────────────────────────────────────
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" || "$PHASE_FROM" == "merge" ]]; then
if [[ "$PHASE_FROM" == "consolidate" ]]; then
step "Phase 0: Dataset Konsolidierung"
CONS_LOG="$LOG_DIR/consolidate-$TIMESTAMP.log"
log "Starte consolidate_v8_dataset.py..."
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py" 2>&1 | tee "$CONS_LOG"
SFT_MERGED=$(wc -l < "$DATA_DIR/v8-sft-merged.jsonl" 2>/dev/null || echo 0)
DPO_MERGED=$(wc -l < "$DATA_DIR/v8-dpo-merged.jsonl" 2>/dev/null || echo 0)
log "✓ Merged: $SFT_MERGED SFT + $DPO_MERGED DPO"
else
log "Phase: $PHASE_FROM — Konsolidierung übersprungen"
if [[ ! -f "$DATA_DIR/v8-sft-merged.jsonl" ]]; then
warn "v8-sft-merged.jsonl fehlt — erstelle schnell..."
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py"
fi
fi
fi
# ─── Step 3: SFT Training ─────────────────────────────────────────────────────
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" ]]; then
step "Phase 1: SFT Training (Qwen2.5-14B, LoRA r=64, 5 Epochs)"
SFT_LOG="$LOG_DIR/sft-$TIMESTAMP.log"
log "Starte train_blog_v8.py --phase sft..."
log "Log: $SFT_LOG"
log "Estimated: ~10-14 Stunden (run overnight!)"
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase sft 2>&1 | tee "$SFT_LOG"
ADAPTER="$FINE_TUNER_DIR/adapters/fo-blog-v8/adapter"
if [[ ! -d "$ADAPTER" ]]; then
err "SFT Adapter nicht gefunden: $ADAPTER"
exit 1
fi
log "✓ SFT Adapter: $ADAPTER"
fi
# ─── Step 4: DPO Training ─────────────────────────────────────────────────────
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" ]]; then
step "Phase 2: DPO Training (2 Epochs)"
DPO_LOG="$LOG_DIR/dpo-$TIMESTAMP.log"
DPO_FILE="$DATA_DIR/v8-dpo-merged.jsonl"
if [[ ! -f "$DPO_FILE" ]]; then
warn "DPO File fehlt — überspringe DPO Phase"
else
DPO_COUNT=$(wc -l < "$DPO_FILE")
log "DPO Pairs: $DPO_COUNT"
log "Starte train_blog_v8.py --phase dpo..."
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase dpo 2>&1 | tee "$DPO_LOG"
log "✓ DPO Training abgeschlossen"
fi
fi
# ─── Step 5: Merge + GGUF + Ollama ───────────────────────────────────────────
step "Phase 3: Merge + GGUF + Ollama Registrierung"
CONV_LOG="$LOG_DIR/convert-$TIMESTAMP.log"
log "Starte train_blog_v8.py --phase convert..."
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase convert 2>&1 | tee "$CONV_LOG"
log "✓ fo-blog-v8 in Ollama registriert"
# ─── Abschluss ────────────────────────────────────────────────────────────────
step "v8 Pipeline ABGESCHLOSSEN"
echo ""
log "fo-blog-v8 ist bereit:"
log " Ollama: ollama run fo-blog-v8"
log " API: OLLAMA_LLM_MODEL=fo-blog-v8"
echo ""
log "Auf Erik deployen:"
log " 1. GGUF rsync: rsync -avz models/fo-blog-v8/fo-blog-v8.gguf root@erik:/opt/ollama-models/"
log " 2. Ollama: ssh erik 'ollama create fo-blog-v8 -f /opt/tip/Modelfile-v8'"
log " 3. TIP: ecosystem.config.js → OLLAMA_LLM_MODEL=fo-blog-v8"
log " 4. Restart: ssh erik 'cd /opt/tip && pm2 restart ecosystem.config.js --update-env'"
echo ""
log "Logs: $LOG_DIR/"
echo ""
log "v8 vs v7 Verbesserungen:"
log " - 14B statt 7B (4× Parameter)"
log " - Echte Blog-Posts ×3 gewichtet"
log " - Echte Modell-Failures als DPO (v6 too-long posts)"
log " - Externe Quellen: APNIC, RIPE Labs, potaroo.net, Cloudflare"
log " - 5 SFT + 2 DPO Epochs (war 4 + 1)"