Full v8 training pipeline for the optical networking blog model: - train_blog_v8.py: SFT (LoRA r=64, 5 epochs) + DPO (2 epochs) on Qwen2.5-14B-Instruct Fixed for trl 1.2.x: SFTConfig instead of TrainingArguments, processing_class= instead of tokenizer=, eval_strategy= instead of deprecated evaluation_strategy= - consolidate_v8_dataset.py: weighted merge of all data sources (820 effective SFT / 235 DPO) - crawl_v8_sources.py: APNIC/RIPE Labs/potaroo/Cloudflare crawler with balanced div extraction - process_v6_blogs.py: converts 101 real v6 TIP blog outputs into SFT + DPO pairs - label_v7_quality.py: Claude-judged quality labels → v8 quality DPO pairs - parse_real_posts.py: parses blog.fichtmueller.org Ghost CMS HTML → gold SFT records - run_v8_pipeline.sh: autopilot (consolidate → SFT → DPO → GGUF → Ollama) - blog-v8-training.yaml: training config reference Dataset breakdown: 19 real posts ×3 + 196 v7-gen + 28 v6blogs ×2 + 135 external ×1.5
179 lines
8.8 KiB
Bash
Executable File
179 lines
8.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# run_v8_pipeline.sh — fo-blog-v8 Autopilot Pipeline
|
||
#
|
||
# Qwen2.5-14B, LoRA r=64, 5 epochs SFT + 2 epochs DPO
|
||
#
|
||
# Erwartet dass folgende Daten bereit sind:
|
||
# ~/transceiver-training-data/v8-real-posts-sft.jsonl (19 real posts)
|
||
# ~/transceiver-training-data/v7-generated-sft.jsonl (v7 generated, ≥100)
|
||
# ~/transceiver-training-data/v8-v6blogs-sft.jsonl (v6 tip blogs good)
|
||
# ~/transceiver-training-data/v8-external-sft.jsonl (crawled external)
|
||
# ~/transceiver-training-data/v7-dpo-pairs.jsonl (v7 DPO)
|
||
# ~/transceiver-training-data/v8-v6blogs-dpo.jsonl (real v6 failures)
|
||
#
|
||
# Usage:
|
||
# bash scripts/run_v8_pipeline.sh # full auto
|
||
# bash scripts/run_v8_pipeline.sh --wait-crawl # wait for crawler first
|
||
# bash scripts/run_v8_pipeline.sh --phase-from merge # skip training
|
||
# bash scripts/run_v8_pipeline.sh --phase-from dpo # skip SFT, do DPO + merge
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
set -euo pipefail
|
||
|
||
FINE_TUNER_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||
PYTHON="/opt/homebrew/bin/python3.13"
|
||
SCRIPTS="$FINE_TUNER_DIR/scripts"
|
||
DATA_DIR="$HOME/transceiver-training-data"
|
||
LOG_DIR="/tmp/v8-pipeline"
|
||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||
|
||
mkdir -p "$LOG_DIR"
|
||
|
||
# ─── Colors ───────────────────────────────────────────────────────────────────
|
||
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'; BOLD='\033[1m'
|
||
log() { echo -e "${GREEN}[$(date +%H:%M:%S)]${NC} $*"; }
|
||
warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${NC} $*"; }
|
||
err() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${NC} $*"; }
|
||
step() { echo -e "\n${BOLD}${GREEN}══ $* ══${NC}"; }
|
||
|
||
# ─── Args ─────────────────────────────────────────────────────────────────────
|
||
WAIT_CRAWL=false
|
||
PHASE_FROM="consolidate" # consolidate | sft | dpo | merge
|
||
|
||
for arg in "$@"; do
|
||
case "$arg" in
|
||
--wait-crawl) WAIT_CRAWL=true ;;
|
||
--phase-from=*) PHASE_FROM="${arg#*=}" ;;
|
||
--phase-from) shift; PHASE_FROM="$1" ;;
|
||
esac
|
||
done
|
||
|
||
# ─── Step 0: Wait for external crawler ────────────────────────────────────────
|
||
if [[ "$WAIT_CRAWL" == "true" ]]; then
|
||
step "Warte auf v8 External Crawler (crawl_v8_sources.py)"
|
||
while pgrep -f "crawl_v8_sources.py" > /dev/null 2>&1; do
|
||
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
|
||
log " Crawler läuft noch... $EXT_COUNT externe Artikel bisher"
|
||
sleep 120
|
||
done
|
||
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
|
||
log "✓ Crawler fertig: $EXT_COUNT externe Artikel → $DATA_DIR/v8-external-sft.jsonl"
|
||
|
||
# Also wait for v6 DPO generation
|
||
while pgrep -f "process_v6_blogs.py" > /dev/null 2>&1; do
|
||
DPO_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
|
||
log " v6 DPO Generator läuft... $DPO_COUNT Pairs bisher"
|
||
sleep 120
|
||
done
|
||
DPO_V6_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
|
||
log "✓ v6 DPO fertig: $DPO_V6_COUNT Pairs"
|
||
fi
|
||
|
||
# ─── Step 1: Datenlage prüfen ─────────────────────────────────────────────────
|
||
step "Datenlage prüfen"
|
||
cd "$FINE_TUNER_DIR"
|
||
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py" --stats-only
|
||
|
||
REAL_COUNT=$(wc -l < "$DATA_DIR/v8-real-posts-sft.jsonl" 2>/dev/null || echo 0)
|
||
V7GEN_COUNT=$(wc -l < "$DATA_DIR/v7-generated-sft.jsonl" 2>/dev/null || echo 0)
|
||
V6BLOG_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-sft.jsonl" 2>/dev/null || echo 0)
|
||
EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
|
||
|
||
log "SFT Quellen:"
|
||
log " Real posts (Gold ×3): $REAL_COUNT"
|
||
log " v7 Generated (×1): $V7GEN_COUNT"
|
||
log " v6 TIP Blogs (×2): $V6BLOG_COUNT"
|
||
log " External crawled (×1.5): $EXT_COUNT"
|
||
|
||
TOTAL_EST=$(( REAL_COUNT*3 + V7GEN_COUNT + V6BLOG_COUNT*2 + EXT_COUNT*2 ))
|
||
log " Geschätzt total effective: $TOTAL_EST"
|
||
|
||
if [[ "$TOTAL_EST" -lt 80 ]]; then
|
||
err "Zu wenig Daten ($TOTAL_EST effective) — mindestens 80 nötig!"
|
||
err "Warte auf v7-generation oder crawl_v8_sources.py"
|
||
exit 1
|
||
fi
|
||
|
||
# ─── Step 2: Dataset konsolidieren ────────────────────────────────────────────
|
||
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" || "$PHASE_FROM" == "merge" ]]; then
|
||
if [[ "$PHASE_FROM" == "consolidate" ]]; then
|
||
step "Phase 0: Dataset Konsolidierung"
|
||
CONS_LOG="$LOG_DIR/consolidate-$TIMESTAMP.log"
|
||
log "Starte consolidate_v8_dataset.py..."
|
||
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py" 2>&1 | tee "$CONS_LOG"
|
||
SFT_MERGED=$(wc -l < "$DATA_DIR/v8-sft-merged.jsonl" 2>/dev/null || echo 0)
|
||
DPO_MERGED=$(wc -l < "$DATA_DIR/v8-dpo-merged.jsonl" 2>/dev/null || echo 0)
|
||
log "✓ Merged: $SFT_MERGED SFT + $DPO_MERGED DPO"
|
||
else
|
||
log "Phase: $PHASE_FROM — Konsolidierung übersprungen"
|
||
if [[ ! -f "$DATA_DIR/v8-sft-merged.jsonl" ]]; then
|
||
warn "v8-sft-merged.jsonl fehlt — erstelle schnell..."
|
||
$PYTHON "$SCRIPTS/consolidate_v8_dataset.py"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ─── Step 3: SFT Training ─────────────────────────────────────────────────────
|
||
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" ]]; then
|
||
step "Phase 1: SFT Training (Qwen2.5-14B, LoRA r=64, 5 Epochs)"
|
||
SFT_LOG="$LOG_DIR/sft-$TIMESTAMP.log"
|
||
log "Starte train_blog_v8.py --phase sft..."
|
||
log "Log: $SFT_LOG"
|
||
log "Estimated: ~10-14 Stunden (run overnight!)"
|
||
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase sft 2>&1 | tee "$SFT_LOG"
|
||
|
||
ADAPTER="$FINE_TUNER_DIR/adapters/fo-blog-v8/adapter"
|
||
if [[ ! -d "$ADAPTER" ]]; then
|
||
err "SFT Adapter nicht gefunden: $ADAPTER"
|
||
exit 1
|
||
fi
|
||
log "✓ SFT Adapter: $ADAPTER"
|
||
fi
|
||
|
||
# ─── Step 4: DPO Training ─────────────────────────────────────────────────────
|
||
if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" ]]; then
|
||
step "Phase 2: DPO Training (2 Epochs)"
|
||
DPO_LOG="$LOG_DIR/dpo-$TIMESTAMP.log"
|
||
|
||
DPO_FILE="$DATA_DIR/v8-dpo-merged.jsonl"
|
||
if [[ ! -f "$DPO_FILE" ]]; then
|
||
warn "DPO File fehlt — überspringe DPO Phase"
|
||
else
|
||
DPO_COUNT=$(wc -l < "$DPO_FILE")
|
||
log "DPO Pairs: $DPO_COUNT"
|
||
log "Starte train_blog_v8.py --phase dpo..."
|
||
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase dpo 2>&1 | tee "$DPO_LOG"
|
||
log "✓ DPO Training abgeschlossen"
|
||
fi
|
||
fi
|
||
|
||
# ─── Step 5: Merge + GGUF + Ollama ───────────────────────────────────────────
|
||
step "Phase 3: Merge + GGUF + Ollama Registrierung"
|
||
CONV_LOG="$LOG_DIR/convert-$TIMESTAMP.log"
|
||
log "Starte train_blog_v8.py --phase convert..."
|
||
$PYTHON "$SCRIPTS/train_blog_v8.py" --phase convert 2>&1 | tee "$CONV_LOG"
|
||
log "✓ fo-blog-v8 in Ollama registriert"
|
||
|
||
# ─── Abschluss ────────────────────────────────────────────────────────────────
|
||
step "v8 Pipeline ABGESCHLOSSEN"
|
||
echo ""
|
||
log "fo-blog-v8 ist bereit:"
|
||
log " Ollama: ollama run fo-blog-v8"
|
||
log " API: OLLAMA_LLM_MODEL=fo-blog-v8"
|
||
echo ""
|
||
log "Auf Erik deployen:"
|
||
log " 1. GGUF rsync: rsync -avz models/fo-blog-v8/fo-blog-v8.gguf root@erik:/opt/ollama-models/"
|
||
log " 2. Ollama: ssh erik 'ollama create fo-blog-v8 -f /opt/tip/Modelfile-v8'"
|
||
log " 3. TIP: ecosystem.config.js → OLLAMA_LLM_MODEL=fo-blog-v8"
|
||
log " 4. Restart: ssh erik 'cd /opt/tip && pm2 restart ecosystem.config.js --update-env'"
|
||
echo ""
|
||
log "Logs: $LOG_DIR/"
|
||
echo ""
|
||
log "v8 vs v7 Verbesserungen:"
|
||
log " - 14B statt 7B (4× Parameter)"
|
||
log " - Echte Blog-Posts ×3 gewichtet"
|
||
log " - Echte Modell-Failures als DPO (v6 too-long posts)"
|
||
log " - Externe Quellen: APNIC, RIPE Labs, potaroo.net, Cloudflare"
|
||
log " - 5 SFT + 2 DPO Epochs (war 4 + 1)"
|