#!/usr/bin/env bash # ═══════════════════════════════════════════════════════════════════════════════ # run_v8_pipeline.sh — fo-blog-v8 Autopilot Pipeline # # Qwen2.5-14B, LoRA r=64, 5 epochs SFT + 2 epochs DPO # # Erwartet dass folgende Daten bereit sind: # ~/transceiver-training-data/v8-real-posts-sft.jsonl (19 real posts) # ~/transceiver-training-data/v7-generated-sft.jsonl (v7 generated, ≥100) # ~/transceiver-training-data/v8-v6blogs-sft.jsonl (v6 tip blogs good) # ~/transceiver-training-data/v8-external-sft.jsonl (crawled external) # ~/transceiver-training-data/v7-dpo-pairs.jsonl (v7 DPO) # ~/transceiver-training-data/v8-v6blogs-dpo.jsonl (real v6 failures) # # Usage: # bash scripts/run_v8_pipeline.sh # full auto # bash scripts/run_v8_pipeline.sh --wait-crawl # wait for crawler first # bash scripts/run_v8_pipeline.sh --phase-from merge # skip training # bash scripts/run_v8_pipeline.sh --phase-from dpo # skip SFT, do DPO + merge # ═══════════════════════════════════════════════════════════════════════════════ set -euo pipefail FINE_TUNER_DIR="$(cd "$(dirname "$0")/.." && pwd)" PYTHON="/opt/homebrew/bin/python3.13" SCRIPTS="$FINE_TUNER_DIR/scripts" DATA_DIR="$HOME/transceiver-training-data" LOG_DIR="/tmp/v8-pipeline" TIMESTAMP=$(date +%Y%m%d-%H%M%S) mkdir -p "$LOG_DIR" # ─── Colors ─────────────────────────────────────────────────────────────────── GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'; BOLD='\033[1m' log() { echo -e "${GREEN}[$(date +%H:%M:%S)]${NC} $*"; } warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${NC} $*"; } err() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${NC} $*"; } step() { echo -e "\n${BOLD}${GREEN}══ $* ══${NC}"; } # ─── Args ───────────────────────────────────────────────────────────────────── WAIT_CRAWL=false PHASE_FROM="consolidate" # consolidate | sft | dpo | merge for arg in "$@"; do case "$arg" in --wait-crawl) WAIT_CRAWL=true ;; --phase-from=*) PHASE_FROM="${arg#*=}" ;; --phase-from) shift; PHASE_FROM="$1" ;; esac done # ─── Step 0: Wait for external crawler ──────────────────────────────────────── if [[ "$WAIT_CRAWL" == "true" ]]; then step "Warte auf v8 External Crawler (crawl_v8_sources.py)" while pgrep -f "crawl_v8_sources.py" > /dev/null 2>&1; do EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0) log " Crawler läuft noch... $EXT_COUNT externe Artikel bisher" sleep 120 done EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0) log "✓ Crawler fertig: $EXT_COUNT externe Artikel → $DATA_DIR/v8-external-sft.jsonl" # Also wait for v6 DPO generation while pgrep -f "process_v6_blogs.py" > /dev/null 2>&1; do DPO_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0) log " v6 DPO Generator läuft... $DPO_COUNT Pairs bisher" sleep 120 done DPO_V6_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0) log "✓ v6 DPO fertig: $DPO_V6_COUNT Pairs" fi # ─── Step 1: Datenlage prüfen ───────────────────────────────────────────────── step "Datenlage prüfen" cd "$FINE_TUNER_DIR" $PYTHON "$SCRIPTS/consolidate_v8_dataset.py" --stats-only REAL_COUNT=$(wc -l < "$DATA_DIR/v8-real-posts-sft.jsonl" 2>/dev/null || echo 0) V7GEN_COUNT=$(wc -l < "$DATA_DIR/v7-generated-sft.jsonl" 2>/dev/null || echo 0) V6BLOG_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-sft.jsonl" 2>/dev/null || echo 0) EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0) log "SFT Quellen:" log " Real posts (Gold ×3): $REAL_COUNT" log " v7 Generated (×1): $V7GEN_COUNT" log " v6 TIP Blogs (×2): $V6BLOG_COUNT" log " External crawled (×1.5): $EXT_COUNT" TOTAL_EST=$(( REAL_COUNT*3 + V7GEN_COUNT + V6BLOG_COUNT*2 + EXT_COUNT*2 )) log " Geschätzt total effective: $TOTAL_EST" if [[ "$TOTAL_EST" -lt 80 ]]; then err "Zu wenig Daten ($TOTAL_EST effective) — mindestens 80 nötig!" err "Warte auf v7-generation oder crawl_v8_sources.py" exit 1 fi # ─── Step 2: Dataset konsolidieren ──────────────────────────────────────────── if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" || "$PHASE_FROM" == "merge" ]]; then if [[ "$PHASE_FROM" == "consolidate" ]]; then step "Phase 0: Dataset Konsolidierung" CONS_LOG="$LOG_DIR/consolidate-$TIMESTAMP.log" log "Starte consolidate_v8_dataset.py..." $PYTHON "$SCRIPTS/consolidate_v8_dataset.py" 2>&1 | tee "$CONS_LOG" SFT_MERGED=$(wc -l < "$DATA_DIR/v8-sft-merged.jsonl" 2>/dev/null || echo 0) DPO_MERGED=$(wc -l < "$DATA_DIR/v8-dpo-merged.jsonl" 2>/dev/null || echo 0) log "✓ Merged: $SFT_MERGED SFT + $DPO_MERGED DPO" else log "Phase: $PHASE_FROM — Konsolidierung übersprungen" if [[ ! -f "$DATA_DIR/v8-sft-merged.jsonl" ]]; then warn "v8-sft-merged.jsonl fehlt — erstelle schnell..." $PYTHON "$SCRIPTS/consolidate_v8_dataset.py" fi fi fi # ─── Step 3: SFT Training ───────────────────────────────────────────────────── if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" ]]; then step "Phase 1: SFT Training (Qwen2.5-14B, LoRA r=64, 5 Epochs)" SFT_LOG="$LOG_DIR/sft-$TIMESTAMP.log" log "Starte train_blog_v8.py --phase sft..." log "Log: $SFT_LOG" log "Estimated: ~10-14 Stunden (run overnight!)" $PYTHON "$SCRIPTS/train_blog_v8.py" --phase sft 2>&1 | tee "$SFT_LOG" ADAPTER="$FINE_TUNER_DIR/adapters/fo-blog-v8/adapter" if [[ ! -d "$ADAPTER" ]]; then err "SFT Adapter nicht gefunden: $ADAPTER" exit 1 fi log "✓ SFT Adapter: $ADAPTER" fi # ─── Step 4: DPO Training ───────────────────────────────────────────────────── if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" ]]; then step "Phase 2: DPO Training (2 Epochs)" DPO_LOG="$LOG_DIR/dpo-$TIMESTAMP.log" DPO_FILE="$DATA_DIR/v8-dpo-merged.jsonl" if [[ ! -f "$DPO_FILE" ]]; then warn "DPO File fehlt — überspringe DPO Phase" else DPO_COUNT=$(wc -l < "$DPO_FILE") log "DPO Pairs: $DPO_COUNT" log "Starte train_blog_v8.py --phase dpo..." $PYTHON "$SCRIPTS/train_blog_v8.py" --phase dpo 2>&1 | tee "$DPO_LOG" log "✓ DPO Training abgeschlossen" fi fi # ─── Step 5: Merge + GGUF + Ollama ─────────────────────────────────────────── step "Phase 3: Merge + GGUF + Ollama Registrierung" CONV_LOG="$LOG_DIR/convert-$TIMESTAMP.log" log "Starte train_blog_v8.py --phase convert..." $PYTHON "$SCRIPTS/train_blog_v8.py" --phase convert 2>&1 | tee "$CONV_LOG" log "✓ fo-blog-v8 in Ollama registriert" # ─── Abschluss ──────────────────────────────────────────────────────────────── step "v8 Pipeline ABGESCHLOSSEN" echo "" log "fo-blog-v8 ist bereit:" log " Ollama: ollama run fo-blog-v8" log " API: OLLAMA_LLM_MODEL=fo-blog-v8" echo "" log "Auf Erik deployen:" log " 1. GGUF rsync: rsync -avz models/fo-blog-v8/fo-blog-v8.gguf root@erik:/opt/ollama-models/" log " 2. Ollama: ssh erik 'ollama create fo-blog-v8 -f /opt/tip/Modelfile-v8'" log " 3. TIP: ecosystem.config.js → OLLAMA_LLM_MODEL=fo-blog-v8" log " 4. Restart: ssh erik 'cd /opt/tip && pm2 restart ecosystem.config.js --update-env'" echo "" log "Logs: $LOG_DIR/" echo "" log "v8 vs v7 Verbesserungen:" log " - 14B statt 7B (4× Parameter)" log " - Echte Blog-Posts ×3 gewichtet" log " - Echte Modell-Failures als DPO (v6 too-long posts)" log " - Externe Quellen: APNIC, RIPE Labs, potaroo.net, Cloudflare" log " - 5 SFT + 2 DPO Epochs (war 4 + 1)"