(.*?)',
+ r'class="post-full-content[^"]*"[^>]*>(.*?)(?:div|section)>',
+ r'
]*>.*?', '', raw_content, flags=re.DOTALL)
+ raw_content = re.sub(r'', '', raw_content, flags=re.DOTALL)
+ raw_content = re.sub(r'
', '', raw_content, flags=re.DOTALL)
+
+ # HTML → Markdown-ähnliches Format
+ # Headers
+ for level in [6, 5, 4, 3, 2, 1]:
+ hashes = "#" * level
+ raw_content = re.sub(
+ rf'
]*>(.*?)',
+ lambda m: f"\n{hashes} {re.sub('<[^>]+>', '', m.group(1)).strip()}\n",
+ raw_content,
+ flags=re.DOTALL | re.IGNORECASE,
+ )
+
+ # Bold/italic
+ raw_content = re.sub(r'
]*>(.*?)', r'**\1**', raw_content, flags=re.DOTALL)
+ raw_content = re.sub(r'
]*>(.*?)', r'**\1**', raw_content, flags=re.DOTALL)
+ raw_content = re.sub(r'
]*>(.*?)', r'*\1*', raw_content, flags=re.DOTALL)
+
+ # Lists
+ raw_content = re.sub(r'
]*>(.*?)', r'\n- \1', raw_content, flags=re.DOTALL)
+ raw_content = re.sub(r'<[uo]l[^>]*>', '\n', raw_content)
+ raw_content = re.sub(r'[uo]l>', '\n', raw_content)
+
+ # Paragraphs → newlines
+ raw_content = re.sub(r'
', '\n', raw_content)
+ raw_content = re.sub(r'
]*>', '\n', raw_content)
+ raw_content = re.sub(r'
', '\n', raw_content)
+
+ # Code blocks
+ raw_content = re.sub(r'
]*>]*>(.*?)
',
+ lambda m: f"\n```\n{m.group(1)}\n```\n",
+ raw_content, flags=re.DOTALL)
+
+ # Alle verbleibenden Tags entfernen
+ raw_content = re.sub(r'<[^>]+>', ' ', raw_content)
+
+ # HTML entities
+ raw_content = raw_content.replace('&', '&').replace('<', '<').replace('>', '>')
+ raw_content = raw_content.replace(' ', ' ').replace('"', '"').replace(''', "'")
+ raw_content = raw_content.replace('—', '—').replace('–', '–').replace('…', '…')
+
+ # Whitespace normalisieren
+ raw_content = re.sub(r'\n{3,}', '\n\n', raw_content)
+ raw_content = re.sub(r'[ \t]+', ' ', raw_content)
+ raw_content = re.sub(r'\n ', '\n', raw_content)
+ clean = raw_content.strip()
+
+ if len(clean.split()) < 200:
+ return None
+
+ return title, clean
+
+
+def slug_to_topic(slug: str) -> str:
+ """Wandelt URL-Slug in lesbaren Topic-String."""
+ return slug.replace("-", " ").title()
+
+
+def main() -> None:
+ html_files = sorted(f for f in POSTS_DIR.glob("*.html")
+ if f.stat().st_size > 10_000
+ and f.stem not in SKIP_SLUGS
+ and re.match(r'^[a-z0-9]', f.stem)) # skip hidden/garbage files
+
+ print(f"Parsing {len(html_files)} HTML files...")
+
+ results = []
+ skipped = 0
+ seen_slugs: set[str] = set()
+
+ for fpath in html_files:
+ if fpath.stem in seen_slugs:
+ print(f" SKIP (duplicate slug): {fpath.name}")
+ skipped += 1
+ continue
+ seen_slugs.add(fpath.stem)
+
+ html = fpath.read_text(errors="ignore")
+ extracted = extract_ghost_content(html)
+
+ if extracted is None:
+ print(f" SKIP (no content): {fpath.name}")
+ skipped += 1
+ continue
+
+ title, clean_text = extracted
+ word_count = len(clean_text.split())
+
+ if not title:
+ title = slug_to_topic(fpath.stem)
+
+ print(f" OK: {word_count:4d}w | {title[:60]}")
+
+ # Input-Text: genau wie beim Generieren — topic + audience + reminder
+ # Audience basierend auf Slug-Keywords bestimmen
+ slug = fpath.stem
+ if any(k in slug for k in ["shieldx", "claude", "papercortex", "llm", "slop", "sync"]):
+ audience = "developers and infrastructure engineers building AI-powered tools"
+ elif any(k in slug for k in ["aspa", "bgp", "peercortex", "infrastructure"]):
+ audience = "network engineers and NOC operators"
+ else:
+ audience = "network engineers and IT professionals who evaluate and operate optical infrastructure"
+
+ input_text = (
+ f"Write a blog post on the following topic:\n\n"
+ f"**Topic:** {title}\n\n"
+ f"**Target audience:** {audience}\n\n"
+ f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. "
+ f"Stay strictly on-topic. No filler. Start writing now."
+ )
+
+ record = {
+ "system_prompt": SYSTEM_PROMPT,
+ "input_text": input_text,
+ "output_text": clean_text,
+ "meta": {
+ "title": title,
+ "slug": slug,
+ "source": "blog.fichtmueller.org",
+ "word_count": word_count,
+ "quality": "human_written",
+ "weight": 3.0, # 3x Gewichtung in Training — Gold Standard
+ "dataset_version": "v8",
+ },
+ }
+ results.append(record)
+
+ OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+ for r in results:
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+ print(f"\nGespeichert: {len(results)} Posts → {OUTPUT_FILE}")
+ print(f"Übersprungen: {skipped}")
+
+ # Statistik
+ wcs = [r["meta"]["word_count"] for r in results]
+ if wcs:
+ print(f"Word count: min={min(wcs)}, max={max(wcs)}, avg={sum(wcs)//len(wcs)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/fine-tuner/scripts/process_v6_blogs.py b/packages/fine-tuner/scripts/process_v6_blogs.py
new file mode 100644
index 0000000..f0c0ae8
--- /dev/null
+++ b/packages/fine-tuner/scripts/process_v6_blogs.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+"""
+process_v6_blogs.py — Verarbeitet echte fo-blog-v6 Outputs als v8 Trainingsdaten
+
+Analysiert 101 v6-generierte Blog-Posts aus /opt/tip/blog-training-data/
+und erstellt daraus:
+
+ 1. SFT records — Posts mit 700-1100w → direkt als Training-Beispiele
+ 2. DPO pairs — Posts >1100w:
+ rejected = der zu lange Originalpost
+ chosen = Claude rewritet ihn als saubere 700-1000w Version
+
+Dies sind echte Modell-Failures (nicht synthetisch!) — besonders wertvoll für DPO.
+
+Input: ~/transceiver-training-data/v6-tip-blogs/*.md
+Output:
+ ~/transceiver-training-data/v8-v6blogs-sft.jsonl (gute Posts als SFT)
+ ~/transceiver-training-data/v8-v6blogs-dpo.jsonl (zu lange Posts als DPO)
+
+Usage:
+ python3 scripts/process_v6_blogs.py
+ python3 scripts/process_v6_blogs.py --max-dpo 30 # nur 30 DPO Pairs
+ python3 scripts/process_v6_blogs.py --sft-only # nur SFT Records
+ python3 scripts/process_v6_blogs.py --dry-run # Stats only
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import subprocess
+import time
+from pathlib import Path
+
+BLOGS_DIR = Path.home() / "transceiver-training-data" / "v6-tip-blogs"
+SFT_OUTPUT = Path.home() / "transceiver-training-data" / "v8-v6blogs-sft.jsonl"
+DPO_OUTPUT = Path.home() / "transceiver-training-data" / "v8-v6blogs-dpo.jsonl"
+PROGRESS_FILE = Path.home() / "transceiver-training-data" / "v8-v6blogs-progress.json"
+
+# Word count ranges
+GOOD_MIN = 700
+GOOD_MAX = 1100
+REJECTED_MIN = 1100 # posts above this are "too long" → rejected examples
+
+CLAUDE_TIMEOUT = 180
+
+SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
+
+STRICT CONSTRAINTS — Follow exactly, no exceptions:
+- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
+- STRUCTURE (mandatory, in this order):
+ 1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
+ 2. Technical sections — 3–4 H2 sections covering the topic in depth
+ 3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
+- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
+- NO REPETITION: Every sentence must add new information. No restating.
+- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
+- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
+- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
+
+Do not summarize what you are about to write. Start with the hook directly."""
+
+
+def parse_blog_md(path: Path) -> dict | None:
+ """Parse a blog markdown file with YAML frontmatter."""
+ text = path.read_text(encoding="utf-8", errors="ignore")
+
+ # Extract YAML frontmatter
+ frontmatter: dict = {}
+ content = text
+ fm_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', text, re.DOTALL)
+ if fm_match:
+ fm_text = fm_match.group(1)
+ content = text[fm_match.end():]
+ # Parse key: "value" pairs
+ for line in fm_text.split('\n'):
+ kv = re.match(r'^(\w+):\s*"?(.+?)"?\s*$', line)
+ if kv:
+ frontmatter[kv.group(1)] = kv.group(2).strip('"').strip()
+
+ title = frontmatter.get("title", "").strip('"')
+ if not title:
+ # Fallback: first H1
+ h1 = re.search(r'^# (.+)$', content, re.MULTILINE)
+ title = h1.group(1).strip() if h1 else path.stem.replace("-", " ").title()
+
+ slug = frontmatter.get("slug", path.stem)
+ category = frontmatter.get("category", "optical networking")
+ word_count = len(content.split())
+
+ if word_count < 200:
+ return None
+
+ return {
+ "title": title,
+ "slug": slug,
+ "category": category,
+ "content": content.strip(),
+ "word_count": word_count,
+ "path": str(path),
+ }
+
+
+def build_input_text(title: str, audience: str) -> str:
+ return (
+ f"Write a blog post on the following topic:\n\n"
+ f"**Topic:** {title}\n\n"
+ f"**Target audience:** {audience}\n\n"
+ f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. "
+ f"Stay strictly on-topic. No filler. Start writing now."
+ )
+
+
+def get_audience(category: str) -> str:
+ cat = category.lower()
+ if any(k in cat for k in ["fiber", "cabling", "mtp", "mpo", "connector"]):
+ return "data center engineers and cabling specialists"
+ elif any(k in cat for k in ["coherent", "dwdm", "zr", "metro"]):
+ return "network architects and optical engineers designing long-haul links"
+ elif any(k in cat for k in ["compatible", "procurement", "vendor", "cost", "price"]):
+ return "network procurement teams and IT managers evaluating transceiver vendors"
+ else:
+ return "network engineers and IT professionals who evaluate and operate optical infrastructure"
+
+
+def rewrite_with_claude(title: str, source_content: str, audience: str) -> str | None:
+ """Rewrite a too-long v6 blog as a proper 700-1000w version (chosen)."""
+ # Truncate source to ~800 words for context
+ words = source_content.split()
+ if len(words) > 800:
+ source_content = " ".join(words[:800]) + "\n\n[Source truncated for reference]"
+
+ prompt = (
+ f"Rewrite this blog post to be 700-1000 words with the correct structure.\n\n"
+ f"**Topic:** {title}\n"
+ f"**Target audience:** {audience}\n\n"
+ f"**Original post (DO NOT COPY — use only as topic reference, rewrite completely):**\n\n"
+ f"{source_content}\n\n"
+ f"Rewrite now. 700-1000 words. Hook + technical sections + 3 takeaways. Start directly."
+ )
+
+ try:
+ result = subprocess.run(
+ ["claude", "--print", "--system-prompt", SYSTEM_PROMPT, "-p", prompt],
+ capture_output=True, text=True, timeout=CLAUDE_TIMEOUT,
+ )
+ if result.returncode != 0 or not result.stdout.strip():
+ return None
+ output = result.stdout.strip()
+ word_count = len(output.split())
+ if word_count < 400 or word_count > 2000:
+ return None
+ return output
+ except subprocess.TimeoutExpired:
+ print(f" TIMEOUT: Claude took too long for {title[:50]}")
+ return None
+ except Exception as e:
+ print(f" ERROR: {e}")
+ return None
+
+
+def load_progress() -> set[str]:
+ if not PROGRESS_FILE.exists():
+ return set()
+ try:
+ return set(json.loads(PROGRESS_FILE.read_text()).get("done", []))
+ except Exception:
+ return set()
+
+
+def save_progress(done: set[str]) -> None:
+ PROGRESS_FILE.write_text(json.dumps({"done": list(done)}))
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="Process v6 TIP blogs into v8 training data")
+ parser.add_argument("--max-dpo", type=int, default=None, help="Max DPO pairs to generate")
+ parser.add_argument("--sft-only", action="store_true", help="Only create SFT records (skip DPO)")
+ parser.add_argument("--dpo-only", action="store_true", help="Only create DPO pairs (skip SFT)")
+ parser.add_argument("--dry-run", action="store_true", help="Statistics only, no output")
+ args = parser.parse_args()
+
+ # Load all blog files
+ md_files = sorted(BLOGS_DIR.glob("*.md"))
+ if not md_files:
+ print(f"No .md files found in {BLOGS_DIR}")
+ return
+
+ blogs = []
+ for path in md_files:
+ parsed = parse_blog_md(path)
+ if parsed:
+ blogs.append(parsed)
+
+ # Categorize
+ good = [b for b in blogs if GOOD_MIN <= b["word_count"] <= GOOD_MAX]
+ too_long = [b for b in blogs if b["word_count"] > REJECTED_MIN]
+ too_short = [b for b in blogs if b["word_count"] < GOOD_MIN]
+
+ print(f"=== v6 TIP Blog Analysis ===")
+ print(f"Total: {len(blogs)} files")
+ print(f"Good (SFT): {len(good)} files ({GOOD_MIN}-{GOOD_MAX}w)")
+ print(f"Too long: {len(too_long)} files (>{REJECTED_MIN}w) → DPO rejected")
+ print(f"Too short: {len(too_short)} files (<{GOOD_MIN}w) → skip")
+ print()
+
+ if args.dry_run:
+ print("Good posts:")
+ for b in sorted(good, key=lambda x: x["word_count"]):
+ print(f" {b['word_count']:4d}w | {b['title'][:60]}")
+ print("\nToo long (top 10):")
+ for b in sorted(too_long, key=lambda x: x["word_count"], reverse=True)[:10]:
+ print(f" {b['word_count']:4d}w | {b['title'][:60]}")
+ return
+
+ done = load_progress()
+
+ # ─── Phase 1: SFT Records from good posts ──────────────────────────────────
+ if not args.dpo_only:
+ print("=== Phase 1: SFT Records (good posts) ===")
+ SFT_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
+ sft_count = 0
+ with open(SFT_OUTPUT, "w", encoding="utf-8") as f:
+ for blog in good:
+ audience = get_audience(blog["category"])
+ record = {
+ "system_prompt": SYSTEM_PROMPT,
+ "input_text": build_input_text(blog["title"], audience),
+ "output_text": blog["content"],
+ "meta": {
+ "title": blog["title"],
+ "slug": blog["slug"],
+ "source": "tip-v6-blogs",
+ "word_count": blog["word_count"],
+ "quality": "v6_output_good",
+ "weight": 2.0, # Real model output, good length
+ "dataset_version": "v8",
+ },
+ }
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
+ sft_count += 1
+ print(f" SFT: {blog['word_count']:4d}w | {blog['title'][:60]}")
+
+ print(f"\nSFT saved: {sft_count} records → {SFT_OUTPUT}")
+
+ # ─── Phase 2: DPO Pairs from too-long posts ────────────────────────────────
+ if not args.sft_only:
+ print("\n=== Phase 2: DPO Pairs (too-long posts → rewrite) ===")
+ DPO_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
+
+ candidates = too_long
+ if args.max_dpo:
+ candidates = candidates[:args.max_dpo]
+
+ dpo_saved = 0
+ dpo_skipped = 0
+
+ with open(DPO_OUTPUT, "a", encoding="utf-8") as f:
+ for i, blog in enumerate(candidates):
+ slug = blog["slug"]
+ if slug in done:
+ dpo_skipped += 1
+ continue
+
+ print(f" [{i+1}/{len(candidates)}] {blog['word_count']:4d}w → rewrite: {blog['title'][:55]}")
+ audience = get_audience(blog["category"])
+
+ # Get Claude to write a GOOD version → chosen
+ chosen = rewrite_with_claude(blog["title"], blog["content"], audience)
+
+ if not chosen:
+ print(f" SKIP (Claude failed)")
+ done.add(slug)
+ dpo_skipped += 1
+ continue
+
+ chosen_wc = len(chosen.split())
+ print(f" OK: chosen={chosen_wc}w (was {blog['word_count']}w)")
+
+ # Build DPO prompt (ChatML prefix)
+ prompt = (
+ f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
+ f"<|im_start|>user\n{build_input_text(blog['title'], audience)}<|im_end|>\n"
+ )
+
+ pair = {
+ "prompt": prompt,
+ "chosen": chosen,
+ "rejected": blog["content"],
+ "meta": {
+ "title": blog["title"],
+ "slug": slug,
+ "source": "tip-v6-dpo",
+ "rejection_strategy": "too_long_real",
+ "chosen_words": chosen_wc,
+ "rejected_words": blog["word_count"],
+ "dataset_version": "v8",
+ },
+ }
+ f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+ f.flush()
+ done.add(slug)
+ save_progress(done)
+ dpo_saved += 1
+ time.sleep(1)
+
+ print(f"\nDPO saved: {dpo_saved} pairs | skipped: {dpo_skipped} → {DPO_OUTPUT}")
+
+ # ─── Summary ───────────────────────────────────────────────────────────────
+ print("\n=== Summary ===")
+ if SFT_OUTPUT.exists():
+ with open(SFT_OUTPUT) as f:
+ sft_n = sum(1 for _ in f)
+ print(f"SFT: {sft_n} records → {SFT_OUTPUT}")
+ if DPO_OUTPUT.exists():
+ with open(DPO_OUTPUT) as f:
+ dpo_n = sum(1 for _ in f)
+ print(f"DPO: {dpo_n} pairs → {DPO_OUTPUT}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/fine-tuner/scripts/run_v8_pipeline.sh b/packages/fine-tuner/scripts/run_v8_pipeline.sh
new file mode 100755
index 0000000..bb7c409
--- /dev/null
+++ b/packages/fine-tuner/scripts/run_v8_pipeline.sh
@@ -0,0 +1,178 @@
+#!/usr/bin/env bash
+# ═══════════════════════════════════════════════════════════════════════════════
+# run_v8_pipeline.sh — fo-blog-v8 Autopilot Pipeline
+#
+# Qwen2.5-14B, LoRA r=64, 5 epochs SFT + 2 epochs DPO
+#
+# Erwartet dass folgende Daten bereit sind:
+# ~/transceiver-training-data/v8-real-posts-sft.jsonl (19 real posts)
+# ~/transceiver-training-data/v7-generated-sft.jsonl (v7 generated, ≥100)
+# ~/transceiver-training-data/v8-v6blogs-sft.jsonl (v6 tip blogs good)
+# ~/transceiver-training-data/v8-external-sft.jsonl (crawled external)
+# ~/transceiver-training-data/v7-dpo-pairs.jsonl (v7 DPO)
+# ~/transceiver-training-data/v8-v6blogs-dpo.jsonl (real v6 failures)
+#
+# Usage:
+# bash scripts/run_v8_pipeline.sh # full auto
+# bash scripts/run_v8_pipeline.sh --wait-crawl # wait for crawler first
+# bash scripts/run_v8_pipeline.sh --phase-from merge # skip training
+# bash scripts/run_v8_pipeline.sh --phase-from dpo # skip SFT, do DPO + merge
+# ═══════════════════════════════════════════════════════════════════════════════
+
+set -euo pipefail
+
+FINE_TUNER_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+PYTHON="/opt/homebrew/bin/python3.13"
+SCRIPTS="$FINE_TUNER_DIR/scripts"
+DATA_DIR="$HOME/transceiver-training-data"
+LOG_DIR="/tmp/v8-pipeline"
+TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+
+mkdir -p "$LOG_DIR"
+
+# ─── Colors ───────────────────────────────────────────────────────────────────
+GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'; BOLD='\033[1m'
+log() { echo -e "${GREEN}[$(date +%H:%M:%S)]${NC} $*"; }
+warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${NC} $*"; }
+err() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${NC} $*"; }
+step() { echo -e "\n${BOLD}${GREEN}══ $* ══${NC}"; }
+
+# ─── Args ─────────────────────────────────────────────────────────────────────
+WAIT_CRAWL=false
+PHASE_FROM="consolidate" # consolidate | sft | dpo | merge
+
+for arg in "$@"; do
+ case "$arg" in
+ --wait-crawl) WAIT_CRAWL=true ;;
+ --phase-from=*) PHASE_FROM="${arg#*=}" ;;
+ --phase-from) shift; PHASE_FROM="$1" ;;
+ esac
+done
+
+# ─── Step 0: Wait for external crawler ────────────────────────────────────────
+if [[ "$WAIT_CRAWL" == "true" ]]; then
+ step "Warte auf v8 External Crawler (crawl_v8_sources.py)"
+ while pgrep -f "crawl_v8_sources.py" > /dev/null 2>&1; do
+ EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
+ log " Crawler läuft noch... $EXT_COUNT externe Artikel bisher"
+ sleep 120
+ done
+ EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
+ log "✓ Crawler fertig: $EXT_COUNT externe Artikel → $DATA_DIR/v8-external-sft.jsonl"
+
+ # Also wait for v6 DPO generation
+ while pgrep -f "process_v6_blogs.py" > /dev/null 2>&1; do
+ DPO_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
+ log " v6 DPO Generator läuft... $DPO_COUNT Pairs bisher"
+ sleep 120
+ done
+ DPO_V6_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-dpo.jsonl" 2>/dev/null || echo 0)
+ log "✓ v6 DPO fertig: $DPO_V6_COUNT Pairs"
+fi
+
+# ─── Step 1: Datenlage prüfen ─────────────────────────────────────────────────
+step "Datenlage prüfen"
+cd "$FINE_TUNER_DIR"
+$PYTHON "$SCRIPTS/consolidate_v8_dataset.py" --stats-only
+
+REAL_COUNT=$(wc -l < "$DATA_DIR/v8-real-posts-sft.jsonl" 2>/dev/null || echo 0)
+V7GEN_COUNT=$(wc -l < "$DATA_DIR/v7-generated-sft.jsonl" 2>/dev/null || echo 0)
+V6BLOG_COUNT=$(wc -l < "$DATA_DIR/v8-v6blogs-sft.jsonl" 2>/dev/null || echo 0)
+EXT_COUNT=$(wc -l < "$DATA_DIR/v8-external-sft.jsonl" 2>/dev/null || echo 0)
+
+log "SFT Quellen:"
+log " Real posts (Gold ×3): $REAL_COUNT"
+log " v7 Generated (×1): $V7GEN_COUNT"
+log " v6 TIP Blogs (×2): $V6BLOG_COUNT"
+log " External crawled (×1.5): $EXT_COUNT"
+
+TOTAL_EST=$(( REAL_COUNT*3 + V7GEN_COUNT + V6BLOG_COUNT*2 + EXT_COUNT*2 ))
+log " Geschätzt total effective: $TOTAL_EST"
+
+if [[ "$TOTAL_EST" -lt 80 ]]; then
+ err "Zu wenig Daten ($TOTAL_EST effective) — mindestens 80 nötig!"
+ err "Warte auf v7-generation oder crawl_v8_sources.py"
+ exit 1
+fi
+
+# ─── Step 2: Dataset konsolidieren ────────────────────────────────────────────
+if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" || "$PHASE_FROM" == "merge" ]]; then
+ if [[ "$PHASE_FROM" == "consolidate" ]]; then
+ step "Phase 0: Dataset Konsolidierung"
+ CONS_LOG="$LOG_DIR/consolidate-$TIMESTAMP.log"
+ log "Starte consolidate_v8_dataset.py..."
+ $PYTHON "$SCRIPTS/consolidate_v8_dataset.py" 2>&1 | tee "$CONS_LOG"
+ SFT_MERGED=$(wc -l < "$DATA_DIR/v8-sft-merged.jsonl" 2>/dev/null || echo 0)
+ DPO_MERGED=$(wc -l < "$DATA_DIR/v8-dpo-merged.jsonl" 2>/dev/null || echo 0)
+ log "✓ Merged: $SFT_MERGED SFT + $DPO_MERGED DPO"
+ else
+ log "Phase: $PHASE_FROM — Konsolidierung übersprungen"
+ if [[ ! -f "$DATA_DIR/v8-sft-merged.jsonl" ]]; then
+ warn "v8-sft-merged.jsonl fehlt — erstelle schnell..."
+ $PYTHON "$SCRIPTS/consolidate_v8_dataset.py"
+ fi
+ fi
+fi
+
+# ─── Step 3: SFT Training ─────────────────────────────────────────────────────
+if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" ]]; then
+ step "Phase 1: SFT Training (Qwen2.5-14B, LoRA r=64, 5 Epochs)"
+ SFT_LOG="$LOG_DIR/sft-$TIMESTAMP.log"
+ log "Starte train_blog_v8.py --phase sft..."
+ log "Log: $SFT_LOG"
+ log "Estimated: ~10-14 Stunden (run overnight!)"
+ $PYTHON "$SCRIPTS/train_blog_v8.py" --phase sft 2>&1 | tee "$SFT_LOG"
+
+ ADAPTER="$FINE_TUNER_DIR/adapters/fo-blog-v8/adapter"
+ if [[ ! -d "$ADAPTER" ]]; then
+ err "SFT Adapter nicht gefunden: $ADAPTER"
+ exit 1
+ fi
+ log "✓ SFT Adapter: $ADAPTER"
+fi
+
+# ─── Step 4: DPO Training ─────────────────────────────────────────────────────
+if [[ "$PHASE_FROM" == "consolidate" || "$PHASE_FROM" == "sft" || "$PHASE_FROM" == "dpo" ]]; then
+ step "Phase 2: DPO Training (2 Epochs)"
+ DPO_LOG="$LOG_DIR/dpo-$TIMESTAMP.log"
+
+ DPO_FILE="$DATA_DIR/v8-dpo-merged.jsonl"
+ if [[ ! -f "$DPO_FILE" ]]; then
+ warn "DPO File fehlt — überspringe DPO Phase"
+ else
+ DPO_COUNT=$(wc -l < "$DPO_FILE")
+ log "DPO Pairs: $DPO_COUNT"
+ log "Starte train_blog_v8.py --phase dpo..."
+ $PYTHON "$SCRIPTS/train_blog_v8.py" --phase dpo 2>&1 | tee "$DPO_LOG"
+ log "✓ DPO Training abgeschlossen"
+ fi
+fi
+
+# ─── Step 5: Merge + GGUF + Ollama ───────────────────────────────────────────
+step "Phase 3: Merge + GGUF + Ollama Registrierung"
+CONV_LOG="$LOG_DIR/convert-$TIMESTAMP.log"
+log "Starte train_blog_v8.py --phase convert..."
+$PYTHON "$SCRIPTS/train_blog_v8.py" --phase convert 2>&1 | tee "$CONV_LOG"
+log "✓ fo-blog-v8 in Ollama registriert"
+
+# ─── Abschluss ────────────────────────────────────────────────────────────────
+step "v8 Pipeline ABGESCHLOSSEN"
+echo ""
+log "fo-blog-v8 ist bereit:"
+log " Ollama: ollama run fo-blog-v8"
+log " API: OLLAMA_LLM_MODEL=fo-blog-v8"
+echo ""
+log "Auf Erik deployen:"
+log " 1. GGUF rsync: rsync -avz models/fo-blog-v8/fo-blog-v8.gguf root@erik:/opt/ollama-models/"
+log " 2. Ollama: ssh erik 'ollama create fo-blog-v8 -f /opt/tip/Modelfile-v8'"
+log " 3. TIP: ecosystem.config.js → OLLAMA_LLM_MODEL=fo-blog-v8"
+log " 4. Restart: ssh erik 'cd /opt/tip && pm2 restart ecosystem.config.js --update-env'"
+echo ""
+log "Logs: $LOG_DIR/"
+echo ""
+log "v8 vs v7 Verbesserungen:"
+log " - 14B statt 7B (4× Parameter)"
+log " - Echte Blog-Posts ×3 gewichtet"
+log " - Echte Modell-Failures als DPO (v6 too-long posts)"
+log " - Externe Quellen: APNIC, RIPE Labs, potaroo.net, Cloudflare"
+log " - 5 SFT + 2 DPO Epochs (war 4 + 1)"
diff --git a/packages/fine-tuner/scripts/train_blog_v8.py b/packages/fine-tuner/scripts/train_blog_v8.py
new file mode 100644
index 0000000..dbe4fa3
--- /dev/null
+++ b/packages/fine-tuner/scripts/train_blog_v8.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+train_blog_v8.py — fo-blog-v8 Training (Qwen2.5-14B, MPS LoRA)
+
+Phase 1: SFT (5 epochs, LoRA r=64, from merged v8 dataset)
+Phase 2: DPO (2 epochs, from SFT adapter)
+
+Usage:
+ python3 scripts/train_blog_v8.py --phase sft
+ python3 scripts/train_blog_v8.py --phase dpo
+ python3 scripts/train_blog_v8.py --phase both # SFT then DPO sequentially
+
+Hardware: Apple Silicon M4 Max (48GB), MPS backend
+Estimated: SFT ~10-14h, DPO ~3-5h (run overnight)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+# ─── Paths ────────────────────────────────────────────────────────────────────
+FINE_TUNER_DIR = Path(__file__).parent.parent
+DATA_DIR = Path.home() / "transceiver-training-data"
+SFT_DATA = DATA_DIR / "v8-sft-merged.jsonl"
+DPO_DATA = DATA_DIR / "v8-dpo-merged.jsonl"
+SFT_ADAPTER = FINE_TUNER_DIR / "adapters" / "fo-blog-v8" / "adapter"
+DPO_ADAPTER = FINE_TUNER_DIR / "adapters" / "fo-blog-v8-dpo" / "adapter"
+MERGED_DIR = FINE_TUNER_DIR / "models" / "fo-blog-v8" / "merged"
+
+BASE_MODEL = "Qwen/Qwen2.5-14B-Instruct"
+
+SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
+
+STRICT CONSTRAINTS — Follow exactly, no exceptions:
+- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
+- STRUCTURE (mandatory, in this order):
+ 1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
+ 2. Technical sections — 3–4 H2 sections covering the topic in depth
+ 3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
+- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
+- NO REPETITION: Every sentence must add new information. No restating.
+- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
+- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
+- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
+
+Do not summarize what you are about to write. Start with the hook directly."""
+
+
+def build_chatml(system: str, user: str, assistant: str) -> str:
+ """Build ChatML-formatted training string."""
+ return (
+ f"<|im_start|>system\n{system}<|im_end|>\n"
+ f"<|im_start|>user\n{user}<|im_end|>\n"
+ f"<|im_start|>assistant\n{assistant}<|im_end|>"
+ )
+
+
+def load_sft_dataset(tokenizer, max_seq_length: int = 4096):
+ """Load + tokenize SFT dataset from v8-sft-merged.jsonl."""
+ from datasets import Dataset
+
+ if not SFT_DATA.exists():
+ raise FileNotFoundError(
+ f"SFT data not found: {SFT_DATA}\n"
+ "Run: python3 scripts/consolidate_v8_dataset.py"
+ )
+
+ records = []
+ with open(SFT_DATA, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ item = json.loads(line)
+ system = item.get("system_prompt", SYSTEM_PROMPT)
+ user = item.get("input_text", "")
+ assistant = item.get("output_text", "")
+ if user and assistant:
+ text = build_chatml(system, user, assistant)
+ records.append({"text": text})
+ except (json.JSONDecodeError, KeyError):
+ pass
+
+ print(f"Loaded {len(records)} SFT examples from {SFT_DATA.name}")
+ return Dataset.from_list(records)
+
+
+def load_dpo_dataset():
+ """Load DPO dataset from v8-dpo-merged.jsonl."""
+ from datasets import Dataset
+
+ if not DPO_DATA.exists():
+ raise FileNotFoundError(
+ f"DPO data not found: {DPO_DATA}\n"
+ "Run: python3 scripts/consolidate_v8_dataset.py"
+ )
+
+ records = []
+ with open(DPO_DATA, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ item = json.loads(line)
+ records.append({
+ "prompt": item["prompt"],
+ "chosen": item["chosen"],
+ "rejected": item["rejected"],
+ })
+ except (json.JSONDecodeError, KeyError):
+ pass
+
+ print(f"Loaded {len(records)} DPO pairs from {DPO_DATA.name}")
+ return Dataset.from_list(records)
+
+
+def run_sft() -> None:
+ """Phase 1: Supervised Fine-Tuning with LoRA."""
+ import torch
+ from peft import LoraConfig, TaskType
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+ from trl import SFTTrainer, SFTConfig
+
+ print(f"=== fo-blog-v8 SFT: {BASE_MODEL} → LoRA r=64 ===")
+ print(f"Device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
+
+ # ── Tokenizer ──
+ print("Loading tokenizer...")
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "right"
+
+ # ── Dataset ──
+ dataset = load_sft_dataset(tokenizer, max_seq_length=4096)
+
+ # ── Model ──
+ print(f"Loading base model: {BASE_MODEL}")
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
+ model = AutoModelForCausalLM.from_pretrained(
+ BASE_MODEL,
+ dtype=torch.bfloat16, # bf16 for M4 Max (transformers 5.x: dtype= not torch_dtype=)
+ device_map=device,
+ trust_remote_code=True,
+ )
+ model.config.use_cache = False
+
+ # ── LoRA Config ──
+ lora_config = LoraConfig(
+ r=64,
+ lora_alpha=128,
+ lora_dropout=0.05,
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+ "gate_proj", "up_proj", "down_proj"],
+ bias="none",
+ task_type=TaskType.CAUSAL_LM,
+ )
+
+ # ── Training Config (trl 1.x: SFTConfig carries both TrainingArguments + SFT params) ──
+ SFT_ADAPTER.mkdir(parents=True, exist_ok=True)
+ training_args = SFTConfig(
+ output_dir=str(SFT_ADAPTER),
+ num_train_epochs=5,
+ per_device_train_batch_size=1,
+ gradient_accumulation_steps=8,
+ learning_rate=1.2e-4,
+ warmup_ratio=0.05,
+ lr_scheduler_type="cosine",
+ bf16=True,
+ fp16=False,
+ optim="adamw_torch",
+ weight_decay=0.01,
+ max_grad_norm=1.0,
+ logging_steps=10,
+ save_steps=100,
+ save_total_limit=2,
+ eval_strategy="no",
+ dataloader_num_workers=0,
+ remove_unused_columns=False,
+ gradient_checkpointing=True,
+ report_to="none",
+ # SFT-specific (moved from SFTTrainer in trl 1.x)
+ dataset_text_field="text",
+ max_seq_length=4096,
+ packing=False,
+ )
+
+ # ── Trainer ──
+ trainer = SFTTrainer(
+ model=model,
+ train_dataset=dataset,
+ peft_config=lora_config,
+ processing_class=tokenizer,
+ args=training_args,
+ )
+
+ print(f"Starting SFT training: {len(dataset)} examples, 5 epochs...")
+ trainer.train()
+
+ print(f"Saving SFT adapter → {SFT_ADAPTER}")
+ trainer.save_model(str(SFT_ADAPTER))
+ tokenizer.save_pretrained(str(SFT_ADAPTER))
+ print("SFT Phase COMPLETE.")
+
+
+def run_dpo() -> None:
+ """Phase 2: Direct Preference Optimization."""
+ import torch
+ from peft import PeftModel
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
+ from trl import DPOTrainer, DPOConfig
+
+ print(f"=== fo-blog-v8 DPO: SFT adapter → DPO ===")
+
+ if not SFT_ADAPTER.exists():
+ raise FileNotFoundError(
+ f"SFT adapter not found at {SFT_ADAPTER}\n"
+ "Run: python3 scripts/train_blog_v8.py --phase sft"
+ )
+
+ # ── Tokenizer ──
+ tokenizer = AutoTokenizer.from_pretrained(str(SFT_ADAPTER), trust_remote_code=True)
+ tokenizer.pad_token = tokenizer.eos_token
+
+ # ── Dataset ──
+ dataset = load_dpo_dataset()
+
+ # ── Model (base + SFT adapter) ──
+ print(f"Loading model + SFT adapter...")
+ device = "mps" if __import__("torch").backends.mps.is_available() else "cpu"
+ base_model = AutoModelForCausalLM.from_pretrained(
+ BASE_MODEL,
+ dtype=torch.bfloat16,
+ device_map=device,
+ trust_remote_code=True,
+ )
+ model = PeftModel.from_pretrained(base_model, str(SFT_ADAPTER))
+
+ # ── DPO Config ──
+ DPO_ADAPTER.mkdir(parents=True, exist_ok=True)
+ dpo_config = DPOConfig(
+ output_dir=str(DPO_ADAPTER),
+ num_train_epochs=2,
+ per_device_train_batch_size=1,
+ gradient_accumulation_steps=8,
+ learning_rate=5e-5,
+ warmup_ratio=0.05,
+ lr_scheduler_type="cosine",
+ bf16=True,
+ fp16=False,
+ optim="adamw_torch",
+ max_grad_norm=1.0,
+ logging_steps=5,
+ save_steps=50,
+ save_total_limit=2,
+ eval_strategy="no",
+ dataloader_num_workers=0,
+ gradient_checkpointing=True,
+ report_to="none",
+ # DPO-specific
+ beta=0.1,
+ loss_type="sigmoid",
+ max_prompt_length=512,
+ max_length=4096,
+ )
+
+ # ── Trainer ──
+ trainer = DPOTrainer(
+ model=model,
+ ref_model=None, # use implicit reference via peft
+ args=dpo_config,
+ train_dataset=dataset,
+ processing_class=tokenizer,
+ )
+
+ print(f"Starting DPO training: {len(dataset)} pairs, 2 epochs...")
+ trainer.train()
+
+ print(f"Saving DPO adapter → {DPO_ADAPTER}")
+ trainer.save_model(str(DPO_ADAPTER))
+ tokenizer.save_pretrained(str(DPO_ADAPTER))
+ print("DPO Phase COMPLETE.")
+
+
+def run_merge_and_convert() -> None:
+ """Merge adapter → full model, convert to GGUF, register in Ollama."""
+ import subprocess
+ import shutil
+ import torch
+ from peft import PeftModel
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ # Prefer DPO adapter, fall back to SFT
+ adapter_path = DPO_ADAPTER if DPO_ADAPTER.exists() else SFT_ADAPTER
+ if not adapter_path.exists():
+ print(f"No adapter found. Run --phase sft first.")
+ return
+
+ print(f"=== fo-blog-v8 Merge + GGUF ===")
+ print(f"Adapter: {adapter_path}")
+
+ # ── Merge ──
+ MERGED_DIR.mkdir(parents=True, exist_ok=True)
+ safetensors = MERGED_DIR / "model.safetensors"
+ if safetensors.exists() and safetensors.stat().st_size > 10_000_000_000:
+ print(f" Already merged ({safetensors.stat().st_size/1e9:.1f} GB) — skip merge")
+ else:
+ print(" Loading base model on CPU for merge (avoids MPS OOM)...")
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+ model = AutoModelForCausalLM.from_pretrained(
+ BASE_MODEL, dtype=torch.float16,
+ device_map="cpu", trust_remote_code=True,
+ )
+ print(" Loading adapter...")
+ model = PeftModel.from_pretrained(model, str(adapter_path))
+ print(" Merging...")
+ model = model.merge_and_unload()
+ print(f" Saving merged model → {MERGED_DIR}")
+ model.save_pretrained(str(MERGED_DIR), safe_serialization=True)
+ tokenizer.save_pretrained(str(MERGED_DIR))
+ del model
+ print(" Merge done.")
+
+ # ── Copy tokenizer files from HF cache if needed ──
+ hf_cache = Path.home() / ".cache/huggingface/hub"
+ snaps = list(hf_cache.glob("models--Qwen--Qwen2.5-14B-Instruct/snapshots/*/tokenizer.json"))
+ if snaps:
+ snap_dir = snaps[0].parent
+ for fname in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt"]:
+ if (snap_dir / fname).exists() and not (MERGED_DIR / fname).exists():
+ shutil.copy2(snap_dir / fname, MERGED_DIR / fname)
+
+ # ── GGUF Conversion ──
+ gguf_dir = FINE_TUNER_DIR / "models" / "fo-blog-v8"
+ gguf_f16 = gguf_dir / "fo-blog-v8-f16.gguf"
+ gguf_q4 = gguf_dir / "fo-blog-v8.gguf"
+ convert_script = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
+ quantize_bin = "/opt/homebrew/bin/llama-quantize"
+ python_bin = "/opt/homebrew/bin/python3.13"
+
+ if not gguf_f16.exists():
+ print(" Converting to GGUF f16...")
+ subprocess.run(
+ [python_bin, convert_script, str(MERGED_DIR),
+ "--outfile", str(gguf_f16), "--outtype", "f16"],
+ check=True,
+ )
+ else:
+ print(f" F16 GGUF exists ({gguf_f16.stat().st_size/1e9:.1f} GB) — skip")
+
+ if not gguf_q4.exists():
+ print(" Quantizing to Q4_K_M...")
+ subprocess.run(
+ [quantize_bin, str(gguf_f16), str(gguf_q4), "Q4_K_M"],
+ check=True,
+ )
+ gguf_f16.unlink(missing_ok=True)
+ print(f" Q4_K_M GGUF: {gguf_q4} ({gguf_q4.stat().st_size/1e9:.1f} GB)")
+
+ # ── Ollama Registration ──
+ modelfile_path = gguf_dir / "Modelfile-v8"
+ modelfile_content = f"""FROM {gguf_q4.resolve()}
+
+SYSTEM \"\"\"{SYSTEM_PROMPT}\"\"\"
+
+PARAMETER temperature 0.7
+PARAMETER top_p 0.9
+PARAMETER top_k 40
+PARAMETER repeat_penalty 1.15
+PARAMETER num_predict 1500
+"""
+ modelfile_path.write_text(modelfile_content)
+ print(" Registering in Ollama as fo-blog-v8...")
+ subprocess.run(["ollama", "create", "fo-blog-v8", "-f", str(modelfile_path)], check=True)
+
+ import subprocess as sp
+ result = sp.run(["ollama", "list"], capture_output=True, text=True)
+ registered = "fo-blog-v8" in result.stdout
+ print(f" Ollama registration: {'✓ SUCCESS' if registered else '✗ FAILED'}")
+ print(f"\nDONE: {gguf_q4}")
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="Train fo-blog-v8 (Qwen2.5-14B LoRA)")
+ parser.add_argument(
+ "--phase",
+ choices=["sft", "dpo", "both", "convert"],
+ default="sft",
+ help="Training phase to run (default: sft)",
+ )
+ args = parser.parse_args()
+
+ if args.phase in ("sft", "both"):
+ run_sft()
+
+ if args.phase in ("dpo", "both"):
+ run_dpo()
+
+ if args.phase == "convert":
+ run_merge_and_convert()
+
+
+if __name__ == "__main__":
+ main()