llm-gateway/packages/fine-tuner/scripts/parse_real_posts.py
Rene Fichtmueller c3ab87b167 feat: add fo-blog-v8 training pipeline (Qwen2.5-14B, SFT+DPO)
Full v8 training pipeline for the optical networking blog model:
- train_blog_v8.py: SFT (LoRA r=64, 5 epochs) + DPO (2 epochs) on Qwen2.5-14B-Instruct
  Fixed for trl 1.2.x: SFTConfig instead of TrainingArguments, processing_class= instead
  of tokenizer=, eval_strategy= instead of deprecated evaluation_strategy=
- consolidate_v8_dataset.py: weighted merge of all data sources (820 effective SFT / 235 DPO)
- crawl_v8_sources.py: APNIC/RIPE Labs/potaroo/Cloudflare crawler with balanced div extraction
- process_v6_blogs.py: converts 101 real v6 TIP blog outputs into SFT + DPO pairs
- label_v7_quality.py: Claude-judged quality labels → v8 quality DPO pairs
- parse_real_posts.py: parses blog.fichtmueller.org Ghost CMS HTML → gold SFT records
- run_v8_pipeline.sh: autopilot (consolidate → SFT → DPO → GGUF → Ollama)
- blog-v8-training.yaml: training config reference

Dataset breakdown: 19 real posts ×3 + 196 v7-gen + 28 v6blogs ×2 + 135 external ×1.5
2026-04-19 11:44:09 +02:00

229 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
parse_real_posts.py — Konvertiert echte Blog-HTML → v8 SFT Trainingsdaten
Extrahiert Titel + Inhalt aus Ghost CMS HTML, baut daraus hochwertige
Training-Beispiele mit dem v7/v8 System Prompt.
Diese echten Posts sind GOLD — Rene's eigene Stimme, echte Expertise,
keine AI-Halluzinationen. Werden als Top-Priorität in v8 gewichtet.
Input: ~/transceiver-training-data/v8-real-posts/*.html
Output: ~/transceiver-training-data/v8-real-posts-sft.jsonl
"""
from __future__ import annotations
import json
import re
from pathlib import Path
POSTS_DIR = Path.home() / "transceiver-training-data" / "v8-real-posts"
OUTPUT_FILE = Path.home() / "transceiver-training-data" / "v8-real-posts-sft.jsonl"
# Welche Posts sind für Blog-Training relevant (nicht rein persönlich/tool-spezifisch)
# Alle nehmen — auch nicht-Transceiver Posts zeigen Schreibstil + Struktur
SKIP_SLUGS = set() # nichts überspringen — alle Posts zeigen Renes Voice
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
STRICT CONSTRAINTS — Follow exactly, no exceptions:
- LENGTH: 7001000 words. Count carefully. Stop at 1000 words maximum.
- STRUCTURE (mandatory, in this order):
1. HOOK paragraph — 23 sentences stating the problem this post addresses
2. Technical sections — 34 H2 sections covering the topic in depth
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
- NO REPETITION: Every sentence must add new information. No restating.
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
Do not summarize what you are about to write. Start with the hook directly."""
def extract_ghost_content(html: str) -> tuple[str, str] | None:
"""Extrahiert (title, clean_text) aus Ghost CMS HTML."""
# Titel aus og:title oder h1
title_match = (
re.search(r'<meta property="og:title" content="([^"]+)"', html)
or re.search(r'<title>([\s\S]+?)</title>', html)
)
title = title_match.group(1).strip() if title_match else ""
# Multiline title collapse
title = re.sub(r'\s+', ' ', title).strip()
# HTML entities in title
title = (title.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
.replace('&quot;', '"').replace('&#39;', "'").replace('&#x27;', "'")
.replace('&nbsp;', ' ').replace('&mdash;', '').replace('&ndash;', ''))
# Ghost appends " | Blog" or " Site Name" — only strip at a pipe or en-dash
# surrounded by spaces (require \s+, NOT \s* to avoid cutting hyphenated words)
title = re.sub(r"\s+[|]\s+.+$", "", title).strip()
# Ghost CMS Content-Selektoren (in Prioritätsreihenfolge)
content_patterns = [
r'<div class="gh-content[^"]*">(.*?)</(?:div|section)>\s*(?:<div|<section|<footer|<aside)',
r'<section class="gh-content[^"]*">(.*?)</section>',
r'class="post-full-content[^"]*"[^>]*>(.*?)</(?:div|section)>',
r'<div class="post-content[^"]*">(.*?)</div>\s*(?:<div class="post|<section|<footer)',
r'<article[^>]*>(.*?)</article>',
]
raw_content = ""
for pat in content_patterns:
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
if m:
raw_content = m.group(1)
break
if not raw_content:
return None
# Cleanup
raw_content = re.sub(r'<script[^>]*>.*?</script>', '', raw_content, flags=re.DOTALL)
raw_content = re.sub(r'<style[^>]*>.*?</style>', '', raw_content, flags=re.DOTALL)
raw_content = re.sub(r'<noscript[^>]*>.*?</noscript>', '', raw_content, flags=re.DOTALL)
# HTML → Markdown-ähnliches Format
# Headers
for level in [6, 5, 4, 3, 2, 1]:
hashes = "#" * level
raw_content = re.sub(
rf'<h{level}[^>]*>(.*?)</h{level}>',
lambda m: f"\n{hashes} {re.sub('<[^>]+>', '', m.group(1)).strip()}\n",
raw_content,
flags=re.DOTALL | re.IGNORECASE,
)
# Bold/italic
raw_content = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', raw_content, flags=re.DOTALL)
raw_content = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', raw_content, flags=re.DOTALL)
raw_content = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', raw_content, flags=re.DOTALL)
# Lists
raw_content = re.sub(r'<li[^>]*>(.*?)</li>', r'\n- \1', raw_content, flags=re.DOTALL)
raw_content = re.sub(r'<[uo]l[^>]*>', '\n', raw_content)
raw_content = re.sub(r'</[uo]l>', '\n', raw_content)
# Paragraphs → newlines
raw_content = re.sub(r'<br\s*/?>', '\n', raw_content)
raw_content = re.sub(r'<p[^>]*>', '\n', raw_content)
raw_content = re.sub(r'</p>', '\n', raw_content)
# Code blocks
raw_content = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>',
lambda m: f"\n```\n{m.group(1)}\n```\n",
raw_content, flags=re.DOTALL)
# Alle verbleibenden Tags entfernen
raw_content = re.sub(r'<[^>]+>', ' ', raw_content)
# HTML entities
raw_content = raw_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
raw_content = raw_content.replace('&nbsp;', ' ').replace('&quot;', '"').replace('&#39;', "'")
raw_content = raw_content.replace('&mdash;', '').replace('&ndash;', '').replace('&hellip;', '')
# Whitespace normalisieren
raw_content = re.sub(r'\n{3,}', '\n\n', raw_content)
raw_content = re.sub(r'[ \t]+', ' ', raw_content)
raw_content = re.sub(r'\n ', '\n', raw_content)
clean = raw_content.strip()
if len(clean.split()) < 200:
return None
return title, clean
def slug_to_topic(slug: str) -> str:
"""Wandelt URL-Slug in lesbaren Topic-String."""
return slug.replace("-", " ").title()
def main() -> None:
html_files = sorted(f for f in POSTS_DIR.glob("*.html")
if f.stat().st_size > 10_000
and f.stem not in SKIP_SLUGS
and re.match(r'^[a-z0-9]', f.stem)) # skip hidden/garbage files
print(f"Parsing {len(html_files)} HTML files...")
results = []
skipped = 0
seen_slugs: set[str] = set()
for fpath in html_files:
if fpath.stem in seen_slugs:
print(f" SKIP (duplicate slug): {fpath.name}")
skipped += 1
continue
seen_slugs.add(fpath.stem)
html = fpath.read_text(errors="ignore")
extracted = extract_ghost_content(html)
if extracted is None:
print(f" SKIP (no content): {fpath.name}")
skipped += 1
continue
title, clean_text = extracted
word_count = len(clean_text.split())
if not title:
title = slug_to_topic(fpath.stem)
print(f" OK: {word_count:4d}w | {title[:60]}")
# Input-Text: genau wie beim Generieren — topic + audience + reminder
# Audience basierend auf Slug-Keywords bestimmen
slug = fpath.stem
if any(k in slug for k in ["shieldx", "claude", "papercortex", "llm", "slop", "sync"]):
audience = "developers and infrastructure engineers building AI-powered tools"
elif any(k in slug for k in ["aspa", "bgp", "peercortex", "infrastructure"]):
audience = "network engineers and NOC operators"
else:
audience = "network engineers and IT professionals who evaluate and operate optical infrastructure"
input_text = (
f"Write a blog post on the following topic:\n\n"
f"**Topic:** {title}\n\n"
f"**Target audience:** {audience}\n\n"
f"Remember: 7001000 words, hook + technical sections + 3 takeaways. "
f"Stay strictly on-topic. No filler. Start writing now."
)
record = {
"system_prompt": SYSTEM_PROMPT,
"input_text": input_text,
"output_text": clean_text,
"meta": {
"title": title,
"slug": slug,
"source": "blog.fichtmueller.org",
"word_count": word_count,
"quality": "human_written",
"weight": 3.0, # 3x Gewichtung in Training — Gold Standard
"dataset_version": "v8",
},
}
results.append(record)
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for r in results:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f"\nGespeichert: {len(results)} Posts → {OUTPUT_FILE}")
print(f"Übersprungen: {skipped}")
# Statistik
wcs = [r["meta"]["word_count"] for r in results]
if wcs:
print(f"Word count: min={min(wcs)}, max={max(wcs)}, avg={sum(wcs)//len(wcs)}")
if __name__ == "__main__":
main()