Full v8 training pipeline for the optical networking blog model: - train_blog_v8.py: SFT (LoRA r=64, 5 epochs) + DPO (2 epochs) on Qwen2.5-14B-Instruct Fixed for trl 1.2.x: SFTConfig instead of TrainingArguments, processing_class= instead of tokenizer=, eval_strategy= instead of deprecated evaluation_strategy= - consolidate_v8_dataset.py: weighted merge of all data sources (820 effective SFT / 235 DPO) - crawl_v8_sources.py: APNIC/RIPE Labs/potaroo/Cloudflare crawler with balanced div extraction - process_v6_blogs.py: converts 101 real v6 TIP blog outputs into SFT + DPO pairs - label_v7_quality.py: Claude-judged quality labels → v8 quality DPO pairs - parse_real_posts.py: parses blog.fichtmueller.org Ghost CMS HTML → gold SFT records - run_v8_pipeline.sh: autopilot (consolidate → SFT → DPO → GGUF → Ollama) - blog-v8-training.yaml: training config reference Dataset breakdown: 19 real posts ×3 + 196 v7-gen + 28 v6blogs ×2 + 135 external ×1.5
229 lines
8.9 KiB
Python
229 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
parse_real_posts.py — Konvertiert echte Blog-HTML → v8 SFT Trainingsdaten
|
||
|
||
Extrahiert Titel + Inhalt aus Ghost CMS HTML, baut daraus hochwertige
|
||
Training-Beispiele mit dem v7/v8 System Prompt.
|
||
|
||
Diese echten Posts sind GOLD — Rene's eigene Stimme, echte Expertise,
|
||
keine AI-Halluzinationen. Werden als Top-Priorität in v8 gewichtet.
|
||
|
||
Input: ~/transceiver-training-data/v8-real-posts/*.html
|
||
Output: ~/transceiver-training-data/v8-real-posts-sft.jsonl
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
|
||
POSTS_DIR = Path.home() / "transceiver-training-data" / "v8-real-posts"
|
||
OUTPUT_FILE = Path.home() / "transceiver-training-data" / "v8-real-posts-sft.jsonl"
|
||
|
||
# Welche Posts sind für Blog-Training relevant (nicht rein persönlich/tool-spezifisch)
|
||
# Alle nehmen — auch nicht-Transceiver Posts zeigen Schreibstil + Struktur
|
||
SKIP_SLUGS = set() # nichts überspringen — alle Posts zeigen Renes Voice
|
||
|
||
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
|
||
|
||
STRICT CONSTRAINTS — Follow exactly, no exceptions:
|
||
- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
|
||
- STRUCTURE (mandatory, in this order):
|
||
1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
|
||
2. Technical sections — 3–4 H2 sections covering the topic in depth
|
||
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
|
||
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
|
||
- NO REPETITION: Every sentence must add new information. No restating.
|
||
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
|
||
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
|
||
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
|
||
|
||
Do not summarize what you are about to write. Start with the hook directly."""
|
||
|
||
|
||
def extract_ghost_content(html: str) -> tuple[str, str] | None:
|
||
"""Extrahiert (title, clean_text) aus Ghost CMS HTML."""
|
||
|
||
# Titel aus og:title oder h1
|
||
title_match = (
|
||
re.search(r'<meta property="og:title" content="([^"]+)"', html)
|
||
or re.search(r'<title>([\s\S]+?)</title>', html)
|
||
)
|
||
title = title_match.group(1).strip() if title_match else ""
|
||
# Multiline title collapse
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
# HTML entities in title
|
||
title = (title.replace('&', '&').replace('<', '<').replace('>', '>')
|
||
.replace('"', '"').replace(''', "'").replace(''', "'")
|
||
.replace(' ', ' ').replace('—', '—').replace('–', '–'))
|
||
# Ghost appends " | Blog" or " – Site Name" — only strip at a pipe or en-dash
|
||
# surrounded by spaces (require \s+, NOT \s* to avoid cutting hyphenated words)
|
||
title = re.sub(r"\s+[|–]\s+.+$", "", title).strip()
|
||
|
||
# Ghost CMS Content-Selektoren (in Prioritätsreihenfolge)
|
||
content_patterns = [
|
||
r'<div class="gh-content[^"]*">(.*?)</(?:div|section)>\s*(?:<div|<section|<footer|<aside)',
|
||
r'<section class="gh-content[^"]*">(.*?)</section>',
|
||
r'class="post-full-content[^"]*"[^>]*>(.*?)</(?:div|section)>',
|
||
r'<div class="post-content[^"]*">(.*?)</div>\s*(?:<div class="post|<section|<footer)',
|
||
r'<article[^>]*>(.*?)</article>',
|
||
]
|
||
|
||
raw_content = ""
|
||
for pat in content_patterns:
|
||
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
|
||
if m:
|
||
raw_content = m.group(1)
|
||
break
|
||
|
||
if not raw_content:
|
||
return None
|
||
|
||
# Cleanup
|
||
raw_content = re.sub(r'<script[^>]*>.*?</script>', '', raw_content, flags=re.DOTALL)
|
||
raw_content = re.sub(r'<style[^>]*>.*?</style>', '', raw_content, flags=re.DOTALL)
|
||
raw_content = re.sub(r'<noscript[^>]*>.*?</noscript>', '', raw_content, flags=re.DOTALL)
|
||
|
||
# HTML → Markdown-ähnliches Format
|
||
# Headers
|
||
for level in [6, 5, 4, 3, 2, 1]:
|
||
hashes = "#" * level
|
||
raw_content = re.sub(
|
||
rf'<h{level}[^>]*>(.*?)</h{level}>',
|
||
lambda m: f"\n{hashes} {re.sub('<[^>]+>', '', m.group(1)).strip()}\n",
|
||
raw_content,
|
||
flags=re.DOTALL | re.IGNORECASE,
|
||
)
|
||
|
||
# Bold/italic
|
||
raw_content = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', raw_content, flags=re.DOTALL)
|
||
raw_content = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', raw_content, flags=re.DOTALL)
|
||
raw_content = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', raw_content, flags=re.DOTALL)
|
||
|
||
# Lists
|
||
raw_content = re.sub(r'<li[^>]*>(.*?)</li>', r'\n- \1', raw_content, flags=re.DOTALL)
|
||
raw_content = re.sub(r'<[uo]l[^>]*>', '\n', raw_content)
|
||
raw_content = re.sub(r'</[uo]l>', '\n', raw_content)
|
||
|
||
# Paragraphs → newlines
|
||
raw_content = re.sub(r'<br\s*/?>', '\n', raw_content)
|
||
raw_content = re.sub(r'<p[^>]*>', '\n', raw_content)
|
||
raw_content = re.sub(r'</p>', '\n', raw_content)
|
||
|
||
# Code blocks
|
||
raw_content = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>',
|
||
lambda m: f"\n```\n{m.group(1)}\n```\n",
|
||
raw_content, flags=re.DOTALL)
|
||
|
||
# Alle verbleibenden Tags entfernen
|
||
raw_content = re.sub(r'<[^>]+>', ' ', raw_content)
|
||
|
||
# HTML entities
|
||
raw_content = raw_content.replace('&', '&').replace('<', '<').replace('>', '>')
|
||
raw_content = raw_content.replace(' ', ' ').replace('"', '"').replace(''', "'")
|
||
raw_content = raw_content.replace('—', '—').replace('–', '–').replace('…', '…')
|
||
|
||
# Whitespace normalisieren
|
||
raw_content = re.sub(r'\n{3,}', '\n\n', raw_content)
|
||
raw_content = re.sub(r'[ \t]+', ' ', raw_content)
|
||
raw_content = re.sub(r'\n ', '\n', raw_content)
|
||
clean = raw_content.strip()
|
||
|
||
if len(clean.split()) < 200:
|
||
return None
|
||
|
||
return title, clean
|
||
|
||
|
||
def slug_to_topic(slug: str) -> str:
|
||
"""Wandelt URL-Slug in lesbaren Topic-String."""
|
||
return slug.replace("-", " ").title()
|
||
|
||
|
||
def main() -> None:
|
||
html_files = sorted(f for f in POSTS_DIR.glob("*.html")
|
||
if f.stat().st_size > 10_000
|
||
and f.stem not in SKIP_SLUGS
|
||
and re.match(r'^[a-z0-9]', f.stem)) # skip hidden/garbage files
|
||
|
||
print(f"Parsing {len(html_files)} HTML files...")
|
||
|
||
results = []
|
||
skipped = 0
|
||
seen_slugs: set[str] = set()
|
||
|
||
for fpath in html_files:
|
||
if fpath.stem in seen_slugs:
|
||
print(f" SKIP (duplicate slug): {fpath.name}")
|
||
skipped += 1
|
||
continue
|
||
seen_slugs.add(fpath.stem)
|
||
|
||
html = fpath.read_text(errors="ignore")
|
||
extracted = extract_ghost_content(html)
|
||
|
||
if extracted is None:
|
||
print(f" SKIP (no content): {fpath.name}")
|
||
skipped += 1
|
||
continue
|
||
|
||
title, clean_text = extracted
|
||
word_count = len(clean_text.split())
|
||
|
||
if not title:
|
||
title = slug_to_topic(fpath.stem)
|
||
|
||
print(f" OK: {word_count:4d}w | {title[:60]}")
|
||
|
||
# Input-Text: genau wie beim Generieren — topic + audience + reminder
|
||
# Audience basierend auf Slug-Keywords bestimmen
|
||
slug = fpath.stem
|
||
if any(k in slug for k in ["shieldx", "claude", "papercortex", "llm", "slop", "sync"]):
|
||
audience = "developers and infrastructure engineers building AI-powered tools"
|
||
elif any(k in slug for k in ["aspa", "bgp", "peercortex", "infrastructure"]):
|
||
audience = "network engineers and NOC operators"
|
||
else:
|
||
audience = "network engineers and IT professionals who evaluate and operate optical infrastructure"
|
||
|
||
input_text = (
|
||
f"Write a blog post on the following topic:\n\n"
|
||
f"**Topic:** {title}\n\n"
|
||
f"**Target audience:** {audience}\n\n"
|
||
f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. "
|
||
f"Stay strictly on-topic. No filler. Start writing now."
|
||
)
|
||
|
||
record = {
|
||
"system_prompt": SYSTEM_PROMPT,
|
||
"input_text": input_text,
|
||
"output_text": clean_text,
|
||
"meta": {
|
||
"title": title,
|
||
"slug": slug,
|
||
"source": "blog.fichtmueller.org",
|
||
"word_count": word_count,
|
||
"quality": "human_written",
|
||
"weight": 3.0, # 3x Gewichtung in Training — Gold Standard
|
||
"dataset_version": "v8",
|
||
},
|
||
}
|
||
results.append(record)
|
||
|
||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||
for r in results:
|
||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||
|
||
print(f"\nGespeichert: {len(results)} Posts → {OUTPUT_FILE}")
|
||
print(f"Übersprungen: {skipped}")
|
||
|
||
# Statistik
|
||
wcs = [r["meta"]["word_count"] for r in results]
|
||
if wcs:
|
||
print(f"Word count: min={min(wcs)}, max={max(wcs)}, avg={sum(wcs)//len(wcs)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|