#!/usr/bin/env python3 """ parse_real_posts.py — Konvertiert echte Blog-HTML → v8 SFT Trainingsdaten Extrahiert Titel + Inhalt aus Ghost CMS HTML, baut daraus hochwertige Training-Beispiele mit dem v7/v8 System Prompt. Diese echten Posts sind GOLD — Rene's eigene Stimme, echte Expertise, keine AI-Halluzinationen. Werden als Top-Priorität in v8 gewichtet. Input: ~/transceiver-training-data/v8-real-posts/*.html Output: ~/transceiver-training-data/v8-real-posts-sft.jsonl """ from __future__ import annotations import json import re from pathlib import Path POSTS_DIR = Path.home() / "transceiver-training-data" / "v8-real-posts" OUTPUT_FILE = Path.home() / "transceiver-training-data" / "v8-real-posts-sft.jsonl" # Welche Posts sind für Blog-Training relevant (nicht rein persönlich/tool-spezifisch) # Alle nehmen — auch nicht-Transceiver Posts zeigen Schreibstil + Struktur SKIP_SLUGS = set() # nichts überspringen — alle Posts zeigen Renes Voice SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure. STRICT CONSTRAINTS — Follow exactly, no exceptions: - LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum. - STRUCTURE (mandatory, in this order): 1. HOOK paragraph — 2–3 sentences stating the problem this post addresses 2. Technical sections — 3–4 H2 sections covering the topic in depth 3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable - TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift. - NO REPETITION: Every sentence must add new information. No restating. - VOICE: Confident, direct. No hedging phrases like "it's worth noting". - AUDIENCE: Network engineers and IT professionals. Assume technical fluency. - FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms. Do not summarize what you are about to write. Start with the hook directly.""" def extract_ghost_content(html: str) -> tuple[str, str] | None: """Extrahiert (title, clean_text) aus Ghost CMS HTML.""" # Titel aus og:title oder h1 title_match = ( re.search(r'([\s\S]+?)', html) ) title = title_match.group(1).strip() if title_match else "" # Multiline title collapse title = re.sub(r'\s+', ' ', title).strip() # HTML entities in title title = (title.replace('&', '&').replace('<', '<').replace('>', '>') .replace('"', '"').replace(''', "'").replace(''', "'") .replace(' ', ' ').replace('—', '—').replace('–', '–')) # Ghost appends " | Blog" or " – Site Name" — only strip at a pipe or en-dash # surrounded by spaces (require \s+, NOT \s* to avoid cutting hyphenated words) title = re.sub(r"\s+[|–]\s+.+$", "", title).strip() # Ghost CMS Content-Selektoren (in Prioritätsreihenfolge) content_patterns = [ r'