#!/usr/bin/env python3 """ parse_real_posts.py — Konvertiert echte Blog-HTML → v8 SFT Trainingsdaten Extrahiert Titel + Inhalt aus Ghost CMS HTML, baut daraus hochwertige Training-Beispiele mit dem v7/v8 System Prompt. Diese echten Posts sind GOLD — Rene's eigene Stimme, echte Expertise, keine AI-Halluzinationen. Werden als Top-Priorität in v8 gewichtet. Input: ~/transceiver-training-data/v8-real-posts/*.html Output: ~/transceiver-training-data/v8-real-posts-sft.jsonl """ from __future__ import annotations import json import re from pathlib import Path POSTS_DIR = Path.home() / "transceiver-training-data" / "v8-real-posts" OUTPUT_FILE = Path.home() / "transceiver-training-data" / "v8-real-posts-sft.jsonl" # Welche Posts sind für Blog-Training relevant (nicht rein persönlich/tool-spezifisch) # Alle nehmen — auch nicht-Transceiver Posts zeigen Schreibstil + Struktur SKIP_SLUGS = set() # nichts überspringen — alle Posts zeigen Renes Voice SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure. STRICT CONSTRAINTS — Follow exactly, no exceptions: - LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum. - STRUCTURE (mandatory, in this order): 1. HOOK paragraph — 2–3 sentences stating the problem this post addresses 2. Technical sections — 3–4 H2 sections covering the topic in depth 3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable - TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift. - NO REPETITION: Every sentence must add new information. No restating. - VOICE: Confident, direct. No hedging phrases like "it's worth noting". - AUDIENCE: Network engineers and IT professionals. Assume technical fluency. - FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms. Do not summarize what you are about to write. Start with the hook directly.""" def extract_ghost_content(html: str) -> tuple[str, str] | None: """Extrahiert (title, clean_text) aus Ghost CMS HTML.""" # Titel aus og:title oder h1 title_match = ( re.search(r'([\s\S]+?)', html) ) title = title_match.group(1).strip() if title_match else "" # Multiline title collapse title = re.sub(r'\s+', ' ', title).strip() # HTML entities in title title = (title.replace('&', '&').replace('<', '<').replace('>', '>') .replace('"', '"').replace(''', "'").replace(''', "'") .replace(' ', ' ').replace('—', '—').replace('–', '–')) # Ghost appends " | Blog" or " – Site Name" — only strip at a pipe or en-dash # surrounded by spaces (require \s+, NOT \s* to avoid cutting hyphenated words) title = re.sub(r"\s+[|–]\s+.+$", "", title).strip() # Ghost CMS Content-Selektoren (in Prioritätsreihenfolge) content_patterns = [ r'
(.*?)\s*(?:(.*?)', r'class="post-full-content[^"]*"[^>]*>(.*?)', r'
(.*?)
\s*(?:
]*>.*?', '', raw_content, flags=re.DOTALL) raw_content = re.sub(r']*>.*?', '', raw_content, flags=re.DOTALL) raw_content = re.sub(r']*>.*?', '', raw_content, flags=re.DOTALL) # HTML → Markdown-ähnliches Format # Headers for level in [6, 5, 4, 3, 2, 1]: hashes = "#" * level raw_content = re.sub( rf']*>(.*?)', lambda m: f"\n{hashes} {re.sub('<[^>]+>', '', m.group(1)).strip()}\n", raw_content, flags=re.DOTALL | re.IGNORECASE, ) # Bold/italic raw_content = re.sub(r']*>(.*?)', r'**\1**', raw_content, flags=re.DOTALL) raw_content = re.sub(r']*>(.*?)', r'**\1**', raw_content, flags=re.DOTALL) raw_content = re.sub(r']*>(.*?)', r'*\1*', raw_content, flags=re.DOTALL) # Lists raw_content = re.sub(r']*>(.*?)', r'\n- \1', raw_content, flags=re.DOTALL) raw_content = re.sub(r'<[uo]l[^>]*>', '\n', raw_content) raw_content = re.sub(r'', '\n', raw_content) # Paragraphs → newlines raw_content = re.sub(r'', '\n', raw_content) raw_content = re.sub(r']*>', '\n', raw_content) raw_content = re.sub(r'

', '\n', raw_content) # Code blocks raw_content = re.sub(r']*>]*>(.*?)', lambda m: f"\n```\n{m.group(1)}\n```\n", raw_content, flags=re.DOTALL) # Alle verbleibenden Tags entfernen raw_content = re.sub(r'<[^>]+>', ' ', raw_content) # HTML entities raw_content = raw_content.replace('&', '&').replace('<', '<').replace('>', '>') raw_content = raw_content.replace(' ', ' ').replace('"', '"').replace(''', "'") raw_content = raw_content.replace('—', '—').replace('–', '–').replace('…', '…') # Whitespace normalisieren raw_content = re.sub(r'\n{3,}', '\n\n', raw_content) raw_content = re.sub(r'[ \t]+', ' ', raw_content) raw_content = re.sub(r'\n ', '\n', raw_content) clean = raw_content.strip() if len(clean.split()) < 200: return None return title, clean def slug_to_topic(slug: str) -> str: """Wandelt URL-Slug in lesbaren Topic-String.""" return slug.replace("-", " ").title() def main() -> None: html_files = sorted(f for f in POSTS_DIR.glob("*.html") if f.stat().st_size > 10_000 and f.stem not in SKIP_SLUGS and re.match(r'^[a-z0-9]', f.stem)) # skip hidden/garbage files print(f"Parsing {len(html_files)} HTML files...") results = [] skipped = 0 seen_slugs: set[str] = set() for fpath in html_files: if fpath.stem in seen_slugs: print(f" SKIP (duplicate slug): {fpath.name}") skipped += 1 continue seen_slugs.add(fpath.stem) html = fpath.read_text(errors="ignore") extracted = extract_ghost_content(html) if extracted is None: print(f" SKIP (no content): {fpath.name}") skipped += 1 continue title, clean_text = extracted word_count = len(clean_text.split()) if not title: title = slug_to_topic(fpath.stem) print(f" OK: {word_count:4d}w | {title[:60]}") # Input-Text: genau wie beim Generieren — topic + audience + reminder # Audience basierend auf Slug-Keywords bestimmen slug = fpath.stem if any(k in slug for k in ["shieldx", "claude", "papercortex", "llm", "slop", "sync"]): audience = "developers and infrastructure engineers building AI-powered tools" elif any(k in slug for k in ["aspa", "bgp", "peercortex", "infrastructure"]): audience = "network engineers and NOC operators" else: audience = "network engineers and IT professionals who evaluate and operate optical infrastructure" input_text = ( f"Write a blog post on the following topic:\n\n" f"**Topic:** {title}\n\n" f"**Target audience:** {audience}\n\n" f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. " f"Stay strictly on-topic. No filler. Start writing now." ) record = { "system_prompt": SYSTEM_PROMPT, "input_text": input_text, "output_text": clean_text, "meta": { "title": title, "slug": slug, "source": "blog.fichtmueller.org", "word_count": word_count, "quality": "human_written", "weight": 3.0, # 3x Gewichtung in Training — Gold Standard "dataset_version": "v8", }, } results.append(record) OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for r in results: f.write(json.dumps(r, ensure_ascii=False) + "\n") print(f"\nGespeichert: {len(results)} Posts → {OUTPUT_FILE}") print(f"Übersprungen: {skipped}") # Statistik wcs = [r["meta"]["word_count"] for r in results] if wcs: print(f"Word count: min={min(wcs)}, max={max(wcs)}, avg={sum(wcs)//len(wcs)}") if __name__ == "__main__": main()