llm-gateway/packages/fine-tuner/scripts/parse_real_posts.py

#!/usr/bin/env python3
"""
parse_real_posts.py — Konvertiert echte Blog-HTML → v8 SFT Trainingsdaten

Extrahiert Titel + Inhalt aus Ghost CMS HTML, baut daraus hochwertige
Training-Beispiele mit dem v7/v8 System Prompt.

Diese echten Posts sind GOLD — Rene's eigene Stimme, echte Expertise,
keine AI-Halluzinationen. Werden als Top-Priorität in v8 gewichtet.

Input:  ~/transceiver-training-data/v8-real-posts/*.html
Output: ~/transceiver-training-data/v8-real-posts-sft.jsonl
"""

from __future__ import annotations

import json
import re
from pathlib import Path

POSTS_DIR = Path.home() / "transceiver-training-data" / "v8-real-posts"
OUTPUT_FILE = Path.home() / "transceiver-training-data" / "v8-real-posts-sft.jsonl"

# Welche Posts sind für Blog-Training relevant (nicht rein persönlich/tool-spezifisch)
# Alle nehmen — auch nicht-Transceiver Posts zeigen Schreibstil + Struktur
SKIP_SLUGS = set()  # nichts überspringen — alle Posts zeigen Renes Voice

SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.

STRICT CONSTRAINTS — Follow exactly, no exceptions:
- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
- STRUCTURE (mandatory, in this order):
  1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
  2. Technical sections — 3–4 H2 sections covering the topic in depth
  3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
- NO REPETITION: Every sentence must add new information. No restating.
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.

Do not summarize what you are about to write. Start with the hook directly."""


def extract_ghost_content(html: str) -> tuple[str, str] | None:
    """Extrahiert (title, clean_text) aus Ghost CMS HTML."""

    # Titel aus og:title oder h1
    title_match = (
        re.search(r'<meta property="og:title" content="([^"]+)"', html)
        or re.search(r'<title>([\s\S]+?)</title>', html)
    )
    title = title_match.group(1).strip() if title_match else ""
    # Multiline title collapse
    title = re.sub(r'\s+', ' ', title).strip()
    # HTML entities in title
    title = (title.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
             .replace('&quot;', '"').replace('&#39;', "'").replace('&#x27;', "'")
             .replace('&nbsp;', ' ').replace('&mdash;', '—').replace('&ndash;', '–'))
    # Ghost appends " | Blog" or " – Site Name" — only strip at a pipe or en-dash
    # surrounded by spaces (require \s+, NOT \s* to avoid cutting hyphenated words)
    title = re.sub(r"\s+[|–]\s+.+$", "", title).strip()

    # Ghost CMS Content-Selektoren (in Prioritätsreihenfolge)
    content_patterns = [
        r'<div class="gh-content[^"]*">(.*?)</(?:div|section)>\s*(?:<div|<section|<footer|<aside)',
        r'<section class="gh-content[^"]*">(.*?)</section>',
        r'class="post-full-content[^"]*"[^>]*>(.*?)</(?:div|section)>',
        r'<div class="post-content[^"]*">(.*?)</div>\s*(?:<div class="post|<section|<footer)',
        r'<article[^>]*>(.*?)</article>',
    ]

    raw_content = ""
    for pat in content_patterns:
        m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
        if m:
            raw_content = m.group(1)
            break

    if not raw_content:
        return None

    # Cleanup
    raw_content = re.sub(r'<script[^>]*>.*?</script>', '', raw_content, flags=re.DOTALL)
    raw_content = re.sub(r'<style[^>]*>.*?</style>', '', raw_content, flags=re.DOTALL)
    raw_content = re.sub(r'<noscript[^>]*>.*?</noscript>', '', raw_content, flags=re.DOTALL)

    # HTML → Markdown-ähnliches Format
    # Headers
    for level in [6, 5, 4, 3, 2, 1]:
        hashes = "#" * level
        raw_content = re.sub(
            rf'<h{level}[^>]*>(.*?)</h{level}>',
            lambda m: f"\n{hashes} {re.sub('<[^>]+>', '', m.group(1)).strip()}\n",
            raw_content,
            flags=re.DOTALL | re.IGNORECASE,
        )

    # Bold/italic
    raw_content = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', raw_content, flags=re.DOTALL)
    raw_content = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', raw_content, flags=re.DOTALL)
    raw_content = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', raw_content, flags=re.DOTALL)

    # Lists
    raw_content = re.sub(r'<li[^>]*>(.*?)</li>', r'\n- \1', raw_content, flags=re.DOTALL)
    raw_content = re.sub(r'<[uo]l[^>]*>', '\n', raw_content)
    raw_content = re.sub(r'</[uo]l>', '\n', raw_content)

    # Paragraphs → newlines
    raw_content = re.sub(r'<br\s*/?>', '\n', raw_content)
    raw_content = re.sub(r'<p[^>]*>', '\n', raw_content)
    raw_content = re.sub(r'</p>', '\n', raw_content)

    # Code blocks
    raw_content = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>',
                         lambda m: f"\n```\n{m.group(1)}\n```\n",
                         raw_content, flags=re.DOTALL)

    # Alle verbleibenden Tags entfernen
    raw_content = re.sub(r'<[^>]+>', ' ', raw_content)

    # HTML entities
    raw_content = raw_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    raw_content = raw_content.replace('&nbsp;', ' ').replace('&quot;', '"').replace('&#39;', "'")
    raw_content = raw_content.replace('&mdash;', '—').replace('&ndash;', '–').replace('&hellip;', '…')

    # Whitespace normalisieren
    raw_content = re.sub(r'\n{3,}', '\n\n', raw_content)
    raw_content = re.sub(r'[ \t]+', ' ', raw_content)
    raw_content = re.sub(r'\n ', '\n', raw_content)
    clean = raw_content.strip()

    if len(clean.split()) < 200:
        return None

    return title, clean


def slug_to_topic(slug: str) -> str:
    """Wandelt URL-Slug in lesbaren Topic-String."""
    return slug.replace("-", " ").title()


def main() -> None:
    html_files = sorted(f for f in POSTS_DIR.glob("*.html")
                        if f.stat().st_size > 10_000
                        and f.stem not in SKIP_SLUGS
                        and re.match(r'^[a-z0-9]', f.stem))  # skip hidden/garbage files

    print(f"Parsing {len(html_files)} HTML files...")

    results = []
    skipped = 0
    seen_slugs: set[str] = set()

    for fpath in html_files:
        if fpath.stem in seen_slugs:
            print(f"  SKIP (duplicate slug): {fpath.name}")
            skipped += 1
            continue
        seen_slugs.add(fpath.stem)

        html = fpath.read_text(errors="ignore")
        extracted = extract_ghost_content(html)

        if extracted is None:
            print(f"  SKIP (no content): {fpath.name}")
            skipped += 1
            continue

        title, clean_text = extracted
        word_count = len(clean_text.split())

        if not title:
            title = slug_to_topic(fpath.stem)

        print(f"  OK: {word_count:4d}w | {title[:60]}")

        # Input-Text: genau wie beim Generieren — topic + audience + reminder
        # Audience basierend auf Slug-Keywords bestimmen
        slug = fpath.stem
        if any(k in slug for k in ["shieldx", "claude", "papercortex", "llm", "slop", "sync"]):
            audience = "developers and infrastructure engineers building AI-powered tools"
        elif any(k in slug for k in ["aspa", "bgp", "peercortex", "infrastructure"]):
            audience = "network engineers and NOC operators"
        else:
            audience = "network engineers and IT professionals who evaluate and operate optical infrastructure"

        input_text = (
            f"Write a blog post on the following topic:\n\n"
            f"**Topic:** {title}\n\n"
            f"**Target audience:** {audience}\n\n"
            f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. "
            f"Stay strictly on-topic. No filler. Start writing now."
        )

        record = {
            "system_prompt": SYSTEM_PROMPT,
            "input_text": input_text,
            "output_text": clean_text,
            "meta": {
                "title": title,
                "slug": slug,
                "source": "blog.fichtmueller.org",
                "word_count": word_count,
                "quality": "human_written",
                "weight": 3.0,   # 3x Gewichtung in Training — Gold Standard
                "dataset_version": "v8",
            },
        }
        results.append(record)

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"\nGespeichert: {len(results)} Posts → {OUTPUT_FILE}")
    print(f"Übersprungen: {skipped}")

    # Statistik
    wcs = [r["meta"]["word_count"] for r in results]
    if wcs:
        print(f"Word count: min={min(wcs)}, max={max(wcs)}, avg={sum(wcs)//len(wcs)}")


if __name__ == "__main__":
    main()