llm-gateway/packages/fine-tuner/scripts/label_v7_quality.py

#!/usr/bin/env python3
"""
label_v7_quality.py — Bewertet v7-generierte Blogs und erstellt DPO-Labels

Claude liest jeden generierten Blog und bewertet ihn nach 5 Kriterien:
  1. Wortanzahl (700-1000 = gut, sonst schlecht)
  2. Hook vorhanden (klares Einstiegsproblem)
  3. Technische Tiefe (nicht generisch)
  4. Struktur (## Headers, 3 Takeaways)
  5. Kein Drift (bleibt beim Thema)

Aus gut/schlecht Bewertungen:
  - "gut" + "schlecht" vom gleichen Thema → DPO-Pair
  - Oder: "schlecht" → Claude schreibt bessere Version → DPO-Pair

Output:
  ~/transceiver-training-data/v8-quality-dpo.jsonl

Usage:
  python3 scripts/label_v7_quality.py
  python3 scripts/label_v7_quality.py --input v7-generated-sft.jsonl
  python3 scripts/label_v7_quality.py --max 50 --rewrite-bad
"""

from __future__ import annotations

import argparse
import json
import logging
import re
import subprocess
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

DATA_DIR = Path.home() / "transceiver-training-data"
DEFAULT_INPUT = DATA_DIR / "v7-generated-sft.jsonl"
OUTPUT_FILE = DATA_DIR / "v8-quality-dpo.jsonl"
PROGRESS_FILE = DATA_DIR / "v8-quality-progress.json"

CLAUDE_TIMEOUT = 120

SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.

STRICT CONSTRAINTS — Follow exactly, no exceptions:
- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
- STRUCTURE (mandatory, in this order):
  1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
  2. Technical sections — 3–4 H2 sections covering the topic in depth
  3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
- NO REPETITION: Every sentence must add new information. No restating.
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.

Do not summarize what you are about to write. Start with the hook directly."""

JUDGE_PROMPT = """Evaluate this blog post against the following criteria. Respond with ONLY a JSON object — no explanation, no markdown.

Criteria:
1. word_count_ok: true if 700-1000 words (count the words)
2. has_hook: true if first paragraph clearly states a real problem (not generic intro)
3. technical_depth: true if contains specific technical details (numbers, standards, product names)
4. good_structure: true if has ## H2 headers AND ends with 3 bullet point takeaways
5. on_topic: true if stays focused on the exact topic throughout (no generic drift)
6. overall: "good" if at least 4 of 5 criteria pass, otherwise "bad"
7. issues: list of failed criteria names

Response format exactly:
{"word_count_ok": true/false, "has_hook": true/false, "technical_depth": true/false, "good_structure": true/false, "on_topic": true/false, "overall": "good"/"bad", "issues": ["..."]}

Blog post to evaluate:

"""


def load_progress() -> set[str]:
    if not PROGRESS_FILE.exists():
        return set()
    try:
        return set(json.loads(PROGRESS_FILE.read_text()).get("done", []))
    except Exception:
        return set()


def save_progress(done: set[str]) -> None:
    PROGRESS_FILE.write_text(json.dumps({"done": list(done)}))


def judge_blog(blog_text: str) -> dict | None:
    """Ask Claude to evaluate a blog post quality. Returns parsed JSON or None."""
    prompt = JUDGE_PROMPT + blog_text

    try:
        result = subprocess.run(
            ["claude", "--print", "-p", prompt],
            capture_output=True, text=True, timeout=CLAUDE_TIMEOUT,
        )
        if result.returncode != 0 or not result.stdout.strip():
            return None

        output = result.stdout.strip()
        # Extract JSON from response (Claude might wrap it)
        json_match = re.search(r'\{[^{}]+\}', output, re.DOTALL)
        if not json_match:
            return None
        return json.loads(json_match.group(0))
    except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception):
        return None


def quick_check(blog_text: str) -> dict:
    """Fast deterministic pre-check (no Claude call) to filter obvious cases."""
    word_count = len(blog_text.split())
    has_h2 = bool(re.search(r'^## ', blog_text, re.MULTILINE))
    has_bullets = bool(re.search(r'^[-*•] ', blog_text, re.MULTILINE))
    bullet_count = len(re.findall(r'^[-*•] ', blog_text, re.MULTILINE))
    starts_ok = not blog_text.strip().startswith("In this ")
    has_takeaways = bool(re.search(r'takeaway|practical|key point', blog_text, re.IGNORECASE))

    # Deterministic verdict (no Claude needed for clear failures)
    clear_bad = word_count < 500 or word_count > 1500 or not has_h2
    clear_good = (700 <= word_count <= 1050 and has_h2 and bullet_count >= 3
                  and starts_ok)

    return {
        "word_count": word_count,
        "has_h2": has_h2,
        "has_bullets": has_bullets,
        "bullet_count": bullet_count,
        "clear_bad": clear_bad,
        "clear_good": clear_good,
        "needs_claude": not clear_bad and not clear_good,
    }


def rewrite_for_chosen(title: str, input_text: str) -> str | None:
    """Use Claude to write a high-quality version (the 'chosen' half of DPO pair)."""
    try:
        result = subprocess.run(
            ["claude", "--print", "--system-prompt", SYSTEM_PROMPT, "-p", input_text],
            capture_output=True, text=True, timeout=180,
        )
        if result.returncode != 0 or not result.stdout.strip():
            return None
        output = result.stdout.strip()
        wc = len(output.split())
        if wc < 400 or wc > 1500:
            return None
        return output
    except Exception:
        return None


def process_examples(
    input_file: Path,
    max_items: int | None,
    rewrite_bad: bool,
) -> None:
    if not input_file.exists():
        logger.error("Input file not found: %s", input_file)
        return

    examples = []
    with open(input_file, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                examples.append(json.loads(line))
            except json.JSONDecodeError:
                pass

    if max_items:
        examples = examples[:max_items]

    logger.info("Processing %d examples from %s", len(examples), input_file.name)

    done = load_progress()
    stats = {"good": 0, "bad": 0, "dpo_pairs": 0, "skipped": 0}

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

    with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f:
        for i, item in enumerate(examples):
            topic = item.get("meta", {}).get("topic", "")[:60]
            output_text = item.get("output_text", "")
            input_text = item.get("input_text", "")
            system = item.get("system_prompt", SYSTEM_PROMPT)

            item_id = f"{input_file.stem}:{i}"
            if item_id in done:
                stats["skipped"] += 1
                continue

            # ── Fast pre-check ──
            check = quick_check(output_text)
            logger.info("[%03d/%03d] %dw | %s", i + 1, len(examples), check["word_count"], topic)

            if check["clear_good"]:
                verdict = "good"
                logger.info("  → CLEAR GOOD (deterministic)")
            elif check["clear_bad"]:
                verdict = "bad"
                logger.info("  → CLEAR BAD (deterministic: %dw, h2=%s)", check["word_count"], check["has_h2"])
            else:
                # Ask Claude to judge
                judgment = judge_blog(output_text)
                if judgment is None:
                    logger.warning("  → SKIP (judge failed)")
                    done.add(item_id)
                    stats["skipped"] += 1
                    continue
                verdict = judgment.get("overall", "bad")
                issues = judgment.get("issues", [])
                logger.info("  → %s | issues: %s", verdict.upper(), issues)

            if verdict == "good":
                stats["good"] += 1
            else:
                stats["bad"] += 1

                if rewrite_bad:
                    # Create DPO pair: bad original → rewritten chosen
                    logger.info("  Rewriting bad post for DPO...")
                    chosen = rewrite_for_chosen(topic, input_text)
                    if chosen and chosen != output_text:
                        prompt = (
                            f"<|im_start|>system\n{system}<|im_end|>\n"
                            f"<|im_start|>user\n{input_text}<|im_end|>\n"
                        )
                        pair = {
                            "prompt": prompt,
                            "chosen": chosen,
                            "rejected": output_text,
                            "meta": {
                                "topic": topic,
                                "rejection_strategy": "quality_labeled_bad",
                                "chosen_words": len(chosen.split()),
                                "rejected_words": check["word_count"],
                                "verdict": verdict,
                                "dataset_version": "v8",
                            },
                        }
                        out_f.write(json.dumps(pair, ensure_ascii=False) + "\n")
                        out_f.flush()
                        stats["dpo_pairs"] += 1
                        logger.info("  DPO pair saved: %dw chosen vs %dw rejected",
                                    len(chosen.split()), check["word_count"])

            done.add(item_id)
            save_progress(done)

    logger.info(
        "Done: good=%d bad=%d dpo_pairs=%d skipped=%d",
        stats["good"], stats["bad"], stats["dpo_pairs"], stats["skipped"],
    )
    logger.info("Output: %s", OUTPUT_FILE)

    # Print summary
    if OUTPUT_FILE.exists():
        with open(OUTPUT_FILE) as f:
            total = sum(1 for _ in f)
        logger.info("Total DPO pairs in output: %d", total)


def main() -> None:
    parser = argparse.ArgumentParser(description="Label v7 posts for quality DPO pairs")
    parser.add_argument(
        "--input", type=Path, default=DEFAULT_INPUT,
        help=f"Input JSONL with SFT examples (default: {DEFAULT_INPUT})",
    )
    parser.add_argument(
        "--max", type=int, default=None,
        help="Max examples to process",
    )
    parser.add_argument(
        "--rewrite-bad", action="store_true",
        help="Rewrite bad posts with Claude to create DPO pairs (slower, costs more)",
    )
    args = parser.parse_args()
    process_examples(args.input, args.max, args.rewrite_bad)


if __name__ == "__main__":
    main()