transceiver-db/scripts/seed-blog-training-data.py

#!/usr/bin/env python3
"""
seed-blog-training-data.py

Inserts the 15 gold-standard blog training articles from blog-training-data/
into the llm_gateway learning_corpus table as SFT (supervised fine-tuning) examples.

Task type: fo-blog-v1
Confidence: 9.0 (gold-standard, human-authored)
Status: approved

Usage:
    python3 scripts/seed-blog-training-data.py
    python3 scripts/seed-blog-training-data.py --dry-run
    python3 scripts/seed-blog-training-data.py --db-url postgresql://...
"""

from __future__ import annotations

import argparse
import os
import re
import sys
import uuid
from pathlib import Path

import psycopg2
import psycopg2.extras

# ---------------------------------------------------------------------------
# Blog type → topic prompt mapping
# ---------------------------------------------------------------------------

TOPIC_PROMPTS = {
    "market_alert": "Write a market alert blog post analyzing current pricing trends and market movements in the optical transceiver space. Help readers make timing decisions about when to buy.",
    "technology_deep_dive": "Write a technology deep-dive blog post explaining how a specific optical transceiver technology works, when to use it, and what engineers need to know before deploying it.",
    "tutorial": "Write a practical tutorial blog post giving engineers step-by-step guidance on a specific transceiver deployment, procurement, or troubleshooting topic.",
    "hype_cycle": "Write a hype cycle analysis blog post assessing where a transceiver technology sits in its adoption curve and whether readers should adopt now or wait.",
    "buying_guide": "Write a buying guide blog post helping procurement teams and engineers make informed decisions when purchasing optical transceivers.",
    "migration_guide": "Write a migration guide blog post detailing what actually breaks (and why) when upgrading from one transceiver generation to another.",
    "comparison": "Write a comparison blog post objectively analyzing the differences between two competing transceiver approaches, technologies, or vendors.",
    "new_product": "Write a new product analysis blog post covering what has actually shipped in the 800G/next-gen transceiver space, what is production-ready, and what the deployment realities are.",
    "competitor_analysis": "Write a competitor analysis blog post evaluating the major compatible transceiver vendors: who does proper testing, who has real quality infrastructure, and how to tell the difference.",
}

SYSTEM_PROMPT = """You are a senior optical network engineer and technical writer with real field experience in data center, ISP, and DWDM environments.

Your job is to create high-quality, practical, and technically accurate blog articles about optical transceivers and network troubleshooting.

Do NOT write generic, shallow, or marketing-style content.
Do NOT use buzzwords, filler phrases, or vague explanations.
Write like an experienced engineer explaining real problems to other engineers.

Your content must:
- Be technically correct and precise
- Include real-world scenarios
- Provide actionable troubleshooting steps
- Explain WHY issues happen, not just WHAT to do
- Include measurements, thresholds, and interpretation
- Reflect field experience (NOC, deployment, escalation cases)

FORMAT RULES:
- Write in flowing paragraphs, not bullet lists
- No markdown headers (##, ###) in the body
- Each section reads like an experienced colleague explaining over coffee
- One clear thesis per article — do not mix topics
- Target length: 800-1200 words

ANTI-PATTERNS (STRICTLY FORBIDDEN):
- Generic introductions ("In today's fast-paced world")
- Empty phrases ("optimize", "leverage", "enhance", "plays a key role")
- Bullet lists as structural elements
- Copy-paste datasheet language
- Surface-level explanations without cause-effect reasoning"""


def parse_article(filepath: Path) -> dict | None:
    """Parse a training article markdown file."""
    text = filepath.read_text(encoding="utf-8")

    # Extract frontmatter
    fm_match = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL)
    if not fm_match:
        print(f"  SKIP {filepath.name}: no frontmatter")
        return None

    fm = fm_match.group(1)
    body = text[fm_match.end():].strip()

    # Parse frontmatter fields
    def fm_get(key: str) -> str:
        m = re.search(rf'^{key}:\s*"?([^"\n]+)"?', fm, re.MULTILINE)
        return m.group(1).strip() if m else ""

    title = fm_get("title")
    topic = fm_get("type") or fm_get("category") or "analysis"
    target_audience = fm_get("target_audience")
    score_str = fm_get("score")

    if not title or not body:
        print(f"  SKIP {filepath.name}: missing title/body")
        return None

    # Build input_text: the generation request
    topic_prompt = TOPIC_PROMPTS.get(topic, f"Write a {topic} blog post about optical transceivers.")
    input_text = f"{topic_prompt}\n\nTitle: {title}\nTarget audience: {target_audience or 'technical'}"

    return {
        "title": title,
        "topic": topic,
        "target_audience": target_audience,
        "input_text": input_text,
        "output_text": body,
        "score": score_str,
        "filename": filepath.name,
    }


def insert_corpus_entry(conn, entry: dict, dry_run: bool = False) -> bool:
    """Insert one SFT example into learning_corpus."""
    sql = """
        INSERT INTO learning_corpus (
            id,
            task_type,
            prompt_text,
            completion_text,
            input_text,
            output_text,
            system_prompt,
            confidence_score,
            quality_score,
            status,
            tags,
            human_edited
        ) VALUES (
            %(id)s,
            %(task_type)s,
            %(prompt_text)s,
            %(completion_text)s,
            %(input_text)s,
            %(output_text)s,
            %(system_prompt)s,
            %(confidence_score)s,
            %(quality_score)s,
            'approved',
            %(tags)s,
            true
        )
        ON CONFLICT DO NOTHING
    """

    import json
    params = {
        "id": str(uuid.uuid4()),
        "task_type": "fo-blog-v1",
        "prompt_text": entry["input_text"],
        "completion_text": entry["output_text"],
        "input_text": entry["input_text"],
        "output_text": entry["output_text"],
        "system_prompt": SYSTEM_PROMPT,
        "confidence_score": 9.0,
        "quality_score": 9.0,
        "tags": [entry["topic"], entry.get("target_audience", "technical"), "gold-standard", "blog-training-data"],
    }

    if dry_run:
        print(f"  [DRY-RUN] Would insert: {entry['filename']} ({len(entry['output_text'].split())}w)")
        return True

    with conn.cursor() as cur:
        cur.execute(sql, params)
        inserted = cur.rowcount > 0
    conn.commit()
    return inserted


def main():
    parser = argparse.ArgumentParser(description="Seed blog training data into learning_corpus")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be inserted without writing")
    parser.add_argument("--db-url", default=None, help="PostgreSQL connection URL (overrides env)")
    args = parser.parse_args()

    # Determine DB URL
    db_url = args.db_url or os.environ.get("LLM_GATEWAY_DB_URL") or \
             "postgresql://llm:llm_secure_2026@217.154.82.179:5432/llm_gateway"

    # Find training data directory
    script_dir = Path(__file__).parent
    repo_root = script_dir.parent
    training_dir = repo_root / "blog-training-data"

    if not training_dir.exists():
        print(f"ERROR: Training data directory not found: {training_dir}")
        sys.exit(1)

    files = sorted(training_dir.glob("blog-*.md"))
    print(f"Found {len(files)} training articles in {training_dir}")
    print()

    # Parse all articles
    articles = []
    for f in files:
        entry = parse_article(f)
        if entry:
            articles.append(entry)
            print(f"  OK  {f.name}: {entry['topic']} / {len(entry['output_text'].split())}w")

    print(f"\n{len(articles)} articles parsed successfully")
    print()

    if args.dry_run:
        print("=== DRY RUN — no data will be written ===\n")

    # Connect to DB
    if not args.dry_run:
        try:
            conn = psycopg2.connect(db_url)
            print(f"Connected to LLM gateway DB")
        except Exception as e:
            print(f"ERROR: Cannot connect to DB: {e}")
            print("Hint: try --db-url or set LLM_GATEWAY_DB_URL env var")
            sys.exit(1)
    else:
        conn = None

    # Insert
    inserted = 0
    skipped = 0
    for entry in articles:
        ok = insert_corpus_entry(conn, entry, dry_run=args.dry_run)
        if ok:
            inserted += 1
            if not args.dry_run:
                print(f"  + Inserted: {entry['filename']}")
        else:
            skipped += 1
            if not args.dry_run:
                print(f"  ~ Skipped (already exists): {entry['filename']}")

    if conn:
        conn.close()

    print(f"\nDone: {inserted} inserted, {skipped} skipped")
    if not args.dry_run and inserted > 0:
        print("\nNext step: trigger fine-tuning")
        print("  cd packages/fine-tuner")
        print("  python3 scripts/manual_trigger.py --task-type fo-blog-v1 --min-examples 10")


if __name__ == "__main__":
    main()