#!/usr/bin/env python3 """ seed-blog-training-data.py Inserts the 15 gold-standard blog training articles from blog-training-data/ into the llm_gateway learning_corpus table as SFT (supervised fine-tuning) examples. Task type: fo-blog-v1 Confidence: 9.0 (gold-standard, human-authored) Status: approved Usage: python3 scripts/seed-blog-training-data.py python3 scripts/seed-blog-training-data.py --dry-run python3 scripts/seed-blog-training-data.py --db-url postgresql://... """ from __future__ import annotations import argparse import os import re import sys import uuid from pathlib import Path import psycopg2 import psycopg2.extras # --------------------------------------------------------------------------- # Blog type → topic prompt mapping # --------------------------------------------------------------------------- TOPIC_PROMPTS = { "market_alert": "Write a market alert blog post analyzing current pricing trends and market movements in the optical transceiver space. Help readers make timing decisions about when to buy.", "technology_deep_dive": "Write a technology deep-dive blog post explaining how a specific optical transceiver technology works, when to use it, and what engineers need to know before deploying it.", "tutorial": "Write a practical tutorial blog post giving engineers step-by-step guidance on a specific transceiver deployment, procurement, or troubleshooting topic.", "hype_cycle": "Write a hype cycle analysis blog post assessing where a transceiver technology sits in its adoption curve and whether readers should adopt now or wait.", "buying_guide": "Write a buying guide blog post helping procurement teams and engineers make informed decisions when purchasing optical transceivers.", "migration_guide": "Write a migration guide blog post detailing what actually breaks (and why) when upgrading from one transceiver generation to another.", "comparison": "Write a comparison blog post objectively analyzing the differences between two competing transceiver approaches, technologies, or vendors.", "new_product": "Write a new product analysis blog post covering what has actually shipped in the 800G/next-gen transceiver space, what is production-ready, and what the deployment realities are.", "competitor_analysis": "Write a competitor analysis blog post evaluating the major compatible transceiver vendors: who does proper testing, who has real quality infrastructure, and how to tell the difference.", } SYSTEM_PROMPT = """You are a senior optical network engineer and technical writer with real field experience in data center, ISP, and DWDM environments. Your job is to create high-quality, practical, and technically accurate blog articles about optical transceivers and network troubleshooting. Do NOT write generic, shallow, or marketing-style content. Do NOT use buzzwords, filler phrases, or vague explanations. Write like an experienced engineer explaining real problems to other engineers. Your content must: - Be technically correct and precise - Include real-world scenarios - Provide actionable troubleshooting steps - Explain WHY issues happen, not just WHAT to do - Include measurements, thresholds, and interpretation - Reflect field experience (NOC, deployment, escalation cases) FORMAT RULES: - Write in flowing paragraphs, not bullet lists - No markdown headers (##, ###) in the body - Each section reads like an experienced colleague explaining over coffee - One clear thesis per article — do not mix topics - Target length: 800-1200 words ANTI-PATTERNS (STRICTLY FORBIDDEN): - Generic introductions ("In today's fast-paced world") - Empty phrases ("optimize", "leverage", "enhance", "plays a key role") - Bullet lists as structural elements - Copy-paste datasheet language - Surface-level explanations without cause-effect reasoning""" def parse_article(filepath: Path) -> dict | None: """Parse a training article markdown file.""" text = filepath.read_text(encoding="utf-8") # Extract frontmatter fm_match = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL) if not fm_match: print(f" SKIP {filepath.name}: no frontmatter") return None fm = fm_match.group(1) body = text[fm_match.end():].strip() # Parse frontmatter fields def fm_get(key: str) -> str: m = re.search(rf'^{key}:\s*"?([^"\n]+)"?', fm, re.MULTILINE) return m.group(1).strip() if m else "" title = fm_get("title") topic = fm_get("type") or fm_get("category") or "analysis" target_audience = fm_get("target_audience") score_str = fm_get("score") if not title or not body: print(f" SKIP {filepath.name}: missing title/body") return None # Build input_text: the generation request topic_prompt = TOPIC_PROMPTS.get(topic, f"Write a {topic} blog post about optical transceivers.") input_text = f"{topic_prompt}\n\nTitle: {title}\nTarget audience: {target_audience or 'technical'}" return { "title": title, "topic": topic, "target_audience": target_audience, "input_text": input_text, "output_text": body, "score": score_str, "filename": filepath.name, } def insert_corpus_entry(conn, entry: dict, dry_run: bool = False) -> bool: """Insert one SFT example into learning_corpus.""" sql = """ INSERT INTO learning_corpus ( id, task_type, prompt_text, completion_text, input_text, output_text, system_prompt, confidence_score, quality_score, status, tags, human_edited ) VALUES ( %(id)s, %(task_type)s, %(prompt_text)s, %(completion_text)s, %(input_text)s, %(output_text)s, %(system_prompt)s, %(confidence_score)s, %(quality_score)s, 'approved', %(tags)s, true ) ON CONFLICT DO NOTHING """ import json params = { "id": str(uuid.uuid4()), "task_type": "fo-blog-v1", "prompt_text": entry["input_text"], "completion_text": entry["output_text"], "input_text": entry["input_text"], "output_text": entry["output_text"], "system_prompt": SYSTEM_PROMPT, "confidence_score": 9.0, "quality_score": 9.0, "tags": [entry["topic"], entry.get("target_audience", "technical"), "gold-standard", "blog-training-data"], } if dry_run: print(f" [DRY-RUN] Would insert: {entry['filename']} ({len(entry['output_text'].split())}w)") return True with conn.cursor() as cur: cur.execute(sql, params) inserted = cur.rowcount > 0 conn.commit() return inserted def main(): parser = argparse.ArgumentParser(description="Seed blog training data into learning_corpus") parser.add_argument("--dry-run", action="store_true", help="Show what would be inserted without writing") parser.add_argument("--db-url", default=None, help="PostgreSQL connection URL (overrides env)") args = parser.parse_args() # Determine DB URL db_url = args.db_url or os.environ.get("LLM_GATEWAY_DB_URL") or \ "postgresql://llm:llm_secure_2026@217.154.82.179:5432/llm_gateway" # Find training data directory script_dir = Path(__file__).parent repo_root = script_dir.parent training_dir = repo_root / "blog-training-data" if not training_dir.exists(): print(f"ERROR: Training data directory not found: {training_dir}") sys.exit(1) files = sorted(training_dir.glob("blog-*.md")) print(f"Found {len(files)} training articles in {training_dir}") print() # Parse all articles articles = [] for f in files: entry = parse_article(f) if entry: articles.append(entry) print(f" OK {f.name}: {entry['topic']} / {len(entry['output_text'].split())}w") print(f"\n{len(articles)} articles parsed successfully") print() if args.dry_run: print("=== DRY RUN — no data will be written ===\n") # Connect to DB if not args.dry_run: try: conn = psycopg2.connect(db_url) print(f"Connected to LLM gateway DB") except Exception as e: print(f"ERROR: Cannot connect to DB: {e}") print("Hint: try --db-url or set LLM_GATEWAY_DB_URL env var") sys.exit(1) else: conn = None # Insert inserted = 0 skipped = 0 for entry in articles: ok = insert_corpus_entry(conn, entry, dry_run=args.dry_run) if ok: inserted += 1 if not args.dry_run: print(f" + Inserted: {entry['filename']}") else: skipped += 1 if not args.dry_run: print(f" ~ Skipped (already exists): {entry['filename']}") if conn: conn.close() print(f"\nDone: {inserted} inserted, {skipped} skipped") if not args.dry_run and inserted > 0: print("\nNext step: trigger fine-tuning") print(" cd packages/fine-tuner") print(" python3 scripts/manual_trigger.py --task-type fo-blog-v1 --min-examples 10") if __name__ == "__main__": main()