transceiver-db/scripts/seed-blog-training-data.py

252 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
seed-blog-training-data.py
Inserts the 15 gold-standard blog training articles from blog-training-data/
into the llm_gateway learning_corpus table as SFT (supervised fine-tuning) examples.
Task type: fo-blog-v1
Confidence: 9.0 (gold-standard, human-authored)
Status: approved
Usage:
python3 scripts/seed-blog-training-data.py
python3 scripts/seed-blog-training-data.py --dry-run
python3 scripts/seed-blog-training-data.py --db-url postgresql://...
"""
from __future__ import annotations
import argparse
import os
import re
import sys
import uuid
from pathlib import Path
import psycopg2
import psycopg2.extras
# ---------------------------------------------------------------------------
# Blog type → topic prompt mapping
# ---------------------------------------------------------------------------
TOPIC_PROMPTS = {
"market_alert": "Write a market alert blog post analyzing current pricing trends and market movements in the optical transceiver space. Help readers make timing decisions about when to buy.",
"technology_deep_dive": "Write a technology deep-dive blog post explaining how a specific optical transceiver technology works, when to use it, and what engineers need to know before deploying it.",
"tutorial": "Write a practical tutorial blog post giving engineers step-by-step guidance on a specific transceiver deployment, procurement, or troubleshooting topic.",
"hype_cycle": "Write a hype cycle analysis blog post assessing where a transceiver technology sits in its adoption curve and whether readers should adopt now or wait.",
"buying_guide": "Write a buying guide blog post helping procurement teams and engineers make informed decisions when purchasing optical transceivers.",
"migration_guide": "Write a migration guide blog post detailing what actually breaks (and why) when upgrading from one transceiver generation to another.",
"comparison": "Write a comparison blog post objectively analyzing the differences between two competing transceiver approaches, technologies, or vendors.",
"new_product": "Write a new product analysis blog post covering what has actually shipped in the 800G/next-gen transceiver space, what is production-ready, and what the deployment realities are.",
"competitor_analysis": "Write a competitor analysis blog post evaluating the major compatible transceiver vendors: who does proper testing, who has real quality infrastructure, and how to tell the difference.",
}
SYSTEM_PROMPT = """You are a senior optical network engineer and technical writer with real field experience in data center, ISP, and DWDM environments.
Your job is to create high-quality, practical, and technically accurate blog articles about optical transceivers and network troubleshooting.
Do NOT write generic, shallow, or marketing-style content.
Do NOT use buzzwords, filler phrases, or vague explanations.
Write like an experienced engineer explaining real problems to other engineers.
Your content must:
- Be technically correct and precise
- Include real-world scenarios
- Provide actionable troubleshooting steps
- Explain WHY issues happen, not just WHAT to do
- Include measurements, thresholds, and interpretation
- Reflect field experience (NOC, deployment, escalation cases)
FORMAT RULES:
- Write in flowing paragraphs, not bullet lists
- No markdown headers (##, ###) in the body
- Each section reads like an experienced colleague explaining over coffee
- One clear thesis per article — do not mix topics
- Target length: 800-1200 words
ANTI-PATTERNS (STRICTLY FORBIDDEN):
- Generic introductions ("In today's fast-paced world")
- Empty phrases ("optimize", "leverage", "enhance", "plays a key role")
- Bullet lists as structural elements
- Copy-paste datasheet language
- Surface-level explanations without cause-effect reasoning"""
def parse_article(filepath: Path) -> dict | None:
"""Parse a training article markdown file."""
text = filepath.read_text(encoding="utf-8")
# Extract frontmatter
fm_match = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL)
if not fm_match:
print(f" SKIP {filepath.name}: no frontmatter")
return None
fm = fm_match.group(1)
body = text[fm_match.end():].strip()
# Parse frontmatter fields
def fm_get(key: str) -> str:
m = re.search(rf'^{key}:\s*"?([^"\n]+)"?', fm, re.MULTILINE)
return m.group(1).strip() if m else ""
title = fm_get("title")
topic = fm_get("type") or fm_get("category") or "analysis"
target_audience = fm_get("target_audience")
score_str = fm_get("score")
if not title or not body:
print(f" SKIP {filepath.name}: missing title/body")
return None
# Build input_text: the generation request
topic_prompt = TOPIC_PROMPTS.get(topic, f"Write a {topic} blog post about optical transceivers.")
input_text = f"{topic_prompt}\n\nTitle: {title}\nTarget audience: {target_audience or 'technical'}"
return {
"title": title,
"topic": topic,
"target_audience": target_audience,
"input_text": input_text,
"output_text": body,
"score": score_str,
"filename": filepath.name,
}
def insert_corpus_entry(conn, entry: dict, dry_run: bool = False) -> bool:
"""Insert one SFT example into learning_corpus."""
sql = """
INSERT INTO learning_corpus (
id,
task_type,
prompt_text,
completion_text,
input_text,
output_text,
system_prompt,
confidence_score,
quality_score,
status,
tags,
human_edited
) VALUES (
%(id)s,
%(task_type)s,
%(prompt_text)s,
%(completion_text)s,
%(input_text)s,
%(output_text)s,
%(system_prompt)s,
%(confidence_score)s,
%(quality_score)s,
'approved',
%(tags)s,
true
)
ON CONFLICT DO NOTHING
"""
import json
params = {
"id": str(uuid.uuid4()),
"task_type": "fo-blog-v1",
"prompt_text": entry["input_text"],
"completion_text": entry["output_text"],
"input_text": entry["input_text"],
"output_text": entry["output_text"],
"system_prompt": SYSTEM_PROMPT,
"confidence_score": 9.0,
"quality_score": 9.0,
"tags": [entry["topic"], entry.get("target_audience", "technical"), "gold-standard", "blog-training-data"],
}
if dry_run:
print(f" [DRY-RUN] Would insert: {entry['filename']} ({len(entry['output_text'].split())}w)")
return True
with conn.cursor() as cur:
cur.execute(sql, params)
inserted = cur.rowcount > 0
conn.commit()
return inserted
def main():
parser = argparse.ArgumentParser(description="Seed blog training data into learning_corpus")
parser.add_argument("--dry-run", action="store_true", help="Show what would be inserted without writing")
parser.add_argument("--db-url", default=None, help="PostgreSQL connection URL (overrides env)")
args = parser.parse_args()
# Determine DB URL
db_url = args.db_url or os.environ.get("LLM_GATEWAY_DB_URL") or \
"postgresql://llm:llm_secure_2026@217.154.82.179:5432/llm_gateway"
# Find training data directory
script_dir = Path(__file__).parent
repo_root = script_dir.parent
training_dir = repo_root / "blog-training-data"
if not training_dir.exists():
print(f"ERROR: Training data directory not found: {training_dir}")
sys.exit(1)
files = sorted(training_dir.glob("blog-*.md"))
print(f"Found {len(files)} training articles in {training_dir}")
print()
# Parse all articles
articles = []
for f in files:
entry = parse_article(f)
if entry:
articles.append(entry)
print(f" OK {f.name}: {entry['topic']} / {len(entry['output_text'].split())}w")
print(f"\n{len(articles)} articles parsed successfully")
print()
if args.dry_run:
print("=== DRY RUN — no data will be written ===\n")
# Connect to DB
if not args.dry_run:
try:
conn = psycopg2.connect(db_url)
print(f"Connected to LLM gateway DB")
except Exception as e:
print(f"ERROR: Cannot connect to DB: {e}")
print("Hint: try --db-url or set LLM_GATEWAY_DB_URL env var")
sys.exit(1)
else:
conn = None
# Insert
inserted = 0
skipped = 0
for entry in articles:
ok = insert_corpus_entry(conn, entry, dry_run=args.dry_run)
if ok:
inserted += 1
if not args.dry_run:
print(f" + Inserted: {entry['filename']}")
else:
skipped += 1
if not args.dry_run:
print(f" ~ Skipped (already exists): {entry['filename']}")
if conn:
conn.close()
print(f"\nDone: {inserted} inserted, {skipped} skipped")
if not args.dry_run and inserted > 0:
print("\nNext step: trigger fine-tuning")
print(" cd packages/fine-tuner")
print(" python3 scripts/manual_trigger.py --task-type fo-blog-v1 --min-examples 10")
if __name__ == "__main__":
main()