252 lines
9.2 KiB
Python
252 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
seed-blog-training-data.py
|
|
|
|
Inserts the 15 gold-standard blog training articles from blog-training-data/
|
|
into the llm_gateway learning_corpus table as SFT (supervised fine-tuning) examples.
|
|
|
|
Task type: fo-blog-v1
|
|
Confidence: 9.0 (gold-standard, human-authored)
|
|
Status: approved
|
|
|
|
Usage:
|
|
python3 scripts/seed-blog-training-data.py
|
|
python3 scripts/seed-blog-training-data.py --dry-run
|
|
python3 scripts/seed-blog-training-data.py --db-url postgresql://...
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Blog type → topic prompt mapping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
TOPIC_PROMPTS = {
|
|
"market_alert": "Write a market alert blog post analyzing current pricing trends and market movements in the optical transceiver space. Help readers make timing decisions about when to buy.",
|
|
"technology_deep_dive": "Write a technology deep-dive blog post explaining how a specific optical transceiver technology works, when to use it, and what engineers need to know before deploying it.",
|
|
"tutorial": "Write a practical tutorial blog post giving engineers step-by-step guidance on a specific transceiver deployment, procurement, or troubleshooting topic.",
|
|
"hype_cycle": "Write a hype cycle analysis blog post assessing where a transceiver technology sits in its adoption curve and whether readers should adopt now or wait.",
|
|
"buying_guide": "Write a buying guide blog post helping procurement teams and engineers make informed decisions when purchasing optical transceivers.",
|
|
"migration_guide": "Write a migration guide blog post detailing what actually breaks (and why) when upgrading from one transceiver generation to another.",
|
|
"comparison": "Write a comparison blog post objectively analyzing the differences between two competing transceiver approaches, technologies, or vendors.",
|
|
"new_product": "Write a new product analysis blog post covering what has actually shipped in the 800G/next-gen transceiver space, what is production-ready, and what the deployment realities are.",
|
|
"competitor_analysis": "Write a competitor analysis blog post evaluating the major compatible transceiver vendors: who does proper testing, who has real quality infrastructure, and how to tell the difference.",
|
|
}
|
|
|
|
SYSTEM_PROMPT = """You are a senior optical network engineer and technical writer with real field experience in data center, ISP, and DWDM environments.
|
|
|
|
Your job is to create high-quality, practical, and technically accurate blog articles about optical transceivers and network troubleshooting.
|
|
|
|
Do NOT write generic, shallow, or marketing-style content.
|
|
Do NOT use buzzwords, filler phrases, or vague explanations.
|
|
Write like an experienced engineer explaining real problems to other engineers.
|
|
|
|
Your content must:
|
|
- Be technically correct and precise
|
|
- Include real-world scenarios
|
|
- Provide actionable troubleshooting steps
|
|
- Explain WHY issues happen, not just WHAT to do
|
|
- Include measurements, thresholds, and interpretation
|
|
- Reflect field experience (NOC, deployment, escalation cases)
|
|
|
|
FORMAT RULES:
|
|
- Write in flowing paragraphs, not bullet lists
|
|
- No markdown headers (##, ###) in the body
|
|
- Each section reads like an experienced colleague explaining over coffee
|
|
- One clear thesis per article — do not mix topics
|
|
- Target length: 800-1200 words
|
|
|
|
ANTI-PATTERNS (STRICTLY FORBIDDEN):
|
|
- Generic introductions ("In today's fast-paced world")
|
|
- Empty phrases ("optimize", "leverage", "enhance", "plays a key role")
|
|
- Bullet lists as structural elements
|
|
- Copy-paste datasheet language
|
|
- Surface-level explanations without cause-effect reasoning"""
|
|
|
|
|
|
def parse_article(filepath: Path) -> dict | None:
|
|
"""Parse a training article markdown file."""
|
|
text = filepath.read_text(encoding="utf-8")
|
|
|
|
# Extract frontmatter
|
|
fm_match = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL)
|
|
if not fm_match:
|
|
print(f" SKIP {filepath.name}: no frontmatter")
|
|
return None
|
|
|
|
fm = fm_match.group(1)
|
|
body = text[fm_match.end():].strip()
|
|
|
|
# Parse frontmatter fields
|
|
def fm_get(key: str) -> str:
|
|
m = re.search(rf'^{key}:\s*"?([^"\n]+)"?', fm, re.MULTILINE)
|
|
return m.group(1).strip() if m else ""
|
|
|
|
title = fm_get("title")
|
|
topic = fm_get("type") or fm_get("category") or "analysis"
|
|
target_audience = fm_get("target_audience")
|
|
score_str = fm_get("score")
|
|
|
|
if not title or not body:
|
|
print(f" SKIP {filepath.name}: missing title/body")
|
|
return None
|
|
|
|
# Build input_text: the generation request
|
|
topic_prompt = TOPIC_PROMPTS.get(topic, f"Write a {topic} blog post about optical transceivers.")
|
|
input_text = f"{topic_prompt}\n\nTitle: {title}\nTarget audience: {target_audience or 'technical'}"
|
|
|
|
return {
|
|
"title": title,
|
|
"topic": topic,
|
|
"target_audience": target_audience,
|
|
"input_text": input_text,
|
|
"output_text": body,
|
|
"score": score_str,
|
|
"filename": filepath.name,
|
|
}
|
|
|
|
|
|
def insert_corpus_entry(conn, entry: dict, dry_run: bool = False) -> bool:
|
|
"""Insert one SFT example into learning_corpus."""
|
|
sql = """
|
|
INSERT INTO learning_corpus (
|
|
id,
|
|
task_type,
|
|
prompt_text,
|
|
completion_text,
|
|
input_text,
|
|
output_text,
|
|
system_prompt,
|
|
confidence_score,
|
|
quality_score,
|
|
status,
|
|
tags,
|
|
human_edited
|
|
) VALUES (
|
|
%(id)s,
|
|
%(task_type)s,
|
|
%(prompt_text)s,
|
|
%(completion_text)s,
|
|
%(input_text)s,
|
|
%(output_text)s,
|
|
%(system_prompt)s,
|
|
%(confidence_score)s,
|
|
%(quality_score)s,
|
|
'approved',
|
|
%(tags)s,
|
|
true
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
"""
|
|
|
|
import json
|
|
params = {
|
|
"id": str(uuid.uuid4()),
|
|
"task_type": "fo-blog-v1",
|
|
"prompt_text": entry["input_text"],
|
|
"completion_text": entry["output_text"],
|
|
"input_text": entry["input_text"],
|
|
"output_text": entry["output_text"],
|
|
"system_prompt": SYSTEM_PROMPT,
|
|
"confidence_score": 9.0,
|
|
"quality_score": 9.0,
|
|
"tags": [entry["topic"], entry.get("target_audience", "technical"), "gold-standard", "blog-training-data"],
|
|
}
|
|
|
|
if dry_run:
|
|
print(f" [DRY-RUN] Would insert: {entry['filename']} ({len(entry['output_text'].split())}w)")
|
|
return True
|
|
|
|
with conn.cursor() as cur:
|
|
cur.execute(sql, params)
|
|
inserted = cur.rowcount > 0
|
|
conn.commit()
|
|
return inserted
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Seed blog training data into learning_corpus")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be inserted without writing")
|
|
parser.add_argument("--db-url", default=None, help="PostgreSQL connection URL (overrides env)")
|
|
args = parser.parse_args()
|
|
|
|
# Determine DB URL
|
|
db_url = args.db_url or os.environ.get("LLM_GATEWAY_DB_URL") or \
|
|
"postgresql://llm:llm_secure_2026@217.154.82.179:5432/llm_gateway"
|
|
|
|
# Find training data directory
|
|
script_dir = Path(__file__).parent
|
|
repo_root = script_dir.parent
|
|
training_dir = repo_root / "blog-training-data"
|
|
|
|
if not training_dir.exists():
|
|
print(f"ERROR: Training data directory not found: {training_dir}")
|
|
sys.exit(1)
|
|
|
|
files = sorted(training_dir.glob("blog-*.md"))
|
|
print(f"Found {len(files)} training articles in {training_dir}")
|
|
print()
|
|
|
|
# Parse all articles
|
|
articles = []
|
|
for f in files:
|
|
entry = parse_article(f)
|
|
if entry:
|
|
articles.append(entry)
|
|
print(f" OK {f.name}: {entry['topic']} / {len(entry['output_text'].split())}w")
|
|
|
|
print(f"\n{len(articles)} articles parsed successfully")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
print("=== DRY RUN — no data will be written ===\n")
|
|
|
|
# Connect to DB
|
|
if not args.dry_run:
|
|
try:
|
|
conn = psycopg2.connect(db_url)
|
|
print(f"Connected to LLM gateway DB")
|
|
except Exception as e:
|
|
print(f"ERROR: Cannot connect to DB: {e}")
|
|
print("Hint: try --db-url or set LLM_GATEWAY_DB_URL env var")
|
|
sys.exit(1)
|
|
else:
|
|
conn = None
|
|
|
|
# Insert
|
|
inserted = 0
|
|
skipped = 0
|
|
for entry in articles:
|
|
ok = insert_corpus_entry(conn, entry, dry_run=args.dry_run)
|
|
if ok:
|
|
inserted += 1
|
|
if not args.dry_run:
|
|
print(f" + Inserted: {entry['filename']}")
|
|
else:
|
|
skipped += 1
|
|
if not args.dry_run:
|
|
print(f" ~ Skipped (already exists): {entry['filename']}")
|
|
|
|
if conn:
|
|
conn.close()
|
|
|
|
print(f"\nDone: {inserted} inserted, {skipped} skipped")
|
|
if not args.dry_run and inserted > 0:
|
|
print("\nNext step: trigger fine-tuning")
|
|
print(" cd packages/fine-tuner")
|
|
print(" python3 scripts/manual_trigger.py --task-type fo-blog-v1 --min-examples 10")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|