llm-gateway/packages/fine-tuner/scripts/generate_v7_data.py
Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

529 lines
40 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
generate_v7_data.py — fo-blog-v7 training data generator
Uses 'claude --print -p' subprocess to generate 200+ high-quality,
properly constrained blog posts covering transceiver tech + networking topics.
Key improvements over v6 training data:
1. Anchored system prompt with STRICT length (700-1000w) and structure constraints
2. Diverse topics: not just transceivers — BGP, IPv6, RIPE/APNIC, data center, etc.
3. Full articles as output_text (not keyword stubs)
4. Topic match enforced via explicit input format
Output:
~/transceiver-training-data/v7-generated-sft.jsonl
Usage:
python3 scripts/generate_v7_data.py
python3 scripts/generate_v7_data.py --start 50 --end 100 # resume
python3 scripts/generate_v7_data.py --dry-run # show topics only
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import subprocess
import sys
import time
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# ─── Output paths ────────────────────────────────────────────────────────────
OUTPUT_DIR = Path.home() / "transceiver-training-data"
OUTPUT_FILE = OUTPUT_DIR / "v7-generated-sft.jsonl"
PROGRESS_FILE = OUTPUT_DIR / "v7-progress.json"
# ─── Anchored system prompt ───────────────────────────────────────────────────
# This is the KEY improvement for v7: the model must learn these constraints
# are non-negotiable, not suggestions.
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
STRICT CONSTRAINTS — Follow exactly, no exceptions:
- LENGTH: 7001000 words. Count carefully. Stop at 1000 words maximum.
- STRUCTURE (mandatory, in this order):
1. HOOK paragraph — 23 sentences stating the problem this post addresses
2. Technical sections — 34 H2 sections covering the topic in depth
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
- NO REPETITION: Every sentence must add new information. No restating.
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
Do not summarize what you are about to write. Start with the hook directly."""
# ─── Topic list ──────────────────────────────────────────────────────────────
# 250 topics: transceiver tech + networking + RIPE/APNIC + routing + data center
TOPICS: list[dict] = [
# ── Transceiver form factors ──────────────────────────────────────────
{"topic": "QSFP-DD vs OSFP: Which 400G Form Factor Wins in 2026", "category": "transceiver", "audience": "customer"},
{"topic": "SFP vs SFP+: Why the Upgrade Still Matters for 10G Deployments", "category": "transceiver", "audience": "customer"},
{"topic": "CFP2-DCO: Pluggable Coherent Optics for Metro Networks Explained", "category": "transceiver", "audience": "engineer"},
{"topic": "800G OSFP Transceivers: What Network Architects Need to Know", "category": "transceiver", "audience": "engineer"},
{"topic": "QSFP28 vs QSFP56: Migrating Your 100G Infrastructure to 200G", "category": "transceiver", "audience": "customer"},
{"topic": "XFP vs SFP+: When the Legacy Form Factor Still Makes Sense", "category": "transceiver", "audience": "customer"},
{"topic": "Micro-QSFP and SFP-DD: Small Form Factor Optics for High-Density Switching", "category": "transceiver", "audience": "engineer"},
{"topic": "400G QSFP-DD800: Breaking the 400G Barrier with 8×50G PAM4", "category": "transceiver", "audience": "engineer"},
{"topic": "DSFP: The Emerging Dual Small Form Factor and Its Use Cases", "category": "transceiver", "audience": "customer"},
{"topic": "Form Factor Migration: Planning Your Network for 400G to 800G", "category": "transceiver", "audience": "engineer"},
# ── Speed and wavelength ──────────────────────────────────────────────
{"topic": "100G vs 400G Transceivers: Total Cost of Ownership Compared", "category": "transceiver", "audience": "customer"},
{"topic": "400G SR4 vs LR4 vs PSM4: Choosing the Right 400G Optic", "category": "transceiver", "audience": "engineer"},
{"topic": "1.6T Transceivers: What CPO and On-Board Optics Mean for Data Centers", "category": "transceiver", "audience": "engineer"},
{"topic": "CWDM4 vs FR4 vs DR4+: The 40km 400G Transceiver Landscape", "category": "transceiver", "audience": "engineer"},
{"topic": "LR-Lite Transceivers: The 2km 100G Option Operators Actually Use", "category": "transceiver", "audience": "engineer"},
{"topic": "50G PAM4 vs NRZ: Why Modulation Format Matters for Your SFP56", "category": "transceiver", "audience": "engineer"},
{"topic": "10G DWDM Transceivers for Metro: A Practical Deployment Guide", "category": "transceiver", "audience": "customer"},
{"topic": "ZR vs ZR+: The 400G Long-Haul Transceiver Showdown", "category": "transceiver", "audience": "engineer"},
{"topic": "800G SR8 Transceivers: Short-Reach Options for Hyperscale Data Centers", "category": "transceiver", "audience": "engineer"},
{"topic": "400G FR4 Transceivers: The Workhorse of Campus and DCI Networks", "category": "transceiver", "audience": "customer"},
# ── Coherent optics ───────────────────────────────────────────────────
{"topic": "Coherent vs Direct Detect: Which Technology for Your DCI Link?", "category": "transceiver", "audience": "engineer"},
{"topic": "OpenROADM and Open Coherent: Breaking Vendor Lock-In in Long-Haul", "category": "transceiver", "audience": "engineer"},
{"topic": "400G ZR/ZR+ for Enterprise DCI: Configuration and Gotchas", "category": "transceiver", "audience": "engineer"},
{"topic": "Coherent DSP Chips: ACO vs ICO vs DCO Pluggable Architecture", "category": "transceiver", "audience": "engineer"},
{"topic": "Submarine Cable Coherent Optics: From 100G to 800G Capacity", "category": "transceiver", "audience": "engineer"},
{"topic": "Installed Base: When to Upgrade Coherent Infrastructure to 400G", "category": "transceiver", "audience": "customer"},
# ── Compatibility and vendors ─────────────────────────────────────────
{"topic": "Third-Party Transceivers: The Real Risk vs. Cost Argument in 2026", "category": "transceiver", "audience": "customer"},
{"topic": "Cisco vs Juniper Transceiver Lock-In: What Your Contract Says", "category": "transceiver", "audience": "customer"},
{"topic": "OEM vs Compatible Optics: Decoding the Validation Process", "category": "transceiver", "audience": "customer"},
{"topic": "Transceiver Compatibility Matrices: How to Read Them Without Getting Burned", "category": "transceiver", "audience": "customer"},
{"topic": "FLEXOPTIX Programmable Optics: One SKU, Any Vendor, Any Config", "category": "transceiver", "audience": "customer"},
{"topic": "Gray Market Transceivers: How to Spot Counterfeit Optics Before They Fail", "category": "transceiver", "audience": "customer"},
{"topic": "Arista vs Cisco Transceiver Policy: Which Vendor is More Open?", "category": "transceiver", "audience": "customer"},
{"topic": "Transceiver Procurement for Service Providers: RFQ Checklist", "category": "transceiver", "audience": "customer"},
# ── Fiber and physical layer ──────────────────────────────────────────
{"topic": "OS2 vs OM4 vs OM5: Fiber Type Selection for Your Speed Upgrade", "category": "transceiver", "audience": "customer"},
{"topic": "Fiber Insertion Loss Budget: How to Calculate Before You Buy Optics", "category": "transceiver", "audience": "engineer"},
{"topic": "MTP/MPO vs LC vs SC Connectors: Fiber Cabling for High-Density Racks", "category": "transceiver", "audience": "customer"},
{"topic": "Bend-Insensitive Fiber (BIF): When OM5 Doesn't Cut It for Data Center", "category": "transceiver", "audience": "engineer"},
{"topic": "Fiber Dispersion: PMD and CD Compensation in 400G+ Links", "category": "transceiver", "audience": "engineer"},
# ── Data center networking ────────────────────────────────────────────
{"topic": "Spine-Leaf Architecture: Transceiver Strategy for 400G Data Centers", "category": "datacenter", "audience": "engineer"},
{"topic": "Co-Packaged Optics (CPO): Why 2026 Is the Inflection Point", "category": "datacenter", "audience": "engineer"},
{"topic": "Hyperscale vs Enterprise: Different Transceiver Buying Strategies", "category": "datacenter", "audience": "engineer"},
{"topic": "Data Center Interconnect: Selecting Optics for Your DCI Budget", "category": "datacenter", "audience": "customer"},
{"topic": "Active vs Passive DAC Cables: When Direct-Attach Beats Transceivers", "category": "datacenter", "audience": "customer"},
{"topic": "AOC Cables: Active Optical Cable Use Cases in 2026 Data Centers", "category": "datacenter", "audience": "customer"},
{"topic": "Power Efficiency in Optics: How Watt-per-Bit Changes Your TCO", "category": "datacenter", "audience": "engineer"},
{"topic": "Silicon Photonics: How Intel and Broadcom Are Reshaping Transceiver Design", "category": "datacenter", "audience": "engineer"},
{"topic": "AI/ML Infrastructure: Networking Requirements for GPU Clusters", "category": "datacenter", "audience": "engineer"},
{"topic": "400G Switch Fabric Design: Oversubscription Ratios and Transceiver Placement", "category": "datacenter", "audience": "engineer"},
# ── Routing and BGP ───────────────────────────────────────────────────
{"topic": "BGP Route Leaks: Detection, Impact, and Prevention in 2026", "category": "routing", "audience": "engineer"},
{"topic": "RPKI Route Origin Validation: A Practical Deployment Guide", "category": "routing", "audience": "engineer"},
{"topic": "BGP Prefix Hijacking: How Attackers Exploit Routing and How to Stop Them", "category": "routing", "audience": "engineer"},
{"topic": "BGP ASPA: The Next Layer of Route Security After RPKI", "category": "routing", "audience": "engineer"},
{"topic": "BGP Flowspec: Traffic Engineering and DDoS Mitigation in One Protocol", "category": "routing", "audience": "engineer"},
{"topic": "BGP Communities: A Practical Operator's Guide to Traffic Steering", "category": "routing", "audience": "engineer"},
{"topic": "BGP Large Communities RFC 8092: Why Your NOC Needs This Now", "category": "routing", "audience": "engineer"},
{"topic": "Graceful Restart and LLGR: Keeping BGP Sessions Alive During Maintenance", "category": "routing", "audience": "engineer"},
{"topic": "BGP Add-Path: Solving the Best-Path Problem in Multi-homed Networks", "category": "routing", "audience": "engineer"},
{"topic": "Route Reflectors vs Route Servers: iBGP Scaling for Large Networks", "category": "routing", "audience": "engineer"},
{"topic": "BGPsec: Why RPKI's Successor Is Still Waiting for Deployment", "category": "routing", "audience": "engineer"},
{"topic": "IS-IS vs OSPF: Choosing an IGP for Your Service Provider Network", "category": "routing", "audience": "engineer"},
{"topic": "MPLS Traffic Engineering: Still Relevant in the SR-MPLS Era?", "category": "routing", "audience": "engineer"},
{"topic": "Segment Routing (SR-MPLS and SRv6): Which One for Your Backbone?", "category": "routing", "audience": "engineer"},
{"topic": "BFD: Bidirectional Forwarding Detection for Fast Failure Recovery", "category": "routing", "audience": "engineer"},
{"topic": "EVPN: The Definitive Guide to Data Center and WAN BGP Extensions", "category": "routing", "audience": "engineer"},
{"topic": "QoS in IP Networks: Traffic Marking, Shaping, and Policing Explained", "category": "routing", "audience": "engineer"},
{"topic": "FlowSpec vs RTBH: Choosing the Right DDoS Mitigation Tool", "category": "routing", "audience": "engineer"},
{"topic": "BGP Monitoring Protocol (BMP): Real-Time Route Collection for NOCs", "category": "routing", "audience": "engineer"},
{"topic": "OpenConfig and YANG: Network Automation That Actually Works", "category": "routing", "audience": "engineer"},
# ── IPv6 ──────────────────────────────────────────────────────────────
{"topic": "IPv6 Deployment for ISPs: 12 Steps from Planning to Production", "category": "ipv6", "audience": "engineer"},
{"topic": "IPv4 Exhaustion in 2026: What Service Providers Must Do Now", "category": "ipv6", "audience": "engineer"},
{"topic": "CGNAT: Why It's a Bad Fix for IPv4 Exhaustion and What to Use Instead", "category": "ipv6", "audience": "engineer"},
{"topic": "Dual-Stack vs 464XLAT vs NAT64: IPv6 Transition Mechanisms Compared", "category": "ipv6", "audience": "engineer"},
{"topic": "IPv6 Prefix Delegation: PD Configuration for ISP Customer Networks", "category": "ipv6", "audience": "engineer"},
{"topic": "IPv6 Security: Attack Vectors That Don't Exist in IPv4", "category": "ipv6", "audience": "engineer"},
{"topic": "IPv6 Address Planning: How to Structure /32 Allocation for Growth", "category": "ipv6", "audience": "engineer"},
{"topic": "Mobile IPv6 and 5G: How Carrier Networks Handle Mobility at Scale", "category": "ipv6", "audience": "engineer"},
{"topic": "World IPv6 Launch: Where Are We 13 Years Later?", "category": "ipv6", "audience": "engineer"},
{"topic": "IPv6 ROA and RPKI: Securing Your IPv6 Routing from Day One", "category": "ipv6", "audience": "engineer"},
# ── Internet infrastructure and RIR/APNIC/RIPE ───────────────────────
{"topic": "APNIC and Asia-Pacific IPv6 Leadership: What the Data Shows", "category": "infrastructure", "audience": "engineer"},
{"topic": "RIPE NCC Resource Certification: How to Get Your RPKI Right", "category": "infrastructure", "audience": "engineer"},
{"topic": "Internet Exchange Points: Why IXPs Are Critical Infrastructure", "category": "infrastructure", "audience": "engineer"},
{"topic": "ARIN vs RIPE vs APNIC: How IP Address Policies Differ by Region", "category": "infrastructure", "audience": "engineer"},
{"topic": "BGP Looking Glass Tools: How to Debug Routing Problems Remotely", "category": "infrastructure", "audience": "engineer"},
{"topic": "Peering vs Transit: The Economics of Internet Interconnection", "category": "infrastructure", "audience": "engineer"},
{"topic": "DE-CIX, AMS-IX, LINX: The IXPs That Move Europe's Internet", "category": "infrastructure", "audience": "engineer"},
{"topic": "Route Server Best Practices for IXP Operators", "category": "infrastructure", "audience": "engineer"},
{"topic": "MANRS: Mutually Agreed Norms for Routing Security in 2026", "category": "infrastructure", "audience": "engineer"},
{"topic": "Internet Shutdowns: Technical Analysis of BGP Withdrawal Patterns", "category": "infrastructure", "audience": "engineer"},
{"topic": "Submarine Cable Systems: Routing Resilience for Island Networks", "category": "infrastructure", "audience": "engineer"},
{"topic": "RDAP vs WHOIS: The Modern Way to Query IP and Domain Ownership", "category": "infrastructure", "audience": "engineer"},
{"topic": "DNS Anycast: How Root Servers Handle 50 Billion Queries Per Day", "category": "infrastructure", "audience": "engineer"},
{"topic": "DNSSEC: Deployment Status and Why Operators Still Hesitate", "category": "infrastructure", "audience": "engineer"},
{"topic": "RIPE Atlas: Using Distributed Probes to Measure Internet Reachability", "category": "infrastructure", "audience": "engineer"},
{"topic": "NTP Security: How BGP Leaks Can Desync Your Infrastructure Clocks", "category": "infrastructure", "audience": "engineer"},
{"topic": "Internet Routing Registry (IRR): Why It's Messy and What to Do About It", "category": "infrastructure", "audience": "engineer"},
{"topic": "CDN Architecture: How Akamai and Cloudflare Use BGP for Global Delivery", "category": "infrastructure", "audience": "engineer"},
{"topic": "Anycast BGP for DDoS Mitigation: A NOC Operator's Guide", "category": "infrastructure", "audience": "engineer"},
{"topic": "Public Peer vs Private Peer: IXP Negotiation Strategy for ISPs", "category": "infrastructure", "audience": "engineer"},
# ── Network operations ────────────────────────────────────────────────
{"topic": "SNMP vs gRPC Telemetry: Modernizing Your NOC Monitoring Stack", "category": "operations", "audience": "engineer"},
{"topic": "NetFlow vs IPFIX vs sFlow: Choosing Traffic Analytics for Your Network", "category": "operations", "audience": "engineer"},
{"topic": "Optical Power Budget: How to Diagnose Fiber Link Problems Fast", "category": "operations", "audience": "engineer"},
{"topic": "OTDR Testing: Reading Loss Traces for Fiber Troubleshooting", "category": "operations", "audience": "engineer"},
{"topic": "DDOS Mitigation at Scale: BGP Blackhole and Scrubbing Centers", "category": "operations", "audience": "engineer"},
{"topic": "Network Change Management: Avoiding Outages During Maintenance Windows", "category": "operations", "audience": "engineer"},
{"topic": "MPLS LDP vs RSVP-TE: Label Distribution Protocol Comparison", "category": "operations", "audience": "engineer"},
{"topic": "Transceiver DOM Monitoring: What DDM Data Tells You Before Links Fail", "category": "operations", "audience": "engineer"},
{"topic": "NOC Alert Fatigue: Structuring Alerts to Avoid the Cry-Wolf Effect", "category": "operations", "audience": "engineer"},
{"topic": "Fiber Cuts: Incident Response Procedures for Backbone Operators", "category": "operations", "audience": "engineer"},
{"topic": "MTTR vs MTBF: Optical Transceiver Reliability Metrics That Matter", "category": "operations", "audience": "customer"},
{"topic": "Optics Inventory Management: How to Avoid a Spare-Parts Crisis", "category": "operations", "audience": "customer"},
{"topic": "Transceiver Firmware Upgrades: Risk Management and Rollback Plans", "category": "operations", "audience": "engineer"},
{"topic": "Network Automation with Ansible and NAPALM: Practical Getting Started Guide", "category": "operations", "audience": "engineer"},
{"topic": "gNMI and gNOI: Google's Contribution to Network Operations APIs", "category": "operations", "audience": "engineer"},
# ── Security ──────────────────────────────────────────────────────────
{"topic": "BGP Hijack Case Studies: Real Incidents and Their Technical Aftermath", "category": "security", "audience": "engineer"},
{"topic": "RPKI ROA vs ASPA vs BGPsec: The Routing Security Stack in 2026", "category": "security", "audience": "engineer"},
{"topic": "DDoS Amplification via DNS and NTP: How It Works and How to Block It", "category": "security", "audience": "engineer"},
{"topic": "Supply Chain Attacks on Network Hardware: Counterfeit Optics and Beyond", "category": "security", "audience": "engineer"},
{"topic": "BGP Route Filtering: RPKI-Invalid Drop vs Just-Logging", "category": "security", "audience": "engineer"},
{"topic": "Network Segmentation: How Optical Transceivers Factor into Zero-Trust", "category": "security", "audience": "engineer"},
# ── Market and business ───────────────────────────────────────────────
{"topic": "Transceiver Market 2026: 400G Adoption Rates and What's Driving 800G", "category": "market", "audience": "customer"},
{"topic": "Photonics Supply Chain: TSMC, II-VI, and the Chip Shortage Aftermath", "category": "market", "audience": "customer"},
{"topic": "Price Comparison: QSFP-DD 400G ZR from 8 Vendors — Who Wins?", "category": "market", "audience": "customer"},
{"topic": "Hyperscaler Buying Power: How Meta and AWS Shape the Transceiver Market", "category": "market", "audience": "customer"},
{"topic": "Transceiver Leasing vs Buying: CapEx vs OpEx Decision Framework", "category": "market", "audience": "customer"},
{"topic": "Optical Networking M&A: Coherent, II-VI, Lumentum — What the Consolidation Means", "category": "market", "audience": "customer"},
{"topic": "Open Networking: SONiC Adoption and the Disaggregation Trend in 2026", "category": "market", "audience": "engineer"},
{"topic": "Whitebox Switching and Merchant Silicon: The Business Case for Operators", "category": "market", "audience": "engineer"},
{"topic": "Transceiver Pricing Trends: When Does 400G Hit the 10G Price Point?", "category": "market", "audience": "customer"},
{"topic": "AI Networking Demand: How LLM Training Clusters Are Reshaping Optics Sales", "category": "market", "audience": "customer"},
# ── Standards and MSAs ────────────────────────────────────────────────
{"topic": "IEEE 802.3bs and 400GbE: The Standard That Enabled QSFP-DD", "category": "standards", "audience": "engineer"},
{"topic": "OIF 400ZR Implementation Agreement: What It Means for DCI Deployments", "category": "standards", "audience": "engineer"},
{"topic": "MSA Compliance: How Multi-Source Agreements Enable Interoperability", "category": "standards", "audience": "customer"},
{"topic": "SFF-8024: The Transceiver Identifier Standard Explained", "category": "standards", "audience": "engineer"},
{"topic": "IEEE 802.3cu 100G FR and LR: Simplifying 100G Beyond 10km", "category": "standards", "audience": "engineer"},
{"topic": "CMIS 5.0: The Management Interface That Unlocks 800G OSFP Features", "category": "standards", "audience": "engineer"},
{"topic": "800G Standards Landscape: QSFP-DD800, OSFP, and What Comes Next", "category": "standards", "audience": "engineer"},
{"topic": "CWDM vs DWDM Wavelength Plans: ITU Grid Selection for Metro and Long-Haul", "category": "standards", "audience": "engineer"},
{"topic": "OpenZR+ vs OIF-400ZR: The 400G Coherent Protocol War", "category": "standards", "audience": "engineer"},
{"topic": "Photonic Integrated Circuits: SiPh, InP, and the Future of Transceiver Design", "category": "standards", "audience": "engineer"},
# ── SONiC and open networking ─────────────────────────────────────────
{"topic": "SONiC Architecture: How Microsoft's Switch OS Works Under the Hood", "category": "opennet", "audience": "engineer"},
{"topic": "SONiC vs Cumulus vs OpenWrt: Choosing an Open NOS for Your Lab", "category": "opennet", "audience": "engineer"},
{"topic": "SONiC SAI API: Abstracting Hardware Across ASICs and Vendors", "category": "opennet", "audience": "engineer"},
{"topic": "Open Compute Project (OCP): How Facebook is Driving Network Disaggregation", "category": "opennet", "audience": "engineer"},
{"topic": "P4 Programming: The Future of Programmable Data Plane Networking", "category": "opennet", "audience": "engineer"},
{"topic": "SmartNIC and DPU: Offloading Network Functions from CPUs", "category": "opennet", "audience": "engineer"},
# ── Emerging topics ───────────────────────────────────────────────────
{"topic": "Quantum Key Distribution (QKD) over Fiber: Network Integration Challenges", "category": "emerging", "audience": "engineer"},
{"topic": "Space Optical Communications: LEO Constellation Intersatellite Links", "category": "emerging", "audience": "engineer"},
{"topic": "400G and Beyond for RAN Fronthaul: O-RAN Transceiver Requirements", "category": "emerging", "audience": "engineer"},
{"topic": "Edge Computing: Optical Networking Requirements for 5G MEC", "category": "emerging", "audience": "engineer"},
{"topic": "AI-Driven Network Management: Using LLMs for Optical Layer Analysis", "category": "emerging", "audience": "engineer"},
{"topic": "Green Networking: Power Consumption Optimization for Optical Infrastructure", "category": "emerging", "audience": "engineer"},
{"topic": "Liquid Cooling and Photonics: How Thermal Management Changes at 800G", "category": "emerging", "audience": "engineer"},
{"topic": "Reconfigurable Optical Add-Drop Multiplexers (ROADM): WSS Architecture Guide", "category": "emerging", "audience": "engineer"},
{"topic": "Optical Time Domain Reflectometry in Automated NOC Workflows", "category": "emerging", "audience": "engineer"},
{"topic": "Optical Amplifiers: EDFA vs Raman vs SOA — When Each One Applies", "category": "emerging", "audience": "engineer"},
# ── Regional and service provider ─────────────────────────────────────
{"topic": "African Internet Infrastructure: Submarine Cables and Terrestrial Fiber Gaps", "category": "regional", "audience": "engineer"},
{"topic": "APAC Data Center Boom: Transceiver Requirements for Singapore and Tokyo Hubs", "category": "regional", "audience": "customer"},
{"topic": "European 5G Backbone: Optical Transceiver Demand Through 2028", "category": "regional", "audience": "customer"},
{"topic": "Latin America ISP Connectivity: Low-Cost 100G Options for Emerging Markets", "category": "regional", "audience": "customer"},
{"topic": "Middle East Data Center Growth: IXP and Optical Infrastructure Investments", "category": "regional", "audience": "customer"},
{"topic": "Rural Broadband Access: Optical Technologies for the Last Mile", "category": "regional", "audience": "customer"},
{"topic": "GÉANT Research Network: How European Academia Runs 100Tbps+ at Scale", "category": "regional", "audience": "engineer"},
{"topic": "Carrier Ethernet Services: MEF Framework for Wholesale Optical Transport", "category": "regional", "audience": "engineer"},
# ── How-to guides ─────────────────────────────────────────────────────
{"topic": "How to Read a Transceiver Datasheet: Key Specs That Actually Matter", "category": "howto", "audience": "customer"},
{"topic": "How to Choose the Right Transceiver for a 10km Single-Mode Link", "category": "howto", "audience": "customer"},
{"topic": "How to Configure a 400G ZR Link Between Two Routers", "category": "howto", "audience": "engineer"},
{"topic": "How to Implement RPKI on Cisco IOS-XR: Step-by-Step Guide", "category": "howto", "audience": "engineer"},
{"topic": "How to Set Up BGP Communities for Traffic Engineering", "category": "howto", "audience": "engineer"},
{"topic": "How to Diagnose an Optical Link Failure Using DOM Data", "category": "howto", "audience": "engineer"},
{"topic": "How to Compare Transceiver Prices Across Vendors Without Getting Scammed", "category": "howto", "audience": "customer"},
{"topic": "How to Plan Fiber Capacity for a 5-Year Data Center Expansion", "category": "howto", "audience": "customer"},
{"topic": "How to Implement EVPN VXLAN in a Spine-Leaf Data Center", "category": "howto", "audience": "engineer"},
{"topic": "How to Write a Network RFP for Optical Transceiver Procurement", "category": "howto", "audience": "customer"},
{"topic": "How to Migrate from NRZ to PAM4: A Practical Network Engineer's Guide", "category": "howto", "audience": "engineer"},
{"topic": "How to Calculate Power Budget for a 100G Long-Haul Link", "category": "howto", "audience": "engineer"},
{"topic": "How to Set Up Streaming Telemetry with gNMI and InfluxDB", "category": "howto", "audience": "engineer"},
{"topic": "How to Deploy SONiC in a Production Data Center: Lessons from 6 Months", "category": "howto", "audience": "engineer"},
{"topic": "How to Evaluate Third-Party Transceivers Before Buying 500 Units", "category": "howto", "audience": "customer"},
# ── Troubleshooting ───────────────────────────────────────────────────
{"topic": "Why Is My SFP+ Module Showing Rx Power -40dBm? Fiber Fault Diagnosis", "category": "troubleshoot", "audience": "engineer"},
{"topic": "BGP Session Flapping: 7 Root Causes and How to Debug Each One", "category": "troubleshoot", "audience": "engineer"},
{"topic": "Transceiver Not Recognized: Vendor Lock-In Detection and Workarounds", "category": "troubleshoot", "audience": "customer"},
{"topic": "High BER on 100G Link: Signal Integrity Debugging from DOM to OTDR", "category": "troubleshoot", "audience": "engineer"},
{"topic": "MTU Black Holes: How Jumbo Frame Mismatches Kill Network Performance", "category": "troubleshoot", "audience": "engineer"},
{"topic": "OSPF Adjacency Issues: Debugging Area Type Mismatches and Hello Timers", "category": "troubleshoot", "audience": "engineer"},
# ── Comparison and decision guides ────────────────────────────────────
{"topic": "Cisco ASR 9000 vs Nokia 7750: Backbone Router Optics Ecosystem", "category": "comparison", "audience": "customer"},
{"topic": "Arista 7800 vs Juniper QFX10000: Data Center Fabric Optics Comparison", "category": "comparison", "audience": "customer"},
{"topic": "Vendor Lock-In Scorecard: Cisco vs Juniper vs Arista in 2026", "category": "comparison", "audience": "customer"},
{"topic": "100G LR4 vs ER4: When to Pay for 40km Reach", "category": "comparison", "audience": "customer"},
{"topic": "NaaS vs DIY: Network-as-a-Service vs Owning Your Own Optical Infrastructure", "category": "comparison", "audience": "customer"},
{"topic": "Coherent Pluggables vs Fixed-Line Transponders: TCO for Service Providers", "category": "comparison", "audience": "customer"},
# ── Future and innovation ─────────────────────────────────────────────
{"topic": "1.6T Optics Timeline: When Will 1.6 Terabit Transceivers Hit Production?", "category": "future", "audience": "engineer"},
{"topic": "All-Optical Networks: The Dream of Photonic Switching Without O-E-O", "category": "future", "audience": "engineer"},
{"topic": "Post-Quantum Cryptography in Network Infrastructure: Timing the Transition", "category": "future", "audience": "engineer"},
{"topic": "AI-Predicted Network Failures: How ML Is Entering Optical Layer Management", "category": "future", "audience": "engineer"},
{"topic": "Intent-Based Networking: BGP and Optics Policy Automation in 2026", "category": "future", "audience": "engineer"},
{"topic": "The 10 Year Horizon: How Optical Networking Will Change by 2035", "category": "future", "audience": "customer"},
]
# ─── Prompt construction ──────────────────────────────────────────────────────
def build_user_prompt(entry: dict) -> str:
"""Build the user-turn prompt for a given topic entry."""
audience_map = {
"customer": "IT managers, procurement teams, and operators who evaluate and buy transceivers",
"engineer": "network engineers and architects who design and operate optical infrastructure",
}
audience = audience_map.get(entry.get("audience", "engineer"), "network engineers")
return (
f"Write a blog post on the following topic:\n\n"
f'**Topic:** {entry["topic"]}\n\n'
f"**Target audience:** {audience}\n\n"
f"Remember: 7001000 words, hook + technical sections + 3 takeaways. "
f"Stay strictly on-topic. No filler. Start writing now."
)
# ─── Claude subprocess ────────────────────────────────────────────────────────
def call_claude(system: str, user_prompt: str, timeout: int = 120) -> str | None:
"""
Call 'claude --print' via subprocess.
Uses the Claude Code subscription — no API billing.
Returns the generated text, or None on failure.
"""
try:
result = subprocess.run(
[
"claude",
"--print",
"--system-prompt", system,
"-p", user_prompt,
],
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
logger.warning("claude subprocess error (rc=%d): %s", result.returncode, result.stderr[:200])
return None
output = result.stdout.strip()
if not output:
logger.warning("claude returned empty output")
return None
return output
except subprocess.TimeoutExpired:
logger.warning("claude subprocess timed out after %ds", timeout)
return None
except FileNotFoundError:
logger.error("claude CLI not found — install Claude Code")
return None
except Exception as exc:
logger.warning("claude subprocess unexpected error: %s", exc)
return None
# ─── Quality validation ───────────────────────────────────────────────────────
def validate_output(text: str, topic: str) -> tuple[bool, str]:
"""
Basic quality check on generated blog post.
Returns (is_valid, reason).
"""
words = len(text.split())
if words < 400:
return False, f"too short: {words} words (min 400)"
if words > 2500:
return False, f"too long: {words} words (max 2500 — will be flagged)"
# Must have some structure
has_headers = bool(re.search(r"^##\s+.+", text, re.MULTILINE))
if not has_headers:
return False, "missing ## section headers"
# Must start with actual content (not a meta-comment about the post)
first_line = text.strip().split("\n")[0].lower()
skip_patterns = ["i'll write", "here's a", "here is a", "let me write", "blog post:"]
for pat in skip_patterns:
if pat in first_line:
return False, f"starts with meta-comment: '{first_line[:60]}'"
return True, "ok"
# ─── Progress tracking ────────────────────────────────────────────────────────
def load_progress() -> set[int]:
"""Load set of already-generated topic indices."""
if not PROGRESS_FILE.exists():
return set()
try:
with open(PROGRESS_FILE) as f:
data = json.load(f)
return set(data.get("completed", []))
except Exception:
return set()
def save_progress(completed: set[int]) -> None:
with open(PROGRESS_FILE, "w") as f:
json.dump({"completed": sorted(completed), "total": len(TOPICS)}, f)
# ─── Main generation loop ─────────────────────────────────────────────────────
def generate(start: int = 0, end: int | None = None, dry_run: bool = False) -> None:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
end = end or len(TOPICS)
topics_to_run = TOPICS[start:end]
if dry_run:
print(f"DRY RUN: would generate {len(topics_to_run)} topics ({start}{end})")
for i, t in enumerate(topics_to_run):
print(f" [{start + i:03d}] [{t['category']:12s}] {t['topic']}")
return
completed = load_progress()
logger.info("Resuming: %d/%d already done", len(completed), len(TOPICS))
stats = {"generated": 0, "skipped": 0, "failed": 0, "invalid": 0}
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f:
for i, entry in enumerate(topics_to_run):
idx = start + i
if idx in completed:
logger.info("[%03d/%03d] SKIP (already done): %s", idx, len(TOPICS) - 1, entry["topic"])
stats["skipped"] += 1
continue
logger.info("[%03d/%03d] Generating: %s", idx, len(TOPICS) - 1, entry["topic"])
user_prompt = build_user_prompt(entry)
output_text = call_claude(SYSTEM_PROMPT, user_prompt, timeout=180)
if output_text is None:
logger.warning("[%03d] FAILED to get output", idx)
stats["failed"] += 1
# Brief pause before retry/next
time.sleep(5)
continue
is_valid, reason = validate_output(output_text, entry["topic"])
if not is_valid:
logger.warning("[%03d] INVALID (%s): %s", idx, reason, entry["topic"])
stats["invalid"] += 1
# Still save it but log the issue
word_count = len(output_text.split())
logger.warning("[%03d] Saving anyway: %d words", idx, word_count)
word_count = len(output_text.split())
logger.info("[%03d] OK: %d words", idx, word_count)
record = {
"system_prompt": SYSTEM_PROMPT,
"input_text": build_user_prompt(entry),
"output_text": output_text,
"meta": {
"topic": entry["topic"],
"category": entry["category"],
"audience": entry["audience"],
"word_count": word_count,
"valid": is_valid,
"reason": reason,
"generated_by": "claude-code-subprocess",
"model": "claude-sonnet",
"dataset_version": "v7",
},
}
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
out_f.flush()
completed.add(idx)
save_progress(completed)
stats["generated"] += 1
# Small pause to avoid overwhelming claude subprocess
time.sleep(2)
logger.info("Done! Generated: %d | Skipped: %d | Failed: %d | Invalid: %d",
stats["generated"], stats["skipped"], stats["failed"], stats["invalid"])
logger.info("Output: %s", OUTPUT_FILE)
# ─── CLI ──────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="fo-blog-v7 training data generator")
parser.add_argument("--start", type=int, default=0, help="Start at topic index")
parser.add_argument("--end", type=int, default=None, help="Stop at topic index (exclusive)")
parser.add_argument("--dry-run", action="store_true", help="List topics without generating")
parser.add_argument("--list-categories", action="store_true", help="Show category distribution")
args = parser.parse_args()
if args.list_categories:
from collections import Counter
cats = Counter(t["category"] for t in TOPICS)
print(f"Total topics: {len(TOPICS)}")
for cat, count in sorted(cats.items(), key=lambda x: -x[1]):
print(f" {cat:15s}: {count:3d}")
return
generate(start=args.start, end=args.end, dry_run=args.dry_run)
if __name__ == "__main__":
main()