- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator - ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation) - ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles) - ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral) - Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry - Integration tests: claude-code-integration.test.ts (14 test cases) - PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan - Post-deployment verification procedures for health, client fallback, metrics
529 lines
40 KiB
Python
529 lines
40 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
generate_v7_data.py — fo-blog-v7 training data generator
|
||
|
||
Uses 'claude --print -p' subprocess to generate 200+ high-quality,
|
||
properly constrained blog posts covering transceiver tech + networking topics.
|
||
|
||
Key improvements over v6 training data:
|
||
1. Anchored system prompt with STRICT length (700-1000w) and structure constraints
|
||
2. Diverse topics: not just transceivers — BGP, IPv6, RIPE/APNIC, data center, etc.
|
||
3. Full articles as output_text (not keyword stubs)
|
||
4. Topic match enforced via explicit input format
|
||
|
||
Output:
|
||
~/transceiver-training-data/v7-generated-sft.jsonl
|
||
|
||
Usage:
|
||
python3 scripts/generate_v7_data.py
|
||
python3 scripts/generate_v7_data.py --start 50 --end 100 # resume
|
||
python3 scripts/generate_v7_data.py --dry-run # show topics only
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
datefmt="%H:%M:%S",
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ─── Output paths ────────────────────────────────────────────────────────────
|
||
|
||
OUTPUT_DIR = Path.home() / "transceiver-training-data"
|
||
OUTPUT_FILE = OUTPUT_DIR / "v7-generated-sft.jsonl"
|
||
PROGRESS_FILE = OUTPUT_DIR / "v7-progress.json"
|
||
|
||
# ─── Anchored system prompt ───────────────────────────────────────────────────
|
||
# This is the KEY improvement for v7: the model must learn these constraints
|
||
# are non-negotiable, not suggestions.
|
||
|
||
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
|
||
|
||
STRICT CONSTRAINTS — Follow exactly, no exceptions:
|
||
- LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum.
|
||
- STRUCTURE (mandatory, in this order):
|
||
1. HOOK paragraph — 2–3 sentences stating the problem this post addresses
|
||
2. Technical sections — 3–4 H2 sections covering the topic in depth
|
||
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
|
||
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
|
||
- NO REPETITION: Every sentence must add new information. No restating.
|
||
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
|
||
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
|
||
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
|
||
|
||
Do not summarize what you are about to write. Start with the hook directly."""
|
||
|
||
# ─── Topic list ──────────────────────────────────────────────────────────────
|
||
# 250 topics: transceiver tech + networking + RIPE/APNIC + routing + data center
|
||
|
||
TOPICS: list[dict] = [
|
||
# ── Transceiver form factors ──────────────────────────────────────────
|
||
{"topic": "QSFP-DD vs OSFP: Which 400G Form Factor Wins in 2026", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "SFP vs SFP+: Why the Upgrade Still Matters for 10G Deployments", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "CFP2-DCO: Pluggable Coherent Optics for Metro Networks Explained", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "800G OSFP Transceivers: What Network Architects Need to Know", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "QSFP28 vs QSFP56: Migrating Your 100G Infrastructure to 200G", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "XFP vs SFP+: When the Legacy Form Factor Still Makes Sense", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Micro-QSFP and SFP-DD: Small Form Factor Optics for High-Density Switching", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "400G QSFP-DD800: Breaking the 400G Barrier with 8×50G PAM4", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "DSFP: The Emerging Dual Small Form Factor and Its Use Cases", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Form Factor Migration: Planning Your Network for 400G to 800G", "category": "transceiver", "audience": "engineer"},
|
||
|
||
# ── Speed and wavelength ──────────────────────────────────────────────
|
||
{"topic": "100G vs 400G Transceivers: Total Cost of Ownership Compared", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "400G SR4 vs LR4 vs PSM4: Choosing the Right 400G Optic", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "1.6T Transceivers: What CPO and On-Board Optics Mean for Data Centers", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "CWDM4 vs FR4 vs DR4+: The 40km 400G Transceiver Landscape", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "LR-Lite Transceivers: The 2km 100G Option Operators Actually Use", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "50G PAM4 vs NRZ: Why Modulation Format Matters for Your SFP56", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "10G DWDM Transceivers for Metro: A Practical Deployment Guide", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "ZR vs ZR+: The 400G Long-Haul Transceiver Showdown", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "800G SR8 Transceivers: Short-Reach Options for Hyperscale Data Centers", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "400G FR4 Transceivers: The Workhorse of Campus and DCI Networks", "category": "transceiver", "audience": "customer"},
|
||
|
||
# ── Coherent optics ───────────────────────────────────────────────────
|
||
{"topic": "Coherent vs Direct Detect: Which Technology for Your DCI Link?", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "OpenROADM and Open Coherent: Breaking Vendor Lock-In in Long-Haul", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "400G ZR/ZR+ for Enterprise DCI: Configuration and Gotchas", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "Coherent DSP Chips: ACO vs ICO vs DCO Pluggable Architecture", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "Submarine Cable Coherent Optics: From 100G to 800G Capacity", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "Installed Base: When to Upgrade Coherent Infrastructure to 400G", "category": "transceiver", "audience": "customer"},
|
||
|
||
# ── Compatibility and vendors ─────────────────────────────────────────
|
||
{"topic": "Third-Party Transceivers: The Real Risk vs. Cost Argument in 2026", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Cisco vs Juniper Transceiver Lock-In: What Your Contract Says", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "OEM vs Compatible Optics: Decoding the Validation Process", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Transceiver Compatibility Matrices: How to Read Them Without Getting Burned", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "FLEXOPTIX Programmable Optics: One SKU, Any Vendor, Any Config", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Gray Market Transceivers: How to Spot Counterfeit Optics Before They Fail", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Arista vs Cisco Transceiver Policy: Which Vendor is More Open?", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Transceiver Procurement for Service Providers: RFQ Checklist", "category": "transceiver", "audience": "customer"},
|
||
|
||
# ── Fiber and physical layer ──────────────────────────────────────────
|
||
{"topic": "OS2 vs OM4 vs OM5: Fiber Type Selection for Your Speed Upgrade", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Fiber Insertion Loss Budget: How to Calculate Before You Buy Optics", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "MTP/MPO vs LC vs SC Connectors: Fiber Cabling for High-Density Racks", "category": "transceiver", "audience": "customer"},
|
||
{"topic": "Bend-Insensitive Fiber (BIF): When OM5 Doesn't Cut It for Data Center", "category": "transceiver", "audience": "engineer"},
|
||
{"topic": "Fiber Dispersion: PMD and CD Compensation in 400G+ Links", "category": "transceiver", "audience": "engineer"},
|
||
|
||
# ── Data center networking ────────────────────────────────────────────
|
||
{"topic": "Spine-Leaf Architecture: Transceiver Strategy for 400G Data Centers", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "Co-Packaged Optics (CPO): Why 2026 Is the Inflection Point", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "Hyperscale vs Enterprise: Different Transceiver Buying Strategies", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "Data Center Interconnect: Selecting Optics for Your DCI Budget", "category": "datacenter", "audience": "customer"},
|
||
{"topic": "Active vs Passive DAC Cables: When Direct-Attach Beats Transceivers", "category": "datacenter", "audience": "customer"},
|
||
{"topic": "AOC Cables: Active Optical Cable Use Cases in 2026 Data Centers", "category": "datacenter", "audience": "customer"},
|
||
{"topic": "Power Efficiency in Optics: How Watt-per-Bit Changes Your TCO", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "Silicon Photonics: How Intel and Broadcom Are Reshaping Transceiver Design", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "AI/ML Infrastructure: Networking Requirements for GPU Clusters", "category": "datacenter", "audience": "engineer"},
|
||
{"topic": "400G Switch Fabric Design: Oversubscription Ratios and Transceiver Placement", "category": "datacenter", "audience": "engineer"},
|
||
|
||
# ── Routing and BGP ───────────────────────────────────────────────────
|
||
{"topic": "BGP Route Leaks: Detection, Impact, and Prevention in 2026", "category": "routing", "audience": "engineer"},
|
||
{"topic": "RPKI Route Origin Validation: A Practical Deployment Guide", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Prefix Hijacking: How Attackers Exploit Routing and How to Stop Them", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP ASPA: The Next Layer of Route Security After RPKI", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Flowspec: Traffic Engineering and DDoS Mitigation in One Protocol", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Communities: A Practical Operator's Guide to Traffic Steering", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Large Communities RFC 8092: Why Your NOC Needs This Now", "category": "routing", "audience": "engineer"},
|
||
{"topic": "Graceful Restart and LLGR: Keeping BGP Sessions Alive During Maintenance", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Add-Path: Solving the Best-Path Problem in Multi-homed Networks", "category": "routing", "audience": "engineer"},
|
||
{"topic": "Route Reflectors vs Route Servers: iBGP Scaling for Large Networks", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGPsec: Why RPKI's Successor Is Still Waiting for Deployment", "category": "routing", "audience": "engineer"},
|
||
{"topic": "IS-IS vs OSPF: Choosing an IGP for Your Service Provider Network", "category": "routing", "audience": "engineer"},
|
||
{"topic": "MPLS Traffic Engineering: Still Relevant in the SR-MPLS Era?", "category": "routing", "audience": "engineer"},
|
||
{"topic": "Segment Routing (SR-MPLS and SRv6): Which One for Your Backbone?", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BFD: Bidirectional Forwarding Detection for Fast Failure Recovery", "category": "routing", "audience": "engineer"},
|
||
{"topic": "EVPN: The Definitive Guide to Data Center and WAN BGP Extensions", "category": "routing", "audience": "engineer"},
|
||
{"topic": "QoS in IP Networks: Traffic Marking, Shaping, and Policing Explained", "category": "routing", "audience": "engineer"},
|
||
{"topic": "FlowSpec vs RTBH: Choosing the Right DDoS Mitigation Tool", "category": "routing", "audience": "engineer"},
|
||
{"topic": "BGP Monitoring Protocol (BMP): Real-Time Route Collection for NOCs", "category": "routing", "audience": "engineer"},
|
||
{"topic": "OpenConfig and YANG: Network Automation That Actually Works", "category": "routing", "audience": "engineer"},
|
||
|
||
# ── IPv6 ──────────────────────────────────────────────────────────────
|
||
{"topic": "IPv6 Deployment for ISPs: 12 Steps from Planning to Production", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "IPv4 Exhaustion in 2026: What Service Providers Must Do Now", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "CGNAT: Why It's a Bad Fix for IPv4 Exhaustion and What to Use Instead", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "Dual-Stack vs 464XLAT vs NAT64: IPv6 Transition Mechanisms Compared", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "IPv6 Prefix Delegation: PD Configuration for ISP Customer Networks", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "IPv6 Security: Attack Vectors That Don't Exist in IPv4", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "IPv6 Address Planning: How to Structure /32 Allocation for Growth", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "Mobile IPv6 and 5G: How Carrier Networks Handle Mobility at Scale", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "World IPv6 Launch: Where Are We 13 Years Later?", "category": "ipv6", "audience": "engineer"},
|
||
{"topic": "IPv6 ROA and RPKI: Securing Your IPv6 Routing from Day One", "category": "ipv6", "audience": "engineer"},
|
||
|
||
# ── Internet infrastructure and RIR/APNIC/RIPE ───────────────────────
|
||
{"topic": "APNIC and Asia-Pacific IPv6 Leadership: What the Data Shows", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "RIPE NCC Resource Certification: How to Get Your RPKI Right", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Internet Exchange Points: Why IXPs Are Critical Infrastructure", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "ARIN vs RIPE vs APNIC: How IP Address Policies Differ by Region", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "BGP Looking Glass Tools: How to Debug Routing Problems Remotely", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Peering vs Transit: The Economics of Internet Interconnection", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "DE-CIX, AMS-IX, LINX: The IXPs That Move Europe's Internet", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Route Server Best Practices for IXP Operators", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "MANRS: Mutually Agreed Norms for Routing Security in 2026", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Internet Shutdowns: Technical Analysis of BGP Withdrawal Patterns", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Submarine Cable Systems: Routing Resilience for Island Networks", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "RDAP vs WHOIS: The Modern Way to Query IP and Domain Ownership", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "DNS Anycast: How Root Servers Handle 50 Billion Queries Per Day", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "DNSSEC: Deployment Status and Why Operators Still Hesitate", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "RIPE Atlas: Using Distributed Probes to Measure Internet Reachability", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "NTP Security: How BGP Leaks Can Desync Your Infrastructure Clocks", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Internet Routing Registry (IRR): Why It's Messy and What to Do About It", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "CDN Architecture: How Akamai and Cloudflare Use BGP for Global Delivery", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Anycast BGP for DDoS Mitigation: A NOC Operator's Guide", "category": "infrastructure", "audience": "engineer"},
|
||
{"topic": "Public Peer vs Private Peer: IXP Negotiation Strategy for ISPs", "category": "infrastructure", "audience": "engineer"},
|
||
|
||
# ── Network operations ────────────────────────────────────────────────
|
||
{"topic": "SNMP vs gRPC Telemetry: Modernizing Your NOC Monitoring Stack", "category": "operations", "audience": "engineer"},
|
||
{"topic": "NetFlow vs IPFIX vs sFlow: Choosing Traffic Analytics for Your Network", "category": "operations", "audience": "engineer"},
|
||
{"topic": "Optical Power Budget: How to Diagnose Fiber Link Problems Fast", "category": "operations", "audience": "engineer"},
|
||
{"topic": "OTDR Testing: Reading Loss Traces for Fiber Troubleshooting", "category": "operations", "audience": "engineer"},
|
||
{"topic": "DDOS Mitigation at Scale: BGP Blackhole and Scrubbing Centers", "category": "operations", "audience": "engineer"},
|
||
{"topic": "Network Change Management: Avoiding Outages During Maintenance Windows", "category": "operations", "audience": "engineer"},
|
||
{"topic": "MPLS LDP vs RSVP-TE: Label Distribution Protocol Comparison", "category": "operations", "audience": "engineer"},
|
||
{"topic": "Transceiver DOM Monitoring: What DDM Data Tells You Before Links Fail", "category": "operations", "audience": "engineer"},
|
||
{"topic": "NOC Alert Fatigue: Structuring Alerts to Avoid the Cry-Wolf Effect", "category": "operations", "audience": "engineer"},
|
||
{"topic": "Fiber Cuts: Incident Response Procedures for Backbone Operators", "category": "operations", "audience": "engineer"},
|
||
{"topic": "MTTR vs MTBF: Optical Transceiver Reliability Metrics That Matter", "category": "operations", "audience": "customer"},
|
||
{"topic": "Optics Inventory Management: How to Avoid a Spare-Parts Crisis", "category": "operations", "audience": "customer"},
|
||
{"topic": "Transceiver Firmware Upgrades: Risk Management and Rollback Plans", "category": "operations", "audience": "engineer"},
|
||
{"topic": "Network Automation with Ansible and NAPALM: Practical Getting Started Guide", "category": "operations", "audience": "engineer"},
|
||
{"topic": "gNMI and gNOI: Google's Contribution to Network Operations APIs", "category": "operations", "audience": "engineer"},
|
||
|
||
# ── Security ──────────────────────────────────────────────────────────
|
||
{"topic": "BGP Hijack Case Studies: Real Incidents and Their Technical Aftermath", "category": "security", "audience": "engineer"},
|
||
{"topic": "RPKI ROA vs ASPA vs BGPsec: The Routing Security Stack in 2026", "category": "security", "audience": "engineer"},
|
||
{"topic": "DDoS Amplification via DNS and NTP: How It Works and How to Block It", "category": "security", "audience": "engineer"},
|
||
{"topic": "Supply Chain Attacks on Network Hardware: Counterfeit Optics and Beyond", "category": "security", "audience": "engineer"},
|
||
{"topic": "BGP Route Filtering: RPKI-Invalid Drop vs Just-Logging", "category": "security", "audience": "engineer"},
|
||
{"topic": "Network Segmentation: How Optical Transceivers Factor into Zero-Trust", "category": "security", "audience": "engineer"},
|
||
|
||
# ── Market and business ───────────────────────────────────────────────
|
||
{"topic": "Transceiver Market 2026: 400G Adoption Rates and What's Driving 800G", "category": "market", "audience": "customer"},
|
||
{"topic": "Photonics Supply Chain: TSMC, II-VI, and the Chip Shortage Aftermath", "category": "market", "audience": "customer"},
|
||
{"topic": "Price Comparison: QSFP-DD 400G ZR from 8 Vendors — Who Wins?", "category": "market", "audience": "customer"},
|
||
{"topic": "Hyperscaler Buying Power: How Meta and AWS Shape the Transceiver Market", "category": "market", "audience": "customer"},
|
||
{"topic": "Transceiver Leasing vs Buying: CapEx vs OpEx Decision Framework", "category": "market", "audience": "customer"},
|
||
{"topic": "Optical Networking M&A: Coherent, II-VI, Lumentum — What the Consolidation Means", "category": "market", "audience": "customer"},
|
||
{"topic": "Open Networking: SONiC Adoption and the Disaggregation Trend in 2026", "category": "market", "audience": "engineer"},
|
||
{"topic": "Whitebox Switching and Merchant Silicon: The Business Case for Operators", "category": "market", "audience": "engineer"},
|
||
{"topic": "Transceiver Pricing Trends: When Does 400G Hit the 10G Price Point?", "category": "market", "audience": "customer"},
|
||
{"topic": "AI Networking Demand: How LLM Training Clusters Are Reshaping Optics Sales", "category": "market", "audience": "customer"},
|
||
|
||
# ── Standards and MSAs ────────────────────────────────────────────────
|
||
{"topic": "IEEE 802.3bs and 400GbE: The Standard That Enabled QSFP-DD", "category": "standards", "audience": "engineer"},
|
||
{"topic": "OIF 400ZR Implementation Agreement: What It Means for DCI Deployments", "category": "standards", "audience": "engineer"},
|
||
{"topic": "MSA Compliance: How Multi-Source Agreements Enable Interoperability", "category": "standards", "audience": "customer"},
|
||
{"topic": "SFF-8024: The Transceiver Identifier Standard Explained", "category": "standards", "audience": "engineer"},
|
||
{"topic": "IEEE 802.3cu 100G FR and LR: Simplifying 100G Beyond 10km", "category": "standards", "audience": "engineer"},
|
||
{"topic": "CMIS 5.0: The Management Interface That Unlocks 800G OSFP Features", "category": "standards", "audience": "engineer"},
|
||
{"topic": "800G Standards Landscape: QSFP-DD800, OSFP, and What Comes Next", "category": "standards", "audience": "engineer"},
|
||
{"topic": "CWDM vs DWDM Wavelength Plans: ITU Grid Selection for Metro and Long-Haul", "category": "standards", "audience": "engineer"},
|
||
{"topic": "OpenZR+ vs OIF-400ZR: The 400G Coherent Protocol War", "category": "standards", "audience": "engineer"},
|
||
{"topic": "Photonic Integrated Circuits: SiPh, InP, and the Future of Transceiver Design", "category": "standards", "audience": "engineer"},
|
||
|
||
# ── SONiC and open networking ─────────────────────────────────────────
|
||
{"topic": "SONiC Architecture: How Microsoft's Switch OS Works Under the Hood", "category": "opennet", "audience": "engineer"},
|
||
{"topic": "SONiC vs Cumulus vs OpenWrt: Choosing an Open NOS for Your Lab", "category": "opennet", "audience": "engineer"},
|
||
{"topic": "SONiC SAI API: Abstracting Hardware Across ASICs and Vendors", "category": "opennet", "audience": "engineer"},
|
||
{"topic": "Open Compute Project (OCP): How Facebook is Driving Network Disaggregation", "category": "opennet", "audience": "engineer"},
|
||
{"topic": "P4 Programming: The Future of Programmable Data Plane Networking", "category": "opennet", "audience": "engineer"},
|
||
{"topic": "SmartNIC and DPU: Offloading Network Functions from CPUs", "category": "opennet", "audience": "engineer"},
|
||
|
||
# ── Emerging topics ───────────────────────────────────────────────────
|
||
{"topic": "Quantum Key Distribution (QKD) over Fiber: Network Integration Challenges", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Space Optical Communications: LEO Constellation Intersatellite Links", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "400G and Beyond for RAN Fronthaul: O-RAN Transceiver Requirements", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Edge Computing: Optical Networking Requirements for 5G MEC", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "AI-Driven Network Management: Using LLMs for Optical Layer Analysis", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Green Networking: Power Consumption Optimization for Optical Infrastructure", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Liquid Cooling and Photonics: How Thermal Management Changes at 800G", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Reconfigurable Optical Add-Drop Multiplexers (ROADM): WSS Architecture Guide", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Optical Time Domain Reflectometry in Automated NOC Workflows", "category": "emerging", "audience": "engineer"},
|
||
{"topic": "Optical Amplifiers: EDFA vs Raman vs SOA — When Each One Applies", "category": "emerging", "audience": "engineer"},
|
||
|
||
# ── Regional and service provider ─────────────────────────────────────
|
||
{"topic": "African Internet Infrastructure: Submarine Cables and Terrestrial Fiber Gaps", "category": "regional", "audience": "engineer"},
|
||
{"topic": "APAC Data Center Boom: Transceiver Requirements for Singapore and Tokyo Hubs", "category": "regional", "audience": "customer"},
|
||
{"topic": "European 5G Backbone: Optical Transceiver Demand Through 2028", "category": "regional", "audience": "customer"},
|
||
{"topic": "Latin America ISP Connectivity: Low-Cost 100G Options for Emerging Markets", "category": "regional", "audience": "customer"},
|
||
{"topic": "Middle East Data Center Growth: IXP and Optical Infrastructure Investments", "category": "regional", "audience": "customer"},
|
||
{"topic": "Rural Broadband Access: Optical Technologies for the Last Mile", "category": "regional", "audience": "customer"},
|
||
{"topic": "GÉANT Research Network: How European Academia Runs 100Tbps+ at Scale", "category": "regional", "audience": "engineer"},
|
||
{"topic": "Carrier Ethernet Services: MEF Framework for Wholesale Optical Transport", "category": "regional", "audience": "engineer"},
|
||
|
||
# ── How-to guides ─────────────────────────────────────────────────────
|
||
{"topic": "How to Read a Transceiver Datasheet: Key Specs That Actually Matter", "category": "howto", "audience": "customer"},
|
||
{"topic": "How to Choose the Right Transceiver for a 10km Single-Mode Link", "category": "howto", "audience": "customer"},
|
||
{"topic": "How to Configure a 400G ZR Link Between Two Routers", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Implement RPKI on Cisco IOS-XR: Step-by-Step Guide", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Set Up BGP Communities for Traffic Engineering", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Diagnose an Optical Link Failure Using DOM Data", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Compare Transceiver Prices Across Vendors Without Getting Scammed", "category": "howto", "audience": "customer"},
|
||
{"topic": "How to Plan Fiber Capacity for a 5-Year Data Center Expansion", "category": "howto", "audience": "customer"},
|
||
{"topic": "How to Implement EVPN VXLAN in a Spine-Leaf Data Center", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Write a Network RFP for Optical Transceiver Procurement", "category": "howto", "audience": "customer"},
|
||
{"topic": "How to Migrate from NRZ to PAM4: A Practical Network Engineer's Guide", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Calculate Power Budget for a 100G Long-Haul Link", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Set Up Streaming Telemetry with gNMI and InfluxDB", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Deploy SONiC in a Production Data Center: Lessons from 6 Months", "category": "howto", "audience": "engineer"},
|
||
{"topic": "How to Evaluate Third-Party Transceivers Before Buying 500 Units", "category": "howto", "audience": "customer"},
|
||
|
||
# ── Troubleshooting ───────────────────────────────────────────────────
|
||
{"topic": "Why Is My SFP+ Module Showing Rx Power -40dBm? Fiber Fault Diagnosis", "category": "troubleshoot", "audience": "engineer"},
|
||
{"topic": "BGP Session Flapping: 7 Root Causes and How to Debug Each One", "category": "troubleshoot", "audience": "engineer"},
|
||
{"topic": "Transceiver Not Recognized: Vendor Lock-In Detection and Workarounds", "category": "troubleshoot", "audience": "customer"},
|
||
{"topic": "High BER on 100G Link: Signal Integrity Debugging from DOM to OTDR", "category": "troubleshoot", "audience": "engineer"},
|
||
{"topic": "MTU Black Holes: How Jumbo Frame Mismatches Kill Network Performance", "category": "troubleshoot", "audience": "engineer"},
|
||
{"topic": "OSPF Adjacency Issues: Debugging Area Type Mismatches and Hello Timers", "category": "troubleshoot", "audience": "engineer"},
|
||
|
||
# ── Comparison and decision guides ────────────────────────────────────
|
||
{"topic": "Cisco ASR 9000 vs Nokia 7750: Backbone Router Optics Ecosystem", "category": "comparison", "audience": "customer"},
|
||
{"topic": "Arista 7800 vs Juniper QFX10000: Data Center Fabric Optics Comparison", "category": "comparison", "audience": "customer"},
|
||
{"topic": "Vendor Lock-In Scorecard: Cisco vs Juniper vs Arista in 2026", "category": "comparison", "audience": "customer"},
|
||
{"topic": "100G LR4 vs ER4: When to Pay for 40km Reach", "category": "comparison", "audience": "customer"},
|
||
{"topic": "NaaS vs DIY: Network-as-a-Service vs Owning Your Own Optical Infrastructure", "category": "comparison", "audience": "customer"},
|
||
{"topic": "Coherent Pluggables vs Fixed-Line Transponders: TCO for Service Providers", "category": "comparison", "audience": "customer"},
|
||
|
||
# ── Future and innovation ─────────────────────────────────────────────
|
||
{"topic": "1.6T Optics Timeline: When Will 1.6 Terabit Transceivers Hit Production?", "category": "future", "audience": "engineer"},
|
||
{"topic": "All-Optical Networks: The Dream of Photonic Switching Without O-E-O", "category": "future", "audience": "engineer"},
|
||
{"topic": "Post-Quantum Cryptography in Network Infrastructure: Timing the Transition", "category": "future", "audience": "engineer"},
|
||
{"topic": "AI-Predicted Network Failures: How ML Is Entering Optical Layer Management", "category": "future", "audience": "engineer"},
|
||
{"topic": "Intent-Based Networking: BGP and Optics Policy Automation in 2026", "category": "future", "audience": "engineer"},
|
||
{"topic": "The 10 Year Horizon: How Optical Networking Will Change by 2035", "category": "future", "audience": "customer"},
|
||
]
|
||
|
||
|
||
# ─── Prompt construction ──────────────────────────────────────────────────────
|
||
|
||
def build_user_prompt(entry: dict) -> str:
|
||
"""Build the user-turn prompt for a given topic entry."""
|
||
audience_map = {
|
||
"customer": "IT managers, procurement teams, and operators who evaluate and buy transceivers",
|
||
"engineer": "network engineers and architects who design and operate optical infrastructure",
|
||
}
|
||
audience = audience_map.get(entry.get("audience", "engineer"), "network engineers")
|
||
|
||
return (
|
||
f"Write a blog post on the following topic:\n\n"
|
||
f'**Topic:** {entry["topic"]}\n\n'
|
||
f"**Target audience:** {audience}\n\n"
|
||
f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. "
|
||
f"Stay strictly on-topic. No filler. Start writing now."
|
||
)
|
||
|
||
|
||
# ─── Claude subprocess ────────────────────────────────────────────────────────
|
||
|
||
def call_claude(system: str, user_prompt: str, timeout: int = 120) -> str | None:
|
||
"""
|
||
Call 'claude --print' via subprocess.
|
||
|
||
Uses the Claude Code subscription — no API billing.
|
||
Returns the generated text, or None on failure.
|
||
"""
|
||
try:
|
||
result = subprocess.run(
|
||
[
|
||
"claude",
|
||
"--print",
|
||
"--system-prompt", system,
|
||
"-p", user_prompt,
|
||
],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=timeout,
|
||
)
|
||
|
||
if result.returncode != 0:
|
||
logger.warning("claude subprocess error (rc=%d): %s", result.returncode, result.stderr[:200])
|
||
return None
|
||
|
||
output = result.stdout.strip()
|
||
if not output:
|
||
logger.warning("claude returned empty output")
|
||
return None
|
||
|
||
return output
|
||
|
||
except subprocess.TimeoutExpired:
|
||
logger.warning("claude subprocess timed out after %ds", timeout)
|
||
return None
|
||
except FileNotFoundError:
|
||
logger.error("claude CLI not found — install Claude Code")
|
||
return None
|
||
except Exception as exc:
|
||
logger.warning("claude subprocess unexpected error: %s", exc)
|
||
return None
|
||
|
||
|
||
# ─── Quality validation ───────────────────────────────────────────────────────
|
||
|
||
def validate_output(text: str, topic: str) -> tuple[bool, str]:
|
||
"""
|
||
Basic quality check on generated blog post.
|
||
|
||
Returns (is_valid, reason).
|
||
"""
|
||
words = len(text.split())
|
||
|
||
if words < 400:
|
||
return False, f"too short: {words} words (min 400)"
|
||
|
||
if words > 2500:
|
||
return False, f"too long: {words} words (max 2500 — will be flagged)"
|
||
|
||
# Must have some structure
|
||
has_headers = bool(re.search(r"^##\s+.+", text, re.MULTILINE))
|
||
if not has_headers:
|
||
return False, "missing ## section headers"
|
||
|
||
# Must start with actual content (not a meta-comment about the post)
|
||
first_line = text.strip().split("\n")[0].lower()
|
||
skip_patterns = ["i'll write", "here's a", "here is a", "let me write", "blog post:"]
|
||
for pat in skip_patterns:
|
||
if pat in first_line:
|
||
return False, f"starts with meta-comment: '{first_line[:60]}'"
|
||
|
||
return True, "ok"
|
||
|
||
|
||
# ─── Progress tracking ────────────────────────────────────────────────────────
|
||
|
||
def load_progress() -> set[int]:
|
||
"""Load set of already-generated topic indices."""
|
||
if not PROGRESS_FILE.exists():
|
||
return set()
|
||
try:
|
||
with open(PROGRESS_FILE) as f:
|
||
data = json.load(f)
|
||
return set(data.get("completed", []))
|
||
except Exception:
|
||
return set()
|
||
|
||
|
||
def save_progress(completed: set[int]) -> None:
|
||
with open(PROGRESS_FILE, "w") as f:
|
||
json.dump({"completed": sorted(completed), "total": len(TOPICS)}, f)
|
||
|
||
|
||
# ─── Main generation loop ─────────────────────────────────────────────────────
|
||
|
||
def generate(start: int = 0, end: int | None = None, dry_run: bool = False) -> None:
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
end = end or len(TOPICS)
|
||
topics_to_run = TOPICS[start:end]
|
||
|
||
if dry_run:
|
||
print(f"DRY RUN: would generate {len(topics_to_run)} topics ({start}–{end})")
|
||
for i, t in enumerate(topics_to_run):
|
||
print(f" [{start + i:03d}] [{t['category']:12s}] {t['topic']}")
|
||
return
|
||
|
||
completed = load_progress()
|
||
logger.info("Resuming: %d/%d already done", len(completed), len(TOPICS))
|
||
|
||
stats = {"generated": 0, "skipped": 0, "failed": 0, "invalid": 0}
|
||
|
||
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f:
|
||
for i, entry in enumerate(topics_to_run):
|
||
idx = start + i
|
||
|
||
if idx in completed:
|
||
logger.info("[%03d/%03d] SKIP (already done): %s", idx, len(TOPICS) - 1, entry["topic"])
|
||
stats["skipped"] += 1
|
||
continue
|
||
|
||
logger.info("[%03d/%03d] Generating: %s", idx, len(TOPICS) - 1, entry["topic"])
|
||
|
||
user_prompt = build_user_prompt(entry)
|
||
output_text = call_claude(SYSTEM_PROMPT, user_prompt, timeout=180)
|
||
|
||
if output_text is None:
|
||
logger.warning("[%03d] FAILED to get output", idx)
|
||
stats["failed"] += 1
|
||
# Brief pause before retry/next
|
||
time.sleep(5)
|
||
continue
|
||
|
||
is_valid, reason = validate_output(output_text, entry["topic"])
|
||
if not is_valid:
|
||
logger.warning("[%03d] INVALID (%s): %s", idx, reason, entry["topic"])
|
||
stats["invalid"] += 1
|
||
# Still save it but log the issue
|
||
word_count = len(output_text.split())
|
||
logger.warning("[%03d] Saving anyway: %d words", idx, word_count)
|
||
|
||
word_count = len(output_text.split())
|
||
logger.info("[%03d] OK: %d words", idx, word_count)
|
||
|
||
record = {
|
||
"system_prompt": SYSTEM_PROMPT,
|
||
"input_text": build_user_prompt(entry),
|
||
"output_text": output_text,
|
||
"meta": {
|
||
"topic": entry["topic"],
|
||
"category": entry["category"],
|
||
"audience": entry["audience"],
|
||
"word_count": word_count,
|
||
"valid": is_valid,
|
||
"reason": reason,
|
||
"generated_by": "claude-code-subprocess",
|
||
"model": "claude-sonnet",
|
||
"dataset_version": "v7",
|
||
},
|
||
}
|
||
|
||
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||
out_f.flush()
|
||
|
||
completed.add(idx)
|
||
save_progress(completed)
|
||
stats["generated"] += 1
|
||
|
||
# Small pause to avoid overwhelming claude subprocess
|
||
time.sleep(2)
|
||
|
||
logger.info("Done! Generated: %d | Skipped: %d | Failed: %d | Invalid: %d",
|
||
stats["generated"], stats["skipped"], stats["failed"], stats["invalid"])
|
||
logger.info("Output: %s", OUTPUT_FILE)
|
||
|
||
|
||
# ─── CLI ──────────────────────────────────────────────────────────────────────
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="fo-blog-v7 training data generator")
|
||
parser.add_argument("--start", type=int, default=0, help="Start at topic index")
|
||
parser.add_argument("--end", type=int, default=None, help="Stop at topic index (exclusive)")
|
||
parser.add_argument("--dry-run", action="store_true", help="List topics without generating")
|
||
parser.add_argument("--list-categories", action="store_true", help="Show category distribution")
|
||
args = parser.parse_args()
|
||
|
||
if args.list_categories:
|
||
from collections import Counter
|
||
cats = Counter(t["category"] for t in TOPICS)
|
||
print(f"Total topics: {len(TOPICS)}")
|
||
for cat, count in sorted(cats.items(), key=lambda x: -x[1]):
|
||
print(f" {cat:15s}: {count:3d}")
|
||
return
|
||
|
||
generate(start=args.start, end=args.end, dry_run=args.dry_run)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|