#!/usr/bin/env python3 """ generate_v7_data.py — fo-blog-v7 training data generator Uses 'claude --print -p' subprocess to generate 200+ high-quality, properly constrained blog posts covering transceiver tech + networking topics. Key improvements over v6 training data: 1. Anchored system prompt with STRICT length (700-1000w) and structure constraints 2. Diverse topics: not just transceivers — BGP, IPv6, RIPE/APNIC, data center, etc. 3. Full articles as output_text (not keyword stubs) 4. Topic match enforced via explicit input format Output: ~/transceiver-training-data/v7-generated-sft.jsonl Usage: python3 scripts/generate_v7_data.py python3 scripts/generate_v7_data.py --start 50 --end 100 # resume python3 scripts/generate_v7_data.py --dry-run # show topics only """ from __future__ import annotations import argparse import json import logging import re import subprocess import sys import time from pathlib import Path logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) logger = logging.getLogger(__name__) # ─── Output paths ──────────────────────────────────────────────────────────── OUTPUT_DIR = Path.home() / "transceiver-training-data" OUTPUT_FILE = OUTPUT_DIR / "v7-generated-sft.jsonl" PROGRESS_FILE = OUTPUT_DIR / "v7-progress.json" # ─── Anchored system prompt ─────────────────────────────────────────────────── # This is the KEY improvement for v7: the model must learn these constraints # are non-negotiable, not suggestions. SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure. STRICT CONSTRAINTS — Follow exactly, no exceptions: - LENGTH: 700–1000 words. Count carefully. Stop at 1000 words maximum. - STRUCTURE (mandatory, in this order): 1. HOOK paragraph — 2–3 sentences stating the problem this post addresses 2. Technical sections — 3–4 H2 sections covering the topic in depth 3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable - TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift. - NO REPETITION: Every sentence must add new information. No restating. - VOICE: Confident, direct. No hedging phrases like "it's worth noting". - AUDIENCE: Network engineers and IT professionals. Assume technical fluency. - FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms. Do not summarize what you are about to write. Start with the hook directly.""" # ─── Topic list ────────────────────────────────────────────────────────────── # 250 topics: transceiver tech + networking + RIPE/APNIC + routing + data center TOPICS: list[dict] = [ # ── Transceiver form factors ────────────────────────────────────────── {"topic": "QSFP-DD vs OSFP: Which 400G Form Factor Wins in 2026", "category": "transceiver", "audience": "customer"}, {"topic": "SFP vs SFP+: Why the Upgrade Still Matters for 10G Deployments", "category": "transceiver", "audience": "customer"}, {"topic": "CFP2-DCO: Pluggable Coherent Optics for Metro Networks Explained", "category": "transceiver", "audience": "engineer"}, {"topic": "800G OSFP Transceivers: What Network Architects Need to Know", "category": "transceiver", "audience": "engineer"}, {"topic": "QSFP28 vs QSFP56: Migrating Your 100G Infrastructure to 200G", "category": "transceiver", "audience": "customer"}, {"topic": "XFP vs SFP+: When the Legacy Form Factor Still Makes Sense", "category": "transceiver", "audience": "customer"}, {"topic": "Micro-QSFP and SFP-DD: Small Form Factor Optics for High-Density Switching", "category": "transceiver", "audience": "engineer"}, {"topic": "400G QSFP-DD800: Breaking the 400G Barrier with 8×50G PAM4", "category": "transceiver", "audience": "engineer"}, {"topic": "DSFP: The Emerging Dual Small Form Factor and Its Use Cases", "category": "transceiver", "audience": "customer"}, {"topic": "Form Factor Migration: Planning Your Network for 400G to 800G", "category": "transceiver", "audience": "engineer"}, # ── Speed and wavelength ────────────────────────────────────────────── {"topic": "100G vs 400G Transceivers: Total Cost of Ownership Compared", "category": "transceiver", "audience": "customer"}, {"topic": "400G SR4 vs LR4 vs PSM4: Choosing the Right 400G Optic", "category": "transceiver", "audience": "engineer"}, {"topic": "1.6T Transceivers: What CPO and On-Board Optics Mean for Data Centers", "category": "transceiver", "audience": "engineer"}, {"topic": "CWDM4 vs FR4 vs DR4+: The 40km 400G Transceiver Landscape", "category": "transceiver", "audience": "engineer"}, {"topic": "LR-Lite Transceivers: The 2km 100G Option Operators Actually Use", "category": "transceiver", "audience": "engineer"}, {"topic": "50G PAM4 vs NRZ: Why Modulation Format Matters for Your SFP56", "category": "transceiver", "audience": "engineer"}, {"topic": "10G DWDM Transceivers for Metro: A Practical Deployment Guide", "category": "transceiver", "audience": "customer"}, {"topic": "ZR vs ZR+: The 400G Long-Haul Transceiver Showdown", "category": "transceiver", "audience": "engineer"}, {"topic": "800G SR8 Transceivers: Short-Reach Options for Hyperscale Data Centers", "category": "transceiver", "audience": "engineer"}, {"topic": "400G FR4 Transceivers: The Workhorse of Campus and DCI Networks", "category": "transceiver", "audience": "customer"}, # ── Coherent optics ─────────────────────────────────────────────────── {"topic": "Coherent vs Direct Detect: Which Technology for Your DCI Link?", "category": "transceiver", "audience": "engineer"}, {"topic": "OpenROADM and Open Coherent: Breaking Vendor Lock-In in Long-Haul", "category": "transceiver", "audience": "engineer"}, {"topic": "400G ZR/ZR+ for Enterprise DCI: Configuration and Gotchas", "category": "transceiver", "audience": "engineer"}, {"topic": "Coherent DSP Chips: ACO vs ICO vs DCO Pluggable Architecture", "category": "transceiver", "audience": "engineer"}, {"topic": "Submarine Cable Coherent Optics: From 100G to 800G Capacity", "category": "transceiver", "audience": "engineer"}, {"topic": "Installed Base: When to Upgrade Coherent Infrastructure to 400G", "category": "transceiver", "audience": "customer"}, # ── Compatibility and vendors ───────────────────────────────────────── {"topic": "Third-Party Transceivers: The Real Risk vs. Cost Argument in 2026", "category": "transceiver", "audience": "customer"}, {"topic": "Cisco vs Juniper Transceiver Lock-In: What Your Contract Says", "category": "transceiver", "audience": "customer"}, {"topic": "OEM vs Compatible Optics: Decoding the Validation Process", "category": "transceiver", "audience": "customer"}, {"topic": "Transceiver Compatibility Matrices: How to Read Them Without Getting Burned", "category": "transceiver", "audience": "customer"}, {"topic": "FLEXOPTIX Programmable Optics: One SKU, Any Vendor, Any Config", "category": "transceiver", "audience": "customer"}, {"topic": "Gray Market Transceivers: How to Spot Counterfeit Optics Before They Fail", "category": "transceiver", "audience": "customer"}, {"topic": "Arista vs Cisco Transceiver Policy: Which Vendor is More Open?", "category": "transceiver", "audience": "customer"}, {"topic": "Transceiver Procurement for Service Providers: RFQ Checklist", "category": "transceiver", "audience": "customer"}, # ── Fiber and physical layer ────────────────────────────────────────── {"topic": "OS2 vs OM4 vs OM5: Fiber Type Selection for Your Speed Upgrade", "category": "transceiver", "audience": "customer"}, {"topic": "Fiber Insertion Loss Budget: How to Calculate Before You Buy Optics", "category": "transceiver", "audience": "engineer"}, {"topic": "MTP/MPO vs LC vs SC Connectors: Fiber Cabling for High-Density Racks", "category": "transceiver", "audience": "customer"}, {"topic": "Bend-Insensitive Fiber (BIF): When OM5 Doesn't Cut It for Data Center", "category": "transceiver", "audience": "engineer"}, {"topic": "Fiber Dispersion: PMD and CD Compensation in 400G+ Links", "category": "transceiver", "audience": "engineer"}, # ── Data center networking ──────────────────────────────────────────── {"topic": "Spine-Leaf Architecture: Transceiver Strategy for 400G Data Centers", "category": "datacenter", "audience": "engineer"}, {"topic": "Co-Packaged Optics (CPO): Why 2026 Is the Inflection Point", "category": "datacenter", "audience": "engineer"}, {"topic": "Hyperscale vs Enterprise: Different Transceiver Buying Strategies", "category": "datacenter", "audience": "engineer"}, {"topic": "Data Center Interconnect: Selecting Optics for Your DCI Budget", "category": "datacenter", "audience": "customer"}, {"topic": "Active vs Passive DAC Cables: When Direct-Attach Beats Transceivers", "category": "datacenter", "audience": "customer"}, {"topic": "AOC Cables: Active Optical Cable Use Cases in 2026 Data Centers", "category": "datacenter", "audience": "customer"}, {"topic": "Power Efficiency in Optics: How Watt-per-Bit Changes Your TCO", "category": "datacenter", "audience": "engineer"}, {"topic": "Silicon Photonics: How Intel and Broadcom Are Reshaping Transceiver Design", "category": "datacenter", "audience": "engineer"}, {"topic": "AI/ML Infrastructure: Networking Requirements for GPU Clusters", "category": "datacenter", "audience": "engineer"}, {"topic": "400G Switch Fabric Design: Oversubscription Ratios and Transceiver Placement", "category": "datacenter", "audience": "engineer"}, # ── Routing and BGP ─────────────────────────────────────────────────── {"topic": "BGP Route Leaks: Detection, Impact, and Prevention in 2026", "category": "routing", "audience": "engineer"}, {"topic": "RPKI Route Origin Validation: A Practical Deployment Guide", "category": "routing", "audience": "engineer"}, {"topic": "BGP Prefix Hijacking: How Attackers Exploit Routing and How to Stop Them", "category": "routing", "audience": "engineer"}, {"topic": "BGP ASPA: The Next Layer of Route Security After RPKI", "category": "routing", "audience": "engineer"}, {"topic": "BGP Flowspec: Traffic Engineering and DDoS Mitigation in One Protocol", "category": "routing", "audience": "engineer"}, {"topic": "BGP Communities: A Practical Operator's Guide to Traffic Steering", "category": "routing", "audience": "engineer"}, {"topic": "BGP Large Communities RFC 8092: Why Your NOC Needs This Now", "category": "routing", "audience": "engineer"}, {"topic": "Graceful Restart and LLGR: Keeping BGP Sessions Alive During Maintenance", "category": "routing", "audience": "engineer"}, {"topic": "BGP Add-Path: Solving the Best-Path Problem in Multi-homed Networks", "category": "routing", "audience": "engineer"}, {"topic": "Route Reflectors vs Route Servers: iBGP Scaling for Large Networks", "category": "routing", "audience": "engineer"}, {"topic": "BGPsec: Why RPKI's Successor Is Still Waiting for Deployment", "category": "routing", "audience": "engineer"}, {"topic": "IS-IS vs OSPF: Choosing an IGP for Your Service Provider Network", "category": "routing", "audience": "engineer"}, {"topic": "MPLS Traffic Engineering: Still Relevant in the SR-MPLS Era?", "category": "routing", "audience": "engineer"}, {"topic": "Segment Routing (SR-MPLS and SRv6): Which One for Your Backbone?", "category": "routing", "audience": "engineer"}, {"topic": "BFD: Bidirectional Forwarding Detection for Fast Failure Recovery", "category": "routing", "audience": "engineer"}, {"topic": "EVPN: The Definitive Guide to Data Center and WAN BGP Extensions", "category": "routing", "audience": "engineer"}, {"topic": "QoS in IP Networks: Traffic Marking, Shaping, and Policing Explained", "category": "routing", "audience": "engineer"}, {"topic": "FlowSpec vs RTBH: Choosing the Right DDoS Mitigation Tool", "category": "routing", "audience": "engineer"}, {"topic": "BGP Monitoring Protocol (BMP): Real-Time Route Collection for NOCs", "category": "routing", "audience": "engineer"}, {"topic": "OpenConfig and YANG: Network Automation That Actually Works", "category": "routing", "audience": "engineer"}, # ── IPv6 ────────────────────────────────────────────────────────────── {"topic": "IPv6 Deployment for ISPs: 12 Steps from Planning to Production", "category": "ipv6", "audience": "engineer"}, {"topic": "IPv4 Exhaustion in 2026: What Service Providers Must Do Now", "category": "ipv6", "audience": "engineer"}, {"topic": "CGNAT: Why It's a Bad Fix for IPv4 Exhaustion and What to Use Instead", "category": "ipv6", "audience": "engineer"}, {"topic": "Dual-Stack vs 464XLAT vs NAT64: IPv6 Transition Mechanisms Compared", "category": "ipv6", "audience": "engineer"}, {"topic": "IPv6 Prefix Delegation: PD Configuration for ISP Customer Networks", "category": "ipv6", "audience": "engineer"}, {"topic": "IPv6 Security: Attack Vectors That Don't Exist in IPv4", "category": "ipv6", "audience": "engineer"}, {"topic": "IPv6 Address Planning: How to Structure /32 Allocation for Growth", "category": "ipv6", "audience": "engineer"}, {"topic": "Mobile IPv6 and 5G: How Carrier Networks Handle Mobility at Scale", "category": "ipv6", "audience": "engineer"}, {"topic": "World IPv6 Launch: Where Are We 13 Years Later?", "category": "ipv6", "audience": "engineer"}, {"topic": "IPv6 ROA and RPKI: Securing Your IPv6 Routing from Day One", "category": "ipv6", "audience": "engineer"}, # ── Internet infrastructure and RIR/APNIC/RIPE ─────────────────────── {"topic": "APNIC and Asia-Pacific IPv6 Leadership: What the Data Shows", "category": "infrastructure", "audience": "engineer"}, {"topic": "RIPE NCC Resource Certification: How to Get Your RPKI Right", "category": "infrastructure", "audience": "engineer"}, {"topic": "Internet Exchange Points: Why IXPs Are Critical Infrastructure", "category": "infrastructure", "audience": "engineer"}, {"topic": "ARIN vs RIPE vs APNIC: How IP Address Policies Differ by Region", "category": "infrastructure", "audience": "engineer"}, {"topic": "BGP Looking Glass Tools: How to Debug Routing Problems Remotely", "category": "infrastructure", "audience": "engineer"}, {"topic": "Peering vs Transit: The Economics of Internet Interconnection", "category": "infrastructure", "audience": "engineer"}, {"topic": "DE-CIX, AMS-IX, LINX: The IXPs That Move Europe's Internet", "category": "infrastructure", "audience": "engineer"}, {"topic": "Route Server Best Practices for IXP Operators", "category": "infrastructure", "audience": "engineer"}, {"topic": "MANRS: Mutually Agreed Norms for Routing Security in 2026", "category": "infrastructure", "audience": "engineer"}, {"topic": "Internet Shutdowns: Technical Analysis of BGP Withdrawal Patterns", "category": "infrastructure", "audience": "engineer"}, {"topic": "Submarine Cable Systems: Routing Resilience for Island Networks", "category": "infrastructure", "audience": "engineer"}, {"topic": "RDAP vs WHOIS: The Modern Way to Query IP and Domain Ownership", "category": "infrastructure", "audience": "engineer"}, {"topic": "DNS Anycast: How Root Servers Handle 50 Billion Queries Per Day", "category": "infrastructure", "audience": "engineer"}, {"topic": "DNSSEC: Deployment Status and Why Operators Still Hesitate", "category": "infrastructure", "audience": "engineer"}, {"topic": "RIPE Atlas: Using Distributed Probes to Measure Internet Reachability", "category": "infrastructure", "audience": "engineer"}, {"topic": "NTP Security: How BGP Leaks Can Desync Your Infrastructure Clocks", "category": "infrastructure", "audience": "engineer"}, {"topic": "Internet Routing Registry (IRR): Why It's Messy and What to Do About It", "category": "infrastructure", "audience": "engineer"}, {"topic": "CDN Architecture: How Akamai and Cloudflare Use BGP for Global Delivery", "category": "infrastructure", "audience": "engineer"}, {"topic": "Anycast BGP for DDoS Mitigation: A NOC Operator's Guide", "category": "infrastructure", "audience": "engineer"}, {"topic": "Public Peer vs Private Peer: IXP Negotiation Strategy for ISPs", "category": "infrastructure", "audience": "engineer"}, # ── Network operations ──────────────────────────────────────────────── {"topic": "SNMP vs gRPC Telemetry: Modernizing Your NOC Monitoring Stack", "category": "operations", "audience": "engineer"}, {"topic": "NetFlow vs IPFIX vs sFlow: Choosing Traffic Analytics for Your Network", "category": "operations", "audience": "engineer"}, {"topic": "Optical Power Budget: How to Diagnose Fiber Link Problems Fast", "category": "operations", "audience": "engineer"}, {"topic": "OTDR Testing: Reading Loss Traces for Fiber Troubleshooting", "category": "operations", "audience": "engineer"}, {"topic": "DDOS Mitigation at Scale: BGP Blackhole and Scrubbing Centers", "category": "operations", "audience": "engineer"}, {"topic": "Network Change Management: Avoiding Outages During Maintenance Windows", "category": "operations", "audience": "engineer"}, {"topic": "MPLS LDP vs RSVP-TE: Label Distribution Protocol Comparison", "category": "operations", "audience": "engineer"}, {"topic": "Transceiver DOM Monitoring: What DDM Data Tells You Before Links Fail", "category": "operations", "audience": "engineer"}, {"topic": "NOC Alert Fatigue: Structuring Alerts to Avoid the Cry-Wolf Effect", "category": "operations", "audience": "engineer"}, {"topic": "Fiber Cuts: Incident Response Procedures for Backbone Operators", "category": "operations", "audience": "engineer"}, {"topic": "MTTR vs MTBF: Optical Transceiver Reliability Metrics That Matter", "category": "operations", "audience": "customer"}, {"topic": "Optics Inventory Management: How to Avoid a Spare-Parts Crisis", "category": "operations", "audience": "customer"}, {"topic": "Transceiver Firmware Upgrades: Risk Management and Rollback Plans", "category": "operations", "audience": "engineer"}, {"topic": "Network Automation with Ansible and NAPALM: Practical Getting Started Guide", "category": "operations", "audience": "engineer"}, {"topic": "gNMI and gNOI: Google's Contribution to Network Operations APIs", "category": "operations", "audience": "engineer"}, # ── Security ────────────────────────────────────────────────────────── {"topic": "BGP Hijack Case Studies: Real Incidents and Their Technical Aftermath", "category": "security", "audience": "engineer"}, {"topic": "RPKI ROA vs ASPA vs BGPsec: The Routing Security Stack in 2026", "category": "security", "audience": "engineer"}, {"topic": "DDoS Amplification via DNS and NTP: How It Works and How to Block It", "category": "security", "audience": "engineer"}, {"topic": "Supply Chain Attacks on Network Hardware: Counterfeit Optics and Beyond", "category": "security", "audience": "engineer"}, {"topic": "BGP Route Filtering: RPKI-Invalid Drop vs Just-Logging", "category": "security", "audience": "engineer"}, {"topic": "Network Segmentation: How Optical Transceivers Factor into Zero-Trust", "category": "security", "audience": "engineer"}, # ── Market and business ─────────────────────────────────────────────── {"topic": "Transceiver Market 2026: 400G Adoption Rates and What's Driving 800G", "category": "market", "audience": "customer"}, {"topic": "Photonics Supply Chain: TSMC, II-VI, and the Chip Shortage Aftermath", "category": "market", "audience": "customer"}, {"topic": "Price Comparison: QSFP-DD 400G ZR from 8 Vendors — Who Wins?", "category": "market", "audience": "customer"}, {"topic": "Hyperscaler Buying Power: How Meta and AWS Shape the Transceiver Market", "category": "market", "audience": "customer"}, {"topic": "Transceiver Leasing vs Buying: CapEx vs OpEx Decision Framework", "category": "market", "audience": "customer"}, {"topic": "Optical Networking M&A: Coherent, II-VI, Lumentum — What the Consolidation Means", "category": "market", "audience": "customer"}, {"topic": "Open Networking: SONiC Adoption and the Disaggregation Trend in 2026", "category": "market", "audience": "engineer"}, {"topic": "Whitebox Switching and Merchant Silicon: The Business Case for Operators", "category": "market", "audience": "engineer"}, {"topic": "Transceiver Pricing Trends: When Does 400G Hit the 10G Price Point?", "category": "market", "audience": "customer"}, {"topic": "AI Networking Demand: How LLM Training Clusters Are Reshaping Optics Sales", "category": "market", "audience": "customer"}, # ── Standards and MSAs ──────────────────────────────────────────────── {"topic": "IEEE 802.3bs and 400GbE: The Standard That Enabled QSFP-DD", "category": "standards", "audience": "engineer"}, {"topic": "OIF 400ZR Implementation Agreement: What It Means for DCI Deployments", "category": "standards", "audience": "engineer"}, {"topic": "MSA Compliance: How Multi-Source Agreements Enable Interoperability", "category": "standards", "audience": "customer"}, {"topic": "SFF-8024: The Transceiver Identifier Standard Explained", "category": "standards", "audience": "engineer"}, {"topic": "IEEE 802.3cu 100G FR and LR: Simplifying 100G Beyond 10km", "category": "standards", "audience": "engineer"}, {"topic": "CMIS 5.0: The Management Interface That Unlocks 800G OSFP Features", "category": "standards", "audience": "engineer"}, {"topic": "800G Standards Landscape: QSFP-DD800, OSFP, and What Comes Next", "category": "standards", "audience": "engineer"}, {"topic": "CWDM vs DWDM Wavelength Plans: ITU Grid Selection for Metro and Long-Haul", "category": "standards", "audience": "engineer"}, {"topic": "OpenZR+ vs OIF-400ZR: The 400G Coherent Protocol War", "category": "standards", "audience": "engineer"}, {"topic": "Photonic Integrated Circuits: SiPh, InP, and the Future of Transceiver Design", "category": "standards", "audience": "engineer"}, # ── SONiC and open networking ───────────────────────────────────────── {"topic": "SONiC Architecture: How Microsoft's Switch OS Works Under the Hood", "category": "opennet", "audience": "engineer"}, {"topic": "SONiC vs Cumulus vs OpenWrt: Choosing an Open NOS for Your Lab", "category": "opennet", "audience": "engineer"}, {"topic": "SONiC SAI API: Abstracting Hardware Across ASICs and Vendors", "category": "opennet", "audience": "engineer"}, {"topic": "Open Compute Project (OCP): How Facebook is Driving Network Disaggregation", "category": "opennet", "audience": "engineer"}, {"topic": "P4 Programming: The Future of Programmable Data Plane Networking", "category": "opennet", "audience": "engineer"}, {"topic": "SmartNIC and DPU: Offloading Network Functions from CPUs", "category": "opennet", "audience": "engineer"}, # ── Emerging topics ─────────────────────────────────────────────────── {"topic": "Quantum Key Distribution (QKD) over Fiber: Network Integration Challenges", "category": "emerging", "audience": "engineer"}, {"topic": "Space Optical Communications: LEO Constellation Intersatellite Links", "category": "emerging", "audience": "engineer"}, {"topic": "400G and Beyond for RAN Fronthaul: O-RAN Transceiver Requirements", "category": "emerging", "audience": "engineer"}, {"topic": "Edge Computing: Optical Networking Requirements for 5G MEC", "category": "emerging", "audience": "engineer"}, {"topic": "AI-Driven Network Management: Using LLMs for Optical Layer Analysis", "category": "emerging", "audience": "engineer"}, {"topic": "Green Networking: Power Consumption Optimization for Optical Infrastructure", "category": "emerging", "audience": "engineer"}, {"topic": "Liquid Cooling and Photonics: How Thermal Management Changes at 800G", "category": "emerging", "audience": "engineer"}, {"topic": "Reconfigurable Optical Add-Drop Multiplexers (ROADM): WSS Architecture Guide", "category": "emerging", "audience": "engineer"}, {"topic": "Optical Time Domain Reflectometry in Automated NOC Workflows", "category": "emerging", "audience": "engineer"}, {"topic": "Optical Amplifiers: EDFA vs Raman vs SOA — When Each One Applies", "category": "emerging", "audience": "engineer"}, # ── Regional and service provider ───────────────────────────────────── {"topic": "African Internet Infrastructure: Submarine Cables and Terrestrial Fiber Gaps", "category": "regional", "audience": "engineer"}, {"topic": "APAC Data Center Boom: Transceiver Requirements for Singapore and Tokyo Hubs", "category": "regional", "audience": "customer"}, {"topic": "European 5G Backbone: Optical Transceiver Demand Through 2028", "category": "regional", "audience": "customer"}, {"topic": "Latin America ISP Connectivity: Low-Cost 100G Options for Emerging Markets", "category": "regional", "audience": "customer"}, {"topic": "Middle East Data Center Growth: IXP and Optical Infrastructure Investments", "category": "regional", "audience": "customer"}, {"topic": "Rural Broadband Access: Optical Technologies for the Last Mile", "category": "regional", "audience": "customer"}, {"topic": "GÉANT Research Network: How European Academia Runs 100Tbps+ at Scale", "category": "regional", "audience": "engineer"}, {"topic": "Carrier Ethernet Services: MEF Framework for Wholesale Optical Transport", "category": "regional", "audience": "engineer"}, # ── How-to guides ───────────────────────────────────────────────────── {"topic": "How to Read a Transceiver Datasheet: Key Specs That Actually Matter", "category": "howto", "audience": "customer"}, {"topic": "How to Choose the Right Transceiver for a 10km Single-Mode Link", "category": "howto", "audience": "customer"}, {"topic": "How to Configure a 400G ZR Link Between Two Routers", "category": "howto", "audience": "engineer"}, {"topic": "How to Implement RPKI on Cisco IOS-XR: Step-by-Step Guide", "category": "howto", "audience": "engineer"}, {"topic": "How to Set Up BGP Communities for Traffic Engineering", "category": "howto", "audience": "engineer"}, {"topic": "How to Diagnose an Optical Link Failure Using DOM Data", "category": "howto", "audience": "engineer"}, {"topic": "How to Compare Transceiver Prices Across Vendors Without Getting Scammed", "category": "howto", "audience": "customer"}, {"topic": "How to Plan Fiber Capacity for a 5-Year Data Center Expansion", "category": "howto", "audience": "customer"}, {"topic": "How to Implement EVPN VXLAN in a Spine-Leaf Data Center", "category": "howto", "audience": "engineer"}, {"topic": "How to Write a Network RFP for Optical Transceiver Procurement", "category": "howto", "audience": "customer"}, {"topic": "How to Migrate from NRZ to PAM4: A Practical Network Engineer's Guide", "category": "howto", "audience": "engineer"}, {"topic": "How to Calculate Power Budget for a 100G Long-Haul Link", "category": "howto", "audience": "engineer"}, {"topic": "How to Set Up Streaming Telemetry with gNMI and InfluxDB", "category": "howto", "audience": "engineer"}, {"topic": "How to Deploy SONiC in a Production Data Center: Lessons from 6 Months", "category": "howto", "audience": "engineer"}, {"topic": "How to Evaluate Third-Party Transceivers Before Buying 500 Units", "category": "howto", "audience": "customer"}, # ── Troubleshooting ─────────────────────────────────────────────────── {"topic": "Why Is My SFP+ Module Showing Rx Power -40dBm? Fiber Fault Diagnosis", "category": "troubleshoot", "audience": "engineer"}, {"topic": "BGP Session Flapping: 7 Root Causes and How to Debug Each One", "category": "troubleshoot", "audience": "engineer"}, {"topic": "Transceiver Not Recognized: Vendor Lock-In Detection and Workarounds", "category": "troubleshoot", "audience": "customer"}, {"topic": "High BER on 100G Link: Signal Integrity Debugging from DOM to OTDR", "category": "troubleshoot", "audience": "engineer"}, {"topic": "MTU Black Holes: How Jumbo Frame Mismatches Kill Network Performance", "category": "troubleshoot", "audience": "engineer"}, {"topic": "OSPF Adjacency Issues: Debugging Area Type Mismatches and Hello Timers", "category": "troubleshoot", "audience": "engineer"}, # ── Comparison and decision guides ──────────────────────────────────── {"topic": "Cisco ASR 9000 vs Nokia 7750: Backbone Router Optics Ecosystem", "category": "comparison", "audience": "customer"}, {"topic": "Arista 7800 vs Juniper QFX10000: Data Center Fabric Optics Comparison", "category": "comparison", "audience": "customer"}, {"topic": "Vendor Lock-In Scorecard: Cisco vs Juniper vs Arista in 2026", "category": "comparison", "audience": "customer"}, {"topic": "100G LR4 vs ER4: When to Pay for 40km Reach", "category": "comparison", "audience": "customer"}, {"topic": "NaaS vs DIY: Network-as-a-Service vs Owning Your Own Optical Infrastructure", "category": "comparison", "audience": "customer"}, {"topic": "Coherent Pluggables vs Fixed-Line Transponders: TCO for Service Providers", "category": "comparison", "audience": "customer"}, # ── Future and innovation ───────────────────────────────────────────── {"topic": "1.6T Optics Timeline: When Will 1.6 Terabit Transceivers Hit Production?", "category": "future", "audience": "engineer"}, {"topic": "All-Optical Networks: The Dream of Photonic Switching Without O-E-O", "category": "future", "audience": "engineer"}, {"topic": "Post-Quantum Cryptography in Network Infrastructure: Timing the Transition", "category": "future", "audience": "engineer"}, {"topic": "AI-Predicted Network Failures: How ML Is Entering Optical Layer Management", "category": "future", "audience": "engineer"}, {"topic": "Intent-Based Networking: BGP and Optics Policy Automation in 2026", "category": "future", "audience": "engineer"}, {"topic": "The 10 Year Horizon: How Optical Networking Will Change by 2035", "category": "future", "audience": "customer"}, ] # ─── Prompt construction ────────────────────────────────────────────────────── def build_user_prompt(entry: dict) -> str: """Build the user-turn prompt for a given topic entry.""" audience_map = { "customer": "IT managers, procurement teams, and operators who evaluate and buy transceivers", "engineer": "network engineers and architects who design and operate optical infrastructure", } audience = audience_map.get(entry.get("audience", "engineer"), "network engineers") return ( f"Write a blog post on the following topic:\n\n" f'**Topic:** {entry["topic"]}\n\n' f"**Target audience:** {audience}\n\n" f"Remember: 700–1000 words, hook + technical sections + 3 takeaways. " f"Stay strictly on-topic. No filler. Start writing now." ) # ─── Claude subprocess ──────────────────────────────────────────────────────── def call_claude(system: str, user_prompt: str, timeout: int = 120) -> str | None: """ Call 'claude --print' via subprocess. Uses the Claude Code subscription — no API billing. Returns the generated text, or None on failure. """ try: result = subprocess.run( [ "claude", "--print", "--system-prompt", system, "-p", user_prompt, ], capture_output=True, text=True, timeout=timeout, ) if result.returncode != 0: logger.warning("claude subprocess error (rc=%d): %s", result.returncode, result.stderr[:200]) return None output = result.stdout.strip() if not output: logger.warning("claude returned empty output") return None return output except subprocess.TimeoutExpired: logger.warning("claude subprocess timed out after %ds", timeout) return None except FileNotFoundError: logger.error("claude CLI not found — install Claude Code") return None except Exception as exc: logger.warning("claude subprocess unexpected error: %s", exc) return None # ─── Quality validation ─────────────────────────────────────────────────────── def validate_output(text: str, topic: str) -> tuple[bool, str]: """ Basic quality check on generated blog post. Returns (is_valid, reason). """ words = len(text.split()) if words < 400: return False, f"too short: {words} words (min 400)" if words > 2500: return False, f"too long: {words} words (max 2500 — will be flagged)" # Must have some structure has_headers = bool(re.search(r"^##\s+.+", text, re.MULTILINE)) if not has_headers: return False, "missing ## section headers" # Must start with actual content (not a meta-comment about the post) first_line = text.strip().split("\n")[0].lower() skip_patterns = ["i'll write", "here's a", "here is a", "let me write", "blog post:"] for pat in skip_patterns: if pat in first_line: return False, f"starts with meta-comment: '{first_line[:60]}'" return True, "ok" # ─── Progress tracking ──────────────────────────────────────────────────────── def load_progress() -> set[int]: """Load set of already-generated topic indices.""" if not PROGRESS_FILE.exists(): return set() try: with open(PROGRESS_FILE) as f: data = json.load(f) return set(data.get("completed", [])) except Exception: return set() def save_progress(completed: set[int]) -> None: with open(PROGRESS_FILE, "w") as f: json.dump({"completed": sorted(completed), "total": len(TOPICS)}, f) # ─── Main generation loop ───────────────────────────────────────────────────── def generate(start: int = 0, end: int | None = None, dry_run: bool = False) -> None: OUTPUT_DIR.mkdir(parents=True, exist_ok=True) end = end or len(TOPICS) topics_to_run = TOPICS[start:end] if dry_run: print(f"DRY RUN: would generate {len(topics_to_run)} topics ({start}–{end})") for i, t in enumerate(topics_to_run): print(f" [{start + i:03d}] [{t['category']:12s}] {t['topic']}") return completed = load_progress() logger.info("Resuming: %d/%d already done", len(completed), len(TOPICS)) stats = {"generated": 0, "skipped": 0, "failed": 0, "invalid": 0} with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f: for i, entry in enumerate(topics_to_run): idx = start + i if idx in completed: logger.info("[%03d/%03d] SKIP (already done): %s", idx, len(TOPICS) - 1, entry["topic"]) stats["skipped"] += 1 continue logger.info("[%03d/%03d] Generating: %s", idx, len(TOPICS) - 1, entry["topic"]) user_prompt = build_user_prompt(entry) output_text = call_claude(SYSTEM_PROMPT, user_prompt, timeout=180) if output_text is None: logger.warning("[%03d] FAILED to get output", idx) stats["failed"] += 1 # Brief pause before retry/next time.sleep(5) continue is_valid, reason = validate_output(output_text, entry["topic"]) if not is_valid: logger.warning("[%03d] INVALID (%s): %s", idx, reason, entry["topic"]) stats["invalid"] += 1 # Still save it but log the issue word_count = len(output_text.split()) logger.warning("[%03d] Saving anyway: %d words", idx, word_count) word_count = len(output_text.split()) logger.info("[%03d] OK: %d words", idx, word_count) record = { "system_prompt": SYSTEM_PROMPT, "input_text": build_user_prompt(entry), "output_text": output_text, "meta": { "topic": entry["topic"], "category": entry["category"], "audience": entry["audience"], "word_count": word_count, "valid": is_valid, "reason": reason, "generated_by": "claude-code-subprocess", "model": "claude-sonnet", "dataset_version": "v7", }, } out_f.write(json.dumps(record, ensure_ascii=False) + "\n") out_f.flush() completed.add(idx) save_progress(completed) stats["generated"] += 1 # Small pause to avoid overwhelming claude subprocess time.sleep(2) logger.info("Done! Generated: %d | Skipped: %d | Failed: %d | Invalid: %d", stats["generated"], stats["skipped"], stats["failed"], stats["invalid"]) logger.info("Output: %s", OUTPUT_FILE) # ─── CLI ────────────────────────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser(description="fo-blog-v7 training data generator") parser.add_argument("--start", type=int, default=0, help="Start at topic index") parser.add_argument("--end", type=int, default=None, help="Stop at topic index (exclusive)") parser.add_argument("--dry-run", action="store_true", help="List topics without generating") parser.add_argument("--list-categories", action="store_true", help="Show category distribution") args = parser.parse_args() if args.list_categories: from collections import Counter cats = Counter(t["category"] for t in TOPICS) print(f"Total topics: {len(TOPICS)}") for cat, count in sorted(cats.items(), key=lambda x: -x[1]): print(f" {cat:15s}: {count:3d}") return generate(start=args.start, end=args.end, dry_run=args.dry_run) if __name__ == "__main__": main()