llm-gateway/packages/fine-tuner/scripts/ingest_ripe_apnic.py
Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

468 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
ingest_ripe_apnic.py — Convert raw RIPE/APNIC/NANOG content into v7 training format
Reads raw content JSONL files (from previous collection sessions and Fearghas NAS)
and converts them into proper SFT training examples with the anchored v7 system prompt.
For each raw content item:
1. Uses Claude subprocess to expand the raw summary into a full 700-1000w blog post
2. Saves in {system_prompt, input_text, output_text} format
Input files (auto-discovered):
~/transceiver-training-data/nanog-ripe-labs-content.jsonl — 34 entries
~/transceiver-training-data/rir-infrastructure-data.jsonl — 150 entries
[Fearghas NAS mount]/training-data/*.jsonl — (when mounted)
Output:
~/transceiver-training-data/v7-ripe-apnic-sft.jsonl
Usage:
python3 scripts/ingest_ripe_apnic.py
python3 scripts/ingest_ripe_apnic.py --nas-path /Volumes/Fearghas/training-data
python3 scripts/ingest_ripe_apnic.py --dry-run
"""
from __future__ import annotations
import argparse
import json
import logging
import subprocess
import time
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# ─── Paths ────────────────────────────────────────────────────────────────────
TRAINING_DATA_DIR = Path.home() / "transceiver-training-data"
OUTPUT_FILE = TRAINING_DATA_DIR / "v7-ripe-apnic-sft.jsonl"
PROGRESS_FILE = TRAINING_DATA_DIR / "v7-ripe-apnic-progress.json"
LOCAL_RAW_FILES = [
TRAINING_DATA_DIR / "nanog-ripe-labs-content.jsonl",
TRAINING_DATA_DIR / "rir-infrastructure-data.jsonl",
]
# ─── System prompt (same anchored v7 prompt) ──────────────────────────────────
SYSTEM_PROMPT = """You are an expert technical writer specializing in optical networking, transceiver technology, and network infrastructure.
STRICT CONSTRAINTS — Follow exactly, no exceptions:
- LENGTH: 7001000 words. Count carefully. Stop at 1000 words maximum.
- STRUCTURE (mandatory, in this order):
1. HOOK paragraph — 23 sentences stating the problem this post addresses
2. Technical sections — 34 H2 sections covering the topic in depth
3. PRACTICAL TAKEAWAYS — exactly 3 bullet points, actionable
- TOPIC DISCIPLINE: Write ONLY about the exact topic requested. Zero drift.
- NO REPETITION: Every sentence must add new information. No restating.
- VOICE: Confident, direct. No hedging phrases like "it's worth noting".
- AUDIENCE: Network engineers and IT professionals. Assume technical fluency.
- FORMAT: Markdown. Use ## for section headers. Use **bold** for key terms.
Do not summarize what you are about to write. Start with the hook directly."""
def build_expansion_prompt(raw: dict) -> tuple[str, str] | None:
"""
Build a user prompt to expand raw content into a full blog post.
Returns (input_text_for_training, expansion_prompt_for_claude) or None if skipped.
"""
# Support both raw content schemas
title = raw.get("title", "").strip()
content = raw.get("content", "").strip()
category = raw.get("category", raw.get("rir", "networking")).strip()
source = raw.get("source", raw.get("rir", "unknown")).strip()
if not title or not content or len(content) < 50:
return None
# Derive audience from source/category
audience = "network engineers and infrastructure operators"
if any(k in category.lower() for k in ["ipv6", "bgp", "rpki", "routing", "rir", "dns"]):
audience = "network engineers and NOC operators"
elif any(k in category.lower() for k in ["market", "business", "vendor", "price"]):
audience = "IT decision makers, procurement, and network architects"
input_text = (
f"Write a blog post on the following topic:\n\n"
f"**Topic:** {title}\n\n"
f"**Target audience:** {audience}\n\n"
f"Remember: 7001000 words, hook + technical sections + 3 takeaways. "
f"Stay strictly on-topic. No filler. Start writing now."
)
# The expansion prompt also includes the raw content as context for Claude
expansion_prompt = (
f"Write a blog post on the following topic:\n\n"
f"**Topic:** {title}\n\n"
f"**Target audience:** {audience}\n\n"
f"**Background context (do NOT copy verbatim, use as technical grounding):**\n"
f"{content[:800]}\n\n"
f"Remember: 7001000 words, hook + technical sections + 3 takeaways. "
f"Stay strictly on-topic. No filler. Start writing now."
)
return input_text, expansion_prompt
def call_claude(system: str, user_prompt: str, timeout: int = 180) -> str | None:
"""Call claude --print subprocess."""
try:
result = subprocess.run(
["claude", "--print", "--system-prompt", system, "-p", user_prompt],
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
logger.warning("claude error (rc=%d): %s", result.returncode, result.stderr[:200])
return None
output = result.stdout.strip()
return output if output else None
except subprocess.TimeoutExpired:
logger.warning("claude timed out")
return None
except Exception as exc:
logger.warning("claude error: %s", exc)
return None
def load_raw_from_file(path: Path) -> list[dict]:
"""Load all entries from a raw JSONL file."""
if not path.exists():
logger.warning("File not found: %s", path)
return []
items = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
items.append(json.loads(line))
except json.JSONDecodeError:
pass
logger.info("Loaded %d entries from %s", len(items), path.name)
return items
def load_raw_from_dir(nas_path: Path) -> list[dict]:
"""
Load JSON/JSONL files from a NAS directory (tashi-crawler format).
Handles:
- tashi-crawler/2026-03-06/ripe/ripe_data.json → RIPE ASN/RIR data
- tashi-crawler/2026-03-06/nog/nog_all.json → NOG community data
- tashi-crawler/2026-03-06/bgp/bgp_*.json → BGP hourly snapshots
- Any JSONL in training format (pass-through)
"""
items: list[dict] = []
# ── tashi-crawler detection ────────────────────────────────────────────
crawler_dirs = list(nas_path.glob("tashi-crawler/**"))
if crawler_dirs or (nas_path / "ripe").exists() or (nas_path / "nog").exists():
items.extend(_load_tashi_crawler(nas_path))
return items
# ── Generic JSONL pass-through ─────────────────────────────────────────
jsonl_files = list(nas_path.glob("**/*.jsonl"))
logger.info("Found %d JSONL files on NAS at %s", len(jsonl_files), nas_path)
for fpath in jsonl_files:
logger.info("Loading NAS file: %s", fpath.name)
raw = load_raw_from_file(fpath)
for item in raw:
if "system_prompt" in item and "input_text" in item and "output_text" in item:
if item.get("output_text") and len(item["output_text"].split()) > 100:
item["system_prompt"] = SYSTEM_PROMPT
item["meta"] = {**item.get("meta", {}), "source": "fearghas-nas", "dataset_version": "v7"}
items.append({"_direct": True, **item})
continue
if not item.get("title") and not item.get("name"):
continue
if not item.get("title"):
item["title"] = item.get("name", "")
if not item.get("content"):
item["content"] = item.get("body", item.get("text", item.get("summary", "")))
items.append(item)
return items
def _load_tashi_crawler(base_path: Path) -> list[dict]:
"""Extract training topics from tashi-crawler data on Fearghas NAS."""
items: list[dict] = []
# Walk from base_path — tashi-crawler may be at base_path or a subdir
crawler_root = base_path
if (base_path / "tashi-crawler").exists():
# Find the latest date directory
date_dirs = sorted((base_path / "tashi-crawler").glob("*/"))
crawler_root = date_dirs[-1] if date_dirs else base_path / "tashi-crawler"
logger.info("Loading tashi-crawler data from: %s", crawler_root)
# ── RIPE data → ASN and RIR training topics ───────────────────────────
ripe_file = crawler_root / "ripe" / "ripe_data.json"
if ripe_file.exists():
logger.info("Processing RIPE data: %s", ripe_file)
try:
with open(ripe_file, encoding="utf-8") as f:
ripe = json.load(f)
# Generate topics from sample ASNs
sample_asns = ripe.get("sample_asns", [])
for asn_data in sample_asns:
overview = asn_data.get("overview", {})
prefixes = asn_data.get("prefixes", {})
asn = overview.get("asn", "")
holder = overview.get("holder", "")
if not asn or not holder:
continue
prefix_count = prefixes.get("prefix_count", 0)
items.append({
"title": f"RIPE NCC and the Internet Routing Registry: ASN Allocation and BGP Routing",
"content": (
f"Real routing data from {asn} ({holder}): "
f"{prefix_count} announced prefixes. "
f"RIPE NCC manages IP address allocation and AS number registration "
f"for Europe, the Middle East, and Central Asia. "
f"Understanding how ASNs are allocated and how BGP routing works is "
f"fundamental to network operations."
),
"category": "ripe-rir",
"source": "fearghas-tashi-ripe",
})
# RIR summary → generate blog topics per RIR
rir_summary = ripe.get("rir_delegated_summary", {})
for rir_name, rir_data in rir_summary.items():
if not isinstance(rir_data, dict):
continue
items.append({
"title": f"{rir_name} Internet Resource Delegation: IPv4, IPv6, and ASN Statistics",
"content": (
f"{rir_name} is one of the five Regional Internet Registries. "
f"RIR delegation data shows the distribution of IP addresses and AS numbers "
f"across the registry's service region. "
f"This data is critical for understanding internet growth, IPv4 exhaustion, "
f"and the pace of IPv6 adoption in each region."
),
"category": "rir-statistics",
"source": "fearghas-tashi-ripe",
})
logger.info("RIPE: extracted %d topics", len(items))
except Exception as exc:
logger.warning("RIPE data parse error: %s", exc)
# ── NOG data → NOG community training topics ──────────────────────────
nog_file = crawler_root / "nog" / "nog_all.json"
if nog_file.exists():
logger.info("Processing NOG data: %s", nog_file)
try:
with open(nog_file, encoding="utf-8") as f:
nog_all = json.load(f)
results = nog_all.get("results", [])
for nog in results:
nog_name = nog.get("nog", "")
region = nog.get("region", "")
url = nog.get("url", "")
events = nog.get("events", [])
plinks = nog.get("presentation_links", [])
if not nog_name:
continue
content = (
f"{nog_name} is the Network Operators Group for {region}. "
f"NOGs are professional communities where network engineers share operational "
f"knowledge, present technical case studies, and discuss emerging challenges. "
f"{nog_name} meets {len(events) if events else 'periodically'} times per year "
f"and features presentations on routing, security, optical networking, and more."
)
items.append({
"title": f"{nog_name}: The Network Operators Group for {region}",
"content": content,
"category": "nog-community",
"source": "fearghas-tashi-nog",
})
logger.info("NOG: extracted %d topics", len(results))
except Exception as exc:
logger.warning("NOG data parse error: %s", exc)
# ── BGP snapshots → BGP operational topics ────────────────────────────
bgp_files = sorted(crawler_root.glob("bgp/bgp_*.json"))
if bgp_files:
logger.info("Processing %d BGP snapshot files", len(bgp_files))
# Use first file to extract structural context (they're all similar)
try:
with open(bgp_files[0], encoding="utf-8") as f:
bgp = json.load(f)
summary = bgp.get("summary", {})
items.append({
"title": "RouteViews and RIPE RIS: How BGP Routing Tables Are Monitored",
"content": (
f"RouteViews and RIPE RIS collect BGP routing tables from collectors worldwide. "
f"These systems monitor {summary.get('collectors_queried', 'multiple')} collectors "
f"with {summary.get('total_update_files_listed', 'thousands of')} update files daily. "
f"This data is essential for routing security research, anomaly detection, "
f"and understanding global internet routing behavior."
),
"category": "bgp-monitoring",
"source": "fearghas-tashi-bgp",
})
items.append({
"title": "BGP Table Size and Internet Growth: What the Data Tells Us",
"content": (
"The global BGP routing table has grown steadily since the early days of the internet. "
"Route collectors at RouteViews and RIPE RIS track MRT dumps and UPDATE messages "
"in real-time. Understanding BGP table growth is critical for router memory planning, "
"route reflector design, and understanding the impact of IPv4 deaggregation."
),
"category": "bgp-routing-table",
"source": "fearghas-tashi-bgp",
})
except Exception as exc:
logger.warning("BGP data parse error: %s", exc)
logger.info("tashi-crawler total topics extracted: %d", len(items))
return items
def load_progress() -> set[str]:
if not PROGRESS_FILE.exists():
return set()
try:
with open(PROGRESS_FILE) as f:
return set(json.load(f).get("completed", []))
except Exception:
return set()
def save_progress(completed: set[str]) -> None:
with open(PROGRESS_FILE, "w") as f:
json.dump({"completed": sorted(completed)}, f)
def make_key(item: dict, idx: int) -> str:
title = item.get("title", "")
return f"{idx}:{title[:60]}"
def process(nas_path: Path | None = None, dry_run: bool = False) -> None:
TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
# Collect raw items
all_raw: list[dict] = []
for fpath in LOCAL_RAW_FILES:
all_raw.extend(load_raw_from_file(fpath))
if nas_path and nas_path.exists():
logger.info("Loading NAS data from: %s", nas_path)
all_raw.extend(load_raw_from_dir(nas_path))
elif nas_path:
logger.warning("NAS path not accessible: %s — skipping", nas_path)
logger.info("Total raw items to process: %d", len(all_raw))
if dry_run:
print(f"\nDRY RUN: {len(all_raw)} items found")
for i, item in enumerate(all_raw[:10]):
print(f" [{i:03d}] {'[DIRECT]' if item.get('_direct') else '[EXPAND]'} {item.get('title', '?')[:70]}")
if len(all_raw) > 10:
print(f" ... and {len(all_raw) - 10} more")
return
completed = load_progress()
stats = {"direct": 0, "expanded": 0, "skipped": 0, "failed": 0}
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f:
for idx, item in enumerate(all_raw):
key = make_key(item, idx)
if key in completed:
stats["skipped"] += 1
continue
# Direct pass-through (already formatted, from NAS)
if item.get("_direct"):
record = {k: v for k, v in item.items() if k != "_direct"}
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
out_f.flush()
completed.add(key)
save_progress(completed)
stats["direct"] += 1
logger.info("[%03d] DIRECT: %s", idx, item.get("title", "")[:60])
continue
# Expand raw content via Claude
result = build_expansion_prompt(item)
if result is None:
logger.info("[%03d] SKIP (no usable content): %s", idx, item.get("title", "")[:60])
stats["skipped"] += 1
completed.add(key)
save_progress(completed)
continue
input_text, expansion_prompt = result
logger.info("[%03d] Expanding: %s", idx, item.get("title", "")[:60])
output_text = call_claude(SYSTEM_PROMPT, expansion_prompt)
if output_text is None:
logger.warning("[%03d] FAILED: %s", idx, item.get("title", "")[:60])
stats["failed"] += 1
time.sleep(5)
continue
word_count = len(output_text.split())
logger.info("[%03d] OK: %d words", idx, word_count)
record = {
"system_prompt": SYSTEM_PROMPT,
"input_text": input_text,
"output_text": output_text,
"meta": {
"title": item.get("title", ""),
"source": item.get("source", item.get("rir", "ripe-apnic")),
"category": item.get("category", ""),
"word_count": word_count,
"raw_content_len": len(item.get("content", "")),
"generated_by": "claude-code-subprocess",
"dataset_version": "v7",
},
}
out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
out_f.flush()
completed.add(key)
save_progress(completed)
stats["expanded"] += 1
time.sleep(2)
logger.info("Done: direct=%d expanded=%d skipped=%d failed=%d",
stats["direct"], stats["expanded"], stats["skipped"], stats["failed"])
logger.info("Output: %s", OUTPUT_FILE)
def main() -> None:
parser = argparse.ArgumentParser(description="Ingest RIPE/APNIC raw content into v7 training format")
parser.add_argument("--nas-path", type=Path, default=None,
help="Path to Fearghas NAS training data directory (e.g. /Volumes/Fearghas/training-data)")
parser.add_argument("--dry-run", action="store_true", help="List items without generating")
args = parser.parse_args()
process(nas_path=args.nas_path, dry_run=args.dry_run)
if __name__ == "__main__":
main()