feat: TIP_LLM 5-capability training data + updated system prompt

- Add scripts/seed-tip-llm-capabilities.ts: generator for 34 SFT pairs covering all 5 TIP_LLM capabilities (transceiver research, switch research, Blog_LLM data evaluation, crawler/scraper design, Hype Cycle) - Add training-data/tip-llm-capabilities-v1.jsonl: generated output (34 pairs) - Update tip-learning-pool-build.ts: expanded 5-capability system prompt replaces single-line prompt; register capabilities file in files.tip_llm - Regenerate tip_llm runpod outputs: 12141 raw pairs → 11872 training pairs (up from 10654 before capabilities addition) - Published tip_llm (11872 pairs) + blog_llm (11408 pairs) to HuggingFace
2026-04-26 00:01:21 +02:00 · 2026-04-26 00:01:21 +02:00 · 8e367b3c33
commit 8e367b3c33
parent d5be0ba43c
8 changed files with 25360 additions and 23686 deletions
--- a/scripts/seed-tip-llm-capabilities.ts
+++ b/scripts/seed-tip-llm-capabilities.ts
--- a/scripts/tip-learning-pool-build.ts
+++ b/scripts/tip-learning-pool-build.ts
@ -13,7 +13,7 @@ const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop
 const outRoot = join(repoRoot, "training-data", "runpod");
 const system: Record<Lane, string> = {
-  tip_llm: "You are TIP_LLM, the Transceiver Intelligence Platform research and data-preparation model. Convert messy market, vendor, crawler, forum, RIR, compatibility and optics intelligence into precise crawler plans, extraction schemas, normalized findings, source-quality notes and actionable market intelligence. Prefer structured outputs, cite source constraints, and avoid inventing facts.",
+  tip_llm: `You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.\n\nYour five core capabilities:\n\nCAP-1 · TRANSCEIVER RESEARCH\nResearch any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.\n\nCAP-2 · SWITCH RESEARCH\nResearch network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.\n\nCAP-3 · BLOG LLM DATA EVALUATION\nEvaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.\n\nCAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN\nDesign, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.\n\nCAP-5 · HYPE CYCLE CALCULATION\nCalculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0–100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 12–24 month forecast.`,
  blog_llm: "You are Blog_LLM, the specialized Flexoptix/TIP founder-content and technical blog model. Write opinionated, practical, technically credible articles for network engineers and optical infrastructure buyers. Keep the tone human, specific and useful. Avoid generic AI filler, LaTeX in prose, and datasheet dumps.",
 };
@ -32,6 +32,7 @@ const files: Record<Lane, string[]> = {
    "market-business-analysis-part4.jsonl",
    "market-business-analysis-part5.jsonl",
    "market-business-analysis-part6.jsonl",
    "training-data/tip-llm-capabilities-v1.jsonl",
  ],
  blog_llm: [
    "master-training-dataset.jsonl",
--- a/training-data/runpod/manifest.json
+++ b/training-data/runpod/manifest.json
@ -1,13 +1,13 @@
 {
-  "generated_at": "2026-04-25T10:20:52.535Z",
+  "generated_at": "2026-04-25T21:56:31.560Z",
  "version": "TIP-LearningPool-v1",
  "lanes": {
    "tip_llm": {
-      "raw_pairs": 12107,
+      "raw_pairs": 12141,
      "duplicates_removed": 269,
-      "training_pairs": 11838,
+      "training_pairs": 11872,
-      "train_pairs": 10654,
+      "train_pairs": 10684,
-      "eval_pairs": 1184,
+      "eval_pairs": 1188,
      "sources": {
        "external:vendor-deep-dives.jsonl": 11200,
        "external:technical-deep-dives.jsonl": 84,
@ -16,6 +16,7 @@
        "external:synthesized-training-samples.jsonl": 219,
        "external:nanog-ripe-labs-content.jsonl": 34,
        "external:academic-research-synthesis.jsonl": 109,
        "training-data/tip-llm-capabilities-v1.jsonl": 34,
        "external:market-business-analysis-part6.jsonl": 5,
        "external:market-business-analysis-part5.jsonl": 7,
        "external:market-business-analysis-part4.jsonl": 5,
--- a/training-data/runpod/tip_llm/manifest.json
+++ b/training-data/runpod/tip_llm/manifest.json
@ -1,9 +1,9 @@
 {
-  "raw_pairs": 12107,
+  "raw_pairs": 12141,
  "duplicates_removed": 269,
-  "training_pairs": 11838,
+  "training_pairs": 11872,
-  "train_pairs": 10654,
+  "train_pairs": 10684,
-  "eval_pairs": 1184,
+  "eval_pairs": 1188,
  "sources": {
    "external:vendor-deep-dives.jsonl": 11200,
    "external:technical-deep-dives.jsonl": 84,
@ -12,6 +12,7 @@
    "external:synthesized-training-samples.jsonl": 219,
    "external:nanog-ripe-labs-content.jsonl": 34,
    "external:academic-research-synthesis.jsonl": 109,
    "training-data/tip-llm-capabilities-v1.jsonl": 34,
    "external:market-business-analysis-part6.jsonl": 5,
    "external:market-business-analysis-part5.jsonl": 7,
    "external:market-business-analysis-part4.jsonl": 5,
--- a/training-data/runpod/tip_llm/tip_llm-sft-all.jsonl
+++ b/training-data/runpod/tip_llm/tip_llm-sft-all.jsonl
--- a/training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl
+++ b/training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl
--- a/training-data/runpod/tip_llm/tip_llm-sft-train.jsonl
+++ b/training-data/runpod/tip_llm/tip_llm-sft-train.jsonl
--- a/training-data/tip-llm-capabilities-v1.jsonl
+++ b/training-data/tip-llm-capabilities-v1.jsonl