import { createHash } from "crypto"; import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "fs"; import { homedir } from "os"; import { basename, extname, join, relative } from "path"; type Lane = "tip_llm" | "blog_llm"; type Message = { role: "system" | "user" | "assistant"; content: string }; type Row = { id: string; lane: Lane; source: string; kind: string; messages: Message[] }; const repoRoot = process.cwd(); const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data"); const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training"); const outRoot = join(repoRoot, "training-data", "runpod"); const system: Record = { tip_llm: `You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.\n\nYour five core capabilities:\n\nCAP-1 · TRANSCEIVER RESEARCH\nResearch any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.\n\nCAP-2 · SWITCH RESEARCH\nResearch network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.\n\nCAP-3 · BLOG LLM DATA EVALUATION\nEvaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.\n\nCAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN\nDesign, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.\n\nCAP-5 · HYPE CYCLE CALCULATION\nCalculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0–100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 12–24 month forecast.`, blog_llm: "You are Blog_LLM, the specialized Flexoptix/TIP founder-content and technical blog model. Write opinionated, practical, technically credible articles for network engineers and optical infrastructure buyers. Keep the tone human, specific and useful. Avoid generic AI filler, LaTeX in prose, and datasheet dumps.", }; const files: Record = { tip_llm: [ "master-training-dataset.jsonl", "technical-deep-dives.jsonl", "vendor-deep-dives.jsonl", "rir-infrastructure-data.jsonl", "nanog-ripe-labs-content.jsonl", "academic-research-synthesis.jsonl", "synthesized-training-samples.jsonl", "market-business-analysis-part1.jsonl", "market-business-analysis-part2.jsonl", "market-business-analysis-part3.jsonl", "market-business-analysis-part4.jsonl", "market-business-analysis-part5.jsonl", "market-business-analysis-part6.jsonl", "training-data/tip-llm-capabilities-v1.jsonl", ], blog_llm: [ "master-training-dataset.jsonl", "technical-deep-dives.jsonl", "vendor-deep-dives.jsonl", "v8-real-posts-sft.jsonl", "v7-ripe-apnic-sft.jsonl", "v8-v6blogs-sft.jsonl", "v8-external-sft.jsonl", "blog-fichtmueller-posts.jsonl", "mega-training-dataset.jsonl", ], }; function clean(v: unknown): string { return String(v ?? "").replace(/\r\n/g, "\n").replace(/\n{4,}/g, "\n\n\n").trim(); } function id(parts: string[]): string { return createHash("sha256").update(parts.join("\n---\n")).digest("hex").slice(0, 24); } function sourceLabel(path: string): string { const repoRel = relative(repoRoot, path); if (!repoRel.startsWith("..")) return repoRel; const extRel = relative(externalRoot, path); if (!extRel.startsWith("..")) return `external:${extRel}`; const blogRel = relative(blogMegaRoot, path); if (!blogRel.startsWith("..")) return `blog-mega:${blogRel}`; return basename(path); } function add(rows: Row[], lane: Lane, source: string, kind: string, prompt: unknown, answer: unknown) { const user = clean(prompt); const assistant = clean(answer); if (user.length < 20 || assistant.length < 40) return; const rowId = id([lane, user, assistant]); rows.push({ id: rowId, lane, source, kind, messages: [ { role: "system", content: system[lane] }, { role: "user", content: user }, { role: "assistant", content: assistant }, ], }); } function jsonl(path: string, lane: Lane): Row[] { if (!existsSync(path)) return []; const rows: Row[] = []; const source = sourceLabel(path); for (const line of readFileSync(path, "utf8").split(/\n+/).filter(Boolean)) { try { const obj = JSON.parse(line); if (Array.isArray(obj.messages)) { const user = obj.messages.find((m: Message) => m.role === "user")?.content; const assistant = [...obj.messages].reverse().find((m: Message) => m.role === "assistant")?.content; add(rows, lane, source, "chat-jsonl", user, assistant); continue; } add( rows, lane, source, "sft-jsonl", obj.prompt || obj.instruction || obj.question || obj.input || obj.title || obj.topic, obj.completion || obj.output || obj.answer || obj.response || obj.article || obj.content || obj.text || obj.summary, ); } catch { // Ignore malformed legacy rows; usable row counts are tracked in the manifest. } } return rows; } function markdownBlog(path: string): Row[] { const article = readFileSync(path, "utf8").trim(); if (article.length < 400) return []; const title = article.match(/^#\s+(.+)$/m)?.[1]?.trim() || basename(path, extname(path)).replace(/[-_]/g, " "); return [{ id: id(["blog_llm", title, article]), lane: "blog_llm", source: sourceLabel(path), kind: "markdown-blog", messages: [ { role: "system", content: system.blog_llm }, { role: "user", content: `Write a TIP/Flexoptix technical blog article with this angle: ${title}\n\nUse a practical network-engineer voice, include concrete deployment context, and avoid generic marketing language.` }, { role: "assistant", content: article }, ], }]; } function collect(lane: Lane): Row[] { const rows: Row[] = []; for (const file of files[lane]) { for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane)); } if (lane === "blog_llm") { for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) { if (!existsSync(dir)) continue; for (const file of readdirSync(dir)) { if (file.endsWith(".md") && file.toLowerCase() !== "readme.md") rows.push(...markdownBlog(join(dir, file))); } } } return rows; } function dedupe(rows: Row[]) { const seen = new Set(); const unique: Row[] = []; let duplicates = 0; for (const row of rows) { const key = id([row.lane, row.messages[1]?.content || "", row.messages[2]?.content || ""]); if (seen.has(key)) duplicates++; else { seen.add(key); unique.push({ ...row, id: key }); } } return { rows: unique.sort((a, b) => a.id.localeCompare(b.id)), duplicates }; } function writeLane(lane: Lane, manifest: any) { const raw = collect(lane); const { rows, duplicates } = dedupe(raw); const evalEvery = rows.length >= 20 ? 10 : 5; const evalRows = rows.filter((_r, i) => i % evalEvery === 0); const trainRows = rows.filter((_r, i) => i % evalEvery !== 0); const laneDir = join(outRoot, lane); mkdirSync(laneDir, { recursive: true }); const write = (name: string, data: Row[]) => writeFileSync(join(laneDir, name), data.map((r) => JSON.stringify({ id: r.id, source: r.source, kind: r.kind, messages: r.messages })).join("\n") + "\n"); write(`${lane}-sft-all.jsonl`, rows); write(`${lane}-sft-train.jsonl`, trainRows); write(`${lane}-sft-eval.jsonl`, evalRows); const sources = rows.reduce>((acc, r) => ({ ...acc, [r.source]: (acc[r.source] || 0) + 1 }), {}); manifest.lanes[lane] = { raw_pairs: raw.length, duplicates_removed: duplicates, training_pairs: rows.length, train_pairs: trainRows.length, eval_pairs: evalRows.length, sources, files: { train: `training-data/runpod/${lane}/${lane}-sft-train.jsonl`, eval: `training-data/runpod/${lane}/${lane}-sft-eval.jsonl`, all: `training-data/runpod/${lane}/${lane}-sft-all.jsonl`, manifest: `training-data/runpod/${lane}/manifest.json`, }, }; writeFileSync(join(laneDir, "manifest.json"), JSON.stringify(manifest.lanes[lane], null, 2) + "\n"); } const manifest = { generated_at: new Date().toISOString(), version: "TIP-LearningPool-v1", lanes: {} as Record }; writeLane("tip_llm", manifest); writeLane("blog_llm", manifest); mkdirSync(outRoot, { recursive: true }); writeFileSync(join(outRoot, "manifest.json"), JSON.stringify(manifest, null, 2) + "\n"); console.log(JSON.stringify(manifest, null, 2));