transceiver-db/scripts/tip-learning-pool-build.ts
2026-04-25 12:21:56 +02:00

197 lines
7.9 KiB
TypeScript

import { createHash } from "crypto";
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "fs";
import { homedir } from "os";
import { basename, extname, join, relative } from "path";
type Lane = "tip_llm" | "blog_llm";
type Message = { role: "system" | "user" | "assistant"; content: string };
type Row = { id: string; lane: Lane; source: string; kind: string; messages: Message[] };
const repoRoot = process.cwd();
const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data");
const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training");
const outRoot = join(repoRoot, "training-data", "runpod");
const system: Record<Lane, string> = {
tip_llm: "You are TIP_LLM, the Transceiver Intelligence Platform research and data-preparation model. Convert messy market, vendor, crawler, forum, RIR, compatibility and optics intelligence into precise crawler plans, extraction schemas, normalized findings, source-quality notes and actionable market intelligence. Prefer structured outputs, cite source constraints, and avoid inventing facts.",
blog_llm: "You are Blog_LLM, the specialized Flexoptix/TIP founder-content and technical blog model. Write opinionated, practical, technically credible articles for network engineers and optical infrastructure buyers. Keep the tone human, specific and useful. Avoid generic AI filler, LaTeX in prose, and datasheet dumps.",
};
const files: Record<Lane, string[]> = {
tip_llm: [
"master-training-dataset.jsonl",
"technical-deep-dives.jsonl",
"vendor-deep-dives.jsonl",
"rir-infrastructure-data.jsonl",
"nanog-ripe-labs-content.jsonl",
"academic-research-synthesis.jsonl",
"synthesized-training-samples.jsonl",
"market-business-analysis-part1.jsonl",
"market-business-analysis-part2.jsonl",
"market-business-analysis-part3.jsonl",
"market-business-analysis-part4.jsonl",
"market-business-analysis-part5.jsonl",
"market-business-analysis-part6.jsonl",
],
blog_llm: [
"master-training-dataset.jsonl",
"technical-deep-dives.jsonl",
"vendor-deep-dives.jsonl",
"v8-real-posts-sft.jsonl",
"v7-ripe-apnic-sft.jsonl",
"v8-v6blogs-sft.jsonl",
"v8-external-sft.jsonl",
"blog-fichtmueller-posts.jsonl",
"mega-training-dataset.jsonl",
],
};
function clean(v: unknown): string {
return String(v ?? "").replace(/\r\n/g, "\n").replace(/\n{4,}/g, "\n\n\n").trim();
}
function id(parts: string[]): string {
return createHash("sha256").update(parts.join("\n---\n")).digest("hex").slice(0, 24);
}
function sourceLabel(path: string): string {
const repoRel = relative(repoRoot, path);
if (!repoRel.startsWith("..")) return repoRel;
const extRel = relative(externalRoot, path);
if (!extRel.startsWith("..")) return `external:${extRel}`;
const blogRel = relative(blogMegaRoot, path);
if (!blogRel.startsWith("..")) return `blog-mega:${blogRel}`;
return basename(path);
}
function add(rows: Row[], lane: Lane, source: string, kind: string, prompt: unknown, answer: unknown) {
const user = clean(prompt);
const assistant = clean(answer);
if (user.length < 20 || assistant.length < 40) return;
const rowId = id([lane, user, assistant]);
rows.push({
id: rowId,
lane,
source,
kind,
messages: [
{ role: "system", content: system[lane] },
{ role: "user", content: user },
{ role: "assistant", content: assistant },
],
});
}
function jsonl(path: string, lane: Lane): Row[] {
if (!existsSync(path)) return [];
const rows: Row[] = [];
const source = sourceLabel(path);
for (const line of readFileSync(path, "utf8").split(/\n+/).filter(Boolean)) {
try {
const obj = JSON.parse(line);
if (Array.isArray(obj.messages)) {
const user = obj.messages.find((m: Message) => m.role === "user")?.content;
const assistant = [...obj.messages].reverse().find((m: Message) => m.role === "assistant")?.content;
add(rows, lane, source, "chat-jsonl", user, assistant);
continue;
}
add(
rows,
lane,
source,
"sft-jsonl",
obj.prompt || obj.instruction || obj.question || obj.input || obj.title || obj.topic,
obj.completion || obj.output || obj.answer || obj.response || obj.article || obj.content || obj.text || obj.summary,
);
} catch {
// Ignore malformed legacy rows; usable row counts are tracked in the manifest.
}
}
return rows;
}
function markdownBlog(path: string): Row[] {
const article = readFileSync(path, "utf8").trim();
if (article.length < 400) return [];
const title = article.match(/^#\s+(.+)$/m)?.[1]?.trim() || basename(path, extname(path)).replace(/[-_]/g, " ");
return [{
id: id(["blog_llm", title, article]),
lane: "blog_llm",
source: sourceLabel(path),
kind: "markdown-blog",
messages: [
{ role: "system", content: system.blog_llm },
{ role: "user", content: `Write a TIP/Flexoptix technical blog article with this angle: ${title}\n\nUse a practical network-engineer voice, include concrete deployment context, and avoid generic marketing language.` },
{ role: "assistant", content: article },
],
}];
}
function collect(lane: Lane): Row[] {
const rows: Row[] = [];
for (const file of files[lane]) {
for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane));
}
if (lane === "blog_llm") {
for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) {
if (!existsSync(dir)) continue;
for (const file of readdirSync(dir)) {
if (file.endsWith(".md") && file.toLowerCase() !== "readme.md") rows.push(...markdownBlog(join(dir, file)));
}
}
}
return rows;
}
function dedupe(rows: Row[]) {
const seen = new Set<string>();
const unique: Row[] = [];
let duplicates = 0;
for (const row of rows) {
const key = id([row.lane, row.messages[1]?.content || "", row.messages[2]?.content || ""]);
if (seen.has(key)) duplicates++;
else {
seen.add(key);
unique.push({ ...row, id: key });
}
}
return { rows: unique.sort((a, b) => a.id.localeCompare(b.id)), duplicates };
}
function writeLane(lane: Lane, manifest: any) {
const raw = collect(lane);
const { rows, duplicates } = dedupe(raw);
const evalEvery = rows.length >= 20 ? 10 : 5;
const evalRows = rows.filter((_r, i) => i % evalEvery === 0);
const trainRows = rows.filter((_r, i) => i % evalEvery !== 0);
const laneDir = join(outRoot, lane);
mkdirSync(laneDir, { recursive: true });
const write = (name: string, data: Row[]) => writeFileSync(join(laneDir, name), data.map((r) => JSON.stringify({ id: r.id, source: r.source, kind: r.kind, messages: r.messages })).join("\n") + "\n");
write(`${lane}-sft-all.jsonl`, rows);
write(`${lane}-sft-train.jsonl`, trainRows);
write(`${lane}-sft-eval.jsonl`, evalRows);
const sources = rows.reduce<Record<string, number>>((acc, r) => ({ ...acc, [r.source]: (acc[r.source] || 0) + 1 }), {});
manifest.lanes[lane] = {
raw_pairs: raw.length,
duplicates_removed: duplicates,
training_pairs: rows.length,
train_pairs: trainRows.length,
eval_pairs: evalRows.length,
sources,
files: {
train: `training-data/runpod/${lane}/${lane}-sft-train.jsonl`,
eval: `training-data/runpod/${lane}/${lane}-sft-eval.jsonl`,
all: `training-data/runpod/${lane}/${lane}-sft-all.jsonl`,
manifest: `training-data/runpod/${lane}/manifest.json`,
},
};
writeFileSync(join(laneDir, "manifest.json"), JSON.stringify(manifest.lanes[lane], null, 2) + "\n");
}
const manifest = { generated_at: new Date().toISOString(), version: "TIP-LearningPool-v1", lanes: {} as Record<Lane, unknown> };
writeLane("tip_llm", manifest);
writeLane("blog_llm", manifest);
mkdirSync(outRoot, { recursive: true });
writeFileSync(join(outRoot, "manifest.json"), JSON.stringify(manifest, null, 2) + "\n");
console.log(JSON.stringify(manifest, null, 2));