feat(training): add TIP selflearning workflows
This commit is contained in:
parent
ba998f4c01
commit
8c625ff1d2
33
docs/TIP_SELFLEARNING_WORKFLOW.md
Normal file
33
docs/TIP_SELFLEARNING_WORKFLOW.md
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# TIP Selflearning Workflow
|
||||||
|
|
||||||
|
TIP now has two separate learning lanes:
|
||||||
|
|
||||||
|
- `TIP_LLM`: research, crawler planning, vendor/market intelligence and data preparation.
|
||||||
|
- `Blog_LLM`: FO_BlogLLM/founder content and practical technical blog generation.
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run learning-pool:build
|
||||||
|
npm run learning-pool:publish-hf
|
||||||
|
```
|
||||||
|
|
||||||
|
Dashboard/API:
|
||||||
|
|
||||||
|
- `GET /api/selflearning/status`
|
||||||
|
- `POST /api/selflearning/build`
|
||||||
|
- `POST /api/selflearning/publish-hf`
|
||||||
|
- `POST /api/selflearning/train` with `{ "lane": "tip_llm"|"blog_llm", "provider": "runpod"|"local" }`
|
||||||
|
|
||||||
|
Secrets are read from environment variables or macOS Keychain, never from committed files:
|
||||||
|
|
||||||
|
- RunPod: `RUNPOD_API_KEY` / `TIP_RUNPOD_API_KEY`, Keychain `magatama.runpod.api` / `tip.runpod.api`
|
||||||
|
- Hugging Face: `HF_TOKEN` / `HUGGINGFACE_TOKEN`, Keychain `magatama.huggingface.token` / `tip.huggingface.token`
|
||||||
|
- Endpoint: `TIP_RUNPOD_ENDPOINT_ID` or `RUNPOD_ENDPOINT_ID`
|
||||||
|
|
||||||
|
Default private Hugging Face datasets:
|
||||||
|
|
||||||
|
- `renefichtmueller/tip-llm-sft`
|
||||||
|
- `renefichtmueller/blog-llm-sft`
|
||||||
|
|
||||||
|
Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically.
|
||||||
@ -11,6 +11,8 @@
|
|||||||
"build:core": "npm run build -w packages/core",
|
"build:core": "npm run build -w packages/core",
|
||||||
"build:api": "npm run build -w packages/api",
|
"build:api": "npm run build -w packages/api",
|
||||||
"dev": "npm run dev -w packages/api",
|
"dev": "npm run dev -w packages/api",
|
||||||
|
"learning-pool:build": "tsx scripts/tip-learning-pool-build.ts",
|
||||||
|
"learning-pool:publish-hf": "python3 scripts/tip-publish-hf-datasets.py",
|
||||||
"migrate": "tsx scripts/migrate.ts",
|
"migrate": "tsx scripts/migrate.ts",
|
||||||
"seed": "tsx scripts/seed-from-npm.ts",
|
"seed": "tsx scripts/seed-from-npm.ts",
|
||||||
"db:reset": "npm run migrate && npm run seed"
|
"db:reset": "npm run migrate && npm run seed"
|
||||||
|
|||||||
@ -31,6 +31,7 @@ import { proxyRouter } from "./routes/proxy";
|
|||||||
import { reviewRouter } from "./routes/review";
|
import { reviewRouter } from "./routes/review";
|
||||||
import { stockRouter } from "./routes/stock";
|
import { stockRouter } from "./routes/stock";
|
||||||
import { priceComparisonRouter } from "./routes/price-comparison";
|
import { priceComparisonRouter } from "./routes/price-comparison";
|
||||||
|
import { selflearningRouter } from "./routes/selflearning";
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
|
|
||||||
@ -91,6 +92,7 @@ app.use("/api/news", newsRouter);
|
|||||||
app.use("/api/review", reviewRouter);
|
app.use("/api/review", reviewRouter);
|
||||||
app.use("/api/stock", stockRouter);
|
app.use("/api/stock", stockRouter);
|
||||||
app.use("/api/price-comparison", priceComparisonRouter);
|
app.use("/api/price-comparison", priceComparisonRouter);
|
||||||
|
app.use("/api/selflearning", selflearningRouter);
|
||||||
|
|
||||||
// Dashboard (static HTML)
|
// Dashboard (static HTML)
|
||||||
app.use("/dashboard", express.static(join(__dirname, "..", "..", "dashboard")));
|
app.use("/dashboard", express.static(join(__dirname, "..", "..", "dashboard")));
|
||||||
@ -128,6 +130,10 @@ app.get("/api", (_req, res) => {
|
|||||||
"GET /api/blog",
|
"GET /api/blog",
|
||||||
"GET /api/blog/:id",
|
"GET /api/blog/:id",
|
||||||
"PUT /api/blog/:id/status {status: draft|review|approved|published}",
|
"PUT /api/blog/:id/status {status: draft|review|approved|published}",
|
||||||
|
"GET /api/selflearning/status",
|
||||||
|
"POST /api/selflearning/build",
|
||||||
|
"POST /api/selflearning/publish-hf",
|
||||||
|
"POST /api/selflearning/train {lane, provider, seed_only?, max_steps?}",
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
197
packages/api/src/routes/selflearning.ts
Normal file
197
packages/api/src/routes/selflearning.ts
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import { execFile } from "child_process";
|
||||||
|
import { existsSync, readFileSync } from "fs";
|
||||||
|
import { join } from "path";
|
||||||
|
import { promisify } from "util";
|
||||||
|
import { Router, Request, Response } from "express";
|
||||||
|
|
||||||
|
const execFileAsync = promisify(execFile);
|
||||||
|
export const selflearningRouter = Router();
|
||||||
|
|
||||||
|
type Lane = "tip_llm" | "blog_llm";
|
||||||
|
type Provider = "runpod" | "local";
|
||||||
|
|
||||||
|
const repoRoot = join(__dirname, "..", "..", "..", "..");
|
||||||
|
const manifestPath = join(repoRoot, "training-data", "runpod", "manifest.json");
|
||||||
|
|
||||||
|
function isLane(value: unknown): value is Lane {
|
||||||
|
return value === "tip_llm" || value === "blog_llm";
|
||||||
|
}
|
||||||
|
|
||||||
|
function readManifest() {
|
||||||
|
if (!existsSync(manifestPath)) return null;
|
||||||
|
return JSON.parse(readFileSync(manifestPath, "utf8"));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runCommand(command: string, args: string[], timeoutMs = 20 * 60 * 1000) {
|
||||||
|
const { stdout, stderr } = await execFileAsync(command, args, {
|
||||||
|
cwd: repoRoot,
|
||||||
|
timeout: timeoutMs,
|
||||||
|
maxBuffer: 20 * 1024 * 1024,
|
||||||
|
env: process.env,
|
||||||
|
});
|
||||||
|
return { stdout, stderr };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function keychain(service: string): Promise<string | null> {
|
||||||
|
if (process.platform !== "darwin") return null;
|
||||||
|
try {
|
||||||
|
const { stdout } = await execFileAsync("security", ["find-generic-password", "-s", service, "-w"], { timeout: 5000 });
|
||||||
|
return stdout.trim() || null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function secret(envNames: string[], services: string[]): Promise<string | null> {
|
||||||
|
for (const name of envNames) if (process.env[name]) return process.env[name] as string;
|
||||||
|
for (const service of services) {
|
||||||
|
const value = await keychain(service);
|
||||||
|
if (value) return value;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function dataset(lane: Lane): string {
|
||||||
|
return lane === "tip_llm"
|
||||||
|
? process.env.TIP_HF_DATASET_TIP_LLM || "renefichtmueller/tip-llm-sft"
|
||||||
|
: process.env.TIP_HF_DATASET_BLOG_LLM || "renefichtmueller/blog-llm-sft";
|
||||||
|
}
|
||||||
|
|
||||||
|
function modelRepo(lane: Lane, runId: string): string {
|
||||||
|
const fallback = lane === "tip_llm" ? "renefichtmueller/TIP_LLM" : "renefichtmueller/Blog_LLM";
|
||||||
|
const base = lane === "tip_llm" ? process.env.TIP_HF_MODEL_TIP_LLM : process.env.TIP_HF_MODEL_BLOG_LLM;
|
||||||
|
return `${base || fallback}-${runId}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function runpodInput(lane: Lane, seedOnly: boolean, maxSteps: number, runId: string, hfToken: string) {
|
||||||
|
return {
|
||||||
|
user_id: "tip-selflearning",
|
||||||
|
model_id: `${lane}-${runId}`,
|
||||||
|
run_id: runId,
|
||||||
|
credentials: { hf_token: hfToken },
|
||||||
|
args: {
|
||||||
|
base_model: process.env.TIP_RUNPOD_BASE_MODEL || "Qwen/Qwen2.5-Coder-7B-Instruct",
|
||||||
|
model_type: "AutoModelForCausalLM",
|
||||||
|
tokenizer_type: "AutoTokenizer",
|
||||||
|
load_in_4bit: true,
|
||||||
|
strict: false,
|
||||||
|
datasets: [{ path: dataset(lane), type: "chat_template", split: "train" }],
|
||||||
|
val_set_size: 0.02,
|
||||||
|
output_dir: `/workspace/outputs/${lane}-${runId}`,
|
||||||
|
sequence_len: lane === "blog_llm" ? 4096 : 3072,
|
||||||
|
sample_packing: true,
|
||||||
|
eval_sample_packing: false,
|
||||||
|
pad_to_sequence_len: true,
|
||||||
|
adapter: "qlora",
|
||||||
|
lora_r: lane === "blog_llm" ? 48 : 32,
|
||||||
|
lora_alpha: lane === "blog_llm" ? 96 : 64,
|
||||||
|
lora_dropout: 0.05,
|
||||||
|
lora_target_linear: true,
|
||||||
|
lora_modules_to_save: ["embed_tokens", "lm_head"],
|
||||||
|
gradient_accumulation_steps: 2,
|
||||||
|
micro_batch_size: 1,
|
||||||
|
num_epochs: seedOnly ? 1 : lane === "blog_llm" ? 2 : 3,
|
||||||
|
optimizer: "adamw_torch_fused",
|
||||||
|
lr_scheduler: "cosine",
|
||||||
|
learning_rate: lane === "blog_llm" ? 0.00016 : 0.00018,
|
||||||
|
train_on_inputs: false,
|
||||||
|
bf16: "auto",
|
||||||
|
tf32: true,
|
||||||
|
gradient_checkpointing: true,
|
||||||
|
flash_attention: true,
|
||||||
|
logging_steps: 5,
|
||||||
|
warmup_steps: 10,
|
||||||
|
evals_per_epoch: 1,
|
||||||
|
save_steps: seedOnly ? 50 : 250,
|
||||||
|
max_steps: maxSteps,
|
||||||
|
push_to_hub: !seedOnly,
|
||||||
|
hub_model_id: modelRepo(lane, runId),
|
||||||
|
hub_strategy: "end",
|
||||||
|
hub_private_repo: true,
|
||||||
|
hf_use_auth_token: true,
|
||||||
|
special_tokens: { pad_token: "<|endoftext|>" },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
selflearningRouter.get("/status", async (_req: Request, res: Response) => {
|
||||||
|
const runpodEndpoint = process.env.TIP_RUNPOD_ENDPOINT_ID || process.env.RUNPOD_ENDPOINT_ID || null;
|
||||||
|
const runpodToken = await secret(["RUNPOD_API_KEY", "TIP_RUNPOD_API_KEY"], ["magatama.runpod.api", "tip.runpod.api"]);
|
||||||
|
const hfToken = await secret(["HF_TOKEN", "HUGGINGFACE_TOKEN"], ["magatama.huggingface.token", "tip.huggingface.token"]);
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
manifest: readManifest(),
|
||||||
|
lanes: {
|
||||||
|
tip_llm: { dataset: dataset("tip_llm"), target_model_prefix: process.env.TIP_HF_MODEL_TIP_LLM || "renefichtmueller/TIP_LLM" },
|
||||||
|
blog_llm: { dataset: dataset("blog_llm"), target_model_prefix: process.env.TIP_HF_MODEL_BLOG_LLM || "renefichtmueller/Blog_LLM" },
|
||||||
|
},
|
||||||
|
runpod: { endpoint_configured: Boolean(runpodEndpoint), api_key_configured: Boolean(runpodToken) },
|
||||||
|
huggingface: { token_configured: Boolean(hfToken) },
|
||||||
|
local: { command: process.env.TIP_LOCAL_TRAIN_COMMAND || "not configured", ready: Boolean(process.env.TIP_LOCAL_TRAIN_COMMAND) },
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
selflearningRouter.post("/build", async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const out = await runCommand("npm", ["run", "learning-pool:build"]);
|
||||||
|
res.json({ success: true, manifest: readManifest(), stdout: out.stdout.slice(-4000), stderr: out.stderr.slice(-4000) });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ success: false, error: String(err) });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
selflearningRouter.post("/publish-hf", async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
if (!readManifest()) await runCommand("npm", ["run", "learning-pool:build"]);
|
||||||
|
const out = await runCommand("npm", ["run", "learning-pool:publish-hf"], 30 * 60 * 1000);
|
||||||
|
res.json({ success: true, stdout: out.stdout.slice(-6000), stderr: out.stderr.slice(-4000) });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ success: false, error: String(err) });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
selflearningRouter.post("/train", async (req: Request, res: Response) => {
|
||||||
|
const lane = req.body?.lane;
|
||||||
|
const provider = (req.body?.provider || "runpod") as Provider;
|
||||||
|
const seedOnly = req.body?.seed_only !== false;
|
||||||
|
const maxSteps = Number(req.body?.max_steps || (seedOnly ? 200 : 2000));
|
||||||
|
if (!isLane(lane)) {
|
||||||
|
res.status(400).json({ success: false, error: "lane must be tip_llm or blog_llm" });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (provider === "local") {
|
||||||
|
const command = process.env.TIP_LOCAL_TRAIN_COMMAND;
|
||||||
|
if (!command) {
|
||||||
|
res.status(409).json({ success: false, error: "Local training command is not configured.", suggestion: "Set TIP_LOCAL_TRAIN_COMMAND; the lane name is appended automatically." });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const out = await runCommand("sh", ["-lc", `${command} ${lane}`], 12 * 60 * 60 * 1000);
|
||||||
|
res.json({ success: true, provider, lane, stdout: out.stdout.slice(-6000), stderr: out.stderr.slice(-4000) });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const endpoint = process.env.TIP_RUNPOD_ENDPOINT_ID || process.env.RUNPOD_ENDPOINT_ID;
|
||||||
|
const runpodToken = await secret(["RUNPOD_API_KEY", "TIP_RUNPOD_API_KEY"], ["magatama.runpod.api", "tip.runpod.api"]);
|
||||||
|
const hfToken = await secret(["HF_TOKEN", "HUGGINGFACE_TOKEN"], ["magatama.huggingface.token", "tip.huggingface.token"]);
|
||||||
|
if (!endpoint || !runpodToken || !hfToken) {
|
||||||
|
res.status(409).json({ success: false, error: "RunPod/Hugging Face credentials are incomplete.", runpod_endpoint_configured: Boolean(endpoint), runpod_api_key_configured: Boolean(runpodToken), hf_token_configured: Boolean(hfToken) });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
if (!readManifest()) await runCommand("npm", ["run", "learning-pool:build"]);
|
||||||
|
const runId = `${lane}-v${new Date().toISOString().replace(/[-:T.Z]/g, "").slice(0, 12)}`;
|
||||||
|
const input = runpodInput(lane, seedOnly, maxSteps, runId, hfToken);
|
||||||
|
const response = await fetch(`https://api.runpod.ai/v2/${endpoint}/run`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${runpodToken}` },
|
||||||
|
body: JSON.stringify({ input, policy: { executionTimeout: Number(process.env.TIP_RUNPOD_EXECUTION_TIMEOUT || 12 * 60 * 60), ttl: Number(process.env.TIP_RUNPOD_JOB_TTL || 24 * 60 * 60) } }),
|
||||||
|
});
|
||||||
|
const body = await response.json().catch(() => ({}));
|
||||||
|
if (!response.ok) {
|
||||||
|
res.status(response.status).json({ success: false, error: "RunPod request failed", details: body });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
res.json({ success: true, provider, lane, seed_only: seedOnly, run_id: runId, dataset: dataset(lane), target_model: input.args.hub_model_id, runpod: body });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ success: false, error: String(err) });
|
||||||
|
}
|
||||||
|
});
|
||||||
@ -800,6 +800,7 @@
|
|||||||
<div class="tab" data-tab="blog">Blog Engine</div>
|
<div class="tab" data-tab="blog">Blog Engine</div>
|
||||||
<div class="tab" data-tab="procurement">Procurement Intelligence</div>
|
<div class="tab" data-tab="procurement">Procurement Intelligence</div>
|
||||||
<div class="tab" data-tab="crawlers">🕷 Crawler Intelligence</div>
|
<div class="tab" data-tab="crawlers">🕷 Crawler Intelligence</div>
|
||||||
|
<div class="tab" data-tab="selflearning">Selflearning</div>
|
||||||
<div class="tab" data-tab="network">🌐 Network</div>
|
<div class="tab" data-tab="network">🌐 Network</div>
|
||||||
<div class="tab" data-tab="review" id="tab-review-nav">✎ Review <span id="review-pending-badge" style="display:none;background:#f97316;color:#fff;border-radius:10px;padding:1px 7px;font-size:0.68rem;margin-left:4px;font-weight:700"></span></div>
|
<div class="tab" data-tab="review" id="tab-review-nav">✎ Review <span id="review-pending-badge" style="display:none;background:#f97316;color:#fff;border-radius:10px;padding:1px 7px;font-size:0.68rem;margin-left:4px;font-weight:700"></span></div>
|
||||||
<div class="tab" data-tab="stock">🏭 Stock <span style="font-size:0.6rem;color:#f59e0b;vertical-align:middle">⚠DEMO</span></div>
|
<div class="tab" data-tab="stock">🏭 Stock <span style="font-size:0.6rem;color:#f59e0b;vertical-align:middle">⚠DEMO</span></div>
|
||||||
@ -1618,6 +1619,54 @@
|
|||||||
</div>
|
</div>
|
||||||
</div><!-- end tab-crawlers -->
|
</div><!-- end tab-crawlers -->
|
||||||
|
|
||||||
|
<!-- SELFLEARNING -->
|
||||||
|
<div id="tab-selflearning" class="hidden fade-in">
|
||||||
|
<div style="display:flex;justify-content:space-between;align-items:flex-start;gap:1rem;margin-bottom:1.25rem">
|
||||||
|
<div>
|
||||||
|
<h2 style="margin:0 0 0.35rem;font-size:1.1rem;font-weight:800;color:var(--text-bright)">Selflearning Control Center</h2>
|
||||||
|
<div style="font-size:0.8rem;color:var(--text-dim)">Getrennte Trainingsketten fuer TIP_LLM und Blog_LLM: Pool bauen, deduplizieren, HF syncen, lokal oder RunPod starten.</div>
|
||||||
|
</div>
|
||||||
|
<div style="display:flex;gap:0.5rem;flex-wrap:wrap;justify-content:flex-end">
|
||||||
|
<button onclick="loadSelflearning()" class="b b-dim" style="padding:7px 12px;border-radius:7px;cursor:pointer">Refresh</button>
|
||||||
|
<button onclick="buildSelflearningPool()" style="background:var(--accent);color:#fff;border:none;padding:7px 12px;border-radius:7px;cursor:pointer;font-weight:700">Build Pool</button>
|
||||||
|
<button onclick="publishSelflearningHF()" style="background:#2563eb;color:#fff;border:none;padding:7px 12px;border-radius:7px;cursor:pointer;font-weight:700">Publish HF</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="selflearning-status-banner" class="card" style="margin-bottom:1rem;border-left:3px solid var(--accent);font-size:0.82rem;color:var(--text-dim)">Loading selflearning status...</div>
|
||||||
|
<div class="grid mb" style="grid-template-columns:repeat(2,1fr);gap:1rem">
|
||||||
|
<div class="card">
|
||||||
|
<div style="display:flex;justify-content:space-between;gap:0.75rem;align-items:flex-start;margin-bottom:0.75rem">
|
||||||
|
<div><div style="font-size:0.88rem;font-weight:800;color:var(--text-bright)">TIP_LLM_Vx.x</div><div style="font-size:0.72rem;color:var(--text-dim);margin-top:2px">Research, Crawler, Wettbewerbsdaten, Vendor-/Market-Intelligence</div></div>
|
||||||
|
<span id="sl-tip-state" class="b b-blue">unknown</span>
|
||||||
|
</div>
|
||||||
|
<div id="sl-tip-metrics" style="display:grid;grid-template-columns:repeat(4,1fr);gap:0.5rem;margin-bottom:0.75rem"></div>
|
||||||
|
<div style="font-size:0.7rem;color:var(--text-dim);margin-bottom:0.5rem">Dataset: <code id="sl-tip-dataset">-</code></div>
|
||||||
|
<div style="display:flex;gap:0.5rem;flex-wrap:wrap">
|
||||||
|
<button onclick="startSelflearningTrain('tip_llm','runpod',true)" style="background:#0f766e;color:#fff;border:none;padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">RunPod Seed</button>
|
||||||
|
<button onclick="startSelflearningTrain('tip_llm','runpod',false)" style="background:#b45309;color:#fff;border:none;padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">RunPod Full</button>
|
||||||
|
<button onclick="startSelflearningTrain('tip_llm','local',true)" style="background:var(--surface2);color:var(--text);border:1px solid var(--border);padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">Local Train</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div style="display:flex;justify-content:space-between;gap:0.75rem;align-items:flex-start;margin-bottom:0.75rem">
|
||||||
|
<div><div style="font-size:0.88rem;font-weight:800;color:var(--text-bright)">Blog_LLM_Vx.x</div><div style="font-size:0.72rem;color:var(--text-dim);margin-top:2px">FO_BlogLLM, Founder Content, technische TIP-/Flexoptix-Artikel</div></div>
|
||||||
|
<span id="sl-blog-state" class="b b-blue">unknown</span>
|
||||||
|
</div>
|
||||||
|
<div id="sl-blog-metrics" style="display:grid;grid-template-columns:repeat(4,1fr);gap:0.5rem;margin-bottom:0.75rem"></div>
|
||||||
|
<div style="font-size:0.7rem;color:var(--text-dim);margin-bottom:0.5rem">Dataset: <code id="sl-blog-dataset">-</code></div>
|
||||||
|
<div style="display:flex;gap:0.5rem;flex-wrap:wrap">
|
||||||
|
<button onclick="startSelflearningTrain('blog_llm','runpod',true)" style="background:#0f766e;color:#fff;border:none;padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">RunPod Seed</button>
|
||||||
|
<button onclick="startSelflearningTrain('blog_llm','runpod',false)" style="background:#b45309;color:#fff;border:none;padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">RunPod Full</button>
|
||||||
|
<button onclick="startSelflearningTrain('blog_llm','local',true)" style="background:var(--surface2);color:var(--text);border:1px solid var(--border);padding:6px 10px;border-radius:6px;cursor:pointer;font-size:0.75rem;font-weight:700">Local Train</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div style="font-size:0.82rem;font-weight:800;color:var(--text-bright);margin-bottom:0.6rem">Training Log</div>
|
||||||
|
<pre id="selflearning-log" style="margin:0;max-height:260px;overflow:auto;background:var(--surface2);border:1px solid var(--border);border-radius:8px;padding:0.8rem;font-size:0.72rem;color:var(--text);white-space:pre-wrap">Noch kein Lauf in dieser Dashboard-Session.</pre>
|
||||||
|
</div>
|
||||||
|
</div><!-- end tab-selflearning -->
|
||||||
|
|
||||||
<!-- NETWORK TAB -->
|
<!-- NETWORK TAB -->
|
||||||
<div id="tab-network" class="hidden fade-in">
|
<div id="tab-network" class="hidden fade-in">
|
||||||
<h2 style="margin-bottom:1.25rem;font-size:1.1rem;font-weight:700">🌐 TIP Proxy Network</h2>
|
<h2 style="margin-bottom:1.25rem;font-size:1.1rem;font-weight:700">🌐 TIP Proxy Network</h2>
|
||||||
@ -2350,6 +2399,7 @@ function goToTab(tabName) {
|
|||||||
if (tabName === 'blog') { loadBlogDrafts(); loadSLLInsights(); loadBlogLLMStatus(); loadPostingTime(); }
|
if (tabName === 'blog') { loadBlogDrafts(); loadSLLInsights(); loadBlogLLMStatus(); loadPostingTime(); }
|
||||||
if (tabName === 'finder') document.getElementById('finder-switch-input').focus();
|
if (tabName === 'finder') document.getElementById('finder-switch-input').focus();
|
||||||
if (tabName === 'crawlers') loadCrawlerStatus();
|
if (tabName === 'crawlers') loadCrawlerStatus();
|
||||||
|
if (tabName === 'selflearning') loadSelflearning();
|
||||||
if (tabName === 'procurement') loadProcurement();
|
if (tabName === 'procurement') loadProcurement();
|
||||||
if (tabName === 'network') loadProxyNetwork();
|
if (tabName === 'network') loadProxyNetwork();
|
||||||
if (tabName === 'review') loadReview();
|
if (tabName === 'review') loadReview();
|
||||||
@ -6033,6 +6083,107 @@ setTimeout(function() {
|
|||||||
if (window.loadToken && window.loadToken()) loadReviewStats().catch(function() {});
|
if (window.loadToken && window.loadToken()) loadReviewStats().catch(function() {});
|
||||||
}, 1500);
|
}, 1500);
|
||||||
|
|
||||||
|
// ── SELFLEARNING TRAINING ───────────────────────────────────────────
|
||||||
|
function selflearningMetric(label, value, color) {
|
||||||
|
return '<div style="background:var(--surface2);border:1px solid var(--border);border-radius:8px;padding:0.6rem;text-align:center">'
|
||||||
|
+ '<div style="font-size:1rem;font-weight:800;color:' + (color || 'var(--accent)') + ';font-family:var(--mono)">' + esc(value == null ? '-' : value) + '</div>'
|
||||||
|
+ '<div style="font-size:0.62rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:0.06em;margin-top:2px">' + esc(label) + '</div></div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderSelflearningLane(key, data, laneInfo) {
|
||||||
|
var metricsEl = el(key === 'tip_llm' ? 'sl-tip-metrics' : 'sl-blog-metrics');
|
||||||
|
var stateEl = el(key === 'tip_llm' ? 'sl-tip-state' : 'sl-blog-state');
|
||||||
|
var datasetEl = el(key === 'tip_llm' ? 'sl-tip-dataset' : 'sl-blog-dataset');
|
||||||
|
if (datasetEl) datasetEl.textContent = (laneInfo && laneInfo.dataset) || '-';
|
||||||
|
if (!metricsEl) return;
|
||||||
|
if (!data) {
|
||||||
|
metricsEl.innerHTML = '<div style="grid-column:1/-1;color:var(--text-dim);font-size:0.75rem">Noch kein Manifest. Erst Build Pool starten.</div>';
|
||||||
|
if (stateEl) { stateEl.textContent = 'needs build'; stateEl.className = 'b b-yellow'; }
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
var pairs = data.training_pairs || 0;
|
||||||
|
metricsEl.innerHTML = [
|
||||||
|
selflearningMetric('Pairs', pairs.toLocaleString('de-DE'), '#22c55e'),
|
||||||
|
selflearningMetric('Train', (data.train_pairs || 0).toLocaleString('de-DE'), 'var(--accent)'),
|
||||||
|
selflearningMetric('Eval', (data.eval_pairs || 0).toLocaleString('de-DE'), '#60a5fa'),
|
||||||
|
selflearningMetric('Dedupe', (data.duplicates_removed || 0).toLocaleString('de-DE'), '#f59e0b')
|
||||||
|
].join('');
|
||||||
|
if (stateEl) {
|
||||||
|
stateEl.textContent = pairs > 0 ? 'ready' : 'empty';
|
||||||
|
stateEl.className = pairs > 0 ? 'b b-green' : 'b b-yellow';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadSelflearning() {
|
||||||
|
var banner = el('selflearning-status-banner');
|
||||||
|
try {
|
||||||
|
var d = await api('/api/selflearning/status');
|
||||||
|
var manifest = d.manifest || null;
|
||||||
|
renderSelflearningLane('tip_llm', manifest && manifest.lanes && manifest.lanes.tip_llm, d.lanes && d.lanes.tip_llm);
|
||||||
|
renderSelflearningLane('blog_llm', manifest && manifest.lanes && manifest.lanes.blog_llm, d.lanes && d.lanes.blog_llm);
|
||||||
|
if (banner) {
|
||||||
|
var parts = [
|
||||||
|
'RunPod Endpoint: ' + (d.runpod && d.runpod.endpoint_configured ? 'ok' : 'fehlt'),
|
||||||
|
'RunPod API: ' + (d.runpod && d.runpod.api_key_configured ? 'ok' : 'fehlt'),
|
||||||
|
'HF Token: ' + (d.huggingface && d.huggingface.token_configured ? 'ok' : 'fehlt'),
|
||||||
|
'Local: ' + (d.local && d.local.ready ? 'konfiguriert' : 'noch kein TIP_LOCAL_TRAIN_COMMAND')
|
||||||
|
];
|
||||||
|
banner.innerHTML = '<strong>Status:</strong> ' + parts.map(esc).join(' · ')
|
||||||
|
+ (manifest ? '<br><span style="font-size:0.72rem">Manifest: ' + esc(manifest.version || '-') + ' · ' + esc(manifest.generated_at || '-') + '</span>' : '<br><span style="font-size:0.72rem">Noch kein Manifest vorhanden.</span>');
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
if (banner) banner.innerHTML = '<span style="color:#f87171">Selflearning status failed: ' + esc(e.message) + '</span>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function setSelflearningLog(text) {
|
||||||
|
var log = el('selflearning-log');
|
||||||
|
if (log) log.textContent = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function buildSelflearningPool() {
|
||||||
|
setSelflearningLog('Build Pool laeuft...');
|
||||||
|
try {
|
||||||
|
var d = await api('/api/selflearning/build', { method: 'POST', headers: { 'Content-Type': 'application/json' } });
|
||||||
|
setSelflearningLog(JSON.stringify(d.manifest || d, null, 2));
|
||||||
|
showToast('Learning Pool gebaut', 'Deduplizierte TIP_LLM und Blog_LLM Datasets sind aktualisiert.');
|
||||||
|
loadSelflearning();
|
||||||
|
} catch(e) {
|
||||||
|
setSelflearningLog(e.message + (e.body ? '\n' + JSON.stringify(e.body, null, 2) : ''));
|
||||||
|
showToast('Build fehlgeschlagen', e.message, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function publishSelflearningHF() {
|
||||||
|
setSelflearningLog('HF Publish laeuft...');
|
||||||
|
try {
|
||||||
|
var d = await api('/api/selflearning/publish-hf', { method: 'POST', headers: { 'Content-Type': 'application/json' } });
|
||||||
|
setSelflearningLog((d.stdout || '') + (d.stderr ? '\nSTDERR:\n' + d.stderr : ''));
|
||||||
|
showToast('HF Sync fertig', 'Private TIP_LLM und Blog_LLM Datasets sind publiziert.');
|
||||||
|
loadSelflearning();
|
||||||
|
} catch(e) {
|
||||||
|
setSelflearningLog(e.message + (e.body ? '\n' + JSON.stringify(e.body, null, 2) : ''));
|
||||||
|
showToast('HF Sync fehlgeschlagen', e.message, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startSelflearningTrain(lane, provider, seedOnly) {
|
||||||
|
var label = lane + ' / ' + provider + (seedOnly ? ' / seed' : ' / full');
|
||||||
|
setSelflearningLog('Training Start: ' + label);
|
||||||
|
try {
|
||||||
|
var d = await api('/api/selflearning/train', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ lane: lane, provider: provider, seed_only: seedOnly, max_steps: seedOnly ? 200 : 2000 })
|
||||||
|
});
|
||||||
|
setSelflearningLog(JSON.stringify(d, null, 2));
|
||||||
|
showToast('Training gestartet', label);
|
||||||
|
} catch(e) {
|
||||||
|
setSelflearningLog(e.message + (e.body ? '\n' + JSON.stringify(e.body, null, 2) : ''));
|
||||||
|
showToast('Training fehlgeschlagen', e.message, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── CRAWLER INTELLIGENCE ────────────────────────────────────────────
|
// ── CRAWLER INTELLIGENCE ────────────────────────────────────────────
|
||||||
async function loadCrawlerStatus() {
|
async function loadCrawlerStatus() {
|
||||||
loadCrawlerJobs(); // load live job queue in parallel
|
loadCrawlerJobs(); // load live job queue in parallel
|
||||||
|
|||||||
196
scripts/tip-learning-pool-build.ts
Normal file
196
scripts/tip-learning-pool-build.ts
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
import { createHash } from "crypto";
|
||||||
|
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
||||||
|
import { homedir } from "os";
|
||||||
|
import { basename, extname, join, relative } from "path";
|
||||||
|
|
||||||
|
type Lane = "tip_llm" | "blog_llm";
|
||||||
|
type Message = { role: "system" | "user" | "assistant"; content: string };
|
||||||
|
type Row = { id: string; lane: Lane; source: string; kind: string; messages: Message[] };
|
||||||
|
|
||||||
|
const repoRoot = process.cwd();
|
||||||
|
const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data");
|
||||||
|
const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training");
|
||||||
|
const outRoot = join(repoRoot, "training-data", "runpod");
|
||||||
|
|
||||||
|
const system: Record<Lane, string> = {
|
||||||
|
tip_llm: "You are TIP_LLM, the Transceiver Intelligence Platform research and data-preparation model. Convert messy market, vendor, crawler, forum, RIR, compatibility and optics intelligence into precise crawler plans, extraction schemas, normalized findings, source-quality notes and actionable market intelligence. Prefer structured outputs, cite source constraints, and avoid inventing facts.",
|
||||||
|
blog_llm: "You are Blog_LLM, the specialized Flexoptix/TIP founder-content and technical blog model. Write opinionated, practical, technically credible articles for network engineers and optical infrastructure buyers. Keep the tone human, specific and useful. Avoid generic AI filler, LaTeX in prose, and datasheet dumps.",
|
||||||
|
};
|
||||||
|
|
||||||
|
const files: Record<Lane, string[]> = {
|
||||||
|
tip_llm: [
|
||||||
|
"master-training-dataset.jsonl",
|
||||||
|
"technical-deep-dives.jsonl",
|
||||||
|
"vendor-deep-dives.jsonl",
|
||||||
|
"rir-infrastructure-data.jsonl",
|
||||||
|
"nanog-ripe-labs-content.jsonl",
|
||||||
|
"academic-research-synthesis.jsonl",
|
||||||
|
"synthesized-training-samples.jsonl",
|
||||||
|
"market-business-analysis-part1.jsonl",
|
||||||
|
"market-business-analysis-part2.jsonl",
|
||||||
|
"market-business-analysis-part3.jsonl",
|
||||||
|
"market-business-analysis-part4.jsonl",
|
||||||
|
"market-business-analysis-part5.jsonl",
|
||||||
|
"market-business-analysis-part6.jsonl",
|
||||||
|
],
|
||||||
|
blog_llm: [
|
||||||
|
"master-training-dataset.jsonl",
|
||||||
|
"technical-deep-dives.jsonl",
|
||||||
|
"vendor-deep-dives.jsonl",
|
||||||
|
"v8-real-posts-sft.jsonl",
|
||||||
|
"v7-ripe-apnic-sft.jsonl",
|
||||||
|
"v8-v6blogs-sft.jsonl",
|
||||||
|
"v8-external-sft.jsonl",
|
||||||
|
"blog-fichtmueller-posts.jsonl",
|
||||||
|
"mega-training-dataset.jsonl",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
function clean(v: unknown): string {
|
||||||
|
return String(v ?? "").replace(/\r\n/g, "\n").replace(/\n{4,}/g, "\n\n\n").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function id(parts: string[]): string {
|
||||||
|
return createHash("sha256").update(parts.join("\n---\n")).digest("hex").slice(0, 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
function sourceLabel(path: string): string {
|
||||||
|
const repoRel = relative(repoRoot, path);
|
||||||
|
if (!repoRel.startsWith("..")) return repoRel;
|
||||||
|
const extRel = relative(externalRoot, path);
|
||||||
|
if (!extRel.startsWith("..")) return `external:${extRel}`;
|
||||||
|
const blogRel = relative(blogMegaRoot, path);
|
||||||
|
if (!blogRel.startsWith("..")) return `blog-mega:${blogRel}`;
|
||||||
|
return basename(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
function add(rows: Row[], lane: Lane, source: string, kind: string, prompt: unknown, answer: unknown) {
|
||||||
|
const user = clean(prompt);
|
||||||
|
const assistant = clean(answer);
|
||||||
|
if (user.length < 20 || assistant.length < 40) return;
|
||||||
|
const rowId = id([lane, user, assistant]);
|
||||||
|
rows.push({
|
||||||
|
id: rowId,
|
||||||
|
lane,
|
||||||
|
source,
|
||||||
|
kind,
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: system[lane] },
|
||||||
|
{ role: "user", content: user },
|
||||||
|
{ role: "assistant", content: assistant },
|
||||||
|
],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function jsonl(path: string, lane: Lane): Row[] {
|
||||||
|
if (!existsSync(path)) return [];
|
||||||
|
const rows: Row[] = [];
|
||||||
|
const source = sourceLabel(path);
|
||||||
|
for (const line of readFileSync(path, "utf8").split(/\n+/).filter(Boolean)) {
|
||||||
|
try {
|
||||||
|
const obj = JSON.parse(line);
|
||||||
|
if (Array.isArray(obj.messages)) {
|
||||||
|
const user = obj.messages.find((m: Message) => m.role === "user")?.content;
|
||||||
|
const assistant = [...obj.messages].reverse().find((m: Message) => m.role === "assistant")?.content;
|
||||||
|
add(rows, lane, source, "chat-jsonl", user, assistant);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
add(
|
||||||
|
rows,
|
||||||
|
lane,
|
||||||
|
source,
|
||||||
|
"sft-jsonl",
|
||||||
|
obj.prompt || obj.instruction || obj.question || obj.input || obj.title || obj.topic,
|
||||||
|
obj.completion || obj.output || obj.answer || obj.response || obj.article || obj.content || obj.text || obj.summary,
|
||||||
|
);
|
||||||
|
} catch {
|
||||||
|
// Ignore malformed legacy rows; usable row counts are tracked in the manifest.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
function markdownBlog(path: string): Row[] {
|
||||||
|
const article = readFileSync(path, "utf8").trim();
|
||||||
|
if (article.length < 400) return [];
|
||||||
|
const title = article.match(/^#\s+(.+)$/m)?.[1]?.trim() || basename(path, extname(path)).replace(/[-_]/g, " ");
|
||||||
|
return [{
|
||||||
|
id: id(["blog_llm", title, article]),
|
||||||
|
lane: "blog_llm",
|
||||||
|
source: sourceLabel(path),
|
||||||
|
kind: "markdown-blog",
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: system.blog_llm },
|
||||||
|
{ role: "user", content: `Write a TIP/Flexoptix technical blog article with this angle: ${title}\n\nUse a practical network-engineer voice, include concrete deployment context, and avoid generic marketing language.` },
|
||||||
|
{ role: "assistant", content: article },
|
||||||
|
],
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
|
||||||
|
function collect(lane: Lane): Row[] {
|
||||||
|
const rows: Row[] = [];
|
||||||
|
for (const file of files[lane]) {
|
||||||
|
for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane));
|
||||||
|
}
|
||||||
|
if (lane === "blog_llm") {
|
||||||
|
for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) {
|
||||||
|
if (!existsSync(dir)) continue;
|
||||||
|
for (const file of readdirSync(dir)) {
|
||||||
|
if (file.endsWith(".md") && file.toLowerCase() !== "readme.md") rows.push(...markdownBlog(join(dir, file)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
function dedupe(rows: Row[]) {
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const unique: Row[] = [];
|
||||||
|
let duplicates = 0;
|
||||||
|
for (const row of rows) {
|
||||||
|
const key = id([row.lane, row.messages[1]?.content || "", row.messages[2]?.content || ""]);
|
||||||
|
if (seen.has(key)) duplicates++;
|
||||||
|
else {
|
||||||
|
seen.add(key);
|
||||||
|
unique.push({ ...row, id: key });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { rows: unique.sort((a, b) => a.id.localeCompare(b.id)), duplicates };
|
||||||
|
}
|
||||||
|
|
||||||
|
function writeLane(lane: Lane, manifest: any) {
|
||||||
|
const raw = collect(lane);
|
||||||
|
const { rows, duplicates } = dedupe(raw);
|
||||||
|
const evalEvery = rows.length >= 20 ? 10 : 5;
|
||||||
|
const evalRows = rows.filter((_r, i) => i % evalEvery === 0);
|
||||||
|
const trainRows = rows.filter((_r, i) => i % evalEvery !== 0);
|
||||||
|
const laneDir = join(outRoot, lane);
|
||||||
|
mkdirSync(laneDir, { recursive: true });
|
||||||
|
const write = (name: string, data: Row[]) => writeFileSync(join(laneDir, name), data.map((r) => JSON.stringify({ id: r.id, source: r.source, kind: r.kind, messages: r.messages })).join("\n") + "\n");
|
||||||
|
write(`${lane}-sft-all.jsonl`, rows);
|
||||||
|
write(`${lane}-sft-train.jsonl`, trainRows);
|
||||||
|
write(`${lane}-sft-eval.jsonl`, evalRows);
|
||||||
|
const sources = rows.reduce<Record<string, number>>((acc, r) => ({ ...acc, [r.source]: (acc[r.source] || 0) + 1 }), {});
|
||||||
|
manifest.lanes[lane] = {
|
||||||
|
raw_pairs: raw.length,
|
||||||
|
duplicates_removed: duplicates,
|
||||||
|
training_pairs: rows.length,
|
||||||
|
train_pairs: trainRows.length,
|
||||||
|
eval_pairs: evalRows.length,
|
||||||
|
sources,
|
||||||
|
files: {
|
||||||
|
train: `training-data/runpod/${lane}/${lane}-sft-train.jsonl`,
|
||||||
|
eval: `training-data/runpod/${lane}/${lane}-sft-eval.jsonl`,
|
||||||
|
all: `training-data/runpod/${lane}/${lane}-sft-all.jsonl`,
|
||||||
|
manifest: `training-data/runpod/${lane}/manifest.json`,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
writeFileSync(join(laneDir, "manifest.json"), JSON.stringify(manifest.lanes[lane], null, 2) + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const manifest = { generated_at: new Date().toISOString(), version: "TIP-LearningPool-v1", lanes: {} as Record<Lane, unknown> };
|
||||||
|
writeLane("tip_llm", manifest);
|
||||||
|
writeLane("blog_llm", manifest);
|
||||||
|
mkdirSync(outRoot, { recursive: true });
|
||||||
|
writeFileSync(join(outRoot, "manifest.json"), JSON.stringify(manifest, null, 2) + "\n");
|
||||||
|
console.log(JSON.stringify(manifest, null, 2));
|
||||||
71
scripts/tip-publish-hf-datasets.py
Normal file
71
scripts/tip-publish-hf-datasets.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Publish private TIP selflearning datasets to Hugging Face."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from huggingface_hub import HfApi
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
RUNPOD_DIR = ROOT / "training-data" / "runpod"
|
||||||
|
LANES = {
|
||||||
|
"tip_llm": os.getenv("TIP_HF_DATASET_TIP_LLM", "renefichtmueller/tip-llm-sft"),
|
||||||
|
"blog_llm": os.getenv("TIP_HF_DATASET_BLOG_LLM", "renefichtmueller/blog-llm-sft"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def keychain(service: str) -> str | None:
|
||||||
|
try:
|
||||||
|
return subprocess.check_output(
|
||||||
|
["security", "find-generic-password", "-s", service, "-w"],
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
text=True,
|
||||||
|
).strip() or None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def hf_token() -> str:
|
||||||
|
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or keychain("magatama.huggingface.token") or keychain("tip.huggingface.token")
|
||||||
|
if not token:
|
||||||
|
raise SystemExit("No Hugging Face token found.")
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
api = HfApi(token=hf_token())
|
||||||
|
manifest = json.loads((RUNPOD_DIR / "manifest.json").read_text())
|
||||||
|
published = {}
|
||||||
|
for lane, repo_id in LANES.items():
|
||||||
|
lane_dir = RUNPOD_DIR / lane
|
||||||
|
lane_manifest = manifest["lanes"][lane]
|
||||||
|
api.create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)
|
||||||
|
for name in (f"{lane}-sft-train.jsonl", f"{lane}-sft-eval.jsonl", f"{lane}-sft-all.jsonl", "manifest.json"):
|
||||||
|
api.upload_file(
|
||||||
|
repo_id=repo_id,
|
||||||
|
repo_type="dataset",
|
||||||
|
path_or_fileobj=str(lane_dir / name),
|
||||||
|
path_in_repo=name,
|
||||||
|
commit_message=f"Update {lane} selflearning dataset",
|
||||||
|
)
|
||||||
|
card = (
|
||||||
|
f"# {repo_id}\n\n"
|
||||||
|
"Private TIP selflearning dataset generated from the Gitea/local learning pool.\n\n"
|
||||||
|
f"- Lane: `{lane}`\n"
|
||||||
|
f"- Version: `{manifest['version']}`\n"
|
||||||
|
f"- Generated: `{manifest['generated_at']}`\n"
|
||||||
|
f"- Training pairs after dedupe: `{lane_manifest['training_pairs']}`\n"
|
||||||
|
f"- Train/Eval split: `{lane_manifest['train_pairs']}` / `{lane_manifest['eval_pairs']}`\n"
|
||||||
|
f"- Duplicates removed: `{lane_manifest['duplicates_removed']}`\n"
|
||||||
|
)
|
||||||
|
api.upload_file(repo_id=repo_id, repo_type="dataset", path_or_fileobj=card.encode(), path_in_repo="README.md", commit_message=f"Document {lane} selflearning dataset")
|
||||||
|
published[lane] = {"repo_id": repo_id, "training_pairs": lane_manifest["training_pairs"], "train_pairs": lane_manifest["train_pairs"], "eval_pairs": lane_manifest["eval_pairs"]}
|
||||||
|
print(json.dumps({"success": True, "published": published}, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
11408
training-data/runpod/blog_llm/blog_llm-sft-all.jsonl
Normal file
11408
training-data/runpod/blog_llm/blog_llm-sft-all.jsonl
Normal file
File diff suppressed because one or more lines are too long
1141
training-data/runpod/blog_llm/blog_llm-sft-eval.jsonl
Normal file
1141
training-data/runpod/blog_llm/blog_llm-sft-eval.jsonl
Normal file
File diff suppressed because one or more lines are too long
10267
training-data/runpod/blog_llm/blog_llm-sft-train.jsonl
Normal file
10267
training-data/runpod/blog_llm/blog_llm-sft-train.jsonl
Normal file
File diff suppressed because one or more lines are too long
118
training-data/runpod/blog_llm/manifest.json
Normal file
118
training-data/runpod/blog_llm/manifest.json
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
{
|
||||||
|
"raw_pairs": 11508,
|
||||||
|
"duplicates_removed": 100,
|
||||||
|
"training_pairs": 11408,
|
||||||
|
"train_pairs": 10267,
|
||||||
|
"eval_pairs": 1141,
|
||||||
|
"sources": {
|
||||||
|
"external:vendor-deep-dives.jsonl": 11200,
|
||||||
|
"external:technical-deep-dives.jsonl": 84,
|
||||||
|
"blog-training-data/blog-025-sfp28-lab-vs-rack.md": 1,
|
||||||
|
"blog-training-data/blog-091-wavelength-selective-switch-wss-explainer.md": 1,
|
||||||
|
"blog-training-data/blog-008-oem-vs-compatible-real-numbers.md": 1,
|
||||||
|
"blog-training-data/blog-014-800g-new-products-what-ships.md": 1,
|
||||||
|
"blog-training-data/blog-045-osnr-link-budget-practical-guide.md": 1,
|
||||||
|
"blog-training-data/blog-024-rx-power-budgets-400g.md": 1,
|
||||||
|
"blog-training-data/blog-017-dom-readings-lie.md": 1,
|
||||||
|
"blog-training-data/blog-010-qsfp-dd-vs-osfp-form-factor-reality.md": 1,
|
||||||
|
"blog-training-data/blog-072-optical-amplifier-edfa-raman-basics.md": 1,
|
||||||
|
"blog-training-data/blog-028-400g-dac-3m-vs-5m.md": 1,
|
||||||
|
"blog-training-data/blog-011-transceiver-procurement-checklist.md": 1,
|
||||||
|
"external:blog-fichtmueller-posts.jsonl": 24,
|
||||||
|
"blog-training-data/blog-083-fiber-optic-testing-otdr-basics.md": 1,
|
||||||
|
"blog-training-data/blog-038-cpo-pluggable-future.md": 1,
|
||||||
|
"blog-training-data/blog-054-multimode-fiber-om3-om4-om5-guide.md": 1,
|
||||||
|
"blog-training-data/blog-015-compatible-vendor-comparison-who-to-trust.md": 1,
|
||||||
|
"blog-training-data/blog-063-100g-zr-coherent-pluggable-timing.md": 1,
|
||||||
|
"blog-training-data/blog-069-optical-budget-calculator-guide.md": 1,
|
||||||
|
"blog-training-data/blog-070-mtp-mpo-cassette-fiber-management.md": 1,
|
||||||
|
"blog-training-data/blog-092-sfp-sfp-plus-backward-compatibility.md": 1,
|
||||||
|
"blog-training-data/blog-086-hyperscale-optics-purchasing-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-055-transceiver-lifecycle-management-enterprise.md": 1,
|
||||||
|
"blog-training-data/blog-066-400g-zr-interoperability-matrix.md": 1,
|
||||||
|
"blog-training-data/blog-093-google-meta-microsoft-optics-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-019-cleaning-fiber-400g-tolerance.md": 1,
|
||||||
|
"blog-training-data/blog-026-400g-zr-vs-zrplus.md": 1,
|
||||||
|
"blog-training-data/blog-035-esd-damage-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-087-rj45-vs-sfp-copper-1g-switches.md": 1,
|
||||||
|
"blog-training-data/blog-009-100g-to-400g-migration-what-breaks.md": 1,
|
||||||
|
"blog-training-data/blog-034-grey-optics-vs-dwdm-metro-aggregation.md": 1,
|
||||||
|
"blog-training-data/blog-082-coherent-dsp-power-consumption.md": 1,
|
||||||
|
"blog-training-data/blog-062-transceiver-inventory-management-excel-vs-cmdb.md": 1,
|
||||||
|
"blog-training-data/blog-088-transceiver-sff-committee-history.md": 1,
|
||||||
|
"blog-training-data/blog-098-carrier-ethernet-timing-syncE-ptp-optics.md": 1,
|
||||||
|
"blog-training-data/blog-003-silicon-photonics.md": 1,
|
||||||
|
"blog-training-data/blog-037-fec-deep-dive.md": 1,
|
||||||
|
"blog-training-data/blog-099-transceiver-market-2026-pricing-forecast.md": 1,
|
||||||
|
"blog-training-data/blog-021-validating-compatible-optics.md": 1,
|
||||||
|
"blog-training-data/blog-023-pam4-800g-fec-errors.md": 1,
|
||||||
|
"blog-training-data/blog-030-when-to-upgrade-from-10g.md": 1,
|
||||||
|
"blog-training-data/blog-002-vendor-lock-in-optics.md": 1,
|
||||||
|
"blog-training-data/blog-081-transceiver-rma-process-best-practices.md": 1,
|
||||||
|
"blog-training-data/blog-013-price-drop-timing-when-to-buy.md": 1,
|
||||||
|
"blog-training-data/blog-095-optical-lan-versus-fiber-ethernet.md": 1,
|
||||||
|
"blog-training-data/blog-067-single-mode-fiber-types-g652-g657.md": 1,
|
||||||
|
"blog-training-data/blog-039-cmis-400g-management.md": 1,
|
||||||
|
"blog-training-data/blog-071-sff-8024-transceiver-id-codes.md": 1,
|
||||||
|
"blog-training-data/blog-097-liquid-cooling-impact-optical-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-007-800g-readiness.md": 1,
|
||||||
|
"blog-training-data/blog-058-arista-eos-optic-compatibility.md": 1,
|
||||||
|
"blog-training-data/blog-068-25g-vs-10g-upgrade-path-decision.md": 1,
|
||||||
|
"blog-training-data/blog-061-cfp2-cfp4-qsfp28-form-factor-migration.md": 1,
|
||||||
|
"blog-training-data/blog-079-ip-optical-integration-disaggregation.md": 1,
|
||||||
|
"blog-training-data/blog-046-transceiver-counterfeit-detection.md": 1,
|
||||||
|
"blog-training-data/blog-056-cisco-qsfp28-compatibility-list.md": 1,
|
||||||
|
"blog-training-data/blog-005-coherent-400zr-reality.md": 1,
|
||||||
|
"blog-training-data/blog-065-dwdm-channel-plan-100ghz-vs-50ghz.md": 1,
|
||||||
|
"blog-training-data/blog-078-pon-gpon-xgspon-optics-explainer.md": 1,
|
||||||
|
"blog-training-data/blog-051-spine-leaf-transceiver-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-032-msa-compliance-vs-interoperability.md": 1,
|
||||||
|
"blog-training-data/blog-064-optic-burn-in-testing.md": 1,
|
||||||
|
"blog-training-data/blog-001-400g-dr4-price-war.md": 1,
|
||||||
|
"blog-training-data/blog-040-evaluating-compatible-vendor.md": 1,
|
||||||
|
"blog-training-data/blog-042-800g-osfp-vs-qsfp-dd-port-density.md": 1,
|
||||||
|
"blog-training-data/blog-100-flexoptix-programming-service-technical.md": 1,
|
||||||
|
"blog-training-data/blog-076-cisco-nexus-vs-catalyst-optic-behavior.md": 1,
|
||||||
|
"blog-training-data/blog-053-cisco-juniper-arista-optic-lock-in.md": 1,
|
||||||
|
"blog-training-data/blog-044-laser-safety-class-1m-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-094-transceiver-programming-eeprom-guide.md": 1,
|
||||||
|
"blog-training-data/blog-085-ai-inference-cluster-optics-requirements.md": 1,
|
||||||
|
"blog-training-data/blog-052-roa-replacing-optics-proactively.md": 1,
|
||||||
|
"blog-training-data/blog-090-optics-for-5g-fronthaul-midhaul.md": 1,
|
||||||
|
"blog-training-data/blog-041-silicon-photonics-co-packaging-2026.md": 1,
|
||||||
|
"blog-training-data/blog-096-dark-fiber-leasing-optics-considerations.md": 1,
|
||||||
|
"blog-training-data/blog-084-ieee-802.3-standards-transceiver-reference.md": 1,
|
||||||
|
"blog-training-data/blog-012-coherent-vs-direct-detect-decision.md": 1,
|
||||||
|
"blog-training-data/blog-004-400g-migration-fiber-plant.md": 1,
|
||||||
|
"blog-training-data/blog-060-fiber-connector-cleaning-protocol.md": 1,
|
||||||
|
"blog-training-data/blog-027-fiber-plant-audit-100g-upgrade.md": 1,
|
||||||
|
"blog-training-data/blog-016-400g-qsfp-dd-after-fiber-moves.md": 1,
|
||||||
|
"blog-training-data/blog-074-fiber-optic-patch-cord-standards.md": 1,
|
||||||
|
"blog-training-data/blog-057-juniper-optic-unlock-ex-qfx.md": 1,
|
||||||
|
"blog-training-data/blog-022-oem-vs-compatible-lab-tests.md": 1,
|
||||||
|
"blog-training-data/blog-020-100g-link-drops-temperature.md": 1,
|
||||||
|
"blog-training-data/blog-050-optical-transceiver-temperature-grades.md": 1,
|
||||||
|
"blog-training-data/blog-036-coherent-tunable-vs-fixed-wavelength.md": 1,
|
||||||
|
"blog-training-data/blog-077-pam4-vs-nrz-modulation-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-080-fcoe-fibre-channel-sfp-differences.md": 1,
|
||||||
|
"blog-training-data/blog-043-zr-zr-plus-coherent-pluggables-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-049-wavelength-division-multiplexing-primer.md": 1,
|
||||||
|
"blog-training-data/blog-089-metro-dwdm-open-vs-proprietary.md": 1,
|
||||||
|
"blog-training-data/blog-073-qsfp-dd-800g-ecosystem-2026.md": 1,
|
||||||
|
"blog-training-data/blog-018-800g-sr8-dr8-fr8-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-029-800g-osfp-spineleaf-checklist.md": 1,
|
||||||
|
"blog-training-data/blog-006-dom-diagnostics.md": 1,
|
||||||
|
"blog-training-data/blog-075-transceiver-failure-root-cause-analysis.md": 1,
|
||||||
|
"blog-training-data/blog-048-400g-dr4-fr4-lr4-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-031-cwdm4-vs-psm4-100g-datacenter.md": 1,
|
||||||
|
"blog-training-data/blog-059-100g-sr4-multimode-distance-limits.md": 1,
|
||||||
|
"blog-training-data/blog-047-dom-digital-optical-monitoring-guide.md": 1,
|
||||||
|
"blog-training-data/blog-033-25g-dac-aoc-optical-tco.md": 1
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"train": "training-data/runpod/blog_llm/blog_llm-sft-train.jsonl",
|
||||||
|
"eval": "training-data/runpod/blog_llm/blog_llm-sft-eval.jsonl",
|
||||||
|
"all": "training-data/runpod/blog_llm/blog_llm-sft-all.jsonl",
|
||||||
|
"manifest": "training-data/runpod/blog_llm/manifest.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
151
training-data/runpod/manifest.json
Normal file
151
training-data/runpod/manifest.json
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
{
|
||||||
|
"generated_at": "2026-04-25T10:20:52.535Z",
|
||||||
|
"version": "TIP-LearningPool-v1",
|
||||||
|
"lanes": {
|
||||||
|
"tip_llm": {
|
||||||
|
"raw_pairs": 12107,
|
||||||
|
"duplicates_removed": 269,
|
||||||
|
"training_pairs": 11838,
|
||||||
|
"train_pairs": 10654,
|
||||||
|
"eval_pairs": 1184,
|
||||||
|
"sources": {
|
||||||
|
"external:vendor-deep-dives.jsonl": 11200,
|
||||||
|
"external:technical-deep-dives.jsonl": 84,
|
||||||
|
"external:rir-infrastructure-data.jsonl": 150,
|
||||||
|
"external:market-business-analysis-part1.jsonl": 10,
|
||||||
|
"external:synthesized-training-samples.jsonl": 219,
|
||||||
|
"external:nanog-ripe-labs-content.jsonl": 34,
|
||||||
|
"external:academic-research-synthesis.jsonl": 109,
|
||||||
|
"external:market-business-analysis-part6.jsonl": 5,
|
||||||
|
"external:market-business-analysis-part5.jsonl": 7,
|
||||||
|
"external:market-business-analysis-part4.jsonl": 5,
|
||||||
|
"external:market-business-analysis-part2.jsonl": 8,
|
||||||
|
"external:market-business-analysis-part3.jsonl": 7
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"train": "training-data/runpod/tip_llm/tip_llm-sft-train.jsonl",
|
||||||
|
"eval": "training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl",
|
||||||
|
"all": "training-data/runpod/tip_llm/tip_llm-sft-all.jsonl",
|
||||||
|
"manifest": "training-data/runpod/tip_llm/manifest.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"blog_llm": {
|
||||||
|
"raw_pairs": 11508,
|
||||||
|
"duplicates_removed": 100,
|
||||||
|
"training_pairs": 11408,
|
||||||
|
"train_pairs": 10267,
|
||||||
|
"eval_pairs": 1141,
|
||||||
|
"sources": {
|
||||||
|
"external:vendor-deep-dives.jsonl": 11200,
|
||||||
|
"external:technical-deep-dives.jsonl": 84,
|
||||||
|
"blog-training-data/blog-025-sfp28-lab-vs-rack.md": 1,
|
||||||
|
"blog-training-data/blog-091-wavelength-selective-switch-wss-explainer.md": 1,
|
||||||
|
"blog-training-data/blog-008-oem-vs-compatible-real-numbers.md": 1,
|
||||||
|
"blog-training-data/blog-014-800g-new-products-what-ships.md": 1,
|
||||||
|
"blog-training-data/blog-045-osnr-link-budget-practical-guide.md": 1,
|
||||||
|
"blog-training-data/blog-024-rx-power-budgets-400g.md": 1,
|
||||||
|
"blog-training-data/blog-017-dom-readings-lie.md": 1,
|
||||||
|
"blog-training-data/blog-010-qsfp-dd-vs-osfp-form-factor-reality.md": 1,
|
||||||
|
"blog-training-data/blog-072-optical-amplifier-edfa-raman-basics.md": 1,
|
||||||
|
"blog-training-data/blog-028-400g-dac-3m-vs-5m.md": 1,
|
||||||
|
"blog-training-data/blog-011-transceiver-procurement-checklist.md": 1,
|
||||||
|
"external:blog-fichtmueller-posts.jsonl": 24,
|
||||||
|
"blog-training-data/blog-083-fiber-optic-testing-otdr-basics.md": 1,
|
||||||
|
"blog-training-data/blog-038-cpo-pluggable-future.md": 1,
|
||||||
|
"blog-training-data/blog-054-multimode-fiber-om3-om4-om5-guide.md": 1,
|
||||||
|
"blog-training-data/blog-015-compatible-vendor-comparison-who-to-trust.md": 1,
|
||||||
|
"blog-training-data/blog-063-100g-zr-coherent-pluggable-timing.md": 1,
|
||||||
|
"blog-training-data/blog-069-optical-budget-calculator-guide.md": 1,
|
||||||
|
"blog-training-data/blog-070-mtp-mpo-cassette-fiber-management.md": 1,
|
||||||
|
"blog-training-data/blog-092-sfp-sfp-plus-backward-compatibility.md": 1,
|
||||||
|
"blog-training-data/blog-086-hyperscale-optics-purchasing-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-055-transceiver-lifecycle-management-enterprise.md": 1,
|
||||||
|
"blog-training-data/blog-066-400g-zr-interoperability-matrix.md": 1,
|
||||||
|
"blog-training-data/blog-093-google-meta-microsoft-optics-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-019-cleaning-fiber-400g-tolerance.md": 1,
|
||||||
|
"blog-training-data/blog-026-400g-zr-vs-zrplus.md": 1,
|
||||||
|
"blog-training-data/blog-035-esd-damage-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-087-rj45-vs-sfp-copper-1g-switches.md": 1,
|
||||||
|
"blog-training-data/blog-009-100g-to-400g-migration-what-breaks.md": 1,
|
||||||
|
"blog-training-data/blog-034-grey-optics-vs-dwdm-metro-aggregation.md": 1,
|
||||||
|
"blog-training-data/blog-082-coherent-dsp-power-consumption.md": 1,
|
||||||
|
"blog-training-data/blog-062-transceiver-inventory-management-excel-vs-cmdb.md": 1,
|
||||||
|
"blog-training-data/blog-088-transceiver-sff-committee-history.md": 1,
|
||||||
|
"blog-training-data/blog-098-carrier-ethernet-timing-syncE-ptp-optics.md": 1,
|
||||||
|
"blog-training-data/blog-003-silicon-photonics.md": 1,
|
||||||
|
"blog-training-data/blog-037-fec-deep-dive.md": 1,
|
||||||
|
"blog-training-data/blog-099-transceiver-market-2026-pricing-forecast.md": 1,
|
||||||
|
"blog-training-data/blog-021-validating-compatible-optics.md": 1,
|
||||||
|
"blog-training-data/blog-023-pam4-800g-fec-errors.md": 1,
|
||||||
|
"blog-training-data/blog-030-when-to-upgrade-from-10g.md": 1,
|
||||||
|
"blog-training-data/blog-002-vendor-lock-in-optics.md": 1,
|
||||||
|
"blog-training-data/blog-081-transceiver-rma-process-best-practices.md": 1,
|
||||||
|
"blog-training-data/blog-013-price-drop-timing-when-to-buy.md": 1,
|
||||||
|
"blog-training-data/blog-095-optical-lan-versus-fiber-ethernet.md": 1,
|
||||||
|
"blog-training-data/blog-067-single-mode-fiber-types-g652-g657.md": 1,
|
||||||
|
"blog-training-data/blog-039-cmis-400g-management.md": 1,
|
||||||
|
"blog-training-data/blog-071-sff-8024-transceiver-id-codes.md": 1,
|
||||||
|
"blog-training-data/blog-097-liquid-cooling-impact-optical-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-007-800g-readiness.md": 1,
|
||||||
|
"blog-training-data/blog-058-arista-eos-optic-compatibility.md": 1,
|
||||||
|
"blog-training-data/blog-068-25g-vs-10g-upgrade-path-decision.md": 1,
|
||||||
|
"blog-training-data/blog-061-cfp2-cfp4-qsfp28-form-factor-migration.md": 1,
|
||||||
|
"blog-training-data/blog-079-ip-optical-integration-disaggregation.md": 1,
|
||||||
|
"blog-training-data/blog-046-transceiver-counterfeit-detection.md": 1,
|
||||||
|
"blog-training-data/blog-056-cisco-qsfp28-compatibility-list.md": 1,
|
||||||
|
"blog-training-data/blog-005-coherent-400zr-reality.md": 1,
|
||||||
|
"blog-training-data/blog-065-dwdm-channel-plan-100ghz-vs-50ghz.md": 1,
|
||||||
|
"blog-training-data/blog-078-pon-gpon-xgspon-optics-explainer.md": 1,
|
||||||
|
"blog-training-data/blog-051-spine-leaf-transceiver-strategy.md": 1,
|
||||||
|
"blog-training-data/blog-032-msa-compliance-vs-interoperability.md": 1,
|
||||||
|
"blog-training-data/blog-064-optic-burn-in-testing.md": 1,
|
||||||
|
"blog-training-data/blog-001-400g-dr4-price-war.md": 1,
|
||||||
|
"blog-training-data/blog-040-evaluating-compatible-vendor.md": 1,
|
||||||
|
"blog-training-data/blog-042-800g-osfp-vs-qsfp-dd-port-density.md": 1,
|
||||||
|
"blog-training-data/blog-100-flexoptix-programming-service-technical.md": 1,
|
||||||
|
"blog-training-data/blog-076-cisco-nexus-vs-catalyst-optic-behavior.md": 1,
|
||||||
|
"blog-training-data/blog-053-cisco-juniper-arista-optic-lock-in.md": 1,
|
||||||
|
"blog-training-data/blog-044-laser-safety-class-1m-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-094-transceiver-programming-eeprom-guide.md": 1,
|
||||||
|
"blog-training-data/blog-085-ai-inference-cluster-optics-requirements.md": 1,
|
||||||
|
"blog-training-data/blog-052-roa-replacing-optics-proactively.md": 1,
|
||||||
|
"blog-training-data/blog-090-optics-for-5g-fronthaul-midhaul.md": 1,
|
||||||
|
"blog-training-data/blog-041-silicon-photonics-co-packaging-2026.md": 1,
|
||||||
|
"blog-training-data/blog-096-dark-fiber-leasing-optics-considerations.md": 1,
|
||||||
|
"blog-training-data/blog-084-ieee-802.3-standards-transceiver-reference.md": 1,
|
||||||
|
"blog-training-data/blog-012-coherent-vs-direct-detect-decision.md": 1,
|
||||||
|
"blog-training-data/blog-004-400g-migration-fiber-plant.md": 1,
|
||||||
|
"blog-training-data/blog-060-fiber-connector-cleaning-protocol.md": 1,
|
||||||
|
"blog-training-data/blog-027-fiber-plant-audit-100g-upgrade.md": 1,
|
||||||
|
"blog-training-data/blog-016-400g-qsfp-dd-after-fiber-moves.md": 1,
|
||||||
|
"blog-training-data/blog-074-fiber-optic-patch-cord-standards.md": 1,
|
||||||
|
"blog-training-data/blog-057-juniper-optic-unlock-ex-qfx.md": 1,
|
||||||
|
"blog-training-data/blog-022-oem-vs-compatible-lab-tests.md": 1,
|
||||||
|
"blog-training-data/blog-020-100g-link-drops-temperature.md": 1,
|
||||||
|
"blog-training-data/blog-050-optical-transceiver-temperature-grades.md": 1,
|
||||||
|
"blog-training-data/blog-036-coherent-tunable-vs-fixed-wavelength.md": 1,
|
||||||
|
"blog-training-data/blog-077-pam4-vs-nrz-modulation-transceivers.md": 1,
|
||||||
|
"blog-training-data/blog-080-fcoe-fibre-channel-sfp-differences.md": 1,
|
||||||
|
"blog-training-data/blog-043-zr-zr-plus-coherent-pluggables-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-049-wavelength-division-multiplexing-primer.md": 1,
|
||||||
|
"blog-training-data/blog-089-metro-dwdm-open-vs-proprietary.md": 1,
|
||||||
|
"blog-training-data/blog-073-qsfp-dd-800g-ecosystem-2026.md": 1,
|
||||||
|
"blog-training-data/blog-018-800g-sr8-dr8-fr8-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-029-800g-osfp-spineleaf-checklist.md": 1,
|
||||||
|
"blog-training-data/blog-006-dom-diagnostics.md": 1,
|
||||||
|
"blog-training-data/blog-075-transceiver-failure-root-cause-analysis.md": 1,
|
||||||
|
"blog-training-data/blog-048-400g-dr4-fr4-lr4-comparison.md": 1,
|
||||||
|
"blog-training-data/blog-031-cwdm4-vs-psm4-100g-datacenter.md": 1,
|
||||||
|
"blog-training-data/blog-059-100g-sr4-multimode-distance-limits.md": 1,
|
||||||
|
"blog-training-data/blog-047-dom-digital-optical-monitoring-guide.md": 1,
|
||||||
|
"blog-training-data/blog-033-25g-dac-aoc-optical-tco.md": 1
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"train": "training-data/runpod/blog_llm/blog_llm-sft-train.jsonl",
|
||||||
|
"eval": "training-data/runpod/blog_llm/blog_llm-sft-eval.jsonl",
|
||||||
|
"all": "training-data/runpod/blog_llm/blog_llm-sft-all.jsonl",
|
||||||
|
"manifest": "training-data/runpod/blog_llm/manifest.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
27
training-data/runpod/tip_llm/manifest.json
Normal file
27
training-data/runpod/tip_llm/manifest.json
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"raw_pairs": 12107,
|
||||||
|
"duplicates_removed": 269,
|
||||||
|
"training_pairs": 11838,
|
||||||
|
"train_pairs": 10654,
|
||||||
|
"eval_pairs": 1184,
|
||||||
|
"sources": {
|
||||||
|
"external:vendor-deep-dives.jsonl": 11200,
|
||||||
|
"external:technical-deep-dives.jsonl": 84,
|
||||||
|
"external:rir-infrastructure-data.jsonl": 150,
|
||||||
|
"external:market-business-analysis-part1.jsonl": 10,
|
||||||
|
"external:synthesized-training-samples.jsonl": 219,
|
||||||
|
"external:nanog-ripe-labs-content.jsonl": 34,
|
||||||
|
"external:academic-research-synthesis.jsonl": 109,
|
||||||
|
"external:market-business-analysis-part6.jsonl": 5,
|
||||||
|
"external:market-business-analysis-part5.jsonl": 7,
|
||||||
|
"external:market-business-analysis-part4.jsonl": 5,
|
||||||
|
"external:market-business-analysis-part2.jsonl": 8,
|
||||||
|
"external:market-business-analysis-part3.jsonl": 7
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"train": "training-data/runpod/tip_llm/tip_llm-sft-train.jsonl",
|
||||||
|
"eval": "training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl",
|
||||||
|
"all": "training-data/runpod/tip_llm/tip_llm-sft-all.jsonl",
|
||||||
|
"manifest": "training-data/runpod/tip_llm/manifest.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
11838
training-data/runpod/tip_llm/tip_llm-sft-all.jsonl
Normal file
11838
training-data/runpod/tip_llm/tip_llm-sft-all.jsonl
Normal file
File diff suppressed because one or more lines are too long
1184
training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl
Normal file
1184
training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl
Normal file
File diff suppressed because one or more lines are too long
10654
training-data/runpod/tip_llm/tip_llm-sft-train.jsonl
Normal file
10654
training-data/runpod/tip_llm/tip_llm-sft-train.jsonl
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user