chore: sync API routes, dashboard hot-topics, MCP server, scraper package, scripts

This commit is contained in:
Rene Fichtmueller 2026-05-06 23:39:04 +02:00
parent a8529d166b
commit a1a525b332
7 changed files with 224 additions and 24 deletions

View File

@ -31,3 +31,28 @@ Default private Hugging Face datasets:
- `renefichtmueller/blog-llm-sft` - `renefichtmueller/blog-llm-sft`
Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically. Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically.
## TIPLLM Robot Experience Pool
Crawler and verification robots must use TIPLLM only for planning/extraction feedback. Operational experience is written to the Gitea-backed TIP training pool:
- Default local clone: `/tmp/tip-training-data`
- Override: `TIP_TRAINING_REPO=/path/to/tip-training-data`
- Gitea repo: `rene/tip-training-data`
- SFT records: `qa-pairs/robot-control-high.jsonl`
- Raw audit records: `robot-experiences/YYYY-MM-DD.jsonl`
Useful commands:
```bash
npm run robots:verification -w packages/scraper -- --status
npm run robots:verification -w packages/scraper -- --tipllm-plan --limit=5
npm run robots:verification -w packages/scraper -- --enqueue=details-fast-lane --profile=erik-safe --dry-run
```
Safety defaults:
- `erik-safe` is the default profile and caps to 3 lightweight queues.
- Playwright/discovery work belongs on Proxmox or Pi workers, not Erik.
- Every status snapshot, TIPLLM plan, dry-run plan, enqueue result and crawler result should become a TIPLLM training example.
- `learning-pool:build` automatically imports Gitea pool SFT rows from `qa-pairs/` into the `tip_llm` lane.

View File

@ -25,6 +25,10 @@ interface HotTopic {
data_context?: Record<string, unknown>; data_context?: Record<string, unknown>;
suggested_angle?: string; suggested_angle?: string;
date?: string; date?: string;
blog_title_created?: boolean;
last_blog_created_at?: string;
rank_score?: number;
llm_context?: string;
} }
/** /**
@ -32,10 +36,11 @@ interface HotTopic {
* *
* Returns dynamically ranked blog topics based on real signals. * Returns dynamically ranked blog topics based on real signals.
*/ */
hotTopicsRouter.get("/", async (_req, res) => { hotTopicsRouter.get("/", async (req, res) => {
try { try {
const topics: HotTopic[] = []; const topics: HotTopic[] = [];
const year = new Date().getFullYear(); const year = new Date().getFullYear();
const limit = Math.max(1, Math.min(50, parseInt(String(req.query.limit || "20"), 10) || 20));
// ═══ SOURCE 1: Internal Data — Price Movements ═══ // ═══ SOURCE 1: Internal Data — Price Movements ═══
const priceDrops = await pool.query(` const priceDrops = await pool.query(`
@ -246,9 +251,33 @@ hotTopicsRouter.get("/", async (_req, res) => {
// ═══ SOURCE 7: Evergreen High-Value Topics ═══ // ═══ SOURCE 7: Evergreen High-Value Topics ═══
topics.push(...getEvergreenTopics(year)); topics.push(...getEvergreenTopics(year));
// Sort by urgency: breaking > hot > trending > emerging // Mark already-created topics and rank with daily rotation + source diversity.
const urgencyOrder: Record<string, number> = { breaking: 0, hot: 1, trending: 2, emerging: 3 }; const recentDrafts = await pool.query(`
topics.sort((a, b) => (urgencyOrder[a.urgency] ?? 4) - (urgencyOrder[b.urgency] ?? 4)); SELECT title, created_at
FROM blog_drafts
WHERE created_at > NOW() - INTERVAL '180 days'
ORDER BY created_at DESC
`).catch(() => ({ rows: [] }));
const createdByTitle = new Map<string, string>();
for (const draft of recentDrafts.rows) {
const key = normalizeTopicTitle(String(draft.title || ""));
if (key && !createdByTitle.has(key)) {
createdByTitle.set(key, draft.created_at ? new Date(draft.created_at).toISOString() : new Date().toISOString());
}
}
const daySeed = getDaySeed();
const rotationSeed = daySeed + getQuerySeed(req.query.shuffle);
for (const topic of topics) {
const createdAt = createdByTitle.get(normalizeTopicTitle(topic.title));
topic.blog_title_created = Boolean(createdAt);
topic.last_blog_created_at = createdAt;
topic.rank_score = scoreTopic(topic, rotationSeed);
topic.llm_context = buildTopicBriefing(topic);
}
const rankedTopics = selectDiverseTopics(topics, limit);
// Next daily rotation: tomorrow 00:00 UTC // Next daily rotation: tomorrow 00:00 UTC
const tomorrow = new Date(); const tomorrow = new Date();
@ -256,11 +285,12 @@ hotTopicsRouter.get("/", async (_req, res) => {
tomorrow.setUTCHours(0, 0, 0, 0); tomorrow.setUTCHours(0, 0, 0, 0);
res.json({ res.json({
topics: topics.slice(0, 20), topics: rankedTopics,
total: topics.length, total: topics.length,
generated_at: new Date().toISOString(), generated_at: new Date().toISOString(),
refreshes_at: tomorrow.toISOString(), refreshes_at: tomorrow.toISOString(),
day_seed: getDaySeed(), day_seed: daySeed,
rotation_seed: rotationSeed,
sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"], sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"],
}); });
} catch (err) { } catch (err) {
@ -269,6 +299,131 @@ hotTopicsRouter.get("/", async (_req, res) => {
} }
}); });
function normalizeTopicTitle(title: string): string {
return title
.toLowerCase()
.replace(/\b20\d{2}\b/g, "{year}")
.replace(/[^a-z0-9]+/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function seededTopicJitter(title: string, seed: number): number {
let s = seed;
const normalized = normalizeTopicTitle(title);
for (let i = 0; i < normalized.length; i++) {
s = (s * 1664525 + normalized.charCodeAt(i) + 1013904223) & 0x7fffffff;
}
return s % 140;
}
function getQuerySeed(value: unknown): number {
if (!value) return 0;
const raw = Array.isArray(value) ? String(value[0] || "") : String(value);
if (!raw) return 0;
let hash = 0;
for (let i = 0; i < raw.length; i++) {
hash = ((hash << 5) - hash + raw.charCodeAt(i)) | 0;
}
return Math.abs(hash % 100000);
}
function compactDataContext(data: Record<string, unknown> | undefined): string {
if (!data) return "";
const replacer = (_key: string, value: unknown) => {
if (Array.isArray(value)) return value.slice(0, 5);
if (typeof value === "string" && value.length > 260) return `${value.slice(0, 257)}...`;
return value;
};
return JSON.stringify(data, replacer, 2).slice(0, 1800);
}
function buildTopicBriefing(topic: HotTopic): string {
const lines = [
`Topic: ${topic.title}`,
`Urgency: ${topic.urgency}`,
`Source: ${topic.source_type} / ${topic.source}`,
];
if (topic.date) lines.push(`Signal date: ${topic.date}`);
if (topic.description) lines.push(`Signal summary: ${topic.description}`);
if (topic.suggested_angle) lines.push(`Recommended angle: ${topic.suggested_angle}`);
if (topic.blog_title_created && topic.last_blog_created_at) {
lines.push(`Editorial note: A blog with a very similar title already exists from ${topic.last_blog_created_at}. If used anyway, choose a materially different angle.`);
}
const dataContext = compactDataContext(topic.data_context);
if (dataContext) lines.push(`Structured supporting data:\n${dataContext}`);
lines.push("Editorial instruction: turn this into a practical optical networking article with procurement/engineering consequences, not a generic news summary.");
return lines.join("\n");
}
function scoreTopic(topic: HotTopic, seed: number): number {
const urgencyScore: Record<HotTopic["urgency"], number> = {
breaking: 900,
hot: 760,
trending: 620,
emerging: 500,
};
const sourceScore: Record<HotTopic["source_type"], number> = {
internal_data: 90,
competitor: 85,
trade_press: 75,
conference: 70,
manufacturer: 65,
research: 60,
};
let freshness = 0;
if (topic.date) {
const ageDays = Math.max(0, (Date.now() - new Date(topic.date).getTime()) / 86400000);
freshness = Math.max(0, 90 - ageDays * 4);
}
const createdPenalty = topic.blog_title_created ? -950 : 0;
return (
(urgencyScore[topic.urgency] ?? 400) +
(sourceScore[topic.source_type] ?? 40) +
freshness +
seededTopicJitter(topic.title, seed) +
createdPenalty
);
}
function selectDiverseTopics(topics: HotTopic[], limit: number): HotTopic[] {
const sorted = [...topics].sort((a, b) => (b.rank_score ?? 0) - (a.rank_score ?? 0));
const selected: HotTopic[] = [];
const sourceTypeCount = new Map<string, number>();
const sourceCount = new Map<string, number>();
for (const topic of sorted) {
if (selected.length >= limit) break;
const sourceType = topic.source_type;
const source = topic.source || "unknown";
const typeCount = sourceTypeCount.get(sourceType) ?? 0;
const srcCount = sourceCount.get(source) ?? 0;
if (typeCount >= 5) continue;
if (srcCount >= 3) continue;
selected.push(topic);
sourceTypeCount.set(sourceType, typeCount + 1);
sourceCount.set(source, srcCount + 1);
}
if (selected.length < limit) {
for (const topic of sorted) {
if (selected.length >= limit) break;
if (!selected.includes(topic)) selected.push(topic);
}
}
return selected;
}
function detectNewsTheme(title: string): string { function detectNewsTheme(title: string): string {
const tl = title.toLowerCase(); const tl = title.toLowerCase();
if (tl.includes("800g") || tl.includes("osfp")) return "800G Deployment Wave"; if (tl.includes("800g") || tl.includes("osfp")) return "800G Deployment Wave";

View File

@ -37,7 +37,7 @@
'<div style="background:linear-gradient(135deg,#1a1a1a,#2a2a2a);color:white;padding:2rem;border-radius:12px;text-align:center;margin-bottom:1rem">' + '<div style="background:linear-gradient(135deg,#1a1a1a,#2a2a2a);color:white;padding:2rem;border-radius:12px;text-align:center;margin-bottom:1rem">' +
'<div style="font-size:1.4rem;font-weight:700;margin-bottom:1rem">Generating Blog with AI...</div>' + '<div style="font-size:1.4rem;font-weight:700;margin-bottom:1rem">Generating Blog with AI...</div>' +
'<div id="bp-status" style="font-size:1rem;color:#FF8100;margin-bottom:0.5rem">Starting 10-step Flexoptix Style pipeline...</div>' + '<div id="bp-status" style="font-size:1rem;color:#FF8100;margin-bottom:0.5rem">Starting 10-step Flexoptix Style pipeline...</div>' +
'<div id="bp-step" style="font-size:0.85rem;color:#aaa">Connecting to LLM (qwen2.5:14b)</div>' + '<div id="bp-step" style="font-size:0.85rem;color:#aaa">Connecting to FO_BlogLLM (fo-blog-v7)</div>' +
'<div style="margin-top:1.5rem;background:#333;border-radius:8px;height:8px;overflow:hidden">' + '<div style="margin-top:1.5rem;background:#333;border-radius:8px;height:8px;overflow:hidden">' +
'<div id="bp-bar" style="width:2%;height:100%;background:#FF8100;transition:width 0.5s ease"></div></div>' + '<div id="bp-bar" style="width:2%;height:100%;background:#FF8100;transition:width 0.5s ease"></div></div>' +
'<div id="bp-pct" style="font-size:0.8rem;color:#666;margin-top:0.5rem">0%</div>' + '<div id="bp-pct" style="font-size:0.8rem;color:#666;margin-top:0.5rem">0%</div>' +
@ -46,8 +46,8 @@
var body = { topic: topic }; var body = { topic: topic };
if (speed) body.speed = speed; if (speed) body.speed = speed;
if (customTitle) body.customTitle = customTitle; if (customTitle) body.custom_title = customTitle;
if (customAngle) body.customAngle = customAngle; if (customAngle) body.additional_context = customAngle;
fetch(API + '/api/blog/generate', { fetch(API + '/api/blog/generate', {
method: 'POST', method: 'POST',
@ -137,7 +137,7 @@
if (bar) bar.style.width = prog.pct + '%'; if (bar) bar.style.width = prog.pct + '%';
if (pct) pct.textContent = prog.pct + '%'; if (pct) pct.textContent = prog.pct + '%';
if (status) { status.style.color = '#FF8100'; status.textContent = prog.label || ('Step ' + prog.step + '/10'); } if (status) { status.style.color = '#FF8100'; status.textContent = prog.label || ('Step ' + prog.step + '/10'); }
if (step) step.textContent = 'Step ' + prog.step + '/10 · qwen2.5:14b via Ollama'; if (step) step.textContent = 'Step ' + prog.step + '/10 · fo-blog-v7 via adapter bridge';
} else { } else {
_stallCount++; _stallCount++;
// After 5 consecutive non-running polls (~40s), show stall warning // After 5 consecutive non-running polls (~40s), show stall warning
@ -185,7 +185,8 @@
if (!grid) return; if (!grid) return;
grid.innerHTML = '<div class="loading pulse">Discovering hot topics...</div>'; grid.innerHTML = '<div class="loading pulse">Discovering hot topics...</div>';
fetch(API + '/api/hot-topics', { headers: authHeaders() }) var shuffle = Date.now().toString(36);
fetch(API + '/api/hot-topics?limit=20&shuffle=' + encodeURIComponent(shuffle), { headers: authHeaders({ 'Cache-Control': 'no-cache' }) })
.then(function(r) { return r.json(); }) .then(function(r) { return r.json(); })
.then(function(data) { .then(function(data) {
if (!data.topics || data.topics.length === 0) { if (!data.topics || data.topics.length === 0) {
@ -196,7 +197,7 @@
if (subtitle && data.refreshes_at) { if (subtitle && data.refreshes_at) {
var nextRefresh = new Date(data.refreshes_at); var nextRefresh = new Date(data.refreshes_at);
var hoursLeft = Math.round((nextRefresh - new Date()) / 3600000); var hoursLeft = Math.round((nextRefresh - new Date()) / 3600000);
subtitle.textContent = data.total + ' topics · rotates daily · next refresh in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', '); subtitle.textContent = data.total + ' topics · refresh reshuffles · daily base rotation in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', ');
} }
var colors = { breaking: '#c1121f', hot: '#FF8100', trending: '#e6a800', emerging: '#2d6a4f' }; var colors = { breaking: '#c1121f', hot: '#FF8100', trending: '#e6a800', emerging: '#2d6a4f' };
@ -227,7 +228,7 @@
window._generateFromHotTopic = function(cardId) { window._generateFromHotTopic = function(cardId) {
var t = window['_ht_' + cardId]; var t = window['_ht_' + cardId];
if (!t) return; if (!t) return;
generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.suggested_angle || t.description); generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.llm_context || t.suggested_angle || t.description);
}; };
// Auto-load hot topics when blog tab activates // Auto-load hot topics when blog tab activates

View File

@ -371,12 +371,15 @@ async function main() {
); );
} }
// --- Ollama LLM tools: market analysis (qwen2.5:14b) + blog generation (fo-blog-v5) --- // --- Ollama-compatible LLM tools: market analysis (TIP_LLM) + blog generation (FO_BlogLLM) ---
const OLLAMA_BASE = process.env["OLLAMA_BASE_URL"] ?? "https://ollama.fichtmueller.org"; const OLLAMA_BASE = process.env["OLLAMA_BASE_URL"] ?? "https://ollama.fichtmueller.org";
const TIP_LLM_MODEL = process.env["TIP_LLM_MODEL"] ?? "tip-llm-v1";
const BLOG_LLM_MODEL = process.env["BLOG_LLM_MODEL"] ?? "fo-blog-v7";
const BLOG_LLM_FALLBACK = process.env["BLOG_LLM_FALLBACK_MODEL"] ?? "qwen2.5:14b";
server.tool( server.tool(
"analyze_market_with_llm", "analyze_market_with_llm",
"Deep market analysis for a transceiver technology using local LLM (qwen2.5:14b). Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.", "Deep market analysis for a transceiver technology using TIP_LLM. Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.",
{ {
technology: z.string().describe("Technology to analyze, e.g. '400G QSFP-DD', '800G OSFP', '100G ZR'"), technology: z.string().describe("Technology to analyze, e.g. '400G QSFP-DD', '800G OSFP', '100G ZR'"),
context: z.string().optional().describe("Additional context or specific questions to address"), context: z.string().optional().describe("Additional context or specific questions to address"),
@ -435,7 +438,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`;
const resp = await fetch(`${OLLAMA_BASE}/api/generate`, { const resp = await fetch(`${OLLAMA_BASE}/api/generate`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ model: "qwen2.5:14b", prompt, stream: false }), body: JSON.stringify({ model: TIP_LLM_MODEL, prompt, stream: false }),
signal: AbortSignal.timeout(120_000), signal: AbortSignal.timeout(120_000),
}); });
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}`); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}`);
@ -449,7 +452,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`;
server.tool( server.tool(
"generate_blog_post", "generate_blog_post",
"Generate a professional Flexoptix blog post using the fine-tuned fo-blog-v5 model (Ollama). Automatically enriched with live pricing, hype cycle data, and competitor analysis.", "Generate a professional Flexoptix blog post using the latest FO_BlogLLM model. Automatically enriched with live pricing, hype cycle data, and competitor analysis.",
{ {
topic: z.string().describe("Blog topic, e.g. '400G QSFP-DD vs 400G ZR — which for your DC?'"), topic: z.string().describe("Blog topic, e.g. '400G QSFP-DD vs 400G ZR — which for your DC?'"),
target_audience: z.enum(["network_engineer", "procurement", "executive", "general"]).default("network_engineer").describe("Target reader"), target_audience: z.enum(["network_engineer", "procurement", "executive", "general"]).default("network_engineer").describe("Target reader"),
@ -492,7 +495,7 @@ Do not include a title (added separately). Start directly with the article body.
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ body: JSON.stringify({
model: "fo-blog-v5", model: BLOG_LLM_MODEL,
messages: [ messages: [
{ role: "system", content: systemPrompt }, { role: "system", content: systemPrompt },
{ role: "user", content: userPrompt }, { role: "user", content: userPrompt },
@ -502,12 +505,12 @@ Do not include a title (added separately). Start directly with the article body.
signal: AbortSignal.timeout(180_000), signal: AbortSignal.timeout(180_000),
}); });
if (!resp.ok) { if (!resp.ok) {
// Fallback to qwen2.5:14b if fo-blog-v5 not available // Fallback to generic local model if FO_BlogLLM is unavailable
const fallbackResp = await fetch(`${OLLAMA_BASE}/api/chat`, { const fallbackResp = await fetch(`${OLLAMA_BASE}/api/chat`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ body: JSON.stringify({
model: "qwen2.5:14b", model: BLOG_LLM_FALLBACK,
messages: [ messages: [
{ role: "system", content: systemPrompt }, { role: "system", content: systemPrompt },
{ role: "user", content: userPrompt }, { role: "user", content: userPrompt },
@ -516,9 +519,9 @@ Do not include a title (added separately). Start directly with the article body.
}), }),
signal: AbortSignal.timeout(180_000), signal: AbortSignal.timeout(180_000),
}); });
if (!fallbackResp.ok) throw new Error(`Both fo-blog-v5 and qwen2.5:14b unavailable`); if (!fallbackResp.ok) throw new Error(`Both ${BLOG_LLM_MODEL} and ${BLOG_LLM_FALLBACK} unavailable`);
const fallbackData = await fallbackResp.json() as { message?: { content?: string } }; const fallbackData = await fallbackResp.json() as { message?: { content?: string } };
return { content: [{ type: "text" as const, text: `[Generated with qwen2.5:14b — fo-blog-v5 unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] }; return { content: [{ type: "text" as const, text: `[Generated with ${BLOG_LLM_FALLBACK}${BLOG_LLM_MODEL} unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] };
} }
const data = await resp.json() as { message?: { content?: string } }; const data = await resp.json() as { message?: { content?: string } };
return { content: [{ type: "text" as const, text: data.message?.content ?? "No content generated." }] }; return { content: [{ type: "text" as const, text: data.message?.content ?? "No content generated." }] };

View File

@ -11,7 +11,8 @@
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
"scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts",
"scrape:news": "tsx src/scrapers/news.ts", "scrape:news": "tsx src/scrapers/news.ts",
"scrape:all": "tsx src/index.ts --all" "scrape:all": "tsx src/index.ts --all",
"robots:verification": "tsx src/robots/verification-robots.ts"
}, },
"dependencies": { "dependencies": {
"crawlee": "^3.12.0", "crawlee": "^3.12.0",

View File

@ -15,7 +15,7 @@ import { VENDOR_PROFILES } from "./stock-schema";
import { validateStockExtraction } from "./validator"; import { validateStockExtraction } from "./validator";
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434"; const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434";
const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b"; const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "tip-llm-v1";
const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────

View File

@ -9,6 +9,7 @@ type Row = { id: string; lane: Lane; source: string; kind: string; messages: Mes
const repoRoot = process.cwd(); const repoRoot = process.cwd();
const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data"); const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data");
const giteaTrainingRoot = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training"); const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training");
const outRoot = join(repoRoot, "training-data", "runpod"); const outRoot = join(repoRoot, "training-data", "runpod");
@ -128,11 +129,25 @@ function markdownBlog(path: string): Row[] {
}]; }];
} }
function collectJsonlDir(dir: string, lane: Lane): Row[] {
if (!existsSync(dir)) return [];
const rows: Row[] = [];
for (const entry of readdirSync(dir, { withFileTypes: true })) {
const path = join(dir, entry.name);
if (entry.isDirectory()) rows.push(...collectJsonlDir(path, lane));
else if (entry.isFile() && entry.name.endsWith(".jsonl")) rows.push(...jsonl(path, lane));
}
return rows;
}
function collect(lane: Lane): Row[] { function collect(lane: Lane): Row[] {
const rows: Row[] = []; const rows: Row[] = [];
for (const file of files[lane]) { for (const file of files[lane]) {
for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane)); for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane));
} }
if (lane === "tip_llm") {
rows.push(...collectJsonlDir(join(giteaTrainingRoot, "qa-pairs"), lane));
}
if (lane === "blog_llm") { if (lane === "blog_llm") {
for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) { for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) {
if (!existsSync(dir)) continue; if (!existsSync(dir)) continue;