From a1a525b332945a94670b4dae52193e29c6498445 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 6 May 2026 23:39:04 +0200 Subject: [PATCH] chore: sync API routes, dashboard hot-topics, MCP server, scraper package, scripts --- docs/TIP_SELFLEARNING_WORKFLOW.md | 25 ++++ packages/api/src/routes/hot-topics.ts | 167 ++++++++++++++++++++++- packages/dashboard/hot-topics.js | 15 +- packages/mcp-server/src/index.ts | 21 +-- packages/scraper/package.json | 3 +- packages/scraper/src/crawler-llm/core.ts | 2 +- scripts/tip-learning-pool-build.ts | 15 ++ 7 files changed, 224 insertions(+), 24 deletions(-) diff --git a/docs/TIP_SELFLEARNING_WORKFLOW.md b/docs/TIP_SELFLEARNING_WORKFLOW.md index 8501ac7..892e59f 100644 --- a/docs/TIP_SELFLEARNING_WORKFLOW.md +++ b/docs/TIP_SELFLEARNING_WORKFLOW.md @@ -31,3 +31,28 @@ Default private Hugging Face datasets: - `renefichtmueller/blog-llm-sft` Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically. + +## TIPLLM Robot Experience Pool + +Crawler and verification robots must use TIPLLM only for planning/extraction feedback. Operational experience is written to the Gitea-backed TIP training pool: + +- Default local clone: `/tmp/tip-training-data` +- Override: `TIP_TRAINING_REPO=/path/to/tip-training-data` +- Gitea repo: `rene/tip-training-data` +- SFT records: `qa-pairs/robot-control-high.jsonl` +- Raw audit records: `robot-experiences/YYYY-MM-DD.jsonl` + +Useful commands: + +```bash +npm run robots:verification -w packages/scraper -- --status +npm run robots:verification -w packages/scraper -- --tipllm-plan --limit=5 +npm run robots:verification -w packages/scraper -- --enqueue=details-fast-lane --profile=erik-safe --dry-run +``` + +Safety defaults: + +- `erik-safe` is the default profile and caps to 3 lightweight queues. +- Playwright/discovery work belongs on Proxmox or Pi workers, not Erik. +- Every status snapshot, TIPLLM plan, dry-run plan, enqueue result and crawler result should become a TIPLLM training example. +- `learning-pool:build` automatically imports Gitea pool SFT rows from `qa-pairs/` into the `tip_llm` lane. diff --git a/packages/api/src/routes/hot-topics.ts b/packages/api/src/routes/hot-topics.ts index 35ff876..78725a9 100644 --- a/packages/api/src/routes/hot-topics.ts +++ b/packages/api/src/routes/hot-topics.ts @@ -25,6 +25,10 @@ interface HotTopic { data_context?: Record; suggested_angle?: string; date?: string; + blog_title_created?: boolean; + last_blog_created_at?: string; + rank_score?: number; + llm_context?: string; } /** @@ -32,10 +36,11 @@ interface HotTopic { * * Returns dynamically ranked blog topics based on real signals. */ -hotTopicsRouter.get("/", async (_req, res) => { +hotTopicsRouter.get("/", async (req, res) => { try { const topics: HotTopic[] = []; const year = new Date().getFullYear(); + const limit = Math.max(1, Math.min(50, parseInt(String(req.query.limit || "20"), 10) || 20)); // ═══ SOURCE 1: Internal Data — Price Movements ═══ const priceDrops = await pool.query(` @@ -246,9 +251,33 @@ hotTopicsRouter.get("/", async (_req, res) => { // ═══ SOURCE 7: Evergreen High-Value Topics ═══ topics.push(...getEvergreenTopics(year)); - // Sort by urgency: breaking > hot > trending > emerging - const urgencyOrder: Record = { breaking: 0, hot: 1, trending: 2, emerging: 3 }; - topics.sort((a, b) => (urgencyOrder[a.urgency] ?? 4) - (urgencyOrder[b.urgency] ?? 4)); + // Mark already-created topics and rank with daily rotation + source diversity. + const recentDrafts = await pool.query(` + SELECT title, created_at + FROM blog_drafts + WHERE created_at > NOW() - INTERVAL '180 days' + ORDER BY created_at DESC + `).catch(() => ({ rows: [] })); + + const createdByTitle = new Map(); + for (const draft of recentDrafts.rows) { + const key = normalizeTopicTitle(String(draft.title || "")); + if (key && !createdByTitle.has(key)) { + createdByTitle.set(key, draft.created_at ? new Date(draft.created_at).toISOString() : new Date().toISOString()); + } + } + + const daySeed = getDaySeed(); + const rotationSeed = daySeed + getQuerySeed(req.query.shuffle); + for (const topic of topics) { + const createdAt = createdByTitle.get(normalizeTopicTitle(topic.title)); + topic.blog_title_created = Boolean(createdAt); + topic.last_blog_created_at = createdAt; + topic.rank_score = scoreTopic(topic, rotationSeed); + topic.llm_context = buildTopicBriefing(topic); + } + + const rankedTopics = selectDiverseTopics(topics, limit); // Next daily rotation: tomorrow 00:00 UTC const tomorrow = new Date(); @@ -256,11 +285,12 @@ hotTopicsRouter.get("/", async (_req, res) => { tomorrow.setUTCHours(0, 0, 0, 0); res.json({ - topics: topics.slice(0, 20), + topics: rankedTopics, total: topics.length, generated_at: new Date().toISOString(), refreshes_at: tomorrow.toISOString(), - day_seed: getDaySeed(), + day_seed: daySeed, + rotation_seed: rotationSeed, sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"], }); } catch (err) { @@ -269,6 +299,131 @@ hotTopicsRouter.get("/", async (_req, res) => { } }); +function normalizeTopicTitle(title: string): string { + return title + .toLowerCase() + .replace(/\b20\d{2}\b/g, "{year}") + .replace(/[^a-z0-9]+/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function seededTopicJitter(title: string, seed: number): number { + let s = seed; + const normalized = normalizeTopicTitle(title); + for (let i = 0; i < normalized.length; i++) { + s = (s * 1664525 + normalized.charCodeAt(i) + 1013904223) & 0x7fffffff; + } + return s % 140; +} + +function getQuerySeed(value: unknown): number { + if (!value) return 0; + const raw = Array.isArray(value) ? String(value[0] || "") : String(value); + if (!raw) return 0; + let hash = 0; + for (let i = 0; i < raw.length; i++) { + hash = ((hash << 5) - hash + raw.charCodeAt(i)) | 0; + } + return Math.abs(hash % 100000); +} + +function compactDataContext(data: Record | undefined): string { + if (!data) return ""; + + const replacer = (_key: string, value: unknown) => { + if (Array.isArray(value)) return value.slice(0, 5); + if (typeof value === "string" && value.length > 260) return `${value.slice(0, 257)}...`; + return value; + }; + + return JSON.stringify(data, replacer, 2).slice(0, 1800); +} + +function buildTopicBriefing(topic: HotTopic): string { + const lines = [ + `Topic: ${topic.title}`, + `Urgency: ${topic.urgency}`, + `Source: ${topic.source_type} / ${topic.source}`, + ]; + + if (topic.date) lines.push(`Signal date: ${topic.date}`); + if (topic.description) lines.push(`Signal summary: ${topic.description}`); + if (topic.suggested_angle) lines.push(`Recommended angle: ${topic.suggested_angle}`); + if (topic.blog_title_created && topic.last_blog_created_at) { + lines.push(`Editorial note: A blog with a very similar title already exists from ${topic.last_blog_created_at}. If used anyway, choose a materially different angle.`); + } + + const dataContext = compactDataContext(topic.data_context); + if (dataContext) lines.push(`Structured supporting data:\n${dataContext}`); + + lines.push("Editorial instruction: turn this into a practical optical networking article with procurement/engineering consequences, not a generic news summary."); + return lines.join("\n"); +} + +function scoreTopic(topic: HotTopic, seed: number): number { + const urgencyScore: Record = { + breaking: 900, + hot: 760, + trending: 620, + emerging: 500, + }; + const sourceScore: Record = { + internal_data: 90, + competitor: 85, + trade_press: 75, + conference: 70, + manufacturer: 65, + research: 60, + }; + + let freshness = 0; + if (topic.date) { + const ageDays = Math.max(0, (Date.now() - new Date(topic.date).getTime()) / 86400000); + freshness = Math.max(0, 90 - ageDays * 4); + } + + const createdPenalty = topic.blog_title_created ? -950 : 0; + return ( + (urgencyScore[topic.urgency] ?? 400) + + (sourceScore[topic.source_type] ?? 40) + + freshness + + seededTopicJitter(topic.title, seed) + + createdPenalty + ); +} + +function selectDiverseTopics(topics: HotTopic[], limit: number): HotTopic[] { + const sorted = [...topics].sort((a, b) => (b.rank_score ?? 0) - (a.rank_score ?? 0)); + const selected: HotTopic[] = []; + const sourceTypeCount = new Map(); + const sourceCount = new Map(); + + for (const topic of sorted) { + if (selected.length >= limit) break; + const sourceType = topic.source_type; + const source = topic.source || "unknown"; + const typeCount = sourceTypeCount.get(sourceType) ?? 0; + const srcCount = sourceCount.get(source) ?? 0; + + if (typeCount >= 5) continue; + if (srcCount >= 3) continue; + + selected.push(topic); + sourceTypeCount.set(sourceType, typeCount + 1); + sourceCount.set(source, srcCount + 1); + } + + if (selected.length < limit) { + for (const topic of sorted) { + if (selected.length >= limit) break; + if (!selected.includes(topic)) selected.push(topic); + } + } + + return selected; +} + function detectNewsTheme(title: string): string { const tl = title.toLowerCase(); if (tl.includes("800g") || tl.includes("osfp")) return "800G Deployment Wave"; diff --git a/packages/dashboard/hot-topics.js b/packages/dashboard/hot-topics.js index d8ee2fe..119fed7 100644 --- a/packages/dashboard/hot-topics.js +++ b/packages/dashboard/hot-topics.js @@ -37,7 +37,7 @@ '
' + '
Generating Blog with AI...
' + '
Starting 10-step Flexoptix Style pipeline...
' + - '
Connecting to LLM (qwen2.5:14b)
' + + '
Connecting to FO_BlogLLM (fo-blog-v7)
' + '
' + '
' + '
0%
' + @@ -46,8 +46,8 @@ var body = { topic: topic }; if (speed) body.speed = speed; - if (customTitle) body.customTitle = customTitle; - if (customAngle) body.customAngle = customAngle; + if (customTitle) body.custom_title = customTitle; + if (customAngle) body.additional_context = customAngle; fetch(API + '/api/blog/generate', { method: 'POST', @@ -137,7 +137,7 @@ if (bar) bar.style.width = prog.pct + '%'; if (pct) pct.textContent = prog.pct + '%'; if (status) { status.style.color = '#FF8100'; status.textContent = prog.label || ('Step ' + prog.step + '/10'); } - if (step) step.textContent = 'Step ' + prog.step + '/10 · qwen2.5:14b via Ollama'; + if (step) step.textContent = 'Step ' + prog.step + '/10 · fo-blog-v7 via adapter bridge'; } else { _stallCount++; // After 5 consecutive non-running polls (~40s), show stall warning @@ -185,7 +185,8 @@ if (!grid) return; grid.innerHTML = '
Discovering hot topics...
'; - fetch(API + '/api/hot-topics', { headers: authHeaders() }) + var shuffle = Date.now().toString(36); + fetch(API + '/api/hot-topics?limit=20&shuffle=' + encodeURIComponent(shuffle), { headers: authHeaders({ 'Cache-Control': 'no-cache' }) }) .then(function(r) { return r.json(); }) .then(function(data) { if (!data.topics || data.topics.length === 0) { @@ -196,7 +197,7 @@ if (subtitle && data.refreshes_at) { var nextRefresh = new Date(data.refreshes_at); var hoursLeft = Math.round((nextRefresh - new Date()) / 3600000); - subtitle.textContent = data.total + ' topics · rotates daily · next refresh in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', '); + subtitle.textContent = data.total + ' topics · refresh reshuffles · daily base rotation in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', '); } var colors = { breaking: '#c1121f', hot: '#FF8100', trending: '#e6a800', emerging: '#2d6a4f' }; @@ -227,7 +228,7 @@ window._generateFromHotTopic = function(cardId) { var t = window['_ht_' + cardId]; if (!t) return; - generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.suggested_angle || t.description); + generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.llm_context || t.suggested_angle || t.description); }; // Auto-load hot topics when blog tab activates diff --git a/packages/mcp-server/src/index.ts b/packages/mcp-server/src/index.ts index bcd7619..c09a728 100644 --- a/packages/mcp-server/src/index.ts +++ b/packages/mcp-server/src/index.ts @@ -371,12 +371,15 @@ async function main() { ); } - // --- Ollama LLM tools: market analysis (qwen2.5:14b) + blog generation (fo-blog-v5) --- + // --- Ollama-compatible LLM tools: market analysis (TIP_LLM) + blog generation (FO_BlogLLM) --- const OLLAMA_BASE = process.env["OLLAMA_BASE_URL"] ?? "https://ollama.fichtmueller.org"; + const TIP_LLM_MODEL = process.env["TIP_LLM_MODEL"] ?? "tip-llm-v1"; + const BLOG_LLM_MODEL = process.env["BLOG_LLM_MODEL"] ?? "fo-blog-v7"; + const BLOG_LLM_FALLBACK = process.env["BLOG_LLM_FALLBACK_MODEL"] ?? "qwen2.5:14b"; server.tool( "analyze_market_with_llm", - "Deep market analysis for a transceiver technology using local LLM (qwen2.5:14b). Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.", + "Deep market analysis for a transceiver technology using TIP_LLM. Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.", { technology: z.string().describe("Technology to analyze, e.g. '400G QSFP-DD', '800G OSFP', '100G ZR'"), context: z.string().optional().describe("Additional context or specific questions to address"), @@ -435,7 +438,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`; const resp = await fetch(`${OLLAMA_BASE}/api/generate`, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ model: "qwen2.5:14b", prompt, stream: false }), + body: JSON.stringify({ model: TIP_LLM_MODEL, prompt, stream: false }), signal: AbortSignal.timeout(120_000), }); if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}`); @@ -449,7 +452,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`; server.tool( "generate_blog_post", - "Generate a professional Flexoptix blog post using the fine-tuned fo-blog-v5 model (Ollama). Automatically enriched with live pricing, hype cycle data, and competitor analysis.", + "Generate a professional Flexoptix blog post using the latest FO_BlogLLM model. Automatically enriched with live pricing, hype cycle data, and competitor analysis.", { topic: z.string().describe("Blog topic, e.g. '400G QSFP-DD vs 400G ZR — which for your DC?'"), target_audience: z.enum(["network_engineer", "procurement", "executive", "general"]).default("network_engineer").describe("Target reader"), @@ -492,7 +495,7 @@ Do not include a title (added separately). Start directly with the article body. method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - model: "fo-blog-v5", + model: BLOG_LLM_MODEL, messages: [ { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, @@ -502,12 +505,12 @@ Do not include a title (added separately). Start directly with the article body. signal: AbortSignal.timeout(180_000), }); if (!resp.ok) { - // Fallback to qwen2.5:14b if fo-blog-v5 not available + // Fallback to generic local model if FO_BlogLLM is unavailable const fallbackResp = await fetch(`${OLLAMA_BASE}/api/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - model: "qwen2.5:14b", + model: BLOG_LLM_FALLBACK, messages: [ { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, @@ -516,9 +519,9 @@ Do not include a title (added separately). Start directly with the article body. }), signal: AbortSignal.timeout(180_000), }); - if (!fallbackResp.ok) throw new Error(`Both fo-blog-v5 and qwen2.5:14b unavailable`); + if (!fallbackResp.ok) throw new Error(`Both ${BLOG_LLM_MODEL} and ${BLOG_LLM_FALLBACK} unavailable`); const fallbackData = await fallbackResp.json() as { message?: { content?: string } }; - return { content: [{ type: "text" as const, text: `[Generated with qwen2.5:14b — fo-blog-v5 unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] }; + return { content: [{ type: "text" as const, text: `[Generated with ${BLOG_LLM_FALLBACK} — ${BLOG_LLM_MODEL} unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] }; } const data = await resp.json() as { message?: { content?: string } }; return { content: [{ type: "text" as const, text: data.message?.content ?? "No content generated." }] }; diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 9c941f2..812525c 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -11,7 +11,8 @@ "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:news": "tsx src/scrapers/news.ts", - "scrape:all": "tsx src/index.ts --all" + "scrape:all": "tsx src/index.ts --all", + "robots:verification": "tsx src/robots/verification-robots.ts" }, "dependencies": { "crawlee": "^3.12.0", diff --git a/packages/scraper/src/crawler-llm/core.ts b/packages/scraper/src/crawler-llm/core.ts index 1622457..2e42d64 100644 --- a/packages/scraper/src/crawler-llm/core.ts +++ b/packages/scraper/src/crawler-llm/core.ts @@ -15,7 +15,7 @@ import { VENDOR_PROFILES } from "./stock-schema"; import { validateStockExtraction } from "./validator"; const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434"; -const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b"; +const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "tip-llm-v1"; const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable // ───────────────────────────────────────────────────────────────────────────── diff --git a/scripts/tip-learning-pool-build.ts b/scripts/tip-learning-pool-build.ts index 2f7a802..bfd1699 100644 --- a/scripts/tip-learning-pool-build.ts +++ b/scripts/tip-learning-pool-build.ts @@ -9,6 +9,7 @@ type Row = { id: string; lane: Lane; source: string; kind: string; messages: Mes const repoRoot = process.cwd(); const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data"); +const giteaTrainingRoot = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data"; const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training"); const outRoot = join(repoRoot, "training-data", "runpod"); @@ -128,11 +129,25 @@ function markdownBlog(path: string): Row[] { }]; } +function collectJsonlDir(dir: string, lane: Lane): Row[] { + if (!existsSync(dir)) return []; + const rows: Row[] = []; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const path = join(dir, entry.name); + if (entry.isDirectory()) rows.push(...collectJsonlDir(path, lane)); + else if (entry.isFile() && entry.name.endsWith(".jsonl")) rows.push(...jsonl(path, lane)); + } + return rows; +} + function collect(lane: Lane): Row[] { const rows: Row[] = []; for (const file of files[lane]) { for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane)); } + if (lane === "tip_llm") { + rows.push(...collectJsonlDir(join(giteaTrainingRoot, "qa-pairs"), lane)); + } if (lane === "blog_llm") { for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) { if (!existsSync(dir)) continue;