Compare commits
2 Commits
5a77fce9f3
...
a1a525b332
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1a525b332 | ||
|
|
a8529d166b |
@ -281,3 +281,4 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA
|
|||||||
{"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."}
|
{"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."}
|
||||||
{"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."}
|
{"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."}
|
||||||
{"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."}
|
{"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."}
|
||||||
|
{"d":"2026-04-29","t":"FEAT","m":"TIPLLM robot learning loop: verification robot controller writes status, TIPLLM plans, queue dry-runs/enqueues and crawler outcomes into the Gitea-backed TIP training pool; learning-pool build imports qa-pairs from TIP_TRAINING_REPO into the tip_llm lane. Removed hardcoded Gitea token fallback; existing git remotes or env tokens are used."}
|
||||||
|
|||||||
@ -31,3 +31,28 @@ Default private Hugging Face datasets:
|
|||||||
- `renefichtmueller/blog-llm-sft`
|
- `renefichtmueller/blog-llm-sft`
|
||||||
|
|
||||||
Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically.
|
Local training is enabled by setting `TIP_LOCAL_TRAIN_COMMAND`; the API appends the lane name automatically.
|
||||||
|
|
||||||
|
## TIPLLM Robot Experience Pool
|
||||||
|
|
||||||
|
Crawler and verification robots must use TIPLLM only for planning/extraction feedback. Operational experience is written to the Gitea-backed TIP training pool:
|
||||||
|
|
||||||
|
- Default local clone: `/tmp/tip-training-data`
|
||||||
|
- Override: `TIP_TRAINING_REPO=/path/to/tip-training-data`
|
||||||
|
- Gitea repo: `rene/tip-training-data`
|
||||||
|
- SFT records: `qa-pairs/robot-control-high.jsonl`
|
||||||
|
- Raw audit records: `robot-experiences/YYYY-MM-DD.jsonl`
|
||||||
|
|
||||||
|
Useful commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run robots:verification -w packages/scraper -- --status
|
||||||
|
npm run robots:verification -w packages/scraper -- --tipllm-plan --limit=5
|
||||||
|
npm run robots:verification -w packages/scraper -- --enqueue=details-fast-lane --profile=erik-safe --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
Safety defaults:
|
||||||
|
|
||||||
|
- `erik-safe` is the default profile and caps to 3 lightweight queues.
|
||||||
|
- Playwright/discovery work belongs on Proxmox or Pi workers, not Erik.
|
||||||
|
- Every status snapshot, TIPLLM plan, dry-run plan, enqueue result and crawler result should become a TIPLLM training example.
|
||||||
|
- `learning-pool:build` automatically imports Gitea pool SFT rows from `qa-pairs/` into the `tip_llm` lane.
|
||||||
|
|||||||
@ -25,6 +25,10 @@ interface HotTopic {
|
|||||||
data_context?: Record<string, unknown>;
|
data_context?: Record<string, unknown>;
|
||||||
suggested_angle?: string;
|
suggested_angle?: string;
|
||||||
date?: string;
|
date?: string;
|
||||||
|
blog_title_created?: boolean;
|
||||||
|
last_blog_created_at?: string;
|
||||||
|
rank_score?: number;
|
||||||
|
llm_context?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -32,10 +36,11 @@ interface HotTopic {
|
|||||||
*
|
*
|
||||||
* Returns dynamically ranked blog topics based on real signals.
|
* Returns dynamically ranked blog topics based on real signals.
|
||||||
*/
|
*/
|
||||||
hotTopicsRouter.get("/", async (_req, res) => {
|
hotTopicsRouter.get("/", async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const topics: HotTopic[] = [];
|
const topics: HotTopic[] = [];
|
||||||
const year = new Date().getFullYear();
|
const year = new Date().getFullYear();
|
||||||
|
const limit = Math.max(1, Math.min(50, parseInt(String(req.query.limit || "20"), 10) || 20));
|
||||||
|
|
||||||
// ═══ SOURCE 1: Internal Data — Price Movements ═══
|
// ═══ SOURCE 1: Internal Data — Price Movements ═══
|
||||||
const priceDrops = await pool.query(`
|
const priceDrops = await pool.query(`
|
||||||
@ -246,9 +251,33 @@ hotTopicsRouter.get("/", async (_req, res) => {
|
|||||||
// ═══ SOURCE 7: Evergreen High-Value Topics ═══
|
// ═══ SOURCE 7: Evergreen High-Value Topics ═══
|
||||||
topics.push(...getEvergreenTopics(year));
|
topics.push(...getEvergreenTopics(year));
|
||||||
|
|
||||||
// Sort by urgency: breaking > hot > trending > emerging
|
// Mark already-created topics and rank with daily rotation + source diversity.
|
||||||
const urgencyOrder: Record<string, number> = { breaking: 0, hot: 1, trending: 2, emerging: 3 };
|
const recentDrafts = await pool.query(`
|
||||||
topics.sort((a, b) => (urgencyOrder[a.urgency] ?? 4) - (urgencyOrder[b.urgency] ?? 4));
|
SELECT title, created_at
|
||||||
|
FROM blog_drafts
|
||||||
|
WHERE created_at > NOW() - INTERVAL '180 days'
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
`).catch(() => ({ rows: [] }));
|
||||||
|
|
||||||
|
const createdByTitle = new Map<string, string>();
|
||||||
|
for (const draft of recentDrafts.rows) {
|
||||||
|
const key = normalizeTopicTitle(String(draft.title || ""));
|
||||||
|
if (key && !createdByTitle.has(key)) {
|
||||||
|
createdByTitle.set(key, draft.created_at ? new Date(draft.created_at).toISOString() : new Date().toISOString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const daySeed = getDaySeed();
|
||||||
|
const rotationSeed = daySeed + getQuerySeed(req.query.shuffle);
|
||||||
|
for (const topic of topics) {
|
||||||
|
const createdAt = createdByTitle.get(normalizeTopicTitle(topic.title));
|
||||||
|
topic.blog_title_created = Boolean(createdAt);
|
||||||
|
topic.last_blog_created_at = createdAt;
|
||||||
|
topic.rank_score = scoreTopic(topic, rotationSeed);
|
||||||
|
topic.llm_context = buildTopicBriefing(topic);
|
||||||
|
}
|
||||||
|
|
||||||
|
const rankedTopics = selectDiverseTopics(topics, limit);
|
||||||
|
|
||||||
// Next daily rotation: tomorrow 00:00 UTC
|
// Next daily rotation: tomorrow 00:00 UTC
|
||||||
const tomorrow = new Date();
|
const tomorrow = new Date();
|
||||||
@ -256,11 +285,12 @@ hotTopicsRouter.get("/", async (_req, res) => {
|
|||||||
tomorrow.setUTCHours(0, 0, 0, 0);
|
tomorrow.setUTCHours(0, 0, 0, 0);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
topics: topics.slice(0, 20),
|
topics: rankedTopics,
|
||||||
total: topics.length,
|
total: topics.length,
|
||||||
generated_at: new Date().toISOString(),
|
generated_at: new Date().toISOString(),
|
||||||
refreshes_at: tomorrow.toISOString(),
|
refreshes_at: tomorrow.toISOString(),
|
||||||
day_seed: getDaySeed(),
|
day_seed: daySeed,
|
||||||
|
rotation_seed: rotationSeed,
|
||||||
sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"],
|
sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"],
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@ -269,6 +299,131 @@ hotTopicsRouter.get("/", async (_req, res) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
function normalizeTopicTitle(title: string): string {
|
||||||
|
return title
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/\b20\d{2}\b/g, "{year}")
|
||||||
|
.replace(/[^a-z0-9]+/g, " ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function seededTopicJitter(title: string, seed: number): number {
|
||||||
|
let s = seed;
|
||||||
|
const normalized = normalizeTopicTitle(title);
|
||||||
|
for (let i = 0; i < normalized.length; i++) {
|
||||||
|
s = (s * 1664525 + normalized.charCodeAt(i) + 1013904223) & 0x7fffffff;
|
||||||
|
}
|
||||||
|
return s % 140;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getQuerySeed(value: unknown): number {
|
||||||
|
if (!value) return 0;
|
||||||
|
const raw = Array.isArray(value) ? String(value[0] || "") : String(value);
|
||||||
|
if (!raw) return 0;
|
||||||
|
let hash = 0;
|
||||||
|
for (let i = 0; i < raw.length; i++) {
|
||||||
|
hash = ((hash << 5) - hash + raw.charCodeAt(i)) | 0;
|
||||||
|
}
|
||||||
|
return Math.abs(hash % 100000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function compactDataContext(data: Record<string, unknown> | undefined): string {
|
||||||
|
if (!data) return "";
|
||||||
|
|
||||||
|
const replacer = (_key: string, value: unknown) => {
|
||||||
|
if (Array.isArray(value)) return value.slice(0, 5);
|
||||||
|
if (typeof value === "string" && value.length > 260) return `${value.slice(0, 257)}...`;
|
||||||
|
return value;
|
||||||
|
};
|
||||||
|
|
||||||
|
return JSON.stringify(data, replacer, 2).slice(0, 1800);
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildTopicBriefing(topic: HotTopic): string {
|
||||||
|
const lines = [
|
||||||
|
`Topic: ${topic.title}`,
|
||||||
|
`Urgency: ${topic.urgency}`,
|
||||||
|
`Source: ${topic.source_type} / ${topic.source}`,
|
||||||
|
];
|
||||||
|
|
||||||
|
if (topic.date) lines.push(`Signal date: ${topic.date}`);
|
||||||
|
if (topic.description) lines.push(`Signal summary: ${topic.description}`);
|
||||||
|
if (topic.suggested_angle) lines.push(`Recommended angle: ${topic.suggested_angle}`);
|
||||||
|
if (topic.blog_title_created && topic.last_blog_created_at) {
|
||||||
|
lines.push(`Editorial note: A blog with a very similar title already exists from ${topic.last_blog_created_at}. If used anyway, choose a materially different angle.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const dataContext = compactDataContext(topic.data_context);
|
||||||
|
if (dataContext) lines.push(`Structured supporting data:\n${dataContext}`);
|
||||||
|
|
||||||
|
lines.push("Editorial instruction: turn this into a practical optical networking article with procurement/engineering consequences, not a generic news summary.");
|
||||||
|
return lines.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreTopic(topic: HotTopic, seed: number): number {
|
||||||
|
const urgencyScore: Record<HotTopic["urgency"], number> = {
|
||||||
|
breaking: 900,
|
||||||
|
hot: 760,
|
||||||
|
trending: 620,
|
||||||
|
emerging: 500,
|
||||||
|
};
|
||||||
|
const sourceScore: Record<HotTopic["source_type"], number> = {
|
||||||
|
internal_data: 90,
|
||||||
|
competitor: 85,
|
||||||
|
trade_press: 75,
|
||||||
|
conference: 70,
|
||||||
|
manufacturer: 65,
|
||||||
|
research: 60,
|
||||||
|
};
|
||||||
|
|
||||||
|
let freshness = 0;
|
||||||
|
if (topic.date) {
|
||||||
|
const ageDays = Math.max(0, (Date.now() - new Date(topic.date).getTime()) / 86400000);
|
||||||
|
freshness = Math.max(0, 90 - ageDays * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
const createdPenalty = topic.blog_title_created ? -950 : 0;
|
||||||
|
return (
|
||||||
|
(urgencyScore[topic.urgency] ?? 400) +
|
||||||
|
(sourceScore[topic.source_type] ?? 40) +
|
||||||
|
freshness +
|
||||||
|
seededTopicJitter(topic.title, seed) +
|
||||||
|
createdPenalty
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function selectDiverseTopics(topics: HotTopic[], limit: number): HotTopic[] {
|
||||||
|
const sorted = [...topics].sort((a, b) => (b.rank_score ?? 0) - (a.rank_score ?? 0));
|
||||||
|
const selected: HotTopic[] = [];
|
||||||
|
const sourceTypeCount = new Map<string, number>();
|
||||||
|
const sourceCount = new Map<string, number>();
|
||||||
|
|
||||||
|
for (const topic of sorted) {
|
||||||
|
if (selected.length >= limit) break;
|
||||||
|
const sourceType = topic.source_type;
|
||||||
|
const source = topic.source || "unknown";
|
||||||
|
const typeCount = sourceTypeCount.get(sourceType) ?? 0;
|
||||||
|
const srcCount = sourceCount.get(source) ?? 0;
|
||||||
|
|
||||||
|
if (typeCount >= 5) continue;
|
||||||
|
if (srcCount >= 3) continue;
|
||||||
|
|
||||||
|
selected.push(topic);
|
||||||
|
sourceTypeCount.set(sourceType, typeCount + 1);
|
||||||
|
sourceCount.set(source, srcCount + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (selected.length < limit) {
|
||||||
|
for (const topic of sorted) {
|
||||||
|
if (selected.length >= limit) break;
|
||||||
|
if (!selected.includes(topic)) selected.push(topic);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return selected;
|
||||||
|
}
|
||||||
|
|
||||||
function detectNewsTheme(title: string): string {
|
function detectNewsTheme(title: string): string {
|
||||||
const tl = title.toLowerCase();
|
const tl = title.toLowerCase();
|
||||||
if (tl.includes("800g") || tl.includes("osfp")) return "800G Deployment Wave";
|
if (tl.includes("800g") || tl.includes("osfp")) return "800G Deployment Wave";
|
||||||
|
|||||||
@ -37,7 +37,7 @@
|
|||||||
'<div style="background:linear-gradient(135deg,#1a1a1a,#2a2a2a);color:white;padding:2rem;border-radius:12px;text-align:center;margin-bottom:1rem">' +
|
'<div style="background:linear-gradient(135deg,#1a1a1a,#2a2a2a);color:white;padding:2rem;border-radius:12px;text-align:center;margin-bottom:1rem">' +
|
||||||
'<div style="font-size:1.4rem;font-weight:700;margin-bottom:1rem">Generating Blog with AI...</div>' +
|
'<div style="font-size:1.4rem;font-weight:700;margin-bottom:1rem">Generating Blog with AI...</div>' +
|
||||||
'<div id="bp-status" style="font-size:1rem;color:#FF8100;margin-bottom:0.5rem">Starting 10-step Flexoptix Style pipeline...</div>' +
|
'<div id="bp-status" style="font-size:1rem;color:#FF8100;margin-bottom:0.5rem">Starting 10-step Flexoptix Style pipeline...</div>' +
|
||||||
'<div id="bp-step" style="font-size:0.85rem;color:#aaa">Connecting to LLM (qwen2.5:14b)</div>' +
|
'<div id="bp-step" style="font-size:0.85rem;color:#aaa">Connecting to FO_BlogLLM (fo-blog-v7)</div>' +
|
||||||
'<div style="margin-top:1.5rem;background:#333;border-radius:8px;height:8px;overflow:hidden">' +
|
'<div style="margin-top:1.5rem;background:#333;border-radius:8px;height:8px;overflow:hidden">' +
|
||||||
'<div id="bp-bar" style="width:2%;height:100%;background:#FF8100;transition:width 0.5s ease"></div></div>' +
|
'<div id="bp-bar" style="width:2%;height:100%;background:#FF8100;transition:width 0.5s ease"></div></div>' +
|
||||||
'<div id="bp-pct" style="font-size:0.8rem;color:#666;margin-top:0.5rem">0%</div>' +
|
'<div id="bp-pct" style="font-size:0.8rem;color:#666;margin-top:0.5rem">0%</div>' +
|
||||||
@ -46,8 +46,8 @@
|
|||||||
|
|
||||||
var body = { topic: topic };
|
var body = { topic: topic };
|
||||||
if (speed) body.speed = speed;
|
if (speed) body.speed = speed;
|
||||||
if (customTitle) body.customTitle = customTitle;
|
if (customTitle) body.custom_title = customTitle;
|
||||||
if (customAngle) body.customAngle = customAngle;
|
if (customAngle) body.additional_context = customAngle;
|
||||||
|
|
||||||
fetch(API + '/api/blog/generate', {
|
fetch(API + '/api/blog/generate', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@ -137,7 +137,7 @@
|
|||||||
if (bar) bar.style.width = prog.pct + '%';
|
if (bar) bar.style.width = prog.pct + '%';
|
||||||
if (pct) pct.textContent = prog.pct + '%';
|
if (pct) pct.textContent = prog.pct + '%';
|
||||||
if (status) { status.style.color = '#FF8100'; status.textContent = prog.label || ('Step ' + prog.step + '/10'); }
|
if (status) { status.style.color = '#FF8100'; status.textContent = prog.label || ('Step ' + prog.step + '/10'); }
|
||||||
if (step) step.textContent = 'Step ' + prog.step + '/10 · qwen2.5:14b via Ollama';
|
if (step) step.textContent = 'Step ' + prog.step + '/10 · fo-blog-v7 via adapter bridge';
|
||||||
} else {
|
} else {
|
||||||
_stallCount++;
|
_stallCount++;
|
||||||
// After 5 consecutive non-running polls (~40s), show stall warning
|
// After 5 consecutive non-running polls (~40s), show stall warning
|
||||||
@ -185,7 +185,8 @@
|
|||||||
if (!grid) return;
|
if (!grid) return;
|
||||||
grid.innerHTML = '<div class="loading pulse">Discovering hot topics...</div>';
|
grid.innerHTML = '<div class="loading pulse">Discovering hot topics...</div>';
|
||||||
|
|
||||||
fetch(API + '/api/hot-topics', { headers: authHeaders() })
|
var shuffle = Date.now().toString(36);
|
||||||
|
fetch(API + '/api/hot-topics?limit=20&shuffle=' + encodeURIComponent(shuffle), { headers: authHeaders({ 'Cache-Control': 'no-cache' }) })
|
||||||
.then(function(r) { return r.json(); })
|
.then(function(r) { return r.json(); })
|
||||||
.then(function(data) {
|
.then(function(data) {
|
||||||
if (!data.topics || data.topics.length === 0) {
|
if (!data.topics || data.topics.length === 0) {
|
||||||
@ -196,7 +197,7 @@
|
|||||||
if (subtitle && data.refreshes_at) {
|
if (subtitle && data.refreshes_at) {
|
||||||
var nextRefresh = new Date(data.refreshes_at);
|
var nextRefresh = new Date(data.refreshes_at);
|
||||||
var hoursLeft = Math.round((nextRefresh - new Date()) / 3600000);
|
var hoursLeft = Math.round((nextRefresh - new Date()) / 3600000);
|
||||||
subtitle.textContent = data.total + ' topics · rotates daily · next refresh in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', ');
|
subtitle.textContent = data.total + ' topics · refresh reshuffles · daily base rotation in ' + hoursLeft + 'h · sources: ' + (data.sources || []).join(', ');
|
||||||
}
|
}
|
||||||
|
|
||||||
var colors = { breaking: '#c1121f', hot: '#FF8100', trending: '#e6a800', emerging: '#2d6a4f' };
|
var colors = { breaking: '#c1121f', hot: '#FF8100', trending: '#e6a800', emerging: '#2d6a4f' };
|
||||||
@ -227,7 +228,7 @@
|
|||||||
window._generateFromHotTopic = function(cardId) {
|
window._generateFromHotTopic = function(cardId) {
|
||||||
var t = window['_ht_' + cardId];
|
var t = window['_ht_' + cardId];
|
||||||
if (!t) return;
|
if (!t) return;
|
||||||
generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.suggested_angle || t.description);
|
generateBlog(t.blog_type || 'hype_cycle', null, t.title, t.llm_context || t.suggested_angle || t.description);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Auto-load hot topics when blog tab activates
|
// Auto-load hot topics when blog tab activates
|
||||||
|
|||||||
@ -371,12 +371,15 @@ async function main() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Ollama LLM tools: market analysis (qwen2.5:14b) + blog generation (fo-blog-v5) ---
|
// --- Ollama-compatible LLM tools: market analysis (TIP_LLM) + blog generation (FO_BlogLLM) ---
|
||||||
const OLLAMA_BASE = process.env["OLLAMA_BASE_URL"] ?? "https://ollama.fichtmueller.org";
|
const OLLAMA_BASE = process.env["OLLAMA_BASE_URL"] ?? "https://ollama.fichtmueller.org";
|
||||||
|
const TIP_LLM_MODEL = process.env["TIP_LLM_MODEL"] ?? "tip-llm-v1";
|
||||||
|
const BLOG_LLM_MODEL = process.env["BLOG_LLM_MODEL"] ?? "fo-blog-v7";
|
||||||
|
const BLOG_LLM_FALLBACK = process.env["BLOG_LLM_FALLBACK_MODEL"] ?? "qwen2.5:14b";
|
||||||
|
|
||||||
server.tool(
|
server.tool(
|
||||||
"analyze_market_with_llm",
|
"analyze_market_with_llm",
|
||||||
"Deep market analysis for a transceiver technology using local LLM (qwen2.5:14b). Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.",
|
"Deep market analysis for a transceiver technology using TIP_LLM. Provides expert narrative on adoption trends, pricing trajectory, competitive dynamics, and buy/wait/hold recommendation.",
|
||||||
{
|
{
|
||||||
technology: z.string().describe("Technology to analyze, e.g. '400G QSFP-DD', '800G OSFP', '100G ZR'"),
|
technology: z.string().describe("Technology to analyze, e.g. '400G QSFP-DD', '800G OSFP', '100G ZR'"),
|
||||||
context: z.string().optional().describe("Additional context or specific questions to address"),
|
context: z.string().optional().describe("Additional context or specific questions to address"),
|
||||||
@ -435,7 +438,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`;
|
|||||||
const resp = await fetch(`${OLLAMA_BASE}/api/generate`, {
|
const resp = await fetch(`${OLLAMA_BASE}/api/generate`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
body: JSON.stringify({ model: "qwen2.5:14b", prompt, stream: false }),
|
body: JSON.stringify({ model: TIP_LLM_MODEL, prompt, stream: false }),
|
||||||
signal: AbortSignal.timeout(120_000),
|
signal: AbortSignal.timeout(120_000),
|
||||||
});
|
});
|
||||||
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}`);
|
if (!resp.ok) throw new Error(`Ollama HTTP ${resp.status}`);
|
||||||
@ -449,7 +452,7 @@ Keep the analysis actionable and data-driven. Under 400 words.`;
|
|||||||
|
|
||||||
server.tool(
|
server.tool(
|
||||||
"generate_blog_post",
|
"generate_blog_post",
|
||||||
"Generate a professional Flexoptix blog post using the fine-tuned fo-blog-v5 model (Ollama). Automatically enriched with live pricing, hype cycle data, and competitor analysis.",
|
"Generate a professional Flexoptix blog post using the latest FO_BlogLLM model. Automatically enriched with live pricing, hype cycle data, and competitor analysis.",
|
||||||
{
|
{
|
||||||
topic: z.string().describe("Blog topic, e.g. '400G QSFP-DD vs 400G ZR — which for your DC?'"),
|
topic: z.string().describe("Blog topic, e.g. '400G QSFP-DD vs 400G ZR — which for your DC?'"),
|
||||||
target_audience: z.enum(["network_engineer", "procurement", "executive", "general"]).default("network_engineer").describe("Target reader"),
|
target_audience: z.enum(["network_engineer", "procurement", "executive", "general"]).default("network_engineer").describe("Target reader"),
|
||||||
@ -492,7 +495,7 @@ Do not include a title (added separately). Start directly with the article body.
|
|||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model: "fo-blog-v5",
|
model: BLOG_LLM_MODEL,
|
||||||
messages: [
|
messages: [
|
||||||
{ role: "system", content: systemPrompt },
|
{ role: "system", content: systemPrompt },
|
||||||
{ role: "user", content: userPrompt },
|
{ role: "user", content: userPrompt },
|
||||||
@ -502,12 +505,12 @@ Do not include a title (added separately). Start directly with the article body.
|
|||||||
signal: AbortSignal.timeout(180_000),
|
signal: AbortSignal.timeout(180_000),
|
||||||
});
|
});
|
||||||
if (!resp.ok) {
|
if (!resp.ok) {
|
||||||
// Fallback to qwen2.5:14b if fo-blog-v5 not available
|
// Fallback to generic local model if FO_BlogLLM is unavailable
|
||||||
const fallbackResp = await fetch(`${OLLAMA_BASE}/api/chat`, {
|
const fallbackResp = await fetch(`${OLLAMA_BASE}/api/chat`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model: "qwen2.5:14b",
|
model: BLOG_LLM_FALLBACK,
|
||||||
messages: [
|
messages: [
|
||||||
{ role: "system", content: systemPrompt },
|
{ role: "system", content: systemPrompt },
|
||||||
{ role: "user", content: userPrompt },
|
{ role: "user", content: userPrompt },
|
||||||
@ -516,9 +519,9 @@ Do not include a title (added separately). Start directly with the article body.
|
|||||||
}),
|
}),
|
||||||
signal: AbortSignal.timeout(180_000),
|
signal: AbortSignal.timeout(180_000),
|
||||||
});
|
});
|
||||||
if (!fallbackResp.ok) throw new Error(`Both fo-blog-v5 and qwen2.5:14b unavailable`);
|
if (!fallbackResp.ok) throw new Error(`Both ${BLOG_LLM_MODEL} and ${BLOG_LLM_FALLBACK} unavailable`);
|
||||||
const fallbackData = await fallbackResp.json() as { message?: { content?: string } };
|
const fallbackData = await fallbackResp.json() as { message?: { content?: string } };
|
||||||
return { content: [{ type: "text" as const, text: `[Generated with qwen2.5:14b — fo-blog-v5 unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] };
|
return { content: [{ type: "text" as const, text: `[Generated with ${BLOG_LLM_FALLBACK} — ${BLOG_LLM_MODEL} unavailable]\n\n${fallbackData.message?.content ?? "No content"}` }] };
|
||||||
}
|
}
|
||||||
const data = await resp.json() as { message?: { content?: string } };
|
const data = await resp.json() as { message?: { content?: string } };
|
||||||
return { content: [{ type: "text" as const, text: data.message?.content ?? "No content generated." }] };
|
return { content: [{ type: "text" as const, text: data.message?.content ?? "No content generated." }] };
|
||||||
|
|||||||
@ -11,7 +11,8 @@
|
|||||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||||
"scrape:news": "tsx src/scrapers/news.ts",
|
"scrape:news": "tsx src/scrapers/news.ts",
|
||||||
"scrape:all": "tsx src/index.ts --all"
|
"scrape:all": "tsx src/index.ts --all",
|
||||||
|
"robots:verification": "tsx src/robots/verification-robots.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"crawlee": "^3.12.0",
|
"crawlee": "^3.12.0",
|
||||||
|
|||||||
@ -15,7 +15,7 @@ import { VENDOR_PROFILES } from "./stock-schema";
|
|||||||
import { validateStockExtraction } from "./validator";
|
import { validateStockExtraction } from "./validator";
|
||||||
|
|
||||||
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434";
|
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434";
|
||||||
const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b";
|
const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "tip-llm-v1";
|
||||||
const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable
|
const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { execSync } from "child_process";
|
import { execSync } from "child_process";
|
||||||
import { appendFileSync, mkdirSync, existsSync } from "fs";
|
import { appendFileSync, mkdirSync, existsSync, writeFileSync } from "fs";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
|
|
||||||
@ -28,8 +28,9 @@ import type { CombinedValidationResult } from "./spec-validator";
|
|||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
|
const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
|
||||||
const GITEA_TOKEN = process.env.GITEA_TOKEN || "0e758f30abf86ffb49b2d7bb5b1f0be12c7f0b46";
|
const GITEA_TOKEN = process.env.GITEA_TOKEN || "";
|
||||||
const GITEA_BASE = "http://192.168.178.196:3000";
|
const GITEA_BASE = process.env.GITEA_BASE || "http://192.168.178.196:3000";
|
||||||
|
const GITEA_REPO = process.env.TIP_TRAINING_GITEA_REPO || "rene/tip-training-data";
|
||||||
|
|
||||||
// Minimum confidence for a spec to enter the high-quality training set
|
// Minimum confidence for a spec to enter the high-quality training set
|
||||||
const MIN_CONFIDENCE_HIGH = 0.75;
|
const MIN_CONFIDENCE_HIGH = 0.75;
|
||||||
@ -67,6 +68,20 @@ export interface CrawlExtraction {
|
|||||||
crawled_at: string; // ISO timestamp
|
crawled_at: string; // ISO timestamp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface RobotExperience {
|
||||||
|
event: "status" | "tipllm_plan" | "queue_plan" | "queue_enqueued" | "queue_rejected" | "crawler_result";
|
||||||
|
observed_at: string;
|
||||||
|
actor: string;
|
||||||
|
profile?: string;
|
||||||
|
wave?: string;
|
||||||
|
vendor?: string;
|
||||||
|
summary: string;
|
||||||
|
input: Record<string, unknown>;
|
||||||
|
decision: Record<string, unknown>;
|
||||||
|
outcome?: Record<string, unknown>;
|
||||||
|
safety_notes?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// Helpers
|
// Helpers
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
@ -94,7 +109,39 @@ function ensureDir(dir: string): void {
|
|||||||
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function ensureTrainingRepoLayout(): void {
|
||||||
|
ensureDir(REPO_DIR);
|
||||||
|
ensureDir(join(REPO_DIR, "qa-pairs"));
|
||||||
|
ensureDir(join(REPO_DIR, "robot-experiences"));
|
||||||
|
ensureDir(join(REPO_DIR, "crawl-extractions"));
|
||||||
|
ensureDir(join(REPO_DIR, "validated-specs"));
|
||||||
|
|
||||||
|
const readme = join(REPO_DIR, "README.md");
|
||||||
|
if (!existsSync(readme)) {
|
||||||
|
writeFileSync(
|
||||||
|
readme,
|
||||||
|
[
|
||||||
|
"# TIP Training Data Pool",
|
||||||
|
"",
|
||||||
|
"This repository stores TIPLLM learning records produced by TIP crawlers, scrapers and verification robots.",
|
||||||
|
"",
|
||||||
|
"Primary lanes:",
|
||||||
|
"",
|
||||||
|
"- `qa-pairs/`: supervised fine-tuning JSONL records with chat messages.",
|
||||||
|
"- `robot-experiences/`: raw operational experience logs for audit and future dataset builders.",
|
||||||
|
"- `crawl-extractions/`: crawler extraction audit logs.",
|
||||||
|
"- `validated-specs/`: validated structured transceiver specifications.",
|
||||||
|
"",
|
||||||
|
"Only TIPLLM should be used for crawler planning and extraction feedback in this lane.",
|
||||||
|
"",
|
||||||
|
].join("\n"),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function appendRecord(filePath: string, record: SftRecord): void {
|
function appendRecord(filePath: string, record: SftRecord): void {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8");
|
appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,6 +263,7 @@ function getOutputFile(type: "spec_qa" | "crawl_reasoning" | "validation" | "dis
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getRawExtractionFile(vendorSlug: string): string {
|
function getRawExtractionFile(vendorSlug: string): string {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
const dir = join(REPO_DIR, "crawl-extractions", vendorSlug);
|
const dir = join(REPO_DIR, "crawl-extractions", vendorSlug);
|
||||||
ensureDir(dir);
|
ensureDir(dir);
|
||||||
const date = new Date().toISOString().split("T")[0];
|
const date = new Date().toISOString().split("T")[0];
|
||||||
@ -223,6 +271,7 @@ function getRawExtractionFile(vendorSlug: string): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getValidatedSpecFile(tier: string): string {
|
function getValidatedSpecFile(tier: string): string {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
const dir = join(REPO_DIR, "validated-specs");
|
const dir = join(REPO_DIR, "validated-specs");
|
||||||
ensureDir(dir);
|
ensureDir(dir);
|
||||||
return join(dir, `${tier}.jsonl`);
|
return join(dir, `${tier}.jsonl`);
|
||||||
@ -251,12 +300,17 @@ function gitCommit(message: string): void {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function gitPush(): void {
|
function gitPush(): void {
|
||||||
const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/rene/tip-training-data.git`;
|
if (!GITEA_TOKEN) {
|
||||||
|
execSync("git push origin main", { cwd: REPO_DIR, stdio: "pipe" });
|
||||||
|
} else {
|
||||||
|
const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/${GITEA_REPO}.git`;
|
||||||
execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" });
|
execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function flushToGitea(label = "batch"): void {
|
export function flushToGitea(label = "batch"): void {
|
||||||
try {
|
try {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
gitAddAll();
|
gitAddAll();
|
||||||
gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`);
|
gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`);
|
||||||
gitPush();
|
gitPush();
|
||||||
@ -267,6 +321,67 @@ export function flushToGitea(label = "batch"): void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function makeRobotPair(experience: RobotExperience): SftRecord {
|
||||||
|
const prompt = [
|
||||||
|
"You are TIPLLM controlling TIP crawler and verification robots.",
|
||||||
|
"Given this current observation, choose the safest next robot action.",
|
||||||
|
"",
|
||||||
|
JSON.stringify({
|
||||||
|
event: experience.event,
|
||||||
|
actor: experience.actor,
|
||||||
|
profile: experience.profile,
|
||||||
|
wave: experience.wave,
|
||||||
|
vendor: experience.vendor,
|
||||||
|
input: experience.input,
|
||||||
|
safety_notes: experience.safety_notes ?? [],
|
||||||
|
}, null, 2),
|
||||||
|
].join("\n");
|
||||||
|
|
||||||
|
const response = {
|
||||||
|
summary: experience.summary,
|
||||||
|
decision: experience.decision,
|
||||||
|
outcome: experience.outcome ?? null,
|
||||||
|
constraints: [
|
||||||
|
"Use TIPLLM only for planning/extraction feedback.",
|
||||||
|
"Do not start heavy crawler waves on Erik.",
|
||||||
|
"Prefer small, observable queue batches.",
|
||||||
|
"Write useful outcomes back into the TIPLLM training pool.",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: makeId("robot-exp", JSON.stringify(experience)),
|
||||||
|
source: `robot-experience:${experience.event}:${experience.actor}`,
|
||||||
|
kind: "sft-jsonl",
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: SYSTEM_PROMPT },
|
||||||
|
{ role: "user", content: prompt },
|
||||||
|
{ role: "assistant", content: JSON.stringify(response, null, 2) },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function getRobotExperienceFile(): string {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
|
const date = new Date().toISOString().split("T")[0];
|
||||||
|
return join(REPO_DIR, "robot-experiences", `${date}.jsonl`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write one operational crawler/robot experience to the TIPLLM training pool.
|
||||||
|
* This produces both a raw audit row and an SFT chat row under qa-pairs.
|
||||||
|
*/
|
||||||
|
export function writeRobotExperience(experience: RobotExperience, options: { flush?: boolean } = {}): void {
|
||||||
|
ensureTrainingRepoLayout();
|
||||||
|
appendFileSync(getRobotExperienceFile(), JSON.stringify(experience) + "\n", "utf8");
|
||||||
|
appendRecord(join(REPO_DIR, "qa-pairs", "robot-control-high.jsonl"), makeRobotPair(experience));
|
||||||
|
pendingChanges += 2;
|
||||||
|
|
||||||
|
if (options.flush || pendingChanges >= BATCH_COMMIT_THRESHOLD) {
|
||||||
|
flushToGitea(`robot-${experience.event}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// Public API
|
// Public API
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
481
packages/scraper/src/robots/verification-robots.ts
Normal file
481
packages/scraper/src/robots/verification-robots.ts
Normal file
@ -0,0 +1,481 @@
|
|||||||
|
/**
|
||||||
|
* Verification Robot Command Center
|
||||||
|
*
|
||||||
|
* Read-only status:
|
||||||
|
* tsx src/robots/verification-robots.ts --status
|
||||||
|
*
|
||||||
|
* Enqueue targeted crawler waves:
|
||||||
|
* tsx src/robots/verification-robots.ts --enqueue=details-fast-lane --profile=erik-safe
|
||||||
|
* tsx src/robots/verification-robots.ts --enqueue=priority-vendors --profile=pi-fetch
|
||||||
|
* tsx src/robots/verification-robots.ts --enqueue=all --profile=proxmox-heavy
|
||||||
|
*
|
||||||
|
* Dry run:
|
||||||
|
* tsx src/robots/verification-robots.ts --enqueue=all --dry-run
|
||||||
|
*
|
||||||
|
* TIPLLM planning, no crawl load:
|
||||||
|
* tsx src/robots/verification-robots.ts --tipllm-plan --limit=5
|
||||||
|
*/
|
||||||
|
import { config } from "dotenv";
|
||||||
|
import { join } from "path";
|
||||||
|
import PgBoss from "pg-boss";
|
||||||
|
import { pool } from "../utils/db";
|
||||||
|
import { writeRobotExperience } from "../crawler-llm/training-data-writer";
|
||||||
|
|
||||||
|
config({ path: join(__dirname, "..", "..", "..", "..", ".env") });
|
||||||
|
|
||||||
|
type RobotWave = "details-fast-lane" | "priority-vendors" | "all";
|
||||||
|
type RobotProfile = "erik-safe" | "pi-fetch" | "proxmox-heavy";
|
||||||
|
|
||||||
|
interface VendorBlockerRow {
|
||||||
|
vendor: string;
|
||||||
|
total: string;
|
||||||
|
missing_image: string;
|
||||||
|
missing_details: string;
|
||||||
|
missing_price: string;
|
||||||
|
near_full_missing_details: string;
|
||||||
|
near_full_missing_image: string;
|
||||||
|
not_fully_verified: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TipLlmResearchPlanResponse {
|
||||||
|
success?: boolean;
|
||||||
|
data?: {
|
||||||
|
search_patterns?: string[];
|
||||||
|
extraction_fields?: Array<{ field: string; selector_or_pattern: string; example?: string }>;
|
||||||
|
relevance_rules?: string[];
|
||||||
|
confidence?: number;
|
||||||
|
summary?: string;
|
||||||
|
};
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
|
||||||
|
const TIP_API_URL = process.env.TIP_API_URL || "http://127.0.0.1:3201";
|
||||||
|
const TIP_API_TOKEN = process.env.TIP_API_TOKEN || process.env.TIP_TOKEN || "";
|
||||||
|
|
||||||
|
const DETAILS_FAST_LANE_QUEUES = [
|
||||||
|
"scrape:pricing:flexoptix",
|
||||||
|
"scrape:pricing:fibermall",
|
||||||
|
"scrape:pricing:atgbics",
|
||||||
|
"scrape:pricing:fiber24",
|
||||||
|
"scrape:pricing:qsfptek",
|
||||||
|
"scrape:pricing:naddod",
|
||||||
|
"scrape:pricing:gaotek",
|
||||||
|
"maintenance:reconcile-verification",
|
||||||
|
];
|
||||||
|
|
||||||
|
const VENDOR_QUEUE_MAP: Array<{ match: RegExp; queues: string[] }> = [
|
||||||
|
{ match: /juniper/i, queues: ["scrape:catalog:juniper-oem", "scrape:catalog:juniper-mx-oem", "scrape:catalog:juniper-qfx-oem", "scrape:compat:juniper", "discover:vendor:juniper"] },
|
||||||
|
{ match: /cisco/i, queues: ["scrape:catalog:cisco-nexus-oem", "scrape:catalog:cisco-catalyst-oem", "scrape:catalog:cisco-asr-oem", "scrape:compat:cisco", "discover:vendor:cisco-tmg"] },
|
||||||
|
{ match: /fs\.?com/i, queues: ["scrape:pricing:fs", "discover:vendor:fs-com"] },
|
||||||
|
{ match: /gao/i, queues: ["scrape:pricing:gaotek"] },
|
||||||
|
{ match: /ascent/i, queues: ["scrape:pricing:ascentoptics"] },
|
||||||
|
{ match: /eoptolink/i, queues: ["scrape:catalog:eoptolink"] },
|
||||||
|
{ match: /atgbics/i, queues: ["scrape:pricing:atgbics"] },
|
||||||
|
{ match: /naddod/i, queues: ["scrape:pricing:naddod"] },
|
||||||
|
{ match: /10gtek/i, queues: ["scrape:pricing:10gtek"] },
|
||||||
|
{ match: /qsfptek/i, queues: ["scrape:pricing:qsfptek"] },
|
||||||
|
{ match: /shopfiber24|fiber24/i, queues: ["scrape:pricing:fiber24"] },
|
||||||
|
{ match: /flexoptix/i, queues: ["scrape:pricing:flexoptix", "scrape:vendors:flexoptix-supported"] },
|
||||||
|
{ match: /fibermall/i, queues: ["scrape:pricing:fibermall"] },
|
||||||
|
{ match: /nokia/i, queues: ["scrape:catalog:nokia-oem", "scrape:catalog:nokia-access-oem", "discover:vendor:nokia"] },
|
||||||
|
{ match: /arista/i, queues: ["scrape:catalog:arista-oem", "scrape:catalog:arista-7000-oem", "discover:vendor:arista"] },
|
||||||
|
{ match: /ciena/i, queues: ["scrape:catalog:ciena-oem", "scrape:catalog:ciena-waveserver-oem"] },
|
||||||
|
];
|
||||||
|
|
||||||
|
const HEAVY_QUEUES = new Set([
|
||||||
|
"scrape:pricing:fs",
|
||||||
|
"scrape:pricing:10gtek",
|
||||||
|
"scrape:pricing:prolabs",
|
||||||
|
"scrape:compat:cisco",
|
||||||
|
"scrape:compat:juniper",
|
||||||
|
"scrape:compat:ufispace",
|
||||||
|
"scrape:compat:edgecore",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const ERIK_SAFE_QUEUES = new Set([
|
||||||
|
"scrape:pricing:flexoptix",
|
||||||
|
"scrape:pricing:fibermall",
|
||||||
|
"scrape:pricing:atgbics",
|
||||||
|
"scrape:pricing:fiber24",
|
||||||
|
"scrape:pricing:qsfptek",
|
||||||
|
"scrape:pricing:naddod",
|
||||||
|
"scrape:pricing:gaotek",
|
||||||
|
"maintenance:reconcile-verification",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function isDiscoveryQueue(queue: string): boolean {
|
||||||
|
return queue.startsWith("discover:");
|
||||||
|
}
|
||||||
|
|
||||||
|
function queuesForProfile(queues: string[], profile: RobotProfile): string[] {
|
||||||
|
if (profile === "proxmox-heavy") return queues;
|
||||||
|
|
||||||
|
if (profile === "erik-safe") {
|
||||||
|
return queues.filter((queue) => ERIK_SAFE_QUEUES.has(queue));
|
||||||
|
}
|
||||||
|
|
||||||
|
return queues.filter((queue) => !HEAVY_QUEUES.has(queue) && !isDiscoveryQueue(queue));
|
||||||
|
}
|
||||||
|
|
||||||
|
function defaultMaxQueues(profile: RobotProfile): number {
|
||||||
|
if (profile === "erik-safe") return 3;
|
||||||
|
if (profile === "pi-fetch") return 10;
|
||||||
|
return 30;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toInt(value: string | number | null | undefined): number {
|
||||||
|
return Number.parseInt(String(value ?? "0"), 10) || 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
function unique(items: string[]): string[] {
|
||||||
|
return [...new Set(items)];
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createBoss(): Promise<PgBoss> {
|
||||||
|
const boss = new PgBoss({
|
||||||
|
connectionString,
|
||||||
|
retryLimit: 1,
|
||||||
|
retryDelay: 30,
|
||||||
|
expireInSeconds: 7200,
|
||||||
|
monitorStateIntervalSeconds: 60,
|
||||||
|
});
|
||||||
|
boss.on("error", (err) => console.error("[verification-robots] pg-boss:", err.message));
|
||||||
|
await boss.start();
|
||||||
|
return boss;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getVerificationStatus(): Promise<{ summary: Record<string, string>; vendors: VendorBlockerRow[] }> {
|
||||||
|
const summaryResult = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*)::text AS total,
|
||||||
|
COUNT(*) FILTER (WHERE NOT price_verified)::text AS missing_price,
|
||||||
|
COUNT(*) FILTER (WHERE NOT image_verified)::text AS missing_image,
|
||||||
|
COUNT(*) FILTER (WHERE NOT details_verified)::text AS missing_details,
|
||||||
|
COUNT(*) FILTER (WHERE price_verified AND image_verified AND NOT details_verified)::text AS near_full_missing_details,
|
||||||
|
COUNT(*) FILTER (WHERE price_verified AND details_verified AND NOT image_verified)::text AS near_full_missing_image,
|
||||||
|
COUNT(*) FILTER (WHERE image_verified AND details_verified AND NOT price_verified)::text AS near_full_missing_price,
|
||||||
|
COUNT(*) FILTER (WHERE NOT fully_verified)::text AS not_fully_verified
|
||||||
|
FROM transceivers
|
||||||
|
`);
|
||||||
|
|
||||||
|
const vendorResult = await pool.query<VendorBlockerRow>(`
|
||||||
|
SELECT
|
||||||
|
COALESCE(v.name, 'unknown') AS vendor,
|
||||||
|
COUNT(*)::text AS total,
|
||||||
|
COUNT(*) FILTER (WHERE NOT t.image_verified)::text AS missing_image,
|
||||||
|
COUNT(*) FILTER (WHERE NOT t.details_verified)::text AS missing_details,
|
||||||
|
COUNT(*) FILTER (WHERE NOT t.price_verified)::text AS missing_price,
|
||||||
|
COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified)::text AS near_full_missing_details,
|
||||||
|
COUNT(*) FILTER (WHERE t.price_verified AND t.details_verified AND NOT t.image_verified)::text AS near_full_missing_image,
|
||||||
|
COUNT(*) FILTER (WHERE NOT t.fully_verified)::text AS not_fully_verified
|
||||||
|
FROM transceivers t
|
||||||
|
LEFT JOIN vendors v ON v.id = t.vendor_id
|
||||||
|
GROUP BY v.name
|
||||||
|
HAVING COUNT(*) FILTER (WHERE NOT t.fully_verified) > 0
|
||||||
|
ORDER BY
|
||||||
|
COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified) DESC,
|
||||||
|
COUNT(*) FILTER (WHERE NOT t.fully_verified) DESC
|
||||||
|
LIMIT 30
|
||||||
|
`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
summary: summaryResult.rows[0] ?? {},
|
||||||
|
vendors: vendorResult.rows,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function queuesForPriorityVendors(vendors: VendorBlockerRow[]): string[] {
|
||||||
|
const queues: string[] = [];
|
||||||
|
|
||||||
|
for (const row of vendors.slice(0, 20)) {
|
||||||
|
for (const entry of VENDOR_QUEUE_MAP) {
|
||||||
|
if (entry.match.test(row.vendor)) {
|
||||||
|
queues.push(...entry.queues);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
queues.push("maintenance:reconcile-verification");
|
||||||
|
return unique(queues);
|
||||||
|
}
|
||||||
|
|
||||||
|
function queuesForWave(wave: RobotWave, vendors: VendorBlockerRow[]): string[] {
|
||||||
|
if (wave === "details-fast-lane") return DETAILS_FAST_LANE_QUEUES;
|
||||||
|
if (wave === "priority-vendors") return queuesForPriorityVendors(vendors);
|
||||||
|
return unique([...DETAILS_FAST_LANE_QUEUES, ...queuesForPriorityVendors(vendors)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function enqueueRobotWave(
|
||||||
|
wave: RobotWave,
|
||||||
|
options: { dryRun?: boolean; profile?: RobotProfile; maxQueues?: number } = {},
|
||||||
|
): Promise<string[]> {
|
||||||
|
const profile = options.profile ?? "erik-safe";
|
||||||
|
const dryRun = options.dryRun ?? false;
|
||||||
|
const status = await getVerificationStatus();
|
||||||
|
const rawQueues = queuesForWave(wave, status.vendors);
|
||||||
|
const maxQueues = options.maxQueues ?? defaultMaxQueues(profile);
|
||||||
|
const queues = queuesForProfile(rawQueues, profile).slice(0, maxQueues);
|
||||||
|
const runId = `verification-${wave}-${new Date().toISOString()}`;
|
||||||
|
|
||||||
|
console.log(`\nRobot wave: ${wave}`);
|
||||||
|
console.log(`Profile: ${profile}`);
|
||||||
|
console.log(`Run ID: ${runId}`);
|
||||||
|
console.log(`Queues: ${queues.length} selected from ${rawQueues.length} candidates`);
|
||||||
|
for (const q of queues) console.log(` - ${q}`);
|
||||||
|
|
||||||
|
if (queues.length === 0) {
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "queue_rejected",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
profile,
|
||||||
|
wave,
|
||||||
|
summary: "Robot queue wave rejected because the selected profile had no safe eligible queues.",
|
||||||
|
input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
|
||||||
|
decision: { selected_queues: [], reason: "profile_filter_removed_all_candidates" },
|
||||||
|
safety_notes: ["No jobs enqueued.", "This protects Erik from accidental heavy crawler work."],
|
||||||
|
}, { flush: true });
|
||||||
|
console.log("\nNo queues selected for this profile.");
|
||||||
|
return queues;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "queue_plan",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
profile,
|
||||||
|
wave,
|
||||||
|
summary: "Dry-run queue plan generated for TIPLLM learning without starting crawler jobs.",
|
||||||
|
input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
|
||||||
|
decision: { selected_queues: queues, dry_run: true },
|
||||||
|
safety_notes: ["Dry run only.", "No pg-boss jobs were enqueued."],
|
||||||
|
}, { flush: true });
|
||||||
|
console.log("\nDry run: no jobs enqueued.");
|
||||||
|
return queues;
|
||||||
|
}
|
||||||
|
|
||||||
|
const boss = await createBoss();
|
||||||
|
try {
|
||||||
|
for (const queue of queues) {
|
||||||
|
await boss.createQueue(queue).catch(() => {});
|
||||||
|
await (boss as unknown as { send: (name: string, data?: object, options?: object) => Promise<string | null> }).send(
|
||||||
|
queue,
|
||||||
|
{ source: "verification-robots", wave, run_id: runId, enqueued_at: new Date().toISOString() },
|
||||||
|
{ retryLimit: 1, expireInSeconds: 7200 },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await boss.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("\nJobs enqueued.");
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "queue_enqueued",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
profile,
|
||||||
|
wave,
|
||||||
|
summary: "Limited verification robot queue wave enqueued.",
|
||||||
|
input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
|
||||||
|
decision: { selected_queues: queues, run_id: runId },
|
||||||
|
outcome: { enqueued_count: queues.length },
|
||||||
|
safety_notes: [
|
||||||
|
profile === "erik-safe" ? "Erik-safe profile excludes Playwright and discovery queues." : "Profile selected explicitly.",
|
||||||
|
"Queue count is capped.",
|
||||||
|
],
|
||||||
|
}, { flush: true });
|
||||||
|
return queues;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchTipLlmPlan(vendor: string, product: string, context: string): Promise<TipLlmResearchPlanResponse> {
|
||||||
|
const resp = await fetch(`${TIP_API_URL}/api/tip-llm/research-plan`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
...(TIP_API_TOKEN ? { Authorization: `Bearer ${TIP_API_TOKEN}` } : {}),
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ vendor, product, context }),
|
||||||
|
signal: AbortSignal.timeout(45000),
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = await resp.json().catch(() => ({})) as TipLlmResearchPlanResponse;
|
||||||
|
if (!resp.ok) {
|
||||||
|
return { success: false, error: data.error || `HTTP ${resp.status}` };
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function planWithTipLlm(limit: number): Promise<void> {
|
||||||
|
const status = await getVerificationStatus();
|
||||||
|
const targets = status.vendors
|
||||||
|
.filter((row) => toInt(row.not_fully_verified) > 0)
|
||||||
|
.slice(0, Math.max(1, Math.min(10, limit)));
|
||||||
|
|
||||||
|
console.log(`\n=== TIPLLM Verification Planning (${targets.length} vendor targets) ===\n`);
|
||||||
|
console.log("No crawler jobs are started in this mode.\n");
|
||||||
|
|
||||||
|
for (const row of targets) {
|
||||||
|
const product = "missing transceiver image/details verification";
|
||||||
|
const context =
|
||||||
|
`TIP verification blocker: vendor=${row.vendor}, ` +
|
||||||
|
`missing_image=${row.missing_image}, missing_details=${row.missing_details}, ` +
|
||||||
|
`missing_price=${row.missing_price}, near_full_missing_details=${row.near_full_missing_details}, ` +
|
||||||
|
`near_full_missing_image=${row.near_full_missing_image}. ` +
|
||||||
|
"Return safe search patterns, extraction fields, and relevance rules for targeted crawler work.";
|
||||||
|
|
||||||
|
console.log(`Vendor: ${row.vendor}`);
|
||||||
|
const plan = await fetchTipLlmPlan(row.vendor, product, context);
|
||||||
|
if (!plan.success || !plan.data) {
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "tipllm_plan",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
vendor: row.vendor,
|
||||||
|
summary: "TIPLLM planning failed or returned no usable data.",
|
||||||
|
input: { vendor: row.vendor, product, context },
|
||||||
|
decision: { usable_plan: false, error: plan.error || "unknown error" },
|
||||||
|
safety_notes: ["No crawler jobs started from failed TIPLLM plan."],
|
||||||
|
});
|
||||||
|
console.log(` TIPLLM plan failed: ${plan.error || "unknown error"}\n`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const patterns = plan.data.search_patterns ?? [];
|
||||||
|
const fields = plan.data.extraction_fields ?? [];
|
||||||
|
const rules = plan.data.relevance_rules ?? [];
|
||||||
|
console.log(` Summary: ${plan.data.summary || "(none)"}`);
|
||||||
|
console.log(` Search patterns: ${patterns.slice(0, 4).join(" | ") || "(none)"}`);
|
||||||
|
console.log(` Extraction fields: ${fields.map((f) => f.field).slice(0, 8).join(", ") || "(none)"}`);
|
||||||
|
console.log(` Rules: ${rules.slice(0, 3).join(" | ") || "(none)"}`);
|
||||||
|
console.log("");
|
||||||
|
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "tipllm_plan",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
vendor: row.vendor,
|
||||||
|
summary: plan.data.summary || "TIPLLM produced a crawler research plan.",
|
||||||
|
input: { vendor: row.vendor, product, context },
|
||||||
|
decision: {
|
||||||
|
usable_plan: true,
|
||||||
|
search_patterns: patterns,
|
||||||
|
extraction_fields: fields,
|
||||||
|
relevance_rules: rules,
|
||||||
|
confidence: plan.data.confidence ?? null,
|
||||||
|
},
|
||||||
|
safety_notes: [
|
||||||
|
"TIPLLM planning mode starts no crawler jobs.",
|
||||||
|
"Plans must be executed through capped profiles.",
|
||||||
|
],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "tipllm_plan",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
summary: "TIPLLM planning batch completed.",
|
||||||
|
input: { requested_limit: limit, planned_vendors: targets.map((row) => row.vendor) },
|
||||||
|
decision: { next_safe_step: "Review plans, then enqueue a small erik-safe dry run or dispatch heavy work to Proxmox/Pi profiles." },
|
||||||
|
safety_notes: ["No crawler jobs were started by the planning batch."],
|
||||||
|
}, { flush: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
function printStatus(summary: Record<string, string>, vendors: VendorBlockerRow[]): void {
|
||||||
|
console.log("\n=== TIP Verification Robot Status ===\n");
|
||||||
|
console.log(`Total products: ${summary.total ?? "0"}`);
|
||||||
|
console.log(`Not fully verified: ${summary.not_fully_verified ?? "0"}`);
|
||||||
|
console.log(`Missing price: ${summary.missing_price ?? "0"}`);
|
||||||
|
console.log(`Missing image: ${summary.missing_image ?? "0"}`);
|
||||||
|
console.log(`Missing details: ${summary.missing_details ?? "0"}`);
|
||||||
|
console.log(`Near-full, missing details: ${summary.near_full_missing_details ?? "0"}`);
|
||||||
|
console.log(`Near-full, missing image: ${summary.near_full_missing_image ?? "0"}`);
|
||||||
|
console.log(`Near-full, missing price: ${summary.near_full_missing_price ?? "0"}`);
|
||||||
|
|
||||||
|
console.log("\nTop vendor blockers:");
|
||||||
|
for (const row of vendors.slice(0, 15)) {
|
||||||
|
const nearDetails = toInt(row.near_full_missing_details);
|
||||||
|
const nearImage = toInt(row.near_full_missing_image);
|
||||||
|
console.log(
|
||||||
|
` ${row.vendor.padEnd(22)} not_full=${row.not_fully_verified.padStart(4)} ` +
|
||||||
|
`missing_img=${row.missing_image.padStart(4)} missing_details=${row.missing_details.padStart(4)} ` +
|
||||||
|
`near_details=${String(nearDetails).padStart(4)} near_image=${String(nearImage).padStart(4)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function recordStatusExperience(summary: Record<string, string>, vendors: VendorBlockerRow[]): void {
|
||||||
|
writeRobotExperience({
|
||||||
|
event: "status",
|
||||||
|
observed_at: new Date().toISOString(),
|
||||||
|
actor: "verification-robots",
|
||||||
|
summary: "Verification robot status snapshot captured for TIPLLM learning.",
|
||||||
|
input: {
|
||||||
|
total: summary.total,
|
||||||
|
not_fully_verified: summary.not_fully_verified,
|
||||||
|
missing_price: summary.missing_price,
|
||||||
|
missing_image: summary.missing_image,
|
||||||
|
missing_details: summary.missing_details,
|
||||||
|
near_full_missing_details: summary.near_full_missing_details,
|
||||||
|
near_full_missing_image: summary.near_full_missing_image,
|
||||||
|
top_vendors: vendors.slice(0, 10),
|
||||||
|
},
|
||||||
|
decision: {
|
||||||
|
recommended_sequence: [
|
||||||
|
"Use TIPLLM planning for top vendor blockers before new crawler work.",
|
||||||
|
"Run details-fast-lane first because near-full missing-details products convert fastest.",
|
||||||
|
"Keep Erik on erik-safe profile; send Playwright/discovery work to Proxmox or Pi workers.",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
safety_notes: [
|
||||||
|
"Status collection is read-only.",
|
||||||
|
"No crawler jobs are started by status collection.",
|
||||||
|
],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const enqueueArg = args.find((a) => a.startsWith("--enqueue="));
|
||||||
|
const wave = enqueueArg?.split("=")[1] as RobotWave | undefined;
|
||||||
|
const profileArg = args.find((a) => a.startsWith("--profile="));
|
||||||
|
const profile = (profileArg?.split("=")[1] || "erik-safe") as RobotProfile;
|
||||||
|
const maxQueuesArg = args.find((a) => a.startsWith("--max-queues="));
|
||||||
|
const maxQueues = maxQueuesArg ? Number.parseInt(maxQueuesArg.split("=")[1], 10) : undefined;
|
||||||
|
const limitArg = args.find((a) => a.startsWith("--limit="));
|
||||||
|
const limit = limitArg ? Number.parseInt(limitArg.split("=")[1], 10) : 5;
|
||||||
|
const dryRun = args.includes("--dry-run");
|
||||||
|
const tipLlmPlan = args.includes("--tipllm-plan");
|
||||||
|
|
||||||
|
if (args.includes("--status") || (!wave && !tipLlmPlan)) {
|
||||||
|
const status = await getVerificationStatus();
|
||||||
|
printStatus(status.summary, status.vendors);
|
||||||
|
recordStatusExperience(status.summary, status.vendors);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tipLlmPlan) {
|
||||||
|
await planWithTipLlm(limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wave) {
|
||||||
|
if (!["details-fast-lane", "priority-vendors", "all"].includes(wave)) {
|
||||||
|
throw new Error("Invalid --enqueue value. Use details-fast-lane, priority-vendors, or all.");
|
||||||
|
}
|
||||||
|
if (!["erik-safe", "pi-fetch", "proxmox-heavy"].includes(profile)) {
|
||||||
|
throw new Error("Invalid --profile value. Use erik-safe, pi-fetch, or proxmox-heavy.");
|
||||||
|
}
|
||||||
|
await enqueueRobotWave(wave, { dryRun, profile, maxQueues });
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
main().catch(async (err) => {
|
||||||
|
console.error("Fatal:", err);
|
||||||
|
await pool.end().catch(() => {});
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -14,7 +14,7 @@
|
|||||||
*
|
*
|
||||||
* Rate limited: 1 req/1.5 sec.
|
* Rate limited: 1 req/1.5 sec.
|
||||||
*/
|
*/
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
import * as cheerio from "cheerio";
|
import * as cheerio from "cheerio";
|
||||||
import * as zlib from "zlib";
|
import * as zlib from "zlib";
|
||||||
@ -220,6 +220,7 @@ export async function scrapeFiber24(): Promise<void> {
|
|||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: product.partNumber,
|
partNumber: product.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
formFactor: product.formFactor,
|
formFactor: product.formFactor,
|
||||||
speedGbps: product.speedGbps,
|
speedGbps: product.speedGbps,
|
||||||
speed: product.speed,
|
speed: product.speed,
|
||||||
@ -246,12 +247,8 @@ export async function scrapeFiber24(): Promise<void> {
|
|||||||
|
|
||||||
// Save image URL to transceivers table if present
|
// Save image URL to transceivers table if present
|
||||||
if (product.imageUrl) {
|
if (product.imageUrl) {
|
||||||
await pool.query(
|
const updatedImage = await markImageVerified(txId, product.imageUrl);
|
||||||
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
|
if (updatedImage) imageUpdates++;
|
||||||
WHERE id = $2 AND (image_url IS NULL OR image_url = '')`,
|
|
||||||
[product.imageUrl, txId],
|
|
||||||
);
|
|
||||||
imageUpdates++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
totalProducts++;
|
totalProducts++;
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
* Pagination: /store-XXXXX-name.htm?page=N
|
* Pagination: /store-XXXXX-name.htm?page=N
|
||||||
* Product list: CSS class "new_proList_mainListLi"
|
* Product list: CSS class "new_proList_mainListLi"
|
||||||
*/
|
*/
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
|
|
||||||
const BASE = "https://www.fibermall.com";
|
const BASE = "https://www.fibermall.com";
|
||||||
@ -228,6 +228,7 @@ export async function scrapeFiberMall(): Promise<void> {
|
|||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: product.partNumber,
|
partNumber: product.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
formFactor: product.formFactor,
|
formFactor: product.formFactor,
|
||||||
speedGbps: product.speedGbps,
|
speedGbps: product.speedGbps,
|
||||||
speed: product.speed,
|
speed: product.speed,
|
||||||
@ -254,13 +255,8 @@ export async function scrapeFiberMall(): Promise<void> {
|
|||||||
|
|
||||||
// Save image URL if found and not yet stored
|
// Save image URL if found and not yet stored
|
||||||
if (product.imageUrl) {
|
if (product.imageUrl) {
|
||||||
const imgResult = await pool.query(
|
const updatedImage = await markImageVerified(txId, product.imageUrl);
|
||||||
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
|
if (updatedImage) imageUpdates++;
|
||||||
WHERE id = $2 AND (image_url IS NULL OR image_url = '')
|
|
||||||
RETURNING id`,
|
|
||||||
[product.imageUrl, txId],
|
|
||||||
);
|
|
||||||
if (imgResult.rowCount && imgResult.rowCount > 0) imageUpdates++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
totalProducts++;
|
totalProducts++;
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
* node dist/utils/backfill-images.js
|
* node dist/utils/backfill-images.js
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { pool } from "./db";
|
import { pool, markImageVerified } from "./db";
|
||||||
import { logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
function sleep(ms: number): Promise<void> {
|
function sleep(ms: number): Promise<void> {
|
||||||
@ -24,10 +24,7 @@ function sleep(ms: number): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function updateImageUrl(id: string, imageUrl: string): Promise<void> {
|
async function updateImageUrl(id: string, imageUrl: string): Promise<void> {
|
||||||
await pool.query(
|
await markImageVerified(id, imageUrl);
|
||||||
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`,
|
|
||||||
[imageUrl, id]
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchJson(url: string, init?: RequestInit): Promise<unknown> {
|
async function fetchJson(url: string, init?: RequestInit): Promise<unknown> {
|
||||||
@ -484,7 +481,7 @@ async function backfillOtherVendors(): Promise<{ total: number; updated: number
|
|||||||
// Main
|
// Main
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
export async function backfillImages(): Promise<void> {
|
||||||
logger.info("=== TIP Image Backfill Script ===");
|
logger.info("=== TIP Image Backfill Script ===");
|
||||||
logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`);
|
logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`);
|
||||||
|
|
||||||
@ -531,10 +528,12 @@ async function main(): Promise<void> {
|
|||||||
logger.info("=== Backfill complete ===", { results, elapsedSec });
|
logger.info("=== Backfill complete ===", { results, elapsedSec });
|
||||||
}
|
}
|
||||||
|
|
||||||
main()
|
if (require.main === module) {
|
||||||
|
backfillImages()
|
||||||
.then(() => pool.end())
|
.then(() => pool.end())
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
logger.error("Fatal error", { error: (err as Error).message });
|
logger.error("Fatal error", { error: (err as Error).message });
|
||||||
pool.end();
|
pool.end();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|||||||
@ -25,7 +25,13 @@ export async function setTransceiverImage(
|
|||||||
source?: string
|
source?: string
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
await pool.query(
|
await pool.query(
|
||||||
`UPDATE transceivers SET image_url = $2, has_image = true, image_scraped_at = NOW()
|
`UPDATE transceivers
|
||||||
|
SET image_url = $2,
|
||||||
|
has_image = true,
|
||||||
|
image_verified = true,
|
||||||
|
image_verified_at = COALESCE(image_verified_at, NOW()),
|
||||||
|
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2),
|
||||||
|
image_scraped_at = NOW()
|
||||||
WHERE id = $1 AND (image_url IS NULL OR image_url = '')`,
|
WHERE id = $1 AND (image_url IS NULL OR image_url = '')`,
|
||||||
[transceiverId, imageUrl]
|
[transceiverId, imageUrl]
|
||||||
);
|
);
|
||||||
|
|||||||
@ -77,7 +77,7 @@ export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean
|
|||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
if (specs.imageUrl) {
|
if (specs.imageUrl) {
|
||||||
updates.push(`image_url = $${idx}, has_image = true`);
|
updates.push(`image_url = $${idx}, has_image = true, image_verified = true, image_verified_at = COALESCE(image_verified_at, NOW()), image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $${idx})`);
|
||||||
values.push(specs.imageUrl);
|
values.push(specs.imageUrl);
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
@ -92,6 +92,11 @@ export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean
|
|||||||
|
|
||||||
// Always upgrade confidence from estimated to scraped
|
// Always upgrade confidence from estimated to scraped
|
||||||
updates.push(`data_confidence = 'scraped_unverified'`);
|
updates.push(`data_confidence = 'scraped_unverified'`);
|
||||||
|
updates.push(`details_verified = true`);
|
||||||
|
updates.push(`details_verified_at = COALESCE(details_verified_at, NOW())`);
|
||||||
|
updates.push(`details_source_url = COALESCE(NULLIF(details_source_url, ''), $${idx})`);
|
||||||
|
values.push(specs.source);
|
||||||
|
idx++;
|
||||||
updates.push(`updated_at = NOW()`);
|
updates.push(`updated_at = NOW()`);
|
||||||
|
|
||||||
values.push(specs.transceiverId);
|
values.push(specs.transceiverId);
|
||||||
|
|||||||
@ -9,6 +9,7 @@ type Row = { id: string; lane: Lane; source: string; kind: string; messages: Mes
|
|||||||
|
|
||||||
const repoRoot = process.cwd();
|
const repoRoot = process.cwd();
|
||||||
const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data");
|
const externalRoot = process.env.TIP_LEARNING_SOURCE_DIR || join(homedir(), "transceiver-training-data");
|
||||||
|
const giteaTrainingRoot = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
|
||||||
const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training");
|
const blogMegaRoot = process.env.BLOG_LLM_SOURCE_DIR || join(homedir(), "Desktop", "BlogLLM-v5-Mega-Training");
|
||||||
const outRoot = join(repoRoot, "training-data", "runpod");
|
const outRoot = join(repoRoot, "training-data", "runpod");
|
||||||
|
|
||||||
@ -128,11 +129,25 @@ function markdownBlog(path: string): Row[] {
|
|||||||
}];
|
}];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function collectJsonlDir(dir: string, lane: Lane): Row[] {
|
||||||
|
if (!existsSync(dir)) return [];
|
||||||
|
const rows: Row[] = [];
|
||||||
|
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
||||||
|
const path = join(dir, entry.name);
|
||||||
|
if (entry.isDirectory()) rows.push(...collectJsonlDir(path, lane));
|
||||||
|
else if (entry.isFile() && entry.name.endsWith(".jsonl")) rows.push(...jsonl(path, lane));
|
||||||
|
}
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
function collect(lane: Lane): Row[] {
|
function collect(lane: Lane): Row[] {
|
||||||
const rows: Row[] = [];
|
const rows: Row[] = [];
|
||||||
for (const file of files[lane]) {
|
for (const file of files[lane]) {
|
||||||
for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane));
|
for (const root of [externalRoot, blogMegaRoot, repoRoot]) rows.push(...jsonl(join(root, file), lane));
|
||||||
}
|
}
|
||||||
|
if (lane === "tip_llm") {
|
||||||
|
rows.push(...collectJsonlDir(join(giteaTrainingRoot, "qa-pairs"), lane));
|
||||||
|
}
|
||||||
if (lane === "blog_llm") {
|
if (lane === "blog_llm") {
|
||||||
for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) {
|
for (const dir of [join(repoRoot, "blog-training-data"), join(externalRoot, "v6-tip-blogs")]) {
|
||||||
if (!existsSync(dir)) continue;
|
if (!existsSync(dir)) continue;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user