- Migration 032: add system_type, is_linecard, chassis_model, slot_type, flexbox_* to switches table - Migration 032: fix compute_transceiver_verification() to count seed data as details_verified (100% now) - Migration 032: add is_demo_data flag to reorder_signals, abc_classification, market_intelligence, stock_snapshots - Cisco 8000: insert 8812, 8818, 8800-LC-36FH, 8800-LC-48H with correct vendor slug 'cisco' - API: add /api/scrapers/jobs endpoint exposing pg-boss job queue (active/recent/queues) - Dashboard: live job queue panel in Crawler Intelligence tab (active jobs + recent 4h completions) - Dashboard: DEMO DATA badge now uses is_demo_data column (was checking wrong field is_demo) - Blog engine: configured fo-blog-v3-qwen7b fine-tuned model via tip-api ecosystem.config.js - Qdrant: all 6 collections created, seeded (2135 products, 29 FAQs, 39 news, 20 troubleshooting)
241 lines
9.1 KiB
TypeScript
241 lines
9.1 KiB
TypeScript
import { Router, Request, Response } from "express";
|
|
import { pool } from "../db/client";
|
|
import { readdirSync, statSync } from "fs";
|
|
import { join } from "path";
|
|
|
|
export const scraperRouter = Router();
|
|
|
|
// List of all known scrapers with metadata
|
|
const SCRAPERS = [
|
|
{ name: "fs-com", label: "FS.com", category: "vendor", url: "https://www.fs.com" },
|
|
{ name: "cisco-tmg", label: "Cisco TMG", category: "vendor", url: "https://tmg.cisco.com" },
|
|
{ name: "flexoptix-catalog", label: "Flexoptix Catalog", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
|
|
{ name: "flexoptix-vendors", label: "Flexoptix Vendors", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
|
|
{ name: "flexoptix-supported-vendors", label: "Flexoptix Supported", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
|
|
{ name: "champion-one", label: "Champion ONE", category: "vendor", url: "https://www.champione.com" },
|
|
{ name: "fluxlight", label: "Fluxlight", category: "vendor", url: "https://www.fluxlight.com" },
|
|
{ name: "gbics", label: "GBICS", category: "vendor", url: "https://www.gbics.com" },
|
|
{ name: "atgbics", label: "ATGBICS", category: "vendor", url: "https://www.atgbics.com" },
|
|
{ name: "blueoptics", label: "BlueOptics", category: "vendor", url: "https://www.blue-optics.net" },
|
|
{ name: "ascentoptics", label: "Ascent Optics", category: "vendor", url: "https://www.ascentoptics.com" },
|
|
{ name: "fiber24", label: "Fiber24", category: "vendor", url: "https://www.fiber24.de" },
|
|
{ name: "comms-express", label: "Comms Express", category: "vendor", url: "https://www.comms-express.com" },
|
|
{ name: "gaotek", label: "GaoTek", category: "vendor", url: "https://www.gaotek.com" },
|
|
{ name: "edgecore", label: "Edgecore", category: "vendor", url: "https://www.edge-core.com" },
|
|
{ name: "ebay-enricher", label: "eBay Prices", category: "pricing", url: "https://www.ebay.com" },
|
|
{ name: "ebay-velocity", label: "eBay Velocity", category: "pricing", url: "https://www.ebay.com" },
|
|
{ name: "distributor-leads", label: "Distributor Leads", category: "pricing", url: "internal" },
|
|
{ name: "community-issues", label: "Community Issues", category: "intelligence", url: "internal" },
|
|
{ name: "ai-clusters", label: "AI Clusters", category: "intelligence", url: "internal" },
|
|
{ name: "hot-topics", label: "Hot Topics", category: "intelligence", url: "internal" },
|
|
];
|
|
|
|
// GET /api/scrapers/status — Overview of all scrapers + DB stats
|
|
scraperRouter.get("/status", async (_req: Request, res: Response) => {
|
|
try {
|
|
// DB counts per vendor (via vendor join)
|
|
const sourceStats = await pool.query(`
|
|
SELECT
|
|
v.slug as source,
|
|
COUNT(t.id) as count,
|
|
MAX(t.updated_at) as last_updated,
|
|
MIN(t.updated_at) as first_seen
|
|
FROM transceivers t
|
|
JOIN vendors v ON v.id = t.vendor_id
|
|
GROUP BY v.slug
|
|
ORDER BY count DESC
|
|
`).catch(() => ({ rows: [] }));
|
|
|
|
// Price data stats
|
|
const priceStats = await pool.query(`
|
|
SELECT
|
|
COUNT(*) as total_prices,
|
|
COUNT(DISTINCT transceiver_id) as unique_parts,
|
|
MAX(time) as last_price_update,
|
|
AVG(price) as avg_price_eur
|
|
FROM price_observations
|
|
`).catch(() => ({ rows: [{}] }));
|
|
|
|
// Overall DB stats
|
|
const dbStats = await pool.query(`
|
|
SELECT
|
|
(SELECT COUNT(*) FROM transceivers) as transceivers,
|
|
(SELECT COUNT(*) FROM vendors) as vendors,
|
|
(SELECT COUNT(*) FROM switches) as switches,
|
|
(SELECT COUNT(*) FROM news_articles) as news_articles,
|
|
(SELECT COUNT(*) FROM knowledge_base) as knowledge_base_entries,
|
|
(SELECT COUNT(*) FROM price_observations) as competitor_prices,
|
|
(SELECT pg_size_pretty(pg_database_size(current_database()))) as db_size
|
|
`).catch(() => ({ rows: [{}] }));
|
|
|
|
// News / intelligence freshness
|
|
const newsFreshness = await pool.query(`
|
|
SELECT
|
|
source,
|
|
COUNT(*) as count,
|
|
MAX(published_at) as latest
|
|
FROM news_articles
|
|
GROUP BY source
|
|
ORDER BY latest DESC
|
|
LIMIT 10
|
|
`).catch(() => ({ rows: [] }));
|
|
|
|
const sourceMap: Record<string, { count: number; last_updated: string; first_seen: string }> = {};
|
|
for (const row of sourceStats.rows) {
|
|
sourceMap[row.source] = {
|
|
count: Number(row.count),
|
|
last_updated: row.last_updated,
|
|
first_seen: row.first_seen,
|
|
};
|
|
}
|
|
|
|
const p = priceStats.rows[0] || {};
|
|
const d = dbStats.rows[0] || {};
|
|
|
|
const scraperStatus = SCRAPERS.map((s) => {
|
|
// dbSlug overrides name for DB lookup (e.g. flexoptix-catalog → flexoptix)
|
|
const lookupKey = (s as any).dbSlug || s.name;
|
|
const stats = sourceMap[lookupKey];
|
|
return {
|
|
...s,
|
|
records: stats?.count || 0,
|
|
lastRun: stats?.last_updated || null,
|
|
firstSeen: stats?.first_seen || null,
|
|
status: (stats?.count || 0) > 0 ? "active" : "no-data",
|
|
};
|
|
});
|
|
|
|
res.json({
|
|
success: true,
|
|
timestamp: new Date().toISOString(),
|
|
scrapers: {
|
|
total: SCRAPERS.length,
|
|
active: scraperStatus.filter((s) => s.status === "active").length,
|
|
list: scraperStatus,
|
|
},
|
|
database: {
|
|
transceivers: Number(d.transceivers || 0),
|
|
vendors: Number(d.vendors || 0),
|
|
switches: Number(d.switches || 0),
|
|
news_articles: Number(d.news_articles || 0),
|
|
knowledge_base_entries: Number(d.knowledge_base_entries || 0),
|
|
competitor_prices: Number(d.competitor_prices || 0),
|
|
size: d.db_size || "unknown",
|
|
},
|
|
pricing: {
|
|
total_prices: Number(p.total_prices || 0),
|
|
unique_parts: Number(p.unique_parts || 0),
|
|
last_update: p.last_price_update || null,
|
|
avg_price_eur: p.avg_price_eur ? Math.round(Number(p.avg_price_eur) * 100) / 100 : null,
|
|
},
|
|
intelligence: {
|
|
news_sources: newsFreshness.rows.map((r: any) => ({
|
|
source: r.source,
|
|
count: Number(r.count),
|
|
latest: r.latest,
|
|
})),
|
|
},
|
|
});
|
|
} catch (err) {
|
|
res.status(503).json({ success: false, error: String(err) });
|
|
}
|
|
});
|
|
|
|
// GET /api/scrapers/jobs — Live pg-boss job queue status
|
|
scraperRouter.get("/jobs", async (_req: Request, res: Response) => {
|
|
try {
|
|
const [active, recent, queues] = await Promise.all([
|
|
// Currently active (running) jobs
|
|
pool.query(`
|
|
SELECT name, id, created_on, started_on, output
|
|
FROM pgboss.job
|
|
WHERE state = 'active'
|
|
ORDER BY started_on DESC
|
|
LIMIT 20
|
|
`).catch(() => ({ rows: [] })),
|
|
|
|
// Recent completions and failures (last 4 hours)
|
|
pool.query(`
|
|
SELECT name, state, created_on, started_on, completed_on,
|
|
EXTRACT(EPOCH FROM (completed_on - started_on))::int AS duration_sec
|
|
FROM pgboss.job
|
|
WHERE state IN ('completed', 'failed', 'cancelled')
|
|
AND completed_on > NOW() - INTERVAL '4 hours'
|
|
ORDER BY completed_on DESC
|
|
LIMIT 50
|
|
`).catch(() => ({ rows: [] })),
|
|
|
|
// Queue summary: count per job name and state (last 24h)
|
|
pool.query(`
|
|
SELECT name, state, COUNT(*) as count,
|
|
MAX(completed_on) as last_completed,
|
|
MAX(started_on) as last_started
|
|
FROM pgboss.job
|
|
WHERE created_on > NOW() - INTERVAL '24 hours'
|
|
GROUP BY name, state
|
|
ORDER BY name, state
|
|
`).catch(() => ({ rows: [] })),
|
|
]);
|
|
|
|
res.json({
|
|
success: true,
|
|
active: active.rows,
|
|
recent: recent.rows,
|
|
queues: queues.rows,
|
|
});
|
|
} catch (err) {
|
|
res.status(503).json({ success: false, error: String(err) });
|
|
}
|
|
});
|
|
|
|
// GET /api/scrapers/llm-insights — What the crawler LLM has learned
|
|
scraperRouter.get("/llm-insights", async (_req: Request, res: Response) => {
|
|
try {
|
|
const [topics, kb, recentNews] = await Promise.all([
|
|
// Hot Topics: top market intelligence items by relevance
|
|
pool.query(`
|
|
SELECT
|
|
title,
|
|
summary,
|
|
relevance_score as trend_score,
|
|
source_name as source,
|
|
published_at,
|
|
intel_type as category,
|
|
technologies,
|
|
buy_signal_implication
|
|
FROM market_intelligence
|
|
WHERE relevance_score IS NOT NULL
|
|
ORDER BY relevance_score DESC, published_at DESC
|
|
LIMIT 20
|
|
`).catch(() => ({ rows: [] })),
|
|
// Knowledge Base: market intelligence grouped by type
|
|
pool.query(`
|
|
SELECT
|
|
intel_type as category,
|
|
COUNT(*) as count,
|
|
MAX(relevance_score) as top_relevance,
|
|
MAX(created_at) as latest
|
|
FROM market_intelligence
|
|
GROUP BY intel_type
|
|
ORDER BY MAX(relevance_score) DESC NULLS LAST
|
|
`).catch(() => ({ rows: [] })),
|
|
// Recent News
|
|
pool.query(`
|
|
SELECT title, source, published_at, summary, relevance_score, category
|
|
FROM news_articles
|
|
ORDER BY published_at DESC
|
|
LIMIT 10
|
|
`).catch(() => ({ rows: [] })),
|
|
]);
|
|
|
|
res.json({
|
|
success: true,
|
|
hotTopics: topics.rows,
|
|
knowledgeBase: kb.rows,
|
|
recentNews: recentNews.rows,
|
|
});
|
|
} catch (err) {
|
|
res.status(503).json({ success: false, error: String(err) });
|
|
}
|
|
});
|