import { Router, Request, Response } from "express"; import { pool } from "../db/client"; import { readdirSync, statSync } from "fs"; import { join } from "path"; export const scraperRouter = Router(); // List of all known scrapers with metadata const SCRAPERS = [ { name: "fs-com", label: "FS.com", category: "vendor", url: "https://www.fs.com" }, { name: "cisco-tmg", label: "Cisco TMG", category: "vendor", url: "https://tmg.cisco.com" }, { name: "flexoptix-catalog", label: "Flexoptix Catalog", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" }, { name: "flexoptix-vendors", label: "Flexoptix Vendors", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" }, { name: "flexoptix-supported-vendors", label: "Flexoptix Supported", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" }, { name: "champion-one", label: "Champion ONE", category: "vendor", url: "https://www.champione.com" }, { name: "fluxlight", label: "Fluxlight", category: "vendor", url: "https://www.fluxlight.com" }, { name: "gbics", label: "GBICS", category: "vendor", url: "https://www.gbics.com" }, { name: "atgbics", label: "ATGBICS", category: "vendor", url: "https://www.atgbics.com" }, { name: "blueoptics", label: "BlueOptics", category: "vendor", url: "https://www.blue-optics.net" }, { name: "ascentoptics", label: "Ascent Optics", category: "vendor", url: "https://www.ascentoptics.com" }, { name: "fiber24", label: "Fiber24", category: "vendor", url: "https://www.fiber24.de" }, { name: "comms-express", label: "Comms Express", category: "vendor", url: "https://www.comms-express.com" }, { name: "gaotek", label: "GaoTek", category: "vendor", url: "https://www.gaotek.com" }, { name: "edgecore", label: "Edgecore", category: "vendor", url: "https://www.edge-core.com" }, { name: "ebay-enricher", label: "eBay Prices", category: "pricing", url: "https://www.ebay.com" }, { name: "ebay-velocity", label: "eBay Velocity", category: "pricing", url: "https://www.ebay.com" }, { name: "distributor-leads", label: "Distributor Leads", category: "pricing", url: "internal" }, { name: "community-issues", label: "Community Issues", category: "intelligence", url: "internal" }, { name: "ai-clusters", label: "AI Clusters", category: "intelligence", url: "internal" }, { name: "hot-topics", label: "Hot Topics", category: "intelligence", url: "internal" }, ]; // GET /api/scrapers/status — Overview of all scrapers + DB stats scraperRouter.get("/status", async (_req: Request, res: Response) => { try { // DB counts per vendor (via vendor join) const sourceStats = await pool.query(` SELECT v.slug as source, COUNT(t.id) as count, MAX(t.updated_at) as last_updated, MIN(t.updated_at) as first_seen FROM transceivers t JOIN vendors v ON v.id = t.vendor_id GROUP BY v.slug ORDER BY count DESC `).catch(() => ({ rows: [] })); // Price data stats const priceStats = await pool.query(` SELECT COUNT(*) as total_prices, COUNT(DISTINCT transceiver_id) as unique_parts, MAX(time) as last_price_update, AVG(price) as avg_price_eur FROM price_observations `).catch(() => ({ rows: [{}] })); // Overall DB stats const dbStats = await pool.query(` SELECT (SELECT COUNT(*) FROM transceivers) as transceivers, (SELECT COUNT(*) FROM vendors) as vendors, (SELECT COUNT(*) FROM switches) as switches, (SELECT COUNT(*) FROM news_articles) as news_articles, (SELECT COUNT(*) FROM knowledge_base) as knowledge_base_entries, (SELECT COUNT(*) FROM price_observations) as competitor_prices, (SELECT pg_size_pretty(pg_database_size(current_database()))) as db_size `).catch(() => ({ rows: [{}] })); // News / intelligence freshness const newsFreshness = await pool.query(` SELECT source, COUNT(*) as count, MAX(published_at) as latest FROM news_articles GROUP BY source ORDER BY latest DESC LIMIT 10 `).catch(() => ({ rows: [] })); const sourceMap: Record = {}; for (const row of sourceStats.rows) { sourceMap[row.source] = { count: Number(row.count), last_updated: row.last_updated, first_seen: row.first_seen, }; } const p = priceStats.rows[0] || {}; const d = dbStats.rows[0] || {}; const scraperStatus = SCRAPERS.map((s) => { // dbSlug overrides name for DB lookup (e.g. flexoptix-catalog → flexoptix) const lookupKey = (s as any).dbSlug || s.name; const stats = sourceMap[lookupKey]; return { ...s, records: stats?.count || 0, lastRun: stats?.last_updated || null, firstSeen: stats?.first_seen || null, status: (stats?.count || 0) > 0 ? "active" : "no-data", }; }); res.json({ success: true, timestamp: new Date().toISOString(), scrapers: { total: SCRAPERS.length, active: scraperStatus.filter((s) => s.status === "active").length, list: scraperStatus, }, database: { transceivers: Number(d.transceivers || 0), vendors: Number(d.vendors || 0), switches: Number(d.switches || 0), news_articles: Number(d.news_articles || 0), knowledge_base_entries: Number(d.knowledge_base_entries || 0), competitor_prices: Number(d.competitor_prices || 0), size: d.db_size || "unknown", }, pricing: { total_prices: Number(p.total_prices || 0), unique_parts: Number(p.unique_parts || 0), last_update: p.last_price_update || null, avg_price_eur: p.avg_price_eur ? Math.round(Number(p.avg_price_eur) * 100) / 100 : null, }, intelligence: { news_sources: newsFreshness.rows.map((r: any) => ({ source: r.source, count: Number(r.count), latest: r.latest, })), }, }); } catch (err) { res.status(503).json({ success: false, error: String(err) }); } }); // GET /api/scrapers/jobs — Live pg-boss job queue status scraperRouter.get("/jobs", async (_req: Request, res: Response) => { try { const [active, recent, queues] = await Promise.all([ // Currently active (running) jobs pool.query(` SELECT name, id, created_on, started_on, output FROM pgboss.job WHERE state = 'active' ORDER BY started_on DESC LIMIT 20 `).catch(() => ({ rows: [] })), // Recent completions and failures (last 4 hours) pool.query(` SELECT name, state, created_on, started_on, completed_on, EXTRACT(EPOCH FROM (completed_on - started_on))::int AS duration_sec FROM pgboss.job WHERE state IN ('completed', 'failed', 'cancelled') AND completed_on > NOW() - INTERVAL '4 hours' ORDER BY completed_on DESC LIMIT 50 `).catch(() => ({ rows: [] })), // Queue summary: count per job name and state (last 24h) pool.query(` SELECT name, state, COUNT(*) as count, MAX(completed_on) as last_completed, MAX(started_on) as last_started FROM pgboss.job WHERE created_on > NOW() - INTERVAL '24 hours' GROUP BY name, state ORDER BY name, state `).catch(() => ({ rows: [] })), ]); res.json({ success: true, active: active.rows, recent: recent.rows, queues: queues.rows, }); } catch (err) { res.status(503).json({ success: false, error: String(err) }); } }); // GET /api/scrapers/llm-insights — What the crawler LLM has learned scraperRouter.get("/llm-insights", async (_req: Request, res: Response) => { try { const [topics, kb, recentNews] = await Promise.all([ // Hot Topics: top market intelligence items by relevance pool.query(` SELECT title, summary, relevance_score as trend_score, source_name as source, published_at, intel_type as category, technologies, buy_signal_implication FROM market_intelligence WHERE relevance_score IS NOT NULL ORDER BY relevance_score DESC, published_at DESC LIMIT 20 `).catch(() => ({ rows: [] })), // Knowledge Base: market intelligence grouped by type pool.query(` SELECT intel_type as category, COUNT(*) as count, MAX(relevance_score) as top_relevance, MAX(created_at) as latest FROM market_intelligence GROUP BY intel_type ORDER BY MAX(relevance_score) DESC NULLS LAST `).catch(() => ({ rows: [] })), // Recent News pool.query(` SELECT title, source, published_at, summary, relevance_score, category FROM news_articles ORDER BY published_at DESC LIMIT 10 `).catch(() => ({ rows: [] })), ]); res.json({ success: true, hotTopics: topics.rows, knowledgeBase: kb.rows, recentNews: recentNews.rows, }); } catch (err) { res.status(503).json({ success: false, error: String(err) }); } });