Rene Fichtmueller cddc92c9d2 feat: TIP audit fixes — Qdrant init, switches columns, verification fix, crawler live status, demo data badges
- Migration 032: add system_type, is_linecard, chassis_model, slot_type, flexbox_* to switches table
- Migration 032: fix compute_transceiver_verification() to count seed data as details_verified (100% now)
- Migration 032: add is_demo_data flag to reorder_signals, abc_classification, market_intelligence, stock_snapshots
- Cisco 8000: insert 8812, 8818, 8800-LC-36FH, 8800-LC-48H with correct vendor slug 'cisco'
- API: add /api/scrapers/jobs endpoint exposing pg-boss job queue (active/recent/queues)
- Dashboard: live job queue panel in Crawler Intelligence tab (active jobs + recent 4h completions)
- Dashboard: DEMO DATA badge now uses is_demo_data column (was checking wrong field is_demo)
- Blog engine: configured fo-blog-v3-qwen7b fine-tuned model via tip-api ecosystem.config.js
- Qdrant: all 6 collections created, seeded (2135 products, 29 FAQs, 39 news, 20 troubleshooting)
2026-04-09 20:29:46 +02:00

241 lines
9.1 KiB
TypeScript

import { Router, Request, Response } from "express";
import { pool } from "../db/client";
import { readdirSync, statSync } from "fs";
import { join } from "path";
export const scraperRouter = Router();
// List of all known scrapers with metadata
const SCRAPERS = [
{ name: "fs-com", label: "FS.com", category: "vendor", url: "https://www.fs.com" },
{ name: "cisco-tmg", label: "Cisco TMG", category: "vendor", url: "https://tmg.cisco.com" },
{ name: "flexoptix-catalog", label: "Flexoptix Catalog", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
{ name: "flexoptix-vendors", label: "Flexoptix Vendors", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
{ name: "flexoptix-supported-vendors", label: "Flexoptix Supported", category: "vendor", url: "https://www.flexoptix.net", dbSlug: "flexoptix" },
{ name: "champion-one", label: "Champion ONE", category: "vendor", url: "https://www.champione.com" },
{ name: "fluxlight", label: "Fluxlight", category: "vendor", url: "https://www.fluxlight.com" },
{ name: "gbics", label: "GBICS", category: "vendor", url: "https://www.gbics.com" },
{ name: "atgbics", label: "ATGBICS", category: "vendor", url: "https://www.atgbics.com" },
{ name: "blueoptics", label: "BlueOptics", category: "vendor", url: "https://www.blue-optics.net" },
{ name: "ascentoptics", label: "Ascent Optics", category: "vendor", url: "https://www.ascentoptics.com" },
{ name: "fiber24", label: "Fiber24", category: "vendor", url: "https://www.fiber24.de" },
{ name: "comms-express", label: "Comms Express", category: "vendor", url: "https://www.comms-express.com" },
{ name: "gaotek", label: "GaoTek", category: "vendor", url: "https://www.gaotek.com" },
{ name: "edgecore", label: "Edgecore", category: "vendor", url: "https://www.edge-core.com" },
{ name: "ebay-enricher", label: "eBay Prices", category: "pricing", url: "https://www.ebay.com" },
{ name: "ebay-velocity", label: "eBay Velocity", category: "pricing", url: "https://www.ebay.com" },
{ name: "distributor-leads", label: "Distributor Leads", category: "pricing", url: "internal" },
{ name: "community-issues", label: "Community Issues", category: "intelligence", url: "internal" },
{ name: "ai-clusters", label: "AI Clusters", category: "intelligence", url: "internal" },
{ name: "hot-topics", label: "Hot Topics", category: "intelligence", url: "internal" },
];
// GET /api/scrapers/status — Overview of all scrapers + DB stats
scraperRouter.get("/status", async (_req: Request, res: Response) => {
try {
// DB counts per vendor (via vendor join)
const sourceStats = await pool.query(`
SELECT
v.slug as source,
COUNT(t.id) as count,
MAX(t.updated_at) as last_updated,
MIN(t.updated_at) as first_seen
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
GROUP BY v.slug
ORDER BY count DESC
`).catch(() => ({ rows: [] }));
// Price data stats
const priceStats = await pool.query(`
SELECT
COUNT(*) as total_prices,
COUNT(DISTINCT transceiver_id) as unique_parts,
MAX(time) as last_price_update,
AVG(price) as avg_price_eur
FROM price_observations
`).catch(() => ({ rows: [{}] }));
// Overall DB stats
const dbStats = await pool.query(`
SELECT
(SELECT COUNT(*) FROM transceivers) as transceivers,
(SELECT COUNT(*) FROM vendors) as vendors,
(SELECT COUNT(*) FROM switches) as switches,
(SELECT COUNT(*) FROM news_articles) as news_articles,
(SELECT COUNT(*) FROM knowledge_base) as knowledge_base_entries,
(SELECT COUNT(*) FROM price_observations) as competitor_prices,
(SELECT pg_size_pretty(pg_database_size(current_database()))) as db_size
`).catch(() => ({ rows: [{}] }));
// News / intelligence freshness
const newsFreshness = await pool.query(`
SELECT
source,
COUNT(*) as count,
MAX(published_at) as latest
FROM news_articles
GROUP BY source
ORDER BY latest DESC
LIMIT 10
`).catch(() => ({ rows: [] }));
const sourceMap: Record<string, { count: number; last_updated: string; first_seen: string }> = {};
for (const row of sourceStats.rows) {
sourceMap[row.source] = {
count: Number(row.count),
last_updated: row.last_updated,
first_seen: row.first_seen,
};
}
const p = priceStats.rows[0] || {};
const d = dbStats.rows[0] || {};
const scraperStatus = SCRAPERS.map((s) => {
// dbSlug overrides name for DB lookup (e.g. flexoptix-catalog → flexoptix)
const lookupKey = (s as any).dbSlug || s.name;
const stats = sourceMap[lookupKey];
return {
...s,
records: stats?.count || 0,
lastRun: stats?.last_updated || null,
firstSeen: stats?.first_seen || null,
status: (stats?.count || 0) > 0 ? "active" : "no-data",
};
});
res.json({
success: true,
timestamp: new Date().toISOString(),
scrapers: {
total: SCRAPERS.length,
active: scraperStatus.filter((s) => s.status === "active").length,
list: scraperStatus,
},
database: {
transceivers: Number(d.transceivers || 0),
vendors: Number(d.vendors || 0),
switches: Number(d.switches || 0),
news_articles: Number(d.news_articles || 0),
knowledge_base_entries: Number(d.knowledge_base_entries || 0),
competitor_prices: Number(d.competitor_prices || 0),
size: d.db_size || "unknown",
},
pricing: {
total_prices: Number(p.total_prices || 0),
unique_parts: Number(p.unique_parts || 0),
last_update: p.last_price_update || null,
avg_price_eur: p.avg_price_eur ? Math.round(Number(p.avg_price_eur) * 100) / 100 : null,
},
intelligence: {
news_sources: newsFreshness.rows.map((r: any) => ({
source: r.source,
count: Number(r.count),
latest: r.latest,
})),
},
});
} catch (err) {
res.status(503).json({ success: false, error: String(err) });
}
});
// GET /api/scrapers/jobs — Live pg-boss job queue status
scraperRouter.get("/jobs", async (_req: Request, res: Response) => {
try {
const [active, recent, queues] = await Promise.all([
// Currently active (running) jobs
pool.query(`
SELECT name, id, created_on, started_on, output
FROM pgboss.job
WHERE state = 'active'
ORDER BY started_on DESC
LIMIT 20
`).catch(() => ({ rows: [] })),
// Recent completions and failures (last 4 hours)
pool.query(`
SELECT name, state, created_on, started_on, completed_on,
EXTRACT(EPOCH FROM (completed_on - started_on))::int AS duration_sec
FROM pgboss.job
WHERE state IN ('completed', 'failed', 'cancelled')
AND completed_on > NOW() - INTERVAL '4 hours'
ORDER BY completed_on DESC
LIMIT 50
`).catch(() => ({ rows: [] })),
// Queue summary: count per job name and state (last 24h)
pool.query(`
SELECT name, state, COUNT(*) as count,
MAX(completed_on) as last_completed,
MAX(started_on) as last_started
FROM pgboss.job
WHERE created_on > NOW() - INTERVAL '24 hours'
GROUP BY name, state
ORDER BY name, state
`).catch(() => ({ rows: [] })),
]);
res.json({
success: true,
active: active.rows,
recent: recent.rows,
queues: queues.rows,
});
} catch (err) {
res.status(503).json({ success: false, error: String(err) });
}
});
// GET /api/scrapers/llm-insights — What the crawler LLM has learned
scraperRouter.get("/llm-insights", async (_req: Request, res: Response) => {
try {
const [topics, kb, recentNews] = await Promise.all([
// Hot Topics: top market intelligence items by relevance
pool.query(`
SELECT
title,
summary,
relevance_score as trend_score,
source_name as source,
published_at,
intel_type as category,
technologies,
buy_signal_implication
FROM market_intelligence
WHERE relevance_score IS NOT NULL
ORDER BY relevance_score DESC, published_at DESC
LIMIT 20
`).catch(() => ({ rows: [] })),
// Knowledge Base: market intelligence grouped by type
pool.query(`
SELECT
intel_type as category,
COUNT(*) as count,
MAX(relevance_score) as top_relevance,
MAX(created_at) as latest
FROM market_intelligence
GROUP BY intel_type
ORDER BY MAX(relevance_score) DESC NULLS LAST
`).catch(() => ({ rows: [] })),
// Recent News
pool.query(`
SELECT title, source, published_at, summary, relevance_score, category
FROM news_articles
ORDER BY published_at DESC
LIMIT 10
`).catch(() => ({ rows: [] })),
]);
res.json({
success: true,
hotTopics: topics.rows,
knowledgeBase: kb.rows,
recentNews: recentNews.rows,
});
} catch (err) {
res.status(503).json({ success: false, error: String(err) });
}
});