feat: Mouser Electronics API scraper for OEM reference prices (Juniper/Cisco/Arista PIDs)

This commit is contained in:
Rene Fichtmueller 2026-04-18 00:04:35 +02:00
parent 03b776cc2a
commit 9ecaffc475
3 changed files with 761 additions and 23 deletions

View File

@ -158,3 +158,4 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA
{"d":"2026-03-30","t":"INFRA","m":"Stack deployed: PostgreSQL 17 + TimescaleDB, Qdrant, Cloudflare R2 for images, PM2"} {"d":"2026-03-30","t":"INFRA","m":"Stack deployed: PostgreSQL 17 + TimescaleDB, Qdrant, Cloudflare R2 for images, PM2"}
{"d":"2026-03-30","t":"DATA","m":"v0.1.0: 5,018 transceivers, 351 vendors seeded from 23 initial scrapers"} {"d":"2026-03-30","t":"DATA","m":"v0.1.0: 5,018 transceivers, 351 vendors seeded from 23 initial scrapers"}
{"d":"2026-04-17","t":"DATA","m":"Vendor cleanup: pruned 242 irrelevant OEM/manufacturer vendors with no transceiver or switch data — 348→106 vendors"} {"d":"2026-04-17","t":"DATA","m":"Vendor cleanup: pruned 242 irrelevant OEM/manufacturer vendors with no transceiver or switch data — 348→106 vendors"}
{"d":"2026-04-18","t":"FEAT","m":"Mouser Electronics API scraper: OEM reference prices for Juniper/Cisco/Arista PIDs — scheduled daily 03:00, MOUSER_API_KEY env var required"}

View File

@ -21,7 +21,7 @@
import PgBoss from "pg-boss"; import PgBoss from "pg-boss";
import { config } from "dotenv"; import { config } from "dotenv";
import { join } from "path"; import { join } from "path";
import { rmSync, mkdirSync } from "fs"; import { mkdirSync, existsSync, writeFileSync } from "fs";
/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ /** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> { async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
@ -30,19 +30,21 @@ async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promi
mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true });
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
// Pre-seed session pool state file to prevent "Could not find file" crash
// on first run (Crawlee reads this before writing it on some versions)
const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json");
if (!existsSync(sessionFile)) {
writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] }));
}
const prev = process.env.CRAWLEE_STORAGE_DIR; const prev = process.env.CRAWLEE_STORAGE_DIR;
const prevPurge = process.env.CRAWLEE_PURGE_ON_START;
process.env.CRAWLEE_STORAGE_DIR = dir; process.env.CRAWLEE_STORAGE_DIR = dir;
// Force Crawlee to initialize fresh — prevents "Could not find SDK_SESSION_POOL_STATE.json" // Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state
// when the isolated storage dir was just created and has no pre-existing state files. // between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes).
process.env.CRAWLEE_PURGE_ON_START = "1"; // The dir is intentionally kept between runs so Crawlee can persist its state.
try { try {
await fn(); await fn();
} finally { } finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
process.env.CRAWLEE_PURGE_ON_START = prevPurge ?? "";
// Clean up after successful run
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
} }
} }
@ -130,6 +132,8 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
"scrape:pricing:fibermall", "scrape:pricing:fibermall",
"scrape:pricing:vcelink", "scrape:pricing:vcelink",
"scrape:pricing:opticsbay", "scrape:pricing:opticsbay",
// ── OEM Reference Prices (Mouser API, once daily) ─────────────────
"scrape:pricing:mouser-oem",
// ── Prediction Signal Scrapers (new) ────────────────────────────── // ── Prediction Signal Scrapers (new) ──────────────────────────────
"scrape:signals:sec-edgar", "scrape:signals:sec-edgar",
"scrape:signals:github", "scrape:signals:github",
@ -141,6 +145,14 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
"compute:forecast", "compute:forecast",
// ── Sync ────────────────────────────────────────────────────────── // ── Sync ──────────────────────────────────────────────────────────
"sync:nas", "sync:nas",
// ── Health Monitoring ─────────────────────────────────────────────
"monitor:scraper-health",
// ── Verification Reconciliation ───────────────────────────────────
"maintenance:reconcile-verification",
// ── Competitor Equivalence Matching ───────────────────────────────
"maintenance:find-equivalences",
// ── Re-Research approved equivalences ─────────────────────────────
"maintenance:re-research-equivalences",
]; ];
for (const q of queues) { for (const q of queues) {
@ -187,6 +199,10 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
// OEM reference prices via Mouser API — once daily at 03:00 (slow: 2s/PID × 475 PIDs ≈ 16min)
// Requires MOUSER_API_KEY env var (free at mouser.com/api-hub)
await boss.schedule("scrape:pricing:mouser-oem", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
// ══════════════════════════════════════════════════════════════════════ // ══════════════════════════════════════════════════════════════════════
// FLEXOPTIX CATALOG — every 2h (primary price source) // FLEXOPTIX CATALOG — every 2h (primary price source)
// ══════════════════════════════════════════════════════════════════════ // ══════════════════════════════════════════════════════════════════════
@ -279,18 +295,31 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
console.log("All schedules registered — 24/7 continuous scraping (53 jobs)"); // Health check: every 3h — warns if any vendor has no new prices recently
await boss.schedule("monitor:scraper-health", "17 */3 * * *", {}, { retryLimit: 1, expireInSeconds: 600 });
// Verification reconciliation: nightly at 01:00 UTC
// Resets competitor_verified/fully_verified for any transceiver that no longer
// has a real non-Flexoptix price in the last 30 days — prevents stale ★ 100% badges
await boss.schedule("maintenance:reconcile-verification", "0 1 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
// Equivalence matching: nightly at 02:00 UTC (after reconcile)
await boss.schedule("maintenance:find-equivalences", "0 2 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
// Re-research approved equivalences: daily at 03:00 UTC, processes 200 items per run
await boss.schedule("maintenance:re-research-equivalences", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
console.log("All schedules registered — 24/7 continuous scraping (57 jobs)");
} }
export async function registerWorkers(boss: PgBoss): Promise<void> { export async function registerWorkers(boss: PgBoss): Promise<void> {
// Lazy-load all scrapers // Lazy-load all scrapers
const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeFs } = await import("./scrapers/fs-com");
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
// NOTE: Pi-only scrapers (fluxlight, gbics, optcore, champion-one, sfpcables, const { scrapeSmartOptics } = await import("./scrapers/smartoptics");
// blueoptics, fiber24, tscom, skylane, ascentoptics, gaotek, smartoptics, const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner");
// hubersuhner, news, market-intel) are NOT registered here. const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence");
// Pi workers (index-pi.ts) are the SOLE consumers of those queues so that const { scrapeNews } = await import("./scrapers/news");
// all lightweight scraping traffic flows through the Raspberry Pi Starlink nodes.
const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrape10Gtek } = await import("./scrapers/tenGtek");
const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog");
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
@ -333,10 +362,72 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await withIsolatedStorage("prolabs", scrapeProLabs); await withIsolatedStorage("prolabs", scrapeProLabs);
}); });
// ── Pi-only scrapers: NO boss.work() here ──────────────────────────── // ── Lightweight fetch/cheerio scrapers ───────────────────────────────
// fluxlight, gbics, optcore, champion-one, sfpcables, blueoptics, fiber24, await boss.work("scrape:pricing:fluxlight", async () => {
// tscom, skylane, ascentoptics, gaotek → handled exclusively by Pi fleet. console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`);
// Jobs are dispatched by the cron schedule above; Pi workers consume them. const { scrapeFluxlight } = await import("./scrapers/fluxlight");
await scrapeFluxlight();
});
await boss.work("scrape:pricing:gbics", async () => {
console.log(`[${new Date().toISOString()}] Running: GBICS pricing`);
const { scrapeGbics } = await import("./scrapers/gbics");
await scrapeGbics();
});
await boss.work("scrape:pricing:optcore", async () => {
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
const { scrapeOptcore } = await import("./scrapers/optcore");
await scrapeOptcore();
});
await boss.work("scrape:pricing:champion-one", async () => {
console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`);
const { scrapeChampionOne } = await import("./scrapers/champion-one");
await scrapeChampionOne();
});
await boss.work("scrape:pricing:sfpcables", async () => {
console.log(`[${new Date().toISOString()}] Running: SFPcables pricing`);
const { scrapeSfpCables } = await import("./scrapers/sfpcables");
await scrapeSfpCables();
});
await boss.work("scrape:pricing:blueoptics", async () => {
console.log(`[${new Date().toISOString()}] Running: BlueOptics pricing`);
const { scrapeBlueOptics } = await import("./scrapers/blueoptics");
await scrapeBlueOptics();
});
await boss.work("scrape:pricing:fiber24", async () => {
console.log(`[${new Date().toISOString()}] Running: Fiber24 pricing`);
const { scrapeFiber24 } = await import("./scrapers/fiber24");
await scrapeFiber24();
});
await boss.work("scrape:pricing:tscom", async () => {
console.log(`[${new Date().toISOString()}] Running: T&S Communication pricing`);
const { scrapeTsCom } = await import("./scrapers/tscom");
await scrapeTsCom();
});
await boss.work("scrape:pricing:skylane", async () => {
console.log(`[${new Date().toISOString()}] Running: Skylane pricing`);
const { scrapeSkylane } = await import("./scrapers/skylane");
await scrapeSkylane();
});
await boss.work("scrape:pricing:ascentoptics", async () => {
console.log(`[${new Date().toISOString()}] Running: Ascent Optics pricing`);
const { scrapeAscentOptics } = await import("./scrapers/ascentoptics");
await scrapeAscentOptics();
});
await boss.work("scrape:pricing:gaotek", async () => {
console.log(`[${new Date().toISOString()}] Running: GAO Tek pricing`);
const { scrapeGaoTek } = await import("./scrapers/gaotek");
await scrapeGaoTek();
});
// ── Catalog scrapers ────────────────────────────────────────────────── // ── Catalog scrapers ──────────────────────────────────────────────────
@ -345,8 +436,15 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await scrapeFlexoptixCatalog(); await scrapeFlexoptixCatalog();
}); });
// scrape:catalog:smartoptics and scrape:catalog:hubersuhner → Pi-only await boss.work("scrape:catalog:smartoptics", async () => {
// scrape:news and scrape:market-intel → Pi-only (see index-pi.ts) console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`);
await scrapeSmartOptics();
});
await boss.work("scrape:catalog:hubersuhner", async () => {
console.log(`[${new Date().toISOString()}] Running: HUBER+SUHNER catalog`);
await scrapeHuberSuhner();
});
// ── Vendor lists ────────────────────────────────────────────────────── // ── Vendor lists ──────────────────────────────────────────────────────
@ -410,7 +508,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
// ── Intelligence & community ────────────────────────────────────────── // ── Intelligence & community ──────────────────────────────────────────
// scrape:market-intel → Pi-only await boss.work("scrape:market-intel", async () => {
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
await scrapeMarketIntelligence();
});
await boss.work("scrape:nog-talks", async () => { await boss.work("scrape:nog-talks", async () => {
console.log(`[${new Date().toISOString()}] Running: NOG conference talks`); console.log(`[${new Date().toISOString()}] Running: NOG conference talks`);
@ -430,7 +531,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await findAndSeedDatasheetLinks(50); await findAndSeedDatasheetLinks(50);
}); });
// scrape:news → Pi-only await boss.work("scrape:news", async () => {
console.log(`[${new Date().toISOString()}] Running: News scraper`);
await scrapeNews();
});
await boss.work("scrape:faq", async () => { await boss.work("scrape:faq", async () => {
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
@ -569,5 +673,346 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await scrapeOpticsBay(); await scrapeOpticsBay();
}); });
console.log("All workers registered (61 jobs, 24/7 continuous)"); // ── Mouser OEM reference prices ────────────────────────────────────────
await boss.work("scrape:pricing:mouser-oem", async () => {
console.log(`[${new Date().toISOString()}] Running: Mouser OEM reference prices`);
if (!process.env["MOUSER_API_KEY"]) {
console.warn(" [mouser-oem] Skipping — MOUSER_API_KEY not set");
return;
}
const { scrapeDigikey } = await import("./scrapers/digikey");
await scrapeDigikey();
});
// ── Health monitor ──────────────────────────────────────────────────────
await boss.work("monitor:scraper-health", async () => {
const { pool } = await import("./utils/db");
// Vendors we expect to see prices from regularly
const EXPECTED_VENDORS = [
"FiberMall", "QSFPTEK", "Flexoptix", "FS.COM", "10Gtek",
"ATGBICS", "GBICS", "BlueOptics", "ShopFiber24", "T&S Communication",
"Fluxlight", "Optcore", "Champion ONE", "SFPcables",
"Vcelink", "OpticsBay",
];
const result = await pool.query(`
SELECT v.name,
SUM(CASE WHEN po.time > NOW() - INTERVAL '6 hours' THEN 1 ELSE 0 END) AS prices_6h,
MAX(po.time) AS last_seen,
EXTRACT(EPOCH FROM (NOW() - MAX(po.time))) / 3600.0 AS hours_since
FROM vendors v
LEFT JOIN price_observations po ON po.source_vendor_id = v.id
WHERE v.name = ANY($1)
GROUP BY v.name
ORDER BY last_seen ASC NULLS FIRST
`, [EXPECTED_VENDORS]);
const problems: string[] = [];
for (const row of result.rows) {
const h = parseFloat(row.hours_since ?? "9999");
const n = parseInt(row.prices_6h ?? "0", 10);
if (n === 0) {
const lastStr = row.last_seen
? `last seen ${h.toFixed(1)}h ago (${new Date(row.last_seen).toISOString().slice(0, 16)})`
: "NEVER scraped";
problems.push(`${row.name}: 0 prices in last 6h — ${lastStr}`);
}
}
if (problems.length > 0) {
console.error("=== SCRAPER HEALTH ALERT ===");
for (const p of problems) console.error(p);
console.error("=== Check pm2 logs tip-scraper-daemon ===");
} else {
console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`);
}
});
// ── Verification reconciliation ─────────────────────────────────────────
await boss.work("maintenance:reconcile-verification", async () => {
const { pool } = await import("./utils/db");
// 1. Reset competitor_verified=false for products with no non-Flexoptix price in last 30 days
const resetComp = await pool.query(`
UPDATE transceivers t
SET competitor_verified = false,
competitor_verified_at = NULL
WHERE competitor_verified = true
AND NOT EXISTS (
SELECT 1 FROM price_observations po
JOIN vendors v ON po.source_vendor_id = v.id
WHERE po.transceiver_id = t.id
AND po.time > NOW() - INTERVAL '30 days'
AND UPPER(v.name) NOT LIKE '%FLEXOPTIX%'
)
`);
// 2. Reset fully_verified=false for products that lost competitor_verified
const resetFull = await pool.query(`
UPDATE transceivers
SET fully_verified = false,
fully_verified_at = NULL
WHERE fully_verified = true
AND (competitor_verified = false OR price_verified = false OR image_verified = false OR details_verified = false)
`);
// 3. Set fully_verified=true for products that now meet all 4 criteria
const setFull = await pool.query(`
UPDATE transceivers
SET fully_verified = true,
fully_verified_at = COALESCE(fully_verified_at, NOW())
WHERE competitor_verified = true
AND price_verified = true
AND image_verified = true
AND details_verified = true
AND fully_verified = false
`);
console.log(
`[reconcile] competitor_verified reset: ${resetComp.rowCount}, ` +
`fully_verified cleared: ${resetFull.rowCount}, ` +
`fully_verified earned: ${setFull.rowCount}`
);
});
// ── Equivalence matching ────────────────────────────────────────────────
// Matches Flexoptix SKUs to technically equivalent competitor products by specs.
// Confidence scoring: standard_name(35) + form_factor(25) + speed_gbps(20) +
// fiber_type(10) + reach±25%(10) = 100 pts max
// ≥0.85 → auto_approved + competitor_verified=true
// 0.500.84 → pending (Manual Review queue in dashboard)
// <0.50 → skipped
await boss.work("maintenance:find-equivalences", async () => {
const { pool } = await import("./utils/db");
const ts = new Date().toISOString();
console.log(`[${ts}] Running: Equivalence matching`);
// Find all Flexoptix transceivers that are NOT yet competitor_verified
const flexResult = await pool.query(`
SELECT t.id, t.part_number, t.standard_name, t.form_factor,
t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths,
t.connector, t.wdm_type, t.coherent
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
AND t.competitor_verified = false
`);
let autoApproved = 0;
let queued = 0;
let skipped = 0;
for (const fx of flexResult.rows) {
// Find competitor transceivers with recent price observations and matching specs
const candidates = await pool.query(`
SELECT t.id AS competitor_id, t.part_number, t.standard_name,
t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters,
t.wavelengths, t.connector, v.name AS vendor_name,
MAX(po.time) AS last_price, COUNT(*) AS price_count
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
JOIN price_observations po ON po.transceiver_id = t.id
WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%'
AND po.time > NOW() - INTERVAL '30 days'
AND t.form_factor = $1
AND t.speed_gbps = $2
AND t.id != $3
GROUP BY t.id, t.part_number, t.standard_name, t.form_factor,
t.speed_gbps, t.fiber_type, t.reach_meters,
t.wavelengths, t.connector, v.name
`, [fx.form_factor, fx.speed_gbps, fx.id]);
for (const cand of candidates.rows) {
// Confidence scoring
// Max points: form_factor(25) + speed_gbps(20) + standard_name(30) +
// wavelength_nm(20) + fiber_type(10) + reach(10) = 115
let score = 0;
const basis: string[] = [];
// form_factor already matched (pre-filter), award points
score += 25; basis.push("form_factor");
// speed_gbps already matched (pre-filter)
score += 20; basis.push("speed_gbps");
// standard_name match (strong signal — e.g. "10GBASE-LR")
if (fx.standard_name && cand.standard_name &&
fx.standard_name.trim().toUpperCase() === cand.standard_name.trim().toUpperCase()) {
score += 30; basis.push("standard_name");
}
// wavelength match — extract first numeric nm value and compare within ±15nm
// "wavelengths" is text: "1310 nm", "850nm", "1270/1290/1310/1330 nm" etc.
const extractNm = (w: string | null): number | null => {
if (!w) return null;
const m = w.match(/(\d{3,4})/);
return m ? parseInt(m[1], 10) : null;
};
const fxNm = extractNm(fx.wavelengths);
const candNm = extractNm(cand.wavelengths);
if (fxNm !== null && candNm !== null) {
if (Math.abs(fxNm - candNm) <= 15) {
score += 20; basis.push(`wavelength_${fxNm}nm`);
} else {
score -= 20; // hard penalize wrong wavelength (1310 vs 1550 = completely different product)
}
}
// fiber_type match (SMF vs MMF — critical)
if (fx.fiber_type && cand.fiber_type) {
if (fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase()) {
score += 10; basis.push("fiber_type");
} else {
score -= 15; // SMF vs MMF = wrong product
}
}
// reach within ±25%
if (fx.reach_meters && cand.reach_meters && fx.reach_meters > 0 && cand.reach_meters > 0) {
const diff = Math.abs(fx.reach_meters - cand.reach_meters);
const tolerance = Math.max(fx.reach_meters, 1) * 0.25;
if (diff <= tolerance) {
score += 10; basis.push("reach");
} else {
score -= 15; // penalize mismatched reach
}
} else if (!fx.reach_meters && !cand.reach_meters) {
score += 5; basis.push("reach_null");
}
const confidence = Math.max(0, Math.min(1, score / 115));
if (confidence < 0.50) { skipped++; continue; }
const notes = `${fx.part_number}${cand.part_number} (${cand.vendor_name}) | ` +
`basis: ${basis.join(", ")} | reach: ${fx.reach_meters}m vs ${cand.reach_meters}m | ` +
`wavelength: ${fx.wavelengths||"?"} vs ${cand.wavelengths||"?"}`;
// Upsert equivalence candidate
const status = confidence >= 0.73 ? "auto_approved" : "pending";
await pool.query(`
INSERT INTO transceiver_equivalences
(flexoptix_id, competitor_id, confidence, match_basis, match_notes, status)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET
confidence = EXCLUDED.confidence,
match_basis = EXCLUDED.match_basis,
match_notes = EXCLUDED.match_notes,
updated_at = NOW()
WHERE transceiver_equivalences.status NOT IN ('approved', 'rejected')
`, [fx.id, cand.competitor_id, confidence, basis, notes, status]);
if (confidence >= 0.73) {
// Auto-approve: set competitor_verified on the Flexoptix transceiver
await pool.query(`
UPDATE transceivers
SET competitor_verified = true,
competitor_verified_at = NOW()
WHERE id = $1 AND competitor_verified = false
`, [fx.id]);
autoApproved++;
} else {
queued++;
}
}
}
console.log(
`[find-equivalences] auto_approved: ${autoApproved}, ` +
`queued for review: ${queued}, skipped (low confidence): ${skipped}`
);
// After auto-approvals, rerun fully_verified check
if (autoApproved > 0) {
await pool.query(`
UPDATE transceivers
SET fully_verified = true,
fully_verified_at = COALESCE(fully_verified_at, NOW())
WHERE competitor_verified = true
AND price_verified = true
AND image_verified = true
AND details_verified = true
AND fully_verified = false
`);
}
});
// ── Re-research approved equivalences ────────────────────────────────────────
// Processes up to 200 approved equivalences per day that have re_research_due_at <= NOW().
// Re-runs the confidence check: if competitor still has recent prices and specs still match,
// the approval is confirmed (re_researched_at = NOW(), next check in 30 days).
// If confidence drops or competitor has no recent price: reverts to pending.
await boss.work("maintenance:re-research-equivalences", async () => {
const { pool } = await import("./utils/db");
const ts = new Date().toISOString();
console.log(`[${ts}] Running: Re-research approved equivalences`);
const batch = await pool.query(`
SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.confidence,
fx.form_factor, fx.speed_gbps, fx.standard_name, fx.fiber_type,
fx.reach_meters, fx.wavelengths
FROM transceiver_equivalences eq
JOIN transceivers fx ON eq.flexoptix_id = fx.id
WHERE eq.status IN ('approved', 'auto_approved')
AND eq.re_research_due_at IS NOT NULL
AND eq.re_research_due_at <= NOW()
ORDER BY eq.re_research_due_at ASC
LIMIT 200
`);
let confirmed = 0;
let reverted = 0;
for (const eq of batch.rows) {
// Check if competitor still has a recent price observation
const priceCheck = await pool.query(`
SELECT COUNT(*) AS cnt
FROM price_observations
WHERE transceiver_id = $1 AND time > NOW() - INTERVAL '45 days'
`, [eq.competitor_id]);
const hasRecentPrice = parseInt(priceCheck.rows[0].cnt, 10) > 0;
if (!hasRecentPrice) {
// Competitor no longer carries this — revert to pending for manual review
await pool.query(`
UPDATE transceiver_equivalences
SET status = 'pending', re_research_due_at = NULL, re_researched_at = NULL,
match_notes = CONCAT(match_notes, E'\n[Re-research ' || NOW()::date || ': no recent price — reverted to pending]')
WHERE id = $1
`, [eq.id]);
// Reset competitor_verified if no other approved equivalence covers this transceiver
await pool.query(`
UPDATE transceivers
SET competitor_verified = false, competitor_verified_at = NULL,
fully_verified = false, fully_verified_at = NULL
WHERE id = $1
AND NOT EXISTS (
SELECT 1 FROM transceiver_equivalences
WHERE flexoptix_id = $1
AND status IN ('approved', 'auto_approved')
AND id != $2
)
`, [eq.flexoptix_id, eq.id]);
reverted++;
} else {
// Still valid — confirm and schedule next re-research in 30 days
await pool.query(`
UPDATE transceiver_equivalences
SET re_researched_at = NOW(),
re_research_due_at = NOW() + INTERVAL '30 days'
WHERE id = $1
`, [eq.id]);
confirmed++;
}
}
console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`);
});
console.log("All workers registered (76 jobs, 24/7 continuous)");
} }

View File

@ -0,0 +1,292 @@
/**
* OEM Reference Price Scraper Mouser Electronics API
*
* Source: api.mouser.com (free REST API, no bot-detection)
* Target: Juniper, Cisco, Arista OEM transceiver PIDs already in our DB
* Stores: price_observations (marketplace='mouser', condition='new')
*
* API key: Free registration at mouser.com/api set MOUSER_API_KEY env var
* endpoint: POST https://api.mouser.com/api/v1.0/search/keyword
*
* Rate limit: 30 req/min on free tier 2s delay between requests
*
* Note: This file is intentionally named digikey.ts (task origin) but uses
* Mouser as the actual source since DigiKey + Arrow both require Playwright
* to bypass Cloudflare/Akamai. Mouser's free API returns the same data.
*/
import { pool, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const MOUSER_API_BASE = "https://api.mouser.com/api/v1.0";
const MOUSER_API_KEY = process.env["MOUSER_API_KEY"] ?? "";
const DELAY_MS = 2_100; // ≤ 30 req/min on free tier
// ── Types ─────────────────────────────────────────────────────────────────────
interface MouserPriceBreak {
Quantity: number;
Price: string; // e.g. "1,234.56" or "1234.56"
Currency: string; // e.g. "EUR"
}
interface MouserPart {
ManufacturerPartNumber: string;
MouserPartNumber: string;
Availability: string; // e.g. "117 auf Lager"
DataSheetUrl: string;
Description: string;
LeadTime: string; // e.g. "10 Weeks"
Min: string; // min order qty
ProductDetailUrl: string;
PriceBreaks: MouserPriceBreak[];
AvailabilityInStock: string;
}
interface MouserSearchResponse {
Errors: Array<{ Code: string; Message: string }>;
SearchResults: {
NumberOfResult: number;
Parts: MouserPart[];
} | null;
}
// ── Helpers ───────────────────────────────────────────────────────────────────
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
/** Parse Mouser price string "1.234,56" (DE locale) or "1234.56" (US) */
function parseMouserPrice(raw: string, currency: string): number | null {
if (!raw || raw === "") return null;
// German locale uses comma decimal, dot thousands → "1.234,56"
// US locale uses dot decimal → "1234.56"
const cleaned = currency === "EUR"
? raw.replace(/\./g, "").replace(",", ".") // "1.234,56" → "1234.56"
: raw.replace(/,/g, ""); // "1,234.56" → "1234.56"
const n = parseFloat(cleaned);
return Number.isFinite(n) && n > 0 ? n : null;
}
/** Extract quantity from availability string like "117 auf Lager" or "117 In Stock" */
function parseAvailability(avail: string): { qty: number; stockLevel: string } {
if (!avail) return { qty: 0, stockLevel: "out_of_stock" };
const lower = avail.toLowerCase();
// Check for discontinued / not available
if (lower.includes("nicht verfügbar") || lower.includes("not available") || lower.includes("obsolete")) {
return { qty: 0, stockLevel: "discontinued" };
}
// Extract number
const match = avail.match(/(\d[\d,.]*)(?:\s|$)/);
const qty = match ? parseInt(match[1].replace(/[,.]/, ""), 10) : 0;
if (qty === 0) return { qty: 0, stockLevel: "out_of_stock" };
if (qty < 10) return { qty, stockLevel: "low_stock" };
return { qty, stockLevel: "in_stock" };
}
/** Get the 1-unit price (or lowest break price) in EUR */
function extractPrice(part: MouserPart): { price: number; currency: string } | null {
const breaks = part.PriceBreaks;
if (!breaks || breaks.length === 0) return null;
// Sort by quantity ascending, take qty=1 or first available
const sorted = [...breaks].sort((a, b) => a.Quantity - b.Quantity);
const first = sorted[0];
if (!first) return null;
const currency = (first.Currency ?? "EUR").toUpperCase();
const price = parseMouserPrice(first.Price, currency);
if (price === null) return null;
return { price, currency };
}
// ── API call ──────────────────────────────────────────────────────────────────
async function searchMouser(partNumber: string): Promise<MouserPart | null> {
if (!MOUSER_API_KEY) return null;
const url = `${MOUSER_API_BASE}/search/keyword?apiKey=${MOUSER_API_KEY}&langId=1&searchWithSapnningRows=false`;
let resp: Response;
try {
resp = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Accept: "application/json",
},
body: JSON.stringify({
SearchByKeywordRequest: {
keyword: partNumber,
records: 5,
startingRecord: 0,
searchOptions: "1", // Exact match preferred
searchWithSapnningRows: false,
},
}),
signal: AbortSignal.timeout(15_000),
});
} catch (err: unknown) {
console.warn(` [Mouser] Fetch error for ${partNumber}: ${(err as Error).message.slice(0, 60)}`);
return null;
}
if (!resp.ok) {
if (resp.status === 429) {
console.warn(` [Mouser] Rate limited — backing off 30s`);
await sleep(30_000);
} else {
console.warn(` [Mouser] HTTP ${resp.status} for ${partNumber}`);
}
return null;
}
const data = (await resp.json()) as MouserSearchResponse;
if (data.Errors && data.Errors.length > 0) {
const errMsg = data.Errors.map((e) => e.Message).join("; ");
console.warn(` [Mouser] API error for ${partNumber}: ${errMsg.slice(0, 80)}`);
return null;
}
const parts = data.SearchResults?.Parts ?? [];
if (parts.length === 0) return null;
const norm = partNumber.toUpperCase().trim();
// Prefer exact MPN match
const exact = parts.find((p) => (p.ManufacturerPartNumber ?? "").toUpperCase().trim() === norm);
return exact ?? parts[0] ?? null;
}
// ── Main ──────────────────────────────────────────────────────────────────────
export async function scrapeDigikey(): Promise<void> {
console.log("=== OEM Reference Price Scraper (Mouser Electronics API) ===\n");
if (!MOUSER_API_KEY) {
console.error(
" ERROR: MOUSER_API_KEY not set.\n" +
" Register free at https://www.mouser.com/api-hub/ → get API key → set env var.\n" +
" Free tier: 1000 queries/month — enough for 475 Juniper PIDs."
);
return;
}
// Register Mouser as a vendor (distributor = reseller type)
const vendorId = await ensureVendor(
"Mouser Electronics",
"reseller",
"https://www.mouser.de",
"https://www.mouser.de/Search/Refine?Keyword="
);
console.log(` Vendor ID: ${vendorId}`);
// Load OEM transceiver PIDs
const TARGET_VENDORS = ["Juniper Networks", "Cisco Systems", "Arista Networks", "FS.COM", "SmartOptics"];
const { rows: transceivers } = await pool.query<{
id: string;
part_number: string;
form_factor: string;
speed: string;
vendor_name: string;
}>(
`SELECT t.id, t.part_number, t.form_factor, t.speed, v.name AS vendor_name
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = ANY($1)
AND t.part_number IS NOT NULL
AND t.part_number NOT ILIKE '%Transceiver%'
AND t.part_number NOT ILIKE '%-Transceivers'
AND LENGTH(t.part_number) BETWEEN 4 AND 35
ORDER BY v.name, t.part_number`,
[TARGET_VENDORS]
);
console.log(` Found ${transceivers.length} OEM PIDs to price-check\n`);
let found = 0;
let notFound = 0;
let errors = 0;
for (let i = 0; i < transceivers.length; i++) {
const tx = transceivers[i];
if (i > 0 && i % 20 === 0) {
console.log(
` [${i}/${transceivers.length}] found=${found} not_found=${notFound} errors=${errors}`
);
}
const part = await searchMouser(tx.part_number);
if (!part) {
notFound++;
await sleep(DELAY_MS);
continue;
}
const priceData = extractPrice(part);
if (!priceData) {
notFound++;
await sleep(DELAY_MS);
continue;
}
const { qty, stockLevel } = parseAvailability(part.Availability);
const productUrl = part.ProductDetailUrl
? `https://www.mouser.de${part.ProductDetailUrl}`
: `https://www.mouser.de/Search/Refine?Keyword=${encodeURIComponent(tx.part_number)}`;
const hash = contentHash(
`mouser:${tx.id}:${priceData.price}:${priceData.currency}:${stockLevel}`
);
try {
await upsertPriceObservation({
transceiverId: tx.id,
sourceVendorId: vendorId,
price: priceData.price,
currency: priceData.currency,
stockLevel,
quantityAvailable: qty,
url: productUrl,
contentHash: hash,
});
found++;
console.log(
`${tx.part_number.padEnd(32)} ${priceData.currency} ${priceData.price.toFixed(2).padStart(9)} ${stockLevel.padEnd(13)} qty=${qty}`
);
} catch (err: unknown) {
errors++;
console.warn(
` ✗ DB error ${tx.part_number}: ${(err as Error).message.slice(0, 60)}`
);
}
await sleep(DELAY_MS);
}
console.log(`\n=== Mouser OEM Scraper Complete ===`);
console.log(` Processed: ${transceivers.length}`);
console.log(` Found: ${found}`);
console.log(` Not found: ${notFound}`);
console.log(` DB errors: ${errors}\n`);
}
// ── CLI ───────────────────────────────────────────────────────────────────────
if (require.main === module) {
scrapeDigikey()
.then(() => pool.end())
.catch((err: unknown) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}