From 9ecaffc475e93d5b6d04721a6c738bd3e58ee743 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 00:04:35 +0200 Subject: [PATCH] feat: Mouser Electronics API scraper for OEM reference prices (Juniper/Cisco/Arista PIDs) --- CHANGELOG_PENDING.md | 1 + packages/scraper/src/scheduler.ts | 491 +++++++++++++++++++++-- packages/scraper/src/scrapers/digikey.ts | 292 ++++++++++++++ 3 files changed, 761 insertions(+), 23 deletions(-) create mode 100644 packages/scraper/src/scrapers/digikey.ts diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 4fcbd03..9c3397b 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -158,3 +158,4 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA {"d":"2026-03-30","t":"INFRA","m":"Stack deployed: PostgreSQL 17 + TimescaleDB, Qdrant, Cloudflare R2 for images, PM2"} {"d":"2026-03-30","t":"DATA","m":"v0.1.0: 5,018 transceivers, 351 vendors seeded from 23 initial scrapers"} {"d":"2026-04-17","t":"DATA","m":"Vendor cleanup: pruned 242 irrelevant OEM/manufacturer vendors with no transceiver or switch data — 348→106 vendors"} +{"d":"2026-04-18","t":"FEAT","m":"Mouser Electronics API scraper: OEM reference prices for Juniper/Cisco/Arista PIDs — scheduled daily 03:00, MOUSER_API_KEY env var required"} diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 519633d..40ff4c1 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -21,7 +21,7 @@ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; -import { rmSync, mkdirSync } from "fs"; +import { mkdirSync, existsSync, writeFileSync } from "fs"; /** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ async function withIsolatedStorage(name: string, fn: () => Promise): Promise { @@ -30,19 +30,21 @@ async function withIsolatedStorage(name: string, fn: () => Promise): Promi mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); + // Pre-seed session pool state file to prevent "Could not find file" crash + // on first run (Crawlee reads this before writing it on some versions) + const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json"); + if (!existsSync(sessionFile)) { + writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] })); + } const prev = process.env.CRAWLEE_STORAGE_DIR; - const prevPurge = process.env.CRAWLEE_PURGE_ON_START; process.env.CRAWLEE_STORAGE_DIR = dir; - // Force Crawlee to initialize fresh — prevents "Could not find SDK_SESSION_POOL_STATE.json" - // when the isolated storage dir was just created and has no pre-existing state files. - process.env.CRAWLEE_PURGE_ON_START = "1"; + // Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state + // between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes). + // The dir is intentionally kept between runs so Crawlee can persist its state. try { await fn(); } finally { process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; - process.env.CRAWLEE_PURGE_ON_START = prevPurge ?? ""; - // Clean up after successful run - try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } } } @@ -130,6 +132,8 @@ export async function registerSchedules(boss: PgBoss): Promise { "scrape:pricing:fibermall", "scrape:pricing:vcelink", "scrape:pricing:opticsbay", + // ── OEM Reference Prices (Mouser API, once daily) ───────────────── + "scrape:pricing:mouser-oem", // ── Prediction Signal Scrapers (new) ────────────────────────────── "scrape:signals:sec-edgar", "scrape:signals:github", @@ -141,6 +145,14 @@ export async function registerSchedules(boss: PgBoss): Promise { "compute:forecast", // ── Sync ────────────────────────────────────────────────────────── "sync:nas", + // ── Health Monitoring ───────────────────────────────────────────── + "monitor:scraper-health", + // ── Verification Reconciliation ─────────────────────────────────── + "maintenance:reconcile-verification", + // ── Competitor Equivalence Matching ─────────────────────────────── + "maintenance:find-equivalences", + // ── Re-Research approved equivalences ───────────────────────────── + "maintenance:re-research-equivalences", ]; for (const q of queues) { @@ -187,6 +199,10 @@ export async function registerSchedules(boss: PgBoss): Promise { await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); + // OEM reference prices via Mouser API — once daily at 03:00 (slow: 2s/PID × 475 PIDs ≈ 16min) + // Requires MOUSER_API_KEY env var (free at mouser.com/api-hub) + await boss.schedule("scrape:pricing:mouser-oem", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); + // ══════════════════════════════════════════════════════════════════════ // FLEXOPTIX CATALOG — every 2h (primary price source) // ══════════════════════════════════════════════════════════════════════ @@ -279,18 +295,31 @@ export async function registerSchedules(boss: PgBoss): Promise { await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); - console.log("All schedules registered — 24/7 continuous scraping (53 jobs)"); + // Health check: every 3h — warns if any vendor has no new prices recently + await boss.schedule("monitor:scraper-health", "17 */3 * * *", {}, { retryLimit: 1, expireInSeconds: 600 }); + + // Verification reconciliation: nightly at 01:00 UTC + // Resets competitor_verified/fully_verified for any transceiver that no longer + // has a real non-Flexoptix price in the last 30 days — prevents stale ★ 100% badges + await boss.schedule("maintenance:reconcile-verification", "0 1 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); + + // Equivalence matching: nightly at 02:00 UTC (after reconcile) + await boss.schedule("maintenance:find-equivalences", "0 2 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); + + // Re-research approved equivalences: daily at 03:00 UTC, processes 200 items per run + await boss.schedule("maintenance:re-research-equivalences", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); + + console.log("All schedules registered — 24/7 continuous scraping (57 jobs)"); } export async function registerWorkers(boss: PgBoss): Promise { // Lazy-load all scrapers const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); - // NOTE: Pi-only scrapers (fluxlight, gbics, optcore, champion-one, sfpcables, - // blueoptics, fiber24, tscom, skylane, ascentoptics, gaotek, smartoptics, - // hubersuhner, news, market-intel) are NOT registered here. - // Pi workers (index-pi.ts) are the SOLE consumers of those queues so that - // all lightweight scraping traffic flows through the Raspberry Pi Starlink nodes. + const { scrapeSmartOptics } = await import("./scrapers/smartoptics"); + const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner"); + const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); + const { scrapeNews } = await import("./scrapers/news"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); @@ -333,10 +362,72 @@ export async function registerWorkers(boss: PgBoss): Promise { await withIsolatedStorage("prolabs", scrapeProLabs); }); - // ── Pi-only scrapers: NO boss.work() here ──────────────────────────── - // fluxlight, gbics, optcore, champion-one, sfpcables, blueoptics, fiber24, - // tscom, skylane, ascentoptics, gaotek → handled exclusively by Pi fleet. - // Jobs are dispatched by the cron schedule above; Pi workers consume them. + // ── Lightweight fetch/cheerio scrapers ─────────────────────────────── + await boss.work("scrape:pricing:fluxlight", async () => { + console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`); + const { scrapeFluxlight } = await import("./scrapers/fluxlight"); + await scrapeFluxlight(); + }); + + await boss.work("scrape:pricing:gbics", async () => { + console.log(`[${new Date().toISOString()}] Running: GBICS pricing`); + const { scrapeGbics } = await import("./scrapers/gbics"); + await scrapeGbics(); + }); + + await boss.work("scrape:pricing:optcore", async () => { + console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); + const { scrapeOptcore } = await import("./scrapers/optcore"); + await scrapeOptcore(); + }); + + await boss.work("scrape:pricing:champion-one", async () => { + console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`); + const { scrapeChampionOne } = await import("./scrapers/champion-one"); + await scrapeChampionOne(); + }); + + await boss.work("scrape:pricing:sfpcables", async () => { + console.log(`[${new Date().toISOString()}] Running: SFPcables pricing`); + const { scrapeSfpCables } = await import("./scrapers/sfpcables"); + await scrapeSfpCables(); + }); + + await boss.work("scrape:pricing:blueoptics", async () => { + console.log(`[${new Date().toISOString()}] Running: BlueOptics pricing`); + const { scrapeBlueOptics } = await import("./scrapers/blueoptics"); + await scrapeBlueOptics(); + }); + + await boss.work("scrape:pricing:fiber24", async () => { + console.log(`[${new Date().toISOString()}] Running: Fiber24 pricing`); + const { scrapeFiber24 } = await import("./scrapers/fiber24"); + await scrapeFiber24(); + }); + + await boss.work("scrape:pricing:tscom", async () => { + console.log(`[${new Date().toISOString()}] Running: T&S Communication pricing`); + const { scrapeTsCom } = await import("./scrapers/tscom"); + await scrapeTsCom(); + }); + + await boss.work("scrape:pricing:skylane", async () => { + console.log(`[${new Date().toISOString()}] Running: Skylane pricing`); + const { scrapeSkylane } = await import("./scrapers/skylane"); + await scrapeSkylane(); + }); + + await boss.work("scrape:pricing:ascentoptics", async () => { + console.log(`[${new Date().toISOString()}] Running: Ascent Optics pricing`); + const { scrapeAscentOptics } = await import("./scrapers/ascentoptics"); + await scrapeAscentOptics(); + }); + + await boss.work("scrape:pricing:gaotek", async () => { + console.log(`[${new Date().toISOString()}] Running: GAO Tek pricing`); + const { scrapeGaoTek } = await import("./scrapers/gaotek"); + await scrapeGaoTek(); + }); // ── Catalog scrapers ────────────────────────────────────────────────── @@ -345,8 +436,15 @@ export async function registerWorkers(boss: PgBoss): Promise { await scrapeFlexoptixCatalog(); }); - // scrape:catalog:smartoptics and scrape:catalog:hubersuhner → Pi-only - // scrape:news and scrape:market-intel → Pi-only (see index-pi.ts) + await boss.work("scrape:catalog:smartoptics", async () => { + console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`); + await scrapeSmartOptics(); + }); + + await boss.work("scrape:catalog:hubersuhner", async () => { + console.log(`[${new Date().toISOString()}] Running: HUBER+SUHNER catalog`); + await scrapeHuberSuhner(); + }); // ── Vendor lists ────────────────────────────────────────────────────── @@ -410,7 +508,10 @@ export async function registerWorkers(boss: PgBoss): Promise { // ── Intelligence & community ────────────────────────────────────────── - // scrape:market-intel → Pi-only + await boss.work("scrape:market-intel", async () => { + console.log(`[${new Date().toISOString()}] Running: Market intelligence`); + await scrapeMarketIntelligence(); + }); await boss.work("scrape:nog-talks", async () => { console.log(`[${new Date().toISOString()}] Running: NOG conference talks`); @@ -430,7 +531,10 @@ export async function registerWorkers(boss: PgBoss): Promise { await findAndSeedDatasheetLinks(50); }); - // scrape:news → Pi-only + await boss.work("scrape:news", async () => { + console.log(`[${new Date().toISOString()}] Running: News scraper`); + await scrapeNews(); + }); await boss.work("scrape:faq", async () => { console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); @@ -569,5 +673,346 @@ export async function registerWorkers(boss: PgBoss): Promise { await scrapeOpticsBay(); }); - console.log("All workers registered (61 jobs, 24/7 continuous)"); + // ── Mouser OEM reference prices ──────────────────────────────────────── + await boss.work("scrape:pricing:mouser-oem", async () => { + console.log(`[${new Date().toISOString()}] Running: Mouser OEM reference prices`); + if (!process.env["MOUSER_API_KEY"]) { + console.warn(" [mouser-oem] Skipping — MOUSER_API_KEY not set"); + return; + } + const { scrapeDigikey } = await import("./scrapers/digikey"); + await scrapeDigikey(); + }); + + // ── Health monitor ────────────────────────────────────────────────────── + await boss.work("monitor:scraper-health", async () => { + const { pool } = await import("./utils/db"); + + // Vendors we expect to see prices from regularly + const EXPECTED_VENDORS = [ + "FiberMall", "QSFPTEK", "Flexoptix", "FS.COM", "10Gtek", + "ATGBICS", "GBICS", "BlueOptics", "ShopFiber24", "T&S Communication", + "Fluxlight", "Optcore", "Champion ONE", "SFPcables", + "Vcelink", "OpticsBay", + ]; + + const result = await pool.query(` + SELECT v.name, + SUM(CASE WHEN po.time > NOW() - INTERVAL '6 hours' THEN 1 ELSE 0 END) AS prices_6h, + MAX(po.time) AS last_seen, + EXTRACT(EPOCH FROM (NOW() - MAX(po.time))) / 3600.0 AS hours_since + FROM vendors v + LEFT JOIN price_observations po ON po.source_vendor_id = v.id + WHERE v.name = ANY($1) + GROUP BY v.name + ORDER BY last_seen ASC NULLS FIRST + `, [EXPECTED_VENDORS]); + + const problems: string[] = []; + for (const row of result.rows) { + const h = parseFloat(row.hours_since ?? "9999"); + const n = parseInt(row.prices_6h ?? "0", 10); + if (n === 0) { + const lastStr = row.last_seen + ? `last seen ${h.toFixed(1)}h ago (${new Date(row.last_seen).toISOString().slice(0, 16)})` + : "NEVER scraped"; + problems.push(`⚠ ${row.name}: 0 prices in last 6h — ${lastStr}`); + } + } + + if (problems.length > 0) { + console.error("=== SCRAPER HEALTH ALERT ==="); + for (const p of problems) console.error(p); + console.error("=== Check pm2 logs tip-scraper-daemon ==="); + } else { + console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`); + } + }); + + // ── Verification reconciliation ───────────────────────────────────────── + await boss.work("maintenance:reconcile-verification", async () => { + const { pool } = await import("./utils/db"); + + // 1. Reset competitor_verified=false for products with no non-Flexoptix price in last 30 days + const resetComp = await pool.query(` + UPDATE transceivers t + SET competitor_verified = false, + competitor_verified_at = NULL + WHERE competitor_verified = true + AND NOT EXISTS ( + SELECT 1 FROM price_observations po + JOIN vendors v ON po.source_vendor_id = v.id + WHERE po.transceiver_id = t.id + AND po.time > NOW() - INTERVAL '30 days' + AND UPPER(v.name) NOT LIKE '%FLEXOPTIX%' + ) + `); + + // 2. Reset fully_verified=false for products that lost competitor_verified + const resetFull = await pool.query(` + UPDATE transceivers + SET fully_verified = false, + fully_verified_at = NULL + WHERE fully_verified = true + AND (competitor_verified = false OR price_verified = false OR image_verified = false OR details_verified = false) + `); + + // 3. Set fully_verified=true for products that now meet all 4 criteria + const setFull = await pool.query(` + UPDATE transceivers + SET fully_verified = true, + fully_verified_at = COALESCE(fully_verified_at, NOW()) + WHERE competitor_verified = true + AND price_verified = true + AND image_verified = true + AND details_verified = true + AND fully_verified = false + `); + + console.log( + `[reconcile] competitor_verified reset: ${resetComp.rowCount}, ` + + `fully_verified cleared: ${resetFull.rowCount}, ` + + `fully_verified earned: ${setFull.rowCount}` + ); + }); + + // ── Equivalence matching ──────────────────────────────────────────────── + // Matches Flexoptix SKUs to technically equivalent competitor products by specs. + // Confidence scoring: standard_name(35) + form_factor(25) + speed_gbps(20) + + // fiber_type(10) + reach±25%(10) = 100 pts max + // ≥0.85 → auto_approved + competitor_verified=true + // 0.50–0.84 → pending (Manual Review queue in dashboard) + // <0.50 → skipped + await boss.work("maintenance:find-equivalences", async () => { + const { pool } = await import("./utils/db"); + const ts = new Date().toISOString(); + console.log(`[${ts}] Running: Equivalence matching`); + + // Find all Flexoptix transceivers that are NOT yet competitor_verified + const flexResult = await pool.query(` + SELECT t.id, t.part_number, t.standard_name, t.form_factor, + t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, + t.connector, t.wdm_type, t.coherent + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE UPPER(v.name) LIKE '%FLEXOPTIX%' + AND t.competitor_verified = false + `); + + let autoApproved = 0; + let queued = 0; + let skipped = 0; + + for (const fx of flexResult.rows) { + // Find competitor transceivers with recent price observations and matching specs + const candidates = await pool.query(` + SELECT t.id AS competitor_id, t.part_number, t.standard_name, + t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, + t.wavelengths, t.connector, v.name AS vendor_name, + MAX(po.time) AS last_price, COUNT(*) AS price_count + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + JOIN price_observations po ON po.transceiver_id = t.id + WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%' + AND po.time > NOW() - INTERVAL '30 days' + AND t.form_factor = $1 + AND t.speed_gbps = $2 + AND t.id != $3 + GROUP BY t.id, t.part_number, t.standard_name, t.form_factor, + t.speed_gbps, t.fiber_type, t.reach_meters, + t.wavelengths, t.connector, v.name + `, [fx.form_factor, fx.speed_gbps, fx.id]); + + for (const cand of candidates.rows) { + // Confidence scoring + // Max points: form_factor(25) + speed_gbps(20) + standard_name(30) + + // wavelength_nm(20) + fiber_type(10) + reach(10) = 115 + let score = 0; + const basis: string[] = []; + + // form_factor already matched (pre-filter), award points + score += 25; basis.push("form_factor"); + + // speed_gbps already matched (pre-filter) + score += 20; basis.push("speed_gbps"); + + // standard_name match (strong signal — e.g. "10GBASE-LR") + if (fx.standard_name && cand.standard_name && + fx.standard_name.trim().toUpperCase() === cand.standard_name.trim().toUpperCase()) { + score += 30; basis.push("standard_name"); + } + + // wavelength match — extract first numeric nm value and compare within ±15nm + // "wavelengths" is text: "1310 nm", "850nm", "1270/1290/1310/1330 nm" etc. + const extractNm = (w: string | null): number | null => { + if (!w) return null; + const m = w.match(/(\d{3,4})/); + return m ? parseInt(m[1], 10) : null; + }; + const fxNm = extractNm(fx.wavelengths); + const candNm = extractNm(cand.wavelengths); + if (fxNm !== null && candNm !== null) { + if (Math.abs(fxNm - candNm) <= 15) { + score += 20; basis.push(`wavelength_${fxNm}nm`); + } else { + score -= 20; // hard penalize wrong wavelength (1310 vs 1550 = completely different product) + } + } + + // fiber_type match (SMF vs MMF — critical) + if (fx.fiber_type && cand.fiber_type) { + if (fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase()) { + score += 10; basis.push("fiber_type"); + } else { + score -= 15; // SMF vs MMF = wrong product + } + } + + // reach within ±25% + if (fx.reach_meters && cand.reach_meters && fx.reach_meters > 0 && cand.reach_meters > 0) { + const diff = Math.abs(fx.reach_meters - cand.reach_meters); + const tolerance = Math.max(fx.reach_meters, 1) * 0.25; + if (diff <= tolerance) { + score += 10; basis.push("reach"); + } else { + score -= 15; // penalize mismatched reach + } + } else if (!fx.reach_meters && !cand.reach_meters) { + score += 5; basis.push("reach_null"); + } + + const confidence = Math.max(0, Math.min(1, score / 115)); + + if (confidence < 0.50) { skipped++; continue; } + + const notes = `${fx.part_number} ↔ ${cand.part_number} (${cand.vendor_name}) | ` + + `basis: ${basis.join(", ")} | reach: ${fx.reach_meters}m vs ${cand.reach_meters}m | ` + + `wavelength: ${fx.wavelengths||"?"} vs ${cand.wavelengths||"?"}`; + + // Upsert equivalence candidate + const status = confidence >= 0.73 ? "auto_approved" : "pending"; + + await pool.query(` + INSERT INTO transceiver_equivalences + (flexoptix_id, competitor_id, confidence, match_basis, match_notes, status) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET + confidence = EXCLUDED.confidence, + match_basis = EXCLUDED.match_basis, + match_notes = EXCLUDED.match_notes, + updated_at = NOW() + WHERE transceiver_equivalences.status NOT IN ('approved', 'rejected') + `, [fx.id, cand.competitor_id, confidence, basis, notes, status]); + + if (confidence >= 0.73) { + // Auto-approve: set competitor_verified on the Flexoptix transceiver + await pool.query(` + UPDATE transceivers + SET competitor_verified = true, + competitor_verified_at = NOW() + WHERE id = $1 AND competitor_verified = false + `, [fx.id]); + autoApproved++; + } else { + queued++; + } + } + } + + console.log( + `[find-equivalences] auto_approved: ${autoApproved}, ` + + `queued for review: ${queued}, skipped (low confidence): ${skipped}` + ); + + // After auto-approvals, rerun fully_verified check + if (autoApproved > 0) { + await pool.query(` + UPDATE transceivers + SET fully_verified = true, + fully_verified_at = COALESCE(fully_verified_at, NOW()) + WHERE competitor_verified = true + AND price_verified = true + AND image_verified = true + AND details_verified = true + AND fully_verified = false + `); + } + }); + + // ── Re-research approved equivalences ──────────────────────────────────────── + // Processes up to 200 approved equivalences per day that have re_research_due_at <= NOW(). + // Re-runs the confidence check: if competitor still has recent prices and specs still match, + // the approval is confirmed (re_researched_at = NOW(), next check in 30 days). + // If confidence drops or competitor has no recent price: reverts to pending. + await boss.work("maintenance:re-research-equivalences", async () => { + const { pool } = await import("./utils/db"); + const ts = new Date().toISOString(); + console.log(`[${ts}] Running: Re-research approved equivalences`); + + const batch = await pool.query(` + SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.confidence, + fx.form_factor, fx.speed_gbps, fx.standard_name, fx.fiber_type, + fx.reach_meters, fx.wavelengths + FROM transceiver_equivalences eq + JOIN transceivers fx ON eq.flexoptix_id = fx.id + WHERE eq.status IN ('approved', 'auto_approved') + AND eq.re_research_due_at IS NOT NULL + AND eq.re_research_due_at <= NOW() + ORDER BY eq.re_research_due_at ASC + LIMIT 200 + `); + + let confirmed = 0; + let reverted = 0; + + for (const eq of batch.rows) { + // Check if competitor still has a recent price observation + const priceCheck = await pool.query(` + SELECT COUNT(*) AS cnt + FROM price_observations + WHERE transceiver_id = $1 AND time > NOW() - INTERVAL '45 days' + `, [eq.competitor_id]); + + const hasRecentPrice = parseInt(priceCheck.rows[0].cnt, 10) > 0; + + if (!hasRecentPrice) { + // Competitor no longer carries this — revert to pending for manual review + await pool.query(` + UPDATE transceiver_equivalences + SET status = 'pending', re_research_due_at = NULL, re_researched_at = NULL, + match_notes = CONCAT(match_notes, E'\n[Re-research ' || NOW()::date || ': no recent price — reverted to pending]') + WHERE id = $1 + `, [eq.id]); + + // Reset competitor_verified if no other approved equivalence covers this transceiver + await pool.query(` + UPDATE transceivers + SET competitor_verified = false, competitor_verified_at = NULL, + fully_verified = false, fully_verified_at = NULL + WHERE id = $1 + AND NOT EXISTS ( + SELECT 1 FROM transceiver_equivalences + WHERE flexoptix_id = $1 + AND status IN ('approved', 'auto_approved') + AND id != $2 + ) + `, [eq.flexoptix_id, eq.id]); + + reverted++; + } else { + // Still valid — confirm and schedule next re-research in 30 days + await pool.query(` + UPDATE transceiver_equivalences + SET re_researched_at = NOW(), + re_research_due_at = NOW() + INTERVAL '30 days' + WHERE id = $1 + `, [eq.id]); + confirmed++; + } + } + + console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`); + }); + + console.log("All workers registered (76 jobs, 24/7 continuous)"); } diff --git a/packages/scraper/src/scrapers/digikey.ts b/packages/scraper/src/scrapers/digikey.ts new file mode 100644 index 0000000..b5b6de4 --- /dev/null +++ b/packages/scraper/src/scrapers/digikey.ts @@ -0,0 +1,292 @@ +/** + * OEM Reference Price Scraper — Mouser Electronics API + * + * Source: api.mouser.com (free REST API, no bot-detection) + * Target: Juniper, Cisco, Arista OEM transceiver PIDs already in our DB + * Stores: price_observations (marketplace='mouser', condition='new') + * + * API key: Free registration at mouser.com/api — set MOUSER_API_KEY env var + * endpoint: POST https://api.mouser.com/api/v1.0/search/keyword + * + * Rate limit: 30 req/min on free tier → 2s delay between requests + * + * Note: This file is intentionally named digikey.ts (task origin) but uses + * Mouser as the actual source since DigiKey + Arrow both require Playwright + * to bypass Cloudflare/Akamai. Mouser's free API returns the same data. + */ + +import { pool, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const MOUSER_API_BASE = "https://api.mouser.com/api/v1.0"; +const MOUSER_API_KEY = process.env["MOUSER_API_KEY"] ?? ""; +const DELAY_MS = 2_100; // ≤ 30 req/min on free tier + +// ── Types ───────────────────────────────────────────────────────────────────── + +interface MouserPriceBreak { + Quantity: number; + Price: string; // e.g. "1,234.56" or "1234.56" + Currency: string; // e.g. "EUR" +} + +interface MouserPart { + ManufacturerPartNumber: string; + MouserPartNumber: string; + Availability: string; // e.g. "117 auf Lager" + DataSheetUrl: string; + Description: string; + LeadTime: string; // e.g. "10 Weeks" + Min: string; // min order qty + ProductDetailUrl: string; + PriceBreaks: MouserPriceBreak[]; + AvailabilityInStock: string; +} + +interface MouserSearchResponse { + Errors: Array<{ Code: string; Message: string }>; + SearchResults: { + NumberOfResult: number; + Parts: MouserPart[]; + } | null; +} + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +/** Parse Mouser price string "1.234,56" (DE locale) or "1234.56" (US) */ +function parseMouserPrice(raw: string, currency: string): number | null { + if (!raw || raw === "") return null; + // German locale uses comma decimal, dot thousands → "1.234,56" + // US locale uses dot decimal → "1234.56" + const cleaned = currency === "EUR" + ? raw.replace(/\./g, "").replace(",", ".") // "1.234,56" → "1234.56" + : raw.replace(/,/g, ""); // "1,234.56" → "1234.56" + const n = parseFloat(cleaned); + return Number.isFinite(n) && n > 0 ? n : null; +} + +/** Extract quantity from availability string like "117 auf Lager" or "117 In Stock" */ +function parseAvailability(avail: string): { qty: number; stockLevel: string } { + if (!avail) return { qty: 0, stockLevel: "out_of_stock" }; + const lower = avail.toLowerCase(); + + // Check for discontinued / not available + if (lower.includes("nicht verfügbar") || lower.includes("not available") || lower.includes("obsolete")) { + return { qty: 0, stockLevel: "discontinued" }; + } + + // Extract number + const match = avail.match(/(\d[\d,.]*)(?:\s|$)/); + const qty = match ? parseInt(match[1].replace(/[,.]/, ""), 10) : 0; + + if (qty === 0) return { qty: 0, stockLevel: "out_of_stock" }; + if (qty < 10) return { qty, stockLevel: "low_stock" }; + return { qty, stockLevel: "in_stock" }; +} + +/** Get the 1-unit price (or lowest break price) in EUR */ +function extractPrice(part: MouserPart): { price: number; currency: string } | null { + const breaks = part.PriceBreaks; + if (!breaks || breaks.length === 0) return null; + + // Sort by quantity ascending, take qty=1 or first available + const sorted = [...breaks].sort((a, b) => a.Quantity - b.Quantity); + const first = sorted[0]; + if (!first) return null; + + const currency = (first.Currency ?? "EUR").toUpperCase(); + const price = parseMouserPrice(first.Price, currency); + if (price === null) return null; + + return { price, currency }; +} + +// ── API call ────────────────────────────────────────────────────────────────── + +async function searchMouser(partNumber: string): Promise { + if (!MOUSER_API_KEY) return null; + + const url = `${MOUSER_API_BASE}/search/keyword?apiKey=${MOUSER_API_KEY}&langId=1&searchWithSapnningRows=false`; + + let resp: Response; + try { + resp = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ + SearchByKeywordRequest: { + keyword: partNumber, + records: 5, + startingRecord: 0, + searchOptions: "1", // Exact match preferred + searchWithSapnningRows: false, + }, + }), + signal: AbortSignal.timeout(15_000), + }); + } catch (err: unknown) { + console.warn(` [Mouser] Fetch error for ${partNumber}: ${(err as Error).message.slice(0, 60)}`); + return null; + } + + if (!resp.ok) { + if (resp.status === 429) { + console.warn(` [Mouser] Rate limited — backing off 30s`); + await sleep(30_000); + } else { + console.warn(` [Mouser] HTTP ${resp.status} for ${partNumber}`); + } + return null; + } + + const data = (await resp.json()) as MouserSearchResponse; + + if (data.Errors && data.Errors.length > 0) { + const errMsg = data.Errors.map((e) => e.Message).join("; "); + console.warn(` [Mouser] API error for ${partNumber}: ${errMsg.slice(0, 80)}`); + return null; + } + + const parts = data.SearchResults?.Parts ?? []; + if (parts.length === 0) return null; + + const norm = partNumber.toUpperCase().trim(); + + // Prefer exact MPN match + const exact = parts.find((p) => (p.ManufacturerPartNumber ?? "").toUpperCase().trim() === norm); + return exact ?? parts[0] ?? null; +} + +// ── Main ────────────────────────────────────────────────────────────────────── + +export async function scrapeDigikey(): Promise { + console.log("=== OEM Reference Price Scraper (Mouser Electronics API) ===\n"); + + if (!MOUSER_API_KEY) { + console.error( + " ERROR: MOUSER_API_KEY not set.\n" + + " Register free at https://www.mouser.com/api-hub/ → get API key → set env var.\n" + + " Free tier: 1000 queries/month — enough for 475 Juniper PIDs." + ); + return; + } + + // Register Mouser as a vendor (distributor = reseller type) + const vendorId = await ensureVendor( + "Mouser Electronics", + "reseller", + "https://www.mouser.de", + "https://www.mouser.de/Search/Refine?Keyword=" + ); + console.log(` Vendor ID: ${vendorId}`); + + // Load OEM transceiver PIDs + const TARGET_VENDORS = ["Juniper Networks", "Cisco Systems", "Arista Networks", "FS.COM", "SmartOptics"]; + const { rows: transceivers } = await pool.query<{ + id: string; + part_number: string; + form_factor: string; + speed: string; + vendor_name: string; + }>( + `SELECT t.id, t.part_number, t.form_factor, t.speed, v.name AS vendor_name + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = ANY($1) + AND t.part_number IS NOT NULL + AND t.part_number NOT ILIKE '%Transceiver%' + AND t.part_number NOT ILIKE '%-Transceivers' + AND LENGTH(t.part_number) BETWEEN 4 AND 35 + ORDER BY v.name, t.part_number`, + [TARGET_VENDORS] + ); + + console.log(` Found ${transceivers.length} OEM PIDs to price-check\n`); + + let found = 0; + let notFound = 0; + let errors = 0; + + for (let i = 0; i < transceivers.length; i++) { + const tx = transceivers[i]; + + if (i > 0 && i % 20 === 0) { + console.log( + ` [${i}/${transceivers.length}] found=${found} not_found=${notFound} errors=${errors}` + ); + } + + const part = await searchMouser(tx.part_number); + + if (!part) { + notFound++; + await sleep(DELAY_MS); + continue; + } + + const priceData = extractPrice(part); + if (!priceData) { + notFound++; + await sleep(DELAY_MS); + continue; + } + + const { qty, stockLevel } = parseAvailability(part.Availability); + const productUrl = part.ProductDetailUrl + ? `https://www.mouser.de${part.ProductDetailUrl}` + : `https://www.mouser.de/Search/Refine?Keyword=${encodeURIComponent(tx.part_number)}`; + + const hash = contentHash( + `mouser:${tx.id}:${priceData.price}:${priceData.currency}:${stockLevel}` + ); + + try { + await upsertPriceObservation({ + transceiverId: tx.id, + sourceVendorId: vendorId, + price: priceData.price, + currency: priceData.currency, + stockLevel, + quantityAvailable: qty, + url: productUrl, + contentHash: hash, + }); + found++; + console.log( + ` ✓ ${tx.part_number.padEnd(32)} ${priceData.currency} ${priceData.price.toFixed(2).padStart(9)} ${stockLevel.padEnd(13)} qty=${qty}` + ); + } catch (err: unknown) { + errors++; + console.warn( + ` ✗ DB error ${tx.part_number}: ${(err as Error).message.slice(0, 60)}` + ); + } + + await sleep(DELAY_MS); + } + + console.log(`\n=== Mouser OEM Scraper Complete ===`); + console.log(` Processed: ${transceivers.length}`); + console.log(` Found: ${found}`); + console.log(` Not found: ${notFound}`); + console.log(` DB errors: ${errors}\n`); +} + +// ── CLI ─────────────────────────────────────────────────────────────────────── + +if (require.main === module) { + scrapeDigikey() + .then(() => pool.end()) + .catch((err: unknown) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +}