From 5393f73c173f39e64d5bb53fdc8dc59edc081501 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 17 Apr 2026 22:54:40 +0200 Subject: [PATCH] feat: stock quality schema + QSFPTEK/NADDOD v2 scrapers with real-time stock counts - Migration 028 (retroactive): document warehouse columns added to stock_observations - Migration 037: composite indexes for DISTINCT ON (transceiver_id, source_vendor_id) queries - Migration 038: add stock_confidence (1/2/3), price_currency, price_includes_tax, stock_vendor_ts to stock_observations + TRUNCATE test-run data db.ts: upsertStockObservation now accepts stockConfidence, priceCurrency, priceIncludesTax, stockVendorTs; delta detection includes quantity_available fs-com.ts: passes stockConfidence=3 + priceCurrency=EUR + priceIncludesTax=false qsfptek.ts v2: Phase 1 API listing + Phase 2 detail-page stock extraction - Parses 'X in real-time stock, DATE' from product detail pages - Writes stock_observations with confidence=2 + stockVendorTs - Up to 500 detail pages/run at 2s rate limit naddod.ts v2: complete rewrite from WooCommerce to Astro sitemap-based - Discovers products via /sitemaps/products.xml (600+ products) - URL format: /products/XXXXX.html - Extracts 'In Stock: X' exact counts from SSR HTML - Writes both price + stock observations (confidence 1 or 2) --- CHANGELOG_PENDING.md | 12 + packages/scraper/src/scrapers/fs-com.ts | 34 +- packages/scraper/src/scrapers/naddod.ts | 455 ++++++++++-------- packages/scraper/src/scrapers/qsfptek.ts | 218 ++++++--- packages/scraper/src/utils/db.ts | 39 +- ...8-stock-observations-warehouse-columns.sql | 17 + sql/037-stock-observations-indexes.sql | 12 + ...038-stock-observations-quality-columns.sql | 22 + 8 files changed, 533 insertions(+), 276 deletions(-) create mode 100644 sql/028-stock-observations-warehouse-columns.sql create mode 100644 sql/037-stock-observations-indexes.sql create mode 100644 sql/038-stock-observations-quality-columns.sql diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index bd31a81..38f7b44 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -3,6 +3,18 @@ Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}` Types: FEAT · FIX · UI · DATA · AI · INFRA +{"d":"2026-04-17","t":"DATA","m":"Competitor research: QSFPTEK shows real-time aggregated stock count (e.g. '5507 in real-time stock, 17 Apr 2026') + USD prices; NADDOD shows exact per-product counts ('In Stock: 543') via Astro SSR. Both scraped publicly, no login required. Flexoptix confirmed exact Lagerbestand + EUR prices. FS.com: EUR prices yes, exact counts no."} +{"d":"2026-04-17","t":"DATA","m":"stock_observations selective cleanup + schema upgrade: TRUNCATE stock_observations (186 FS.com test-run rows cleared, will repopulate on next launchd run). Added 4 new quality columns via migration 038: stock_confidence (1=boolean/2=aggregated/3=per-warehouse), price_currency CHAR(3), price_includes_tax BOOLEAN, stock_vendor_ts TIMESTAMPTZ."} +{"d":"2026-04-17","t":"FEAT","m":"Migration 028 retroactively committed to repo (028-stock-observations-warehouse-columns.sql) — documents the 10 warehouse columns applied directly to Erik DB. Guards with IF NOT EXISTS for safe re-application."} +{"d":"2026-04-17","t":"FEAT","m":"upsertStockObservation upgraded: new optional params stockConfidence (1|2|3), priceCurrency (ISO 4217), priceIncludesTax (boolean), stockVendorTs (timestamptz). FS.com now writes stockConfidence=3+priceCurrency=EUR+priceIncludesTax=false. Delta detection now also checks quantity_available changes."} +{"d":"2026-04-17","t":"FEAT","m":"QSFPTEK scraper v2: Phase 1 uses existing /mall/commodity/list API for product catalog (880+ products from sitemap). Phase 2 fetches /en/product/XXXXX.html detail pages to extract 'X in real-time stock, DATE' — writes stock_observations with stockConfidence=2 + stockVendorTs. Up to 500 detail pages per run at 2s rate limit."} +{"d":"2026-04-17","t":"FEAT","m":"NADDOD scraper v2: complete rewrite — migrated from WooCommerce category scraping to Astro sitemap-based discovery (/sitemaps/products.xml, /products/XXXXX.html). Extracts 'In Stock: X' exact counts from server-rendered HTML. Writes both price_observations (USD) and stock_observations (stockConfidence=1 or 2 depending on data visibility)."} +{"d":"2026-04-17","t":"DATA","m":"FS.com first warehouse data load: 268 products scraped, 186 stock_observations written — DE-Lager 128,428 units, Global-Lager 156,052 units, Backorder 37,495, 53.4M units sold total. Top seller: SFP-10GSR-85 with 14M units sold."} +{"d":"2026-04-17","t":"FIX","m":"upsertStockObservation: skip condition now includes backorder_qty — backorder-only products (DE=0 GL=0 BO>0) like coherent ZR/ZRH were silently dropped instead of being recorded"} +{"d":"2026-04-17","t":"FIX","m":"FS.com price extraction: broad fallback regex now only accepts prices >€100 to reject FS.com's €79 'Preis auf Anfrage' placeholder — prevents fake price observations on 1G/10G/25G/40G/100G transceivers"} +{"d":"2026-04-17","t":"UI","m":"Dashboard: stock observations count in header stats bar + warehouse stock summary card in Overview tab (hidden until stock_observations populated); both driven by /api/health stock block"} +{"d":"2026-04-17","t":"FEAT","m":"Health API: /api/health now includes stock block — total_observations, transceivers_with_stock, vendors_with_stock, total_de_qty, total_global_qty, last_observation_at from stock_observations"} +{"d":"2026-04-17","t":"INFRA","m":"FS.com Mac-side runner: launchd plist at 02:00/10:00/18:00 + run-fs-scraper-mac.sh via SSH tunnel to Erik DB port 5433 — residential IP required, datacenter IP blocked by FS.com Cloudflare WAF"} {"d":"2026-04-17","t":"FEAT","m":"Stock API: GET /api/stock, /api/stock/summary, /api/stock/:id — warehouse breakdowns (DE-Lager, Global-Lager, Nachlieferung, units_sold) per transceiver/vendor"} {"d":"2026-04-17","t":"DATA","m":"upsertStockObservation() in db.ts — writes 10 new stock_observations columns (warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands, price_net, product_url, delivery dates)"} {"d":"2026-04-17","t":"DATA","m":"FS.com scraper v2: Playwright-based, extracts DE-Lager + Global-Lager + Nachlieferung + Verkauft counts, German number/date parsing, 120-URL pre-queue, 12-category crawl, 12h dedup window"} diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 13af993..6744d3e 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -13,7 +13,7 @@ * * Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail). */ -import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; +import { PlaywrightCrawler, ProxyConfiguration, purgeDefaultStorages } from "crawlee"; import type { Page } from "playwright"; /** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */ @@ -268,6 +268,9 @@ interface ProductDetail extends ProductSummary { async function collectProductUrls( proxyConfiguration: ProxyConfiguration | undefined ): Promise> { + // Purge any leftover Crawlee storage from previous runs + await purgeDefaultStorages(); + const products = new Map(); const exhausted = new Set(); @@ -359,6 +362,8 @@ async function scrapeProductDetails( requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, proxyConfiguration: ProxyConfiguration | undefined ): Promise { + // Purge Phase 1 storage so Phase 2 starts with a clean request queue + await purgeDefaultStorages(); const details: ProductDetail[] = []; const crawler = new PlaywrightCrawler({ @@ -474,21 +479,36 @@ async function scrapeProductDetails( const t = raw.bodyText; // ── Net price (ohne MwSt, EUR) ───────────────────────────────────────── + // Priority: patterns that require "ohne MwSt" or "netto" qualifier (FS.com shows + // real prices this way). Fallback broad patterns are only accepted above €100 + // to avoid matching FS.com's €79 placeholder/template price. let priceNet: number | undefined; - for (const pat of [ + const PRICE_QUALIFIED: RegExp[] = [ /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i, /€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i, /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i, /Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i, - /([0-9]{1,5},[0-9]{2})\s*€/, - /€\s*([0-9]{1,5},[0-9]{2})/, - ]) { + // DOM-extracted price element (set by page.evaluate in raw) + /Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i, + /([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i, + ]; + for (const pat of PRICE_QUALIFIED) { const m = t.match(pat); if (m?.[1]) { const p = parseGermanPrice(m[1]); if (p && p > 0.5 && p < 500_000) { priceNet = p; break; } } } + // Broad fallback — only accept if price > €100 (avoids FS.com's €79 placeholder) + if (!priceNet) { + for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) { + const m = t.match(pat); + if (m?.[1]) { + const p = parseGermanPrice(m[1]); + if (p && p > 100 && p < 500_000) { priceNet = p; break; } + } + } + } // ── DE-Lager ─────────────────────────────────────────────────────────── let deQty: number | undefined; @@ -745,6 +765,10 @@ export async function scrapeFs(): Promise { compatibleBrands: detail.compatibleBrands, priceNet: detail.priceNet, productUrl: detail.url, + // FS.com: per-warehouse breakdown (DE-Lager + Global-Lager), EUR net prices + stockConfidence: 3, + priceCurrency: "EUR", + priceIncludesTax: false, }); if (stockNew) stockWritten++; diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts index 84ede6e..20c11cc 100644 --- a/packages/scraper/src/scrapers/naddod.ts +++ b/packages/scraper/src/scrapers/naddod.ts @@ -1,56 +1,69 @@ /** - * NADDOD Scraper — Chinese compatible transceiver vendor + * NADDOD Scraper v2 — Chinese compatible transceiver vendor * - * naddod.com — WooCommerce store, server-rendered HTML, USD pricing. - * Products listed under product category pages. - * Pagination via /page/N/. Rate limited: 1 req/2sec. + * naddod.com — Migrated from WooCommerce → Astro/Shopify-style in 2025. + * Product URLs: /products/XXXXX.html (numeric IDs, not category slugs) + * Sitemap: /sitemaps/products.xml * - * NADDOD (Shenzhen NADDOD Information Co.) makes and sells compatible - * optics for Cisco, Juniper, Arista, etc. Transparent USD pricing. + * Phase 1: Parse sitemap to collect all product URLs (plain HTTP) + * Phase 2: Fetch product detail pages — extract name, price, stock count + * Stock format: "In Stock: 543" (exact) | "In Stock: 2.1k+" (rounded) | "In Stock: Available" (boolean) + * Per-warehouse JSON (warehouse_stock: {us, nl, sg, cn}) is in a JS hydration + * payload that requires JS execution — only the display count is in plain HTML. + * → stock_confidence=2 (aggregated global count) for exact/rounded counts + * → stock_confidence=1 (boolean) for "Available" only + * + * Rate limited: 1 req/2sec. */ -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.naddod.com"; +const SITEMAP_URL = `${BASE}/sitemaps/products.xml`; const HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }; -const MAX_PAGES = 30; - -const CATEGORIES = [ - { path: "/product-category/1g-sfp-transceivers/", formFactor: "SFP", speed: "1G", speedGbps: 1 }, - { path: "/product-category/10g-sfp-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, - { path: "/product-category/25g-sfp28-transceivers/", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, - { path: "/product-category/40g-qsfp-transceivers/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, - { path: "/product-category/100g-qsfp28-transceivers/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, - { path: "/product-category/200g-qsfp56-transceivers/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 }, - { path: "/product-category/400g-qsfp-dd-transceivers/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/product-category/800g-osfp-transceivers/", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, - { path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, -]; - -interface Product { - partNumber: string; - name: string; - url: string; - price?: number; - formFactor: string; - speed: string; - speedGbps: number; - reachLabel?: string; - reachMeters?: number; - fiberType?: string; - wavelength?: string; - compatibleWith?: string; -} +// Limit detail-page fetches per run to stay reasonable +const MAX_DETAIL_PAGES = 600; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } +// ── Classification helpers ────────────────────────────────────────────────── + +function detectFormFactor(text: string): string { + const t = text.toLowerCase(); + if (/\bosfp\b/.test(t)) return "OSFP"; + if (/\bqsfp.?dd800\b|\bqsfp.?dd\s+800\b/.test(t)) return "QSFP-DD800"; + if (/\bqsfp.?dd\b/.test(t)) return "QSFP-DD"; + if (/\bqsfp56\b/.test(t)) return "QSFP56"; + if (/\bqsfp112\b/.test(t)) return "QSFP112"; + if (/\bqsfp28\b/.test(t)) return "QSFP28"; + if (/\bqsfp\+|\bqsfp\s*plus\b/.test(t)) return "QSFP+"; + if (/\bsfp28\b/.test(t)) return "SFP28"; + if (/\bsfp.?\+|10g.*sfp|sfp.*10g/.test(t)) return "SFP+"; + if (/\bsfp\b/.test(t)) return "SFP"; + if (/\bxfp\b/.test(t)) return "XFP"; + return "SFP+"; // default +} + +function detectSpeedGbps(text: string): { speed: string; speedGbps: number } { + const t = text.toUpperCase(); + if (/\b800G\b|\b800GBE\b/.test(t)) return { speed: "800G", speedGbps: 800 }; + if (/\b400G\b|\b400GBE\b/.test(t)) return { speed: "400G", speedGbps: 400 }; + if (/\b200G\b|\b200GBE\b/.test(t)) return { speed: "200G", speedGbps: 200 }; + if (/\b100G\b|\b100GBE\b/.test(t)) return { speed: "100G", speedGbps: 100 }; + if (/\b40G\b|\b40GBE\b/.test(t)) return { speed: "40G", speedGbps: 40 }; + if (/\b25G\b|\b25GBE\b/.test(t)) return { speed: "25G", speedGbps: 25 }; + if (/\b10G\b|\b10GBE\b/.test(t)) return { speed: "10G", speedGbps: 10 }; + if (/\b1G\b|\b1GBE\b|\bGIGABIT\b/.test(t)) return { speed: "1G", speedGbps: 1 }; + return { speed: "Unknown", speedGbps: 0 }; +} + function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ [/\b120\s*km\b/i, "120km", 120000], @@ -65,16 +78,13 @@ function detectReach(text: string): { label: string; meters: number } | undefine [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], - [/\bLR4\b/, "10km", 10000], - [/\bLR\b/, "10km", 10000], - [/\bER4?\b/, "40km", 40000], - [/\bZR4?\b/, "80km", 80000], - [/\bSR4?\b/, "300m", 300], - [/\bDR4?\b/, "500m", 500], + [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], + [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], + [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], ]; - for (const [regex, label, meters] of patterns) { - if (regex.test(text)) return { label, meters }; + for (const [re, label, meters] of patterns) { + if (re.test(text)) return { label, meters }; } return undefined; } @@ -87,195 +97,230 @@ function detectFiber(text: string): string { } function detectWavelength(text: string): string { - const match = text.match(/(\d{3,4})\s*nm/i); - return match ? match[1] : ""; + const m = text.match(/(\d{3,4})\s*nm/i); + return m ? m[1] : ""; } -function extractCompatibleVendor(name: string): string { - const brands = ["Cisco", "Juniper", "Arista", "HPE", "Dell", "Brocade", "Extreme", "Huawei", - "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti"]; - for (const brand of brands) { - if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand; - } - const match = name.match(/(?:for\s+|compatible\s+(?:with\s+)?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)/); - return match ? match[1] : ""; +function isTransceiver(name: string): boolean { + const lower = name.toLowerCase(); + // Include: SFP, QSFP, OSFP, XFP, DAC/AOC cables count as transceivers for stock tracking + return /sfp|qsfp|osfp|xfp|transceiver|dac|aoc|cwdm|dwdm/i.test(lower); } -function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { - const products: Product[] = []; - const seen = new Set(); - const collapsed = html.replace(/\s+/g, " "); +// ── Stock parsing ─────────────────────────────────────────────────────────── - // Strategy 1: WooCommerce standard product loop - const cardRegex = /]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi; - let cardMatch; - while ((cardMatch = cardRegex.exec(collapsed)) !== null) { - const card = cardMatch[1]; +/** + * Parse NADDOD stock display text. + * "In Stock: 543" → { qty: 543, confidence: 2 } + * "In Stock: 2.1k+" → { qty: 2100, confidence: 2 } (approximate, rounded) + * "In Stock: Available" → { qty: undefined, confidence: 1 } + * Returns null if no stock text found. + */ +function parseStockText(html: string): { qty?: number; confidence: 1 | 2 } | null { + // Look for "In Stock: X" pattern in page text + const m = html.match(/In\s+Stock[:\s]+([^\s<"]+)/i); + if (!m) return null; - const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?naddod\.com\/product\/[^"]+)"/i); - if (!urlMatch) continue; - const url = urlMatch[1]; - if (seen.has(url)) continue; - seen.add(url); + const raw = m[1].trim().toLowerCase(); - const nameMatch = card.match(/woocommerce-loop-product__title[^>]*>([^<]+)]*>([^<]{10,})<\/h2>/i) || - card.match(/]*>([^<]{10,})<\/h3>/i); - if (!nameMatch) continue; - const name = nameMatch[1].trim().replace(/&/g, "&").replace(/–/g, "–"); - if (name.length < 5) continue; + // "Available" = boolean only + if (/^avail/i.test(raw)) return { confidence: 1 }; - const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/); - const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; - - const reach = detectReach(name); - const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); - - products.push({ - partNumber, name, url, - price: price && price > 0 && price < 100000 ? price : undefined, - formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, - reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: detectFiber(name), wavelength: detectWavelength(name), - compatibleWith: extractCompatibleVendor(name), - }); + // Numeric: "543" or "2.1k+" or "1.5k+" + const kMatch = raw.match(/^([\d.]+)k\+?$/); + if (kMatch) { + const qty = Math.round(parseFloat(kMatch[1]) * 1000); + return { qty: isNaN(qty) ? undefined : qty, confidence: 2 }; } - // Strategy 2: Generic product link fallback - if (products.length === 0) { - const linkRegex = /href="(https?:\/\/(?:www\.)?naddod\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>\s*([^<]{10,})/gi; - let m; - while ((m = linkRegex.exec(collapsed)) !== null) { - const url = m[1]; - const name = m[2].trim().replace(/&/g, "&"); - if (seen.has(url) || name.length < 10) continue; - if (!/transceiver|sfp|qsfp|osfp|dac|aoc|xfp/i.test(name)) continue; - seen.add(url); + const exact = parseInt(raw.replace(/[^0-9]/g, ""), 10); + if (!isNaN(exact) && exact >= 0) return { qty: exact, confidence: 2 }; - const ctx = collapsed.slice(Math.max(0, m.index - 200), m.index + 500); - const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/); - const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined; - const reach = detectReach(name); - - products.push({ - partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "", - name, url, - price: price && price > 0 && price < 100000 ? price : undefined, - formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, - reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: detectFiber(name), wavelength: detectWavelength(name), - compatibleWith: extractCompatibleVendor(name), - }); - } - } - - return products; + return { confidence: 1 }; // fallback: boolean } -async function fetchPage(url: string): Promise { +// ── HTTP helpers ──────────────────────────────────────────────────────────── + +async function fetchText(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); - if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + if (!resp.ok) throw new Error(`HTTP ${resp.status}`); return resp.text(); } +/** + * Parse product detail page HTML → extract name, price, stock. + */ +function parseDetailPage(html: string, url: string): { + name: string; + price?: number; + stock: { qty?: number; confidence: 1 | 2 } | null; +} | null { + // Product name: og:title or or <h1> + const ogTitle = html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i)?.[1]; + const h1 = html.match(/<h1[^>]*>([^<]{15,})<\/h1>/i)?.[1]?.trim(); + const titleTag = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim(); + const name = (ogTitle || h1 || titleTag || "").replace(/\s+/g, " ").slice(0, 200); + + if (!name || name.length < 10) return null; + if (!isTransceiver(name)) return null; + + // Price: "US$ 10.90" or "$10.90" + const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) || + html.match(/\$\s*([\d,]+\.\d{2})\b/); + const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; + + // Stock count + const stock = parseStockText(html); + + return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock }; +} + +// ── Sitemap parsing ───────────────────────────────────────────────────────── + +async function fetchProductUrlsFromSitemap(): Promise<string[]> { + console.log(` Fetching sitemap: ${SITEMAP_URL}`); + const xml = await fetchText(SITEMAP_URL); + + // Extract all <loc> URLs that match /products/XXXXX.html + const urls: string[] = []; + const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi; + let m: RegExpExecArray | null; + while ((m = locRegex.exec(xml)) !== null) { + const url = m[1].trim(); + // Keep only canonical English URLs (no language prefix) + if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) { + urls.push(url); + } + } + + return [...new Set(urls)]; // deduplicate +} + +// ── Main scraper ──────────────────────────────────────────────────────────── + export async function scrapeNaddod(): Promise<void> { - console.log("=== NADDOD Scraper Starting ===\n"); + console.log("=== NADDOD Scraper v2 Starting (sitemap + detail mode) ===\n"); const vendorId = await ensureVendor( "NADDOD", "compatible", "https://www.naddod.com", - "https://www.naddod.com/product-category/transceivers/", + "https://www.naddod.com/collections/transceivers", ); - let totalProducts = 0; - let priceUpdates = 0; - const seenCategories = new Set<string>(); - - for (const cat of CATEGORIES) { - console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); - - try { - const html1 = await fetchPage(BASE + cat.path); - const catProducts = parseProductList(html1, cat); - - if (cat.path.includes("/transceivers/") && seenCategories.size > 3) { - console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`); - continue; - } - - if (catProducts.length === 0) { - console.log(" No products on page 1 — skipping"); - continue; - } - - seenCategories.add(cat.path); - console.log(` Found ${catProducts.length} products on page 1`); - - const totalPagesMatch = html1.match(/page-numbers[^>]*>(\d+)<\/a>(?!.*page-numbers)/); - const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 1; - console.log(` Total pages: ${totalPages}`); - - const allProducts = [...catProducts]; - - for (let page = 2; page <= totalPages; page++) { - await sleep(2000); - try { - const html = await fetchPage(BASE + cat.path + `page/${page}/`); - const pageProds = parseProductList(html, cat); - if (pageProds.length === 0) break; - allProducts.push(...pageProds); - console.log(` Page ${page}: ${pageProds.length} products`); - } catch (err) { - console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`); - break; - } - } - - const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i); - console.log(` Total unique: ${uniqueProducts.length}`); - - for (const product of uniqueProducts) { - try { - const txId = await findOrCreateScrapedTransceiver({ - partNumber: product.partNumber, - vendorId, - formFactor: product.formFactor, - speedGbps: product.speedGbps, - speed: product.speed, - reachMeters: product.reachMeters, - reachLabel: product.reachLabel, - fiberType: product.fiberType, - wavelengths: product.wavelength, - category: "DataCenter", - }); - - if (product.price && product.price > 0) { - const hash = contentHash({ price: product.price, part: product.partNumber }); - const updated = await upsertPriceObservation({ - transceiverId: txId, - sourceVendorId: vendorId, - price: product.price, - currency: "USD", - stockLevel: "in_stock", - url: product.url, - contentHash: hash, - }); - if (updated) priceUpdates++; - } - totalProducts++; - } catch (err) { - console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); - } - } - } catch (err) { - console.error(` Category failed: ${(err as Error).message}`); - } - - await sleep(2000); + // ── Phase 1: Discover product URLs via sitemap ──────────────────────────── + console.log("[Phase 1] Discovering products from sitemap..."); + let productUrls: string[] = []; + try { + productUrls = await fetchProductUrlsFromSitemap(); + console.log(` Found ${productUrls.length} product URLs in sitemap`); + } catch (err) { + console.error(` Sitemap fetch failed: ${(err as Error).message}`); + return; } - console.log(`\n=== NADDOD Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); + if (productUrls.length === 0) { + console.warn(" No product URLs found — aborting"); + return; + } + + // Limit to avoid excessive runtime + const urls = productUrls.slice(0, MAX_DETAIL_PAGES); + console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`); + + // ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── + console.log("\n[Phase 2] Fetching product detail pages..."); + + let processed = 0; + let priceUpdates = 0; + let stockWritten = 0; + let stockSkipped = 0; + let skippedNonTx = 0; + let errors = 0; + + for (const url of urls) { + await sleep(2000); + try { + const html = await fetchText(url); + const detail = parseDetailPage(html, url); + + if (!detail) { + skippedNonTx++; + continue; + } + + const { name, price, stock } = detail; + const { speed, speedGbps } = detectSpeedGbps(name); + const formFactor = detectFormFactor(name); + const reach = detectReach(name); + const fiberType = detectFiber(name); + const wavelength = detectWavelength(name); + + // Extract part number from name (first word-group before "Compatible" or vendor name) + const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); + + const txId = await findOrCreateScrapedTransceiver({ + partNumber, + vendorId, + formFactor, + speedGbps, + speed, + reachMeters: reach?.meters, + reachLabel: reach?.label, + fiberType, + wavelengths: wavelength, + category: "DataCenter", + }); + + // Price observation + if (price && price > 0) { + const hash = contentHash({ price, part: partNumber }); + const isNew = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price, + currency: "USD", + stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" : + stock?.confidence === 1 ? "in_stock" : "unknown", + url, + contentHash: hash, + }); + if (isNew) priceUpdates++; + } + + // Stock observation + if (stock !== null) { + const stockLevel = stock.qty !== undefined ? (stock.qty > 0 ? "in_stock" : "out_of_stock") : "in_stock"; + const isNew = await upsertStockObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + stockLevel, + quantityAvailable: stock.qty !== undefined && stock.qty > 0 ? stock.qty : undefined, + productUrl: url, + stockConfidence: stock.confidence, + priceCurrency: "USD", + priceIncludesTax: false, + }); + if (isNew) stockWritten++; + else stockSkipped++; + } + + processed++; + if (processed % 50 === 0) { + console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`); + } + } catch (err) { + errors++; + if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 70)}`); + } + } + + console.log("\n=== NADDOD Scraper v2 Complete ==="); + console.log(` Products processed: ${processed}`); + console.log(` Non-transceivers skip: ${skippedNonTx}`); + console.log(` Price observations: ${priceUpdates} new`); + console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`); + if (errors > 0) console.warn(` Errors: ${errors}`); } if (require.main === module) { diff --git a/packages/scraper/src/scrapers/qsfptek.ts b/packages/scraper/src/scrapers/qsfptek.ts index 513f63d..68e9fe6 100644 --- a/packages/scraper/src/scrapers/qsfptek.ts +++ b/packages/scraper/src/scrapers/qsfptek.ts @@ -8,10 +8,15 @@ * API: GET /mall/commodity/list?categoryId=21&attributes=VALUE_ID&page=N&pageSize=30 * Returns HTML fragment with product cards. * + * Phase 1: Collect product list + prices via API (plain HTTP, no JS needed) + * Phase 2: Fetch product detail pages to extract real-time stock count + * Format: "5507 in real-time stock, 17 Apr, 2026" + * Confidence: 2 (aggregated global count with vendor timestamp) + * * Rate limited: 1 req/2sec. */ import * as cheerio from "cheerio"; -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.qsfptek.com"; @@ -26,6 +31,10 @@ const HEADERS = { const MAX_PAGES = 40; const PAGE_SIZE = 30; +// Limit detail-page fetches per run to avoid overwhelming the server +// (~500 products × 2s = ~17min for a full refresh; subsequent runs are faster +// since unchanged stock counts are skipped by upsertStockObservation) +const MAX_DETAIL_PAGES = 500; // Data rate attribute values (found in /mall/commodity/attribute?categoryId=21) // pid = "2c9180837bbaf08f017bbdd1ebf7001e" (Data Rate attribute group) @@ -59,6 +68,11 @@ interface Product { wavelength?: string; } +interface StockDetail { + qty: number; + vendorTs: Date | null; +} + function sleep(ms: number): Promise<void> { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -97,6 +111,40 @@ function detectWavelength(text: string): string { return m ? m[1] : ""; } +/** + * Parse QSFPTEK real-time stock text. + * Format: "5507 in real-time stock, 17 Apr, 2026" + * Returns { qty, vendorTs } or null if not found. + */ +function parseStockDetail(html: string): StockDetail | null { + // Match: "<number> in real-time stock, <date>" + const m = html.match(/(\d[\d,]*)\s+in\s+real-?time\s+stock[,\s]+(\d{1,2}\s+\w+,?\s*\d{4})/i); + if (!m) { + // Also try: "<number> in stock" without timestamp + const simple = html.match(/(\d[\d,]+)\s+in\s+(?:real-?time\s+)?stock\b/i); + if (simple) { + const qty = parseInt(simple[1].replace(/,/g, ""), 10); + return isNaN(qty) || qty < 0 ? null : { qty, vendorTs: null }; + } + return null; + } + + const qty = parseInt(m[1].replace(/,/g, ""), 10); + if (isNaN(qty) || qty < 0) return null; + + // Parse vendor timestamp: "17 Apr, 2026" → Date + let vendorTs: Date | null = null; + try { + const dateStr = m[2].replace(",", ""); + const d = new Date(dateStr); + if (!isNaN(d.getTime())) vendorTs = d; + } catch { + // ignore unparseable date + } + + return { qty, vendorTs }; +} + function parseProductFragment(html: string, attr: typeof DATA_RATE_ATTRIBUTES[number]): Product[] { const $ = cheerio.load(html); const products: Product[] = []; @@ -145,8 +193,16 @@ async function fetchProductList(attrId: string, page: number): Promise<string> { return resp.text(); } +async function fetchDetailPage(url: string): Promise<string> { + // Normalise URL: ensure /en/ prefix for detail pages + const normalized = url.includes("/en/product/") ? url : url.replace("/product/", "/en/product/"); + const resp = await fetch(normalized, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${normalized}`); + return resp.text(); +} + export async function scrapeQsfptek(): Promise<void> { - console.log("=== QSFPTEK Scraper Starting (API mode) ===\n"); + console.log("=== QSFPTEK Scraper v2 Starting (API + stock detail mode) ===\n"); const vendorId = await ensureVendor( "QSFPTEK", @@ -155,13 +211,12 @@ export async function scrapeQsfptek(): Promise<void> { "https://www.qsfptek.com/c/fiber-optic-transceiver.html", ); - let totalProducts = 0; - let priceUpdates = 0; + // ── Phase 1: Collect all products via API listing ───────────────────────── + console.log("[Phase 1] Collecting product catalog..."); + const allProducts = new Map<string, Product>(); // url → product for (const attr of DATA_RATE_ATTRIBUTES) { - console.log(`\n--- ${attr.formFactor} (${attr.speed}) ---`); - const allProducts: Product[] = []; - const allSeen = new Set<string>(); + console.log(`\n ${attr.formFactor} (${attr.speed})`); for (let page = 1; page <= MAX_PAGES; page++) { try { @@ -169,70 +224,119 @@ export async function scrapeQsfptek(): Promise<void> { const pageProds = parseProductFragment(html, attr); if (pageProds.length === 0) { - if (page === 1) console.log(" No products on page 1 — skipping"); - else console.log(` Page ${page}: empty, stopping`); + if (page === 1) console.log(" No products on page 1 — skipping"); + else console.log(` Page ${page}: empty, stopping`); break; } - // Dedupe against already seen (API may repeat items) - const newProds = pageProds.filter(p => { - if (allSeen.has(p.url)) return false; - allSeen.add(p.url); - return true; - }); - allProducts.push(...newProds); - console.log(` Page ${page}: ${pageProds.length} total, ${newProds.length} new (${allProducts.length} running)`); + let newCount = 0; + for (const p of pageProds) { + if (!allProducts.has(p.url)) { + allProducts.set(p.url, p); + newCount++; + } + } + console.log(` Page ${page}: ${pageProds.length} results, ${newCount} new (${allProducts.size} total)`); - // If we got fewer than pageSize, we're on the last page if (pageProds.length < PAGE_SIZE) break; - if (page < MAX_PAGES) await sleep(2000); } catch (err) { - console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 80)}`); + console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 80)}`); break; } } - console.log(` Unique products: ${allProducts.length}`); - - for (const product of allProducts) { - try { - const txId = await findOrCreateScrapedTransceiver({ - partNumber: product.partNumber, - vendorId, - formFactor: product.formFactor, - speedGbps: product.speedGbps, - speed: product.speed, - reachMeters: product.reachMeters, - reachLabel: product.reachLabel, - fiberType: product.fiberType, - wavelengths: product.wavelength, - category: "DataCenter", - }); - - if (product.price && product.price > 0) { - const hash = contentHash({ price: product.price, part: product.partNumber }); - const updated = await upsertPriceObservation({ - transceiverId: txId, - sourceVendorId: vendorId, - price: product.price, - currency: "USD", - stockLevel: "in_stock", - url: product.url, - contentHash: hash, - }); - if (updated) priceUpdates++; - } - totalProducts++; - } catch (err) { - console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); - } - } - await sleep(2000); } - console.log(`\n=== QSFPTEK Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); + console.log(`\n[Phase 1] Complete — ${allProducts.size} unique products collected`); + + // ── Phase 2: Write to DB + fetch detail pages for stock counts ───────────── + console.log("\n[Phase 2] Writing prices + fetching real-time stock counts..."); + + let totalProducts = 0; + let priceUpdates = 0; + let stockWritten = 0; + let stockSkipped = 0; + let detailFetched = 0; + let errors = 0; + + const products = [...allProducts.values()].slice(0, MAX_DETAIL_PAGES); + + for (const product of products) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + // Price observation from listing page + if (product.price && product.price > 0) { + const hash = contentHash({ price: product.price, part: product.partNumber }); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: "USD", + stockLevel: "in_stock", + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + + // Fetch detail page for real-time stock count + await sleep(2000); + try { + const detailHtml = await fetchDetailPage(product.url); + detailFetched++; + const stockInfo = parseStockDetail(detailHtml); + + if (stockInfo !== null) { + const isNew = await upsertStockObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + stockLevel: stockInfo.qty > 0 ? "in_stock" : "out_of_stock", + quantityAvailable: stockInfo.qty > 0 ? stockInfo.qty : undefined, + productUrl: product.url, + // Quality metadata: QSFPTEK provides aggregated global count + timestamp + stockConfidence: 2, + priceCurrency: "USD", + priceIncludesTax: false, + stockVendorTs: stockInfo.vendorTs, + }); + if (isNew) stockWritten++; + else stockSkipped++; + } + } catch (detailErr) { + // Detail page failures are non-fatal — we still have price data + console.warn(` Stock fetch failed for ${product.partNumber}: ${(detailErr as Error).message.slice(0, 60)}`); + } + + totalProducts++; + if (totalProducts % 50 === 0) { + console.log(` Progress: ${totalProducts}/${products.length} products | ${priceUpdates} prices | ${stockWritten} stock obs`); + } + } catch (err) { + console.warn(` DB error for ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`); + errors++; + } + } + + console.log("\n=== QSFPTEK Scraper v2 Complete ==="); + console.log(` Products processed: ${totalProducts}`); + console.log(` Price observations: ${priceUpdates} new`); + console.log(` Detail pages fetched: ${detailFetched}`); + console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`); + if (errors > 0) console.warn(` Errors: ${errors}`); } if (require.main === module) { diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index fe1cbee..b113913 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -165,9 +165,16 @@ export async function upsertPriceObservation(params: { } /** - * Upsert a stock observation with full warehouse breakdown (FS.com v2). + * Upsert a stock observation with full warehouse breakdown (FS.com v2+). * Writes to stock_observations including DE-Lager, Global-Lager, Nachlieferung, - * units_sold, compatible_brands, price_net, and product_url columns. + * units_sold, compatible_brands, price_net, product_url, plus quality metadata: + * stock_confidence, price_currency, price_includes_tax, stock_vendor_ts. + * + * stock_confidence: + * 1 = boolean only (in_stock true/false, no unit count) + * 2 = aggregated global count (single number, e.g. QSFPTEK "5507 in real-time stock") + * 3 = per-warehouse breakdown (e.g. FS.com DE-Lager + Global-Lager split) + * * Returns true only when the data has changed since the last observation. */ export async function upsertStockObservation(params: { @@ -185,19 +192,26 @@ export async function upsertStockObservation(params: { compatibleBrands?: string[]; priceNet?: number; productUrl?: string; + // Quality metadata (migration 038) + stockConfidence?: 1 | 2 | 3; + priceCurrency?: string; + priceIncludesTax?: boolean; + stockVendorTs?: Date | null; }): Promise<boolean> { // Skip if there is genuinely no warehouse data at all + // (includes backorderQty so products available only on backorder are recorded) if ( - params.warehouseDeQty === undefined && + params.warehouseDeQty === undefined && params.warehouseGlobalQty === undefined && - params.quantityAvailable === undefined + params.quantityAvailable === undefined && + params.backorderQty === undefined ) { return false; } // Compare against the last observation to avoid duplicate writes const lastObs = await pool.query( - `SELECT warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold + `SELECT warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, quantity_available FROM stock_observations WHERE transceiver_id = $1 AND source_vendor_id = $2 ORDER BY time DESC LIMIT 1`, @@ -210,12 +224,13 @@ export async function upsertStockObservation(params: { (r.warehouse_de_qty ?? null) === (params.warehouseDeQty ?? null) && (r.warehouse_global_qty ?? null) === (params.warehouseGlobalQty ?? null) && (r.backorder_qty ?? null) === (params.backorderQty ?? null) && - (r.units_sold ?? null) === (params.unitsSold ?? null); + (r.units_sold ?? null) === (params.unitsSold ?? null) && + (r.quantity_available ?? null) === (params.quantityAvailable ?? null); if (unchanged) return false; } const inStock = - ((params.warehouseDeQty ?? 0) + (params.warehouseGlobalQty ?? 0)) > 0; + ((params.warehouseDeQty ?? 0) + (params.warehouseGlobalQty ?? 0) + (params.quantityAvailable ?? 0)) > 0; await pool.query( `INSERT INTO stock_observations ( @@ -224,14 +239,16 @@ export async function upsertStockObservation(params: { warehouse_de_qty, warehouse_de_delivery_date, warehouse_global_qty, warehouse_global_delivery_date, backorder_qty, backorder_estimated_date, - units_sold, compatible_brands, price_net, product_url + units_sold, compatible_brands, price_net, product_url, + stock_confidence, price_currency, price_includes_tax, stock_vendor_ts ) VALUES ( NOW(), $1, $2, $3, $4, $5, $6::date, $7, $8::date, $9, $10::date, - $11, $12, $13, $14 + $11, $12, $13, $14, + $15, $16, $17, $18 )`, [ params.transceiverId, @@ -248,6 +265,10 @@ export async function upsertStockObservation(params: { params.compatibleBrands?.length ? params.compatibleBrands : null, params.priceNet ?? null, params.productUrl ?? null, + params.stockConfidence ?? 1, + params.priceCurrency ?? null, + params.priceIncludesTax ?? null, + params.stockVendorTs ?? null, ] ); diff --git a/sql/028-stock-observations-warehouse-columns.sql b/sql/028-stock-observations-warehouse-columns.sql new file mode 100644 index 0000000..f4f74db --- /dev/null +++ b/sql/028-stock-observations-warehouse-columns.sql @@ -0,0 +1,17 @@ +-- Migration 028: Extend stock_observations with full warehouse breakdown +-- Adds FS.com warehouse columns: DE-Lager, Global-Lager, Nachlieferung, +-- units_sold, compatible_brands, price_net, product_url. +-- NOTE: This migration was applied directly on Erik before being committed to the repo. +-- All ADD COLUMN statements use IF NOT EXISTS guards for safe re-application. + +ALTER TABLE stock_observations + ADD COLUMN IF NOT EXISTS warehouse_de_qty INTEGER, + ADD COLUMN IF NOT EXISTS warehouse_de_delivery_date DATE, + ADD COLUMN IF NOT EXISTS warehouse_global_qty INTEGER, + ADD COLUMN IF NOT EXISTS warehouse_global_delivery_date DATE, + ADD COLUMN IF NOT EXISTS backorder_qty INTEGER, + ADD COLUMN IF NOT EXISTS backorder_estimated_date DATE, + ADD COLUMN IF NOT EXISTS units_sold BIGINT, + ADD COLUMN IF NOT EXISTS compatible_brands TEXT[], + ADD COLUMN IF NOT EXISTS price_net NUMERIC(12,4), + ADD COLUMN IF NOT EXISTS product_url TEXT; diff --git a/sql/037-stock-observations-indexes.sql b/sql/037-stock-observations-indexes.sql new file mode 100644 index 0000000..6ca6e02 --- /dev/null +++ b/sql/037-stock-observations-indexes.sql @@ -0,0 +1,12 @@ +-- Migration 037: Add composite indexes for stock_observations +-- Optimises DISTINCT ON (transceiver_id, source_vendor_id) queries used +-- by GET /api/stock and GET /api/stock/summary. + +-- Primary lookup index for "latest per (transceiver, vendor)" queries +CREATE INDEX IF NOT EXISTS idx_stock_tx_vendor_time + ON stock_observations (transceiver_id, source_vendor_id, time DESC); + +-- Partial index for "in-stock only" filter (used by /api/stock?in_stock=true) +CREATE INDEX IF NOT EXISTS idx_stock_in_stock + ON stock_observations (in_stock, time DESC) + WHERE in_stock = true; diff --git a/sql/038-stock-observations-quality-columns.sql b/sql/038-stock-observations-quality-columns.sql new file mode 100644 index 0000000..4f852cf --- /dev/null +++ b/sql/038-stock-observations-quality-columns.sql @@ -0,0 +1,22 @@ +-- Migration 038: Add data-quality columns to stock_observations + selective cleanup +-- +-- stock_confidence: 1 = boolean only (in_stock true/false) +-- 2 = aggregated global count (single number, e.g. QSFPTEK) +-- 3 = per-warehouse breakdown (e.g. FS.com DE/Global split) +-- price_currency: ISO 4217 code, e.g. 'USD', 'EUR', 'GBP' +-- price_includes_tax: true = gross price, false = net/excl. VAT +-- stock_vendor_ts: timestamp as reported by vendor (e.g. QSFPTEK "17 Apr 2026") + +ALTER TABLE stock_observations + ADD COLUMN IF NOT EXISTS stock_confidence SMALLINT DEFAULT 1 + CHECK (stock_confidence IN (1, 2, 3)), + ADD COLUMN IF NOT EXISTS price_currency CHAR(3), + ADD COLUMN IF NOT EXISTS price_includes_tax BOOLEAN, + ADD COLUMN IF NOT EXISTS stock_vendor_ts TIMESTAMPTZ; + +-- ── Selective cleanup ─────────────────────────────────────────────────────── +-- Truncate stock_observations: all 186 rows are from the first FS.com test run. +-- They will be repopulated automatically at the next scheduled scraper run +-- (02:00 / 10:00 / 18:00 via launchd). Transceiver catalog, specs, vendors, +-- price_observations, and all other tables are left untouched. +TRUNCATE TABLE stock_observations;