/** * FS.com Scraper v2 — Warehouse Data, Prices, Product Catalog * * Phase 1: Category listing pages → collect product URLs (paginated) * Phase 2: Product detail pages → extract warehouse breakdown, net price, specs * Phase 3: Write to PostgreSQL (price_observations + stock_observations) * * Uses German locale (www.fs.com/de/) for EUR prices and German warehouse labels: * DE-Lager → warehouse_de_qty + warehouse_de_delivery_date * Global-Lager → warehouse_global_qty + warehouse_global_delivery_date * Nachlieferung → backorder_qty + backorder_estimated_date * verkauft → units_sold * * Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail). */ import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config"; import { rmSync } from "node:fs"; import type { Page } from "playwright"; /** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */ async function applyStealthPatches(page: Page): Promise { await page.addInitScript(() => { // Remove webdriver flag Object.defineProperty(navigator, "webdriver", { get: () => undefined }); // Mimic Chrome plugins (empty list = detected as bot) Object.defineProperty(navigator, "plugins", { get: () => ({ 0: { name: "Chrome PDF Plugin", filename: "internal-pdf-viewer", description: "Portable Document Format", length: 1 }, length: 1, namedItem: (n: string) => null, refresh: () => {}, item: (i: number) => null, [Symbol.iterator]: function* () { yield (this as any)[0]; } }) }); // Real Chrome languages Object.defineProperty(navigator, "languages", { get: () => ["de-DE", "de", "en-US", "en"] }); // Non-zero hardware concurrency Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 8 }); // Permissions API const originalQuery = window.navigator.permissions?.query; if (originalQuery) { (window.navigator.permissions as any).query = (params: any) => params.name === "notifications" ? Promise.resolve({ state: Notification.permission } as PermissionStatus) : originalQuery.call(navigator.permissions, params); } // Chrome object (headless detection) (window as any).chrome = { runtime: {}, loadTimes: () => {}, csi: () => {}, app: {} }; // Hide automation-specific properties delete (window as any).__playwright; delete (window as any).__pwInitScripts; }); } import { ensureVendor, upsertPriceObservation, upsertStockObservation, findOrCreateScrapedTransceiver, pool, } from "../utils/db"; import { contentHash } from "../utils/hash"; import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; // ── Constants ────────────────────────────────────────────────────────────────── // FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist) const BASE_URL = "https://www.fs.com/de"; const MAX_PAGES_PER_CATEGORY = 10; const MAX_DETAIL_PAGES_PER_RUN = 300; const STOCK_FRESH_HOURS = 12; const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") .map((u) => u.trim()) .filter(Boolean); function buildProxyConfiguration(): ProxyConfiguration | undefined { if (PROXY_URLS.length === 0) return undefined; return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); } const CATEGORY_URLS = [ "/c/1g-sfp-81", "/c/10g-sfp-63", "/c/25g-sfp28-3215", "/c/40g-qsfp-1360", "/c/100g-qsfp28-sfp-dd-1159", "/c/200g-qsfp-dd-qsfp56-3542", "/c/400g-osfp-qsfp112-qsfp-dd-3652", "/c/800g-osfp-qsfp-dd-4089", "/c/1.6t-osfp-5597", "/c/400g-coherent-qsfp-dd-4103", "/c/10g-cwdm-dwdm-sfp-65", "/c/100g-dwdm-qsfp28-3863", ]; const DE_COOKIES = [ { name: "currency", value: "EUR", domain: ".fs.com", path: "/" }, { name: "lang", value: "de", domain: ".fs.com", path: "/" }, { name: "country", value: "DE", domain: ".fs.com", path: "/" }, ]; // ── German locale parsers ────────────────────────────────────────────────────── const GERMAN_MONTHS: Record = { jan: "01", feb: "02", mär: "03", mar: "03", apr: "04", mai: "05", may: "05", jun: "06", jul: "07", aug: "08", sep: "09", okt: "10", oct: "10", nov: "11", dez: "12", dec: "12", }; /** * Parse German-formatted quantity string. * "4.895" → 4895 (period = thousands separator in German) * "210.9K" → 210900 * "1.2M" → 1200000 */ function parseGermanQty(text: string): number | undefined { const t = text.trim().replace(/\s/g, ""); if (!t) return undefined; const kMatch = t.match(/^([\d.,]+)[Kk]$/); if (kMatch) { const n = parseFloat(kMatch[1].replace(/\./g, "").replace(",", ".")); return isNaN(n) ? undefined : Math.round(n * 1_000); } const mMatch = t.match(/^([\d.,]+)[Mm]$/); if (mMatch) { const n = parseFloat(mMatch[1].replace(/\./g, "").replace(",", ".")); return isNaN(n) ? undefined : Math.round(n * 1_000_000); } const n = parseInt(t.replace(/\./g, "").replace(/,/g, ""), 10); return isNaN(n) ? undefined : n; } /** * Parse German date to ISO "YYYY-MM-DD". * "20 Apr., 2026" → "2026-04-20" * "20.04.2026" → "2026-04-20" */ function parseGermanDate(text: string): string | undefined { const numericMatch = text.match(/(\d{1,2})\.(\d{1,2})\.(\d{4})/); if (numericMatch) { const [, d, m, y] = numericMatch; return `${y}-${m.padStart(2, "0")}-${d.padStart(2, "0")}`; } const wordMatch = text.match(/(\d{1,2})\.?\s+([A-Za-zÄÖÜäöüß]+)\.?,?\s*(\d{4})/); if (!wordMatch) return undefined; const day = wordMatch[1].padStart(2, "0"); const monthRaw = wordMatch[2] .toLowerCase() .replace(/ä/g, "a").replace(/ö/g, "o").replace(/ü/g, "u") .slice(0, 3); const month = GERMAN_MONTHS[monthRaw]; if (!month) return undefined; return `${wordMatch[3]}-${month}-${day}`; } /** * Parse German price to EUR float. * "42,50" → 42.50 * "1.063,02" → 1063.02 */ function parseGermanPrice(raw: string): number | undefined { const cleaned = raw.replace(/[^0-9.,]/g, "").trim(); if (!cleaned) return undefined; let normalized: string; if (/\d+\.\d{3},\d{2}/.test(cleaned)) { normalized = cleaned.replace(/\./g, "").replace(",", "."); } else if (cleaned.includes(",")) { normalized = cleaned.replace(",", "."); } else { normalized = cleaned; } const n = parseFloat(normalized); return isNaN(n) || n <= 0 ? undefined : n; } // ── Stock level helper ───────────────────────────────────────────────────────── function deriveStockLevel( deQty?: number, globalQty?: number, backorderQty?: number ): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { const total = (deQty ?? 0) + (globalQty ?? 0); if (total > 100) return "in_stock"; if (total > 0) return "low_stock"; if ((backorderQty ?? 0) > 0) return "on_request"; return "out_of_stock"; } // ── Product classification ───────────────────────────────────────────────────── function detectFormFactor(text: string): string | undefined { const l = text.toLowerCase(); if (l.includes("osfp") && !l.includes("qsfp")) return "OSFP"; if (l.includes("qsfp-dd800") || l.includes("qsfp-dd 800")) return "QSFP-DD800"; if (l.includes("qsfp-dd")) return "QSFP-DD"; if (l.includes("qsfp56")) return "QSFP56"; if (l.includes("qsfp28")) return "QSFP28"; if (l.includes("qsfp+") || l.includes("qsfp plus")) return "QSFP+"; if (l.includes("sfp56")) return "SFP56"; if (l.includes("sfp28")) return "SFP28"; if (l.includes("sfp+") || l.includes("sfp plus")) return "SFP+"; if (l.includes("sfp") && !l.includes("qsfp")) return "SFP"; if (l.includes("cfp2")) return "CFP2"; if (l.includes("xfp")) return "XFP"; return undefined; } function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { const patterns: [RegExp, string, number][] = [ [/1\.6\s*t/i, "1.6T", 1600], [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200], [/100\s*g/i, "100G", 100], [/50\s*g/i, "50G", 50], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25], [/10\s*g/i, "10G", 10], [/\b1\s*g\b/i, "1G", 1], ]; for (const [re, speed, gbps] of patterns) { if (re.test(text)) return { speed, speedGbps: gbps }; } return undefined; } function detectReach(text: string): string | undefined { const m = text.match(/(\d+)\s*(m|km)\b/i); return m ? `${m[1]}${m[2].toLowerCase()}` : undefined; } // ── Types ────────────────────────────────────────────────────────────────────── interface ProductSummary { url: string; name: string; partNumber: string; } interface ProductDetail extends ProductSummary { priceNet?: number; deQty?: number; deDeliveryDate?: string; globalQty?: number; globalDeliveryDate?: string; backorderQty?: number; backorderDate?: string; unitsSold?: number; compatibleBrands: string[]; specs: Record; imageUrl?: string; datasheetUrl?: string; } // ── Phase 1: Collect product URLs ────────────────────────────────────────────── /** * Visit all category pages (paginated) and return a Map of product URL → summary. * Pages are queued in round-robin order (all p1s, then all p2s, …) so an * exhausted category is detected before we waste further requests on it. */ async function collectProductUrls( proxyConfiguration: ProxyConfiguration | undefined ): Promise> { // Purge leftover request queue from previous runs (instance-isolated storage) const fsPhase1Dir = crawleeStorageDir("fs-phase1"); try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ } const products = new Map(); const exhausted = new Set(); // Pre-queue: all page-1s, then all page-2s, … const listingRequests = Array.from({ length: MAX_PAGES_PER_CATEGORY }, (_, i) => CATEGORY_URLS.map((cat) => ({ url: i === 0 ? `${BASE_URL}${cat}` : `${BASE_URL}${cat}?page=${i + 1}`, userData: { catPath: cat, pageNum: i + 1 }, uniqueKey: `listing-${cat}-p${i + 1}`, })) ).flat(); const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 12, requestHandlerTimeoutSecs: 60, headless: true, useSessionPool: false, ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], }, }, preNavigationHooks: [ async ({ page }) => { await applyStealthPatches(page); await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" }); await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" }))); }, ], async requestHandler({ page, request, log }) { const { catPath, pageNum } = request.userData as { catPath: string; pageNum: number }; if (exhausted.has(catPath)) { log.debug(`[p1] Skip ${catPath} p${pageNum} — exhausted`); return; } try { await page.waitForSelector('a[href*="/products/"]', { timeout: 12000 }); } catch { await page.waitForTimeout(6000); } const found = await page.evaluate( (): Array<{ url: string; name: string; partNumber: string }> => { const results: Array<{ url: string; name: string; partNumber: string }> = []; const seen = new Set(); for (const link of document.querySelectorAll('a[href*="/products/"]')) { const href = link.getAttribute("href") ?? ""; if (!href.includes("/products/")) continue; const absUrl = href.startsWith("http") ? href : `https://www.fs.com${href}`; if (seen.has(absUrl)) continue; seen.add(absUrl); const img = link.querySelector("img"); const name = ( img?.getAttribute("alt") ?? link.getAttribute("title") ?? link.textContent ?? "" ).trim().replace(/\s+/g, " ").slice(0, 200); const pnMatch = href.match(/\/products\/(\d+)\.html/); const partNumber = pnMatch ? `FS-${pnMatch[1]}` : ""; if (name.length >= 5 && partNumber) results.push({ url: absUrl, name, partNumber }); } return results; } ); log.info(`[Listing] ${catPath} p${pageNum}: ${found.length} products`); if (found.length === 0) { exhausted.add(catPath); } else { for (const p of found) { if (!products.has(p.url)) products.set(p.url, p); } } }, }, makeCrawleeConfig("fs-phase1")); await crawler.run(listingRequests); console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`); return products; } // ── Phase 2: Scrape product detail pages ────────────────────────────────────── async function scrapeProductDetails( requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, proxyConfiguration: ProxyConfiguration | undefined ): Promise { // Purge Phase 2 storage so it starts with a clean request queue const fsPhase2Dir = crawleeStorageDir("fs-phase2"); try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ } const details: ProductDetail[] = []; const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 10, requestHandlerTimeoutSecs: 90, headless: true, useSessionPool: false, ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], }, }, preNavigationHooks: [ async ({ page }) => { await applyStealthPatches(page); await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" }); await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" }))); }, ], async requestHandler({ page, request, log }) { const { name: listingName, partNumber: listingPn } = request.userData as { name: string; partNumber: string; }; const url = request.url; try { await page.waitForSelector( 'h1, .product-detail, [class*="product-info"], [class*="product-main"]', { timeout: 12000 } ); } catch { await page.waitForTimeout(7000); } const raw = await page.evaluate( (): { bodyText: string; specs: Record; brands: string[]; imageUrl: string; datasheetUrl: string; h1: string; } => { const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n"); const specs: Record = {}; const SEL = [ ".product-param tr", ".product-specs tr", ".param-table tr", ".specifications tr", ".detail-param tr", ".prod-spec-list tr", '[class*="specification"] tr', '[class*="param"] tr', ".tab-content tr", ".product-info-table tr", ".tech-param tr", ].join(", "); document.querySelectorAll(SEL).forEach((row) => { const cells = row.querySelectorAll("td, th"); if (cells.length >= 2) { const k = (cells[0]?.textContent ?? "").trim().replace(/\s+/g, " "); const v = (cells[1]?.textContent ?? "").trim().replace(/\s+/g, " "); if (k && v && k.length < 80 && !/^[-\s]+$/.test(k)) specs[k] = v; } }); document.querySelectorAll("dt").forEach((dt) => { const dd = dt.nextElementSibling; if (dd?.tagName === "DD") { const k = (dt.textContent ?? "").trim(); const v = (dd.textContent ?? "").trim(); if (k && v && k.length < 80) specs[k] = v; } }); const brands: string[] = []; const brandContainer = document.querySelector( '[class*="compatible"], [class*="brand-list"], [class*="compatibility"], ' + '[class*="apply-brand"], [id*="brand"], [id*="compatible"]' ); if (brandContainer) { brandContainer.querySelectorAll("button, a, span, li").forEach((el) => { const t = (el.textContent ?? "").trim(); if (t && t.length > 1 && t.length < 50 && !/^\d+$/.test(t)) brands.push(t); }); } if (brands.length === 0) { const bodyTxt = document.body?.innerText ?? ""; const m = bodyTxt.match(/[Kk]ompatibel\s+mit[^:]*:\s*([\s\S]{0,600})/); if (m) { m[1].split(/[,;\n]/).forEach((s) => { const b = s.trim(); if (b.length > 1 && b.length < 50 && !/^\d/.test(b)) brands.push(b); }); } } const imgEl = document.querySelector( ".product-image img, .prod-img img, .product-gallery img, " + '[class*="main-image"] img, [class*="primary-image"] img, ' + ".slick-current img, .product__image img" ); const imageUrl = imgEl?.src ?? imgEl?.getAttribute("data-src") ?? ""; const dsEl = document.querySelector( 'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]' ); const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? ""; const h1 = document.querySelector("h1")?.textContent?.trim() ?? ""; return { bodyText, specs, brands, imageUrl, datasheetUrl, h1 }; } ); if (!raw.bodyText) { log.warning(`No text: ${url}`); return; } const t = raw.bodyText; // ── Net price (ohne MwSt, EUR) ───────────────────────────────────────── // Priority: patterns that require "ohne MwSt" or "netto" qualifier (FS.com shows // real prices this way). Fallback broad patterns are only accepted above €100 // to avoid matching FS.com's €79 placeholder/template price. let priceNet: number | undefined; const PRICE_QUALIFIED: RegExp[] = [ /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i, /€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i, /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i, /Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i, // DOM-extracted price element (set by page.evaluate in raw) /Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i, /([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i, ]; for (const pat of PRICE_QUALIFIED) { const m = t.match(pat); if (m?.[1]) { const p = parseGermanPrice(m[1]); if (p && p > 0.5 && p < 500_000) { priceNet = p; break; } } } // Broad fallback — only accept if price > €100 (avoids FS.com's €79 placeholder) if (!priceNet) { for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) { const m = t.match(pat); if (m?.[1]) { const p = parseGermanPrice(m[1]); if (p && p > 100 && p < 500_000) { priceNet = p; break; } } } } // ── DE-Lager ─────────────────────────────────────────────────────────── let deQty: number | undefined; let deDeliveryDate: string | undefined; const deM = t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*DE[- ]Lager/i) ?? t.match(/(\d[\d.,KkMm]*)\s*im\s*DE[- ]?Lager/i); if (deM?.[1]) { deQty = parseGermanQty(deM[1]); const idx = t.indexOf(deM[0]); const ctx = t.slice(idx, idx + 300); const dm = ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); if (dm?.[1]) deDeliveryDate = parseGermanDate(dm[1]); } // ── Global-Lager ─────────────────────────────────────────────────────── let globalQty: number | undefined; let globalDeliveryDate: string | undefined; const glM = t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*Global[- ]Lager/i) ?? t.match(/(\d[\d.,KkMm]*)\s*im\s*Global[- ]?(?:Lager|Warehouse)/i) ?? t.match(/(\d[\d.,KkMm]*)\s*in\s+Global\s+Warehouse/i); if (glM?.[1]) { globalQty = parseGermanQty(glM[1]); const idx = t.indexOf(glM[0]); const ctx = t.slice(idx, idx + 300); const dm = ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); if (dm?.[1]) globalDeliveryDate = parseGermanDate(dm[1]); } // ── Nachlieferung ────────────────────────────────────────────────────── let backorderQty: number | undefined; let backorderDate: string | undefined; const boM = t.match(/(\d[\d.,KkMm]*)\s*(?:Stk\.)?\s*in\s+Nachlieferung/i) ?? t.match(/Nachlieferung[:\s]*(\d[\d.,KkMm]*)/i); if (boM?.[1]) { backorderQty = parseGermanQty(boM[1]); const idx = t.indexOf(boM[0]); const ctx = t.slice(idx, idx + 300); const dm = ctx.match(/[Ee]rwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ?? ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/); if (dm?.[1]) backorderDate = parseGermanDate(dm[1]); } // ── Units sold ───────────────────────────────────────────────────────── let unitsSold: number | undefined; const soldM = t.match(/(\d[\d.,KkMm]*)\s*(?:[Mm]al\s+)?[Vv]erkauft/) ?? t.match(/([\d.,KkMm]+)\+?\s*sold/i); if (soldM?.[1]) unitsSold = parseGermanQty(soldM[1]); // ── Part number refinement ───────────────────────────────────────────── let partNumber = listingPn; const pnM = t.match( /(?:Part\s+Number|Teilenummer|Artikelnummer|P\/N)[:\s]+([A-Z0-9][-A-Z0-9./]{3,40})/i ); if (pnM?.[1]) partNumber = pnM[1].trim(); const resolveUrl = (u: string): string | undefined => { if (!u) return undefined; if (u.startsWith("//")) return `https:${u}`; if (u.startsWith("/")) return `${BASE_URL}${u}`; if (u.startsWith("http")) return u; return undefined; }; const compatibleBrands = [...new Set(raw.brands)].filter((b) => b.length > 1).slice(0, 30); log.info( `${partNumber}: €${priceNet?.toFixed(2) ?? "?"} | ` + `DE=${deQty ?? "-"} GL=${globalQty ?? "-"} BO=${backorderQty ?? "-"} ` + `sold=${unitsSold ?? "-"} brands=${compatibleBrands.length}` ); details.push({ url, name: raw.h1 || listingName, partNumber, priceNet, deQty, deDeliveryDate, globalQty, globalDeliveryDate, backorderQty, backorderDate, unitsSold, compatibleBrands, specs: raw.specs, imageUrl: resolveUrl(raw.imageUrl), datasheetUrl: resolveUrl(raw.datasheetUrl), }); }, }, makeCrawleeConfig("fs-phase2")); await crawler.run(requests); return details; } // ── Main export ──────────────────────────────────────────────────────────────── export async function scrapeFs(): Promise { console.log("=== FS.com Scraper v2 Starting ===\n"); // ── Quick connectivity check — exit early on datacenter IPs that block FS.com ─ try { const probe = await fetch("https://www.fs.com/robots.txt", { signal: AbortSignal.timeout(8000), headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }, }); if (!probe.ok && probe.status !== 403) throw new Error(`HTTP ${probe.status}`); if (probe.status === 403) { const body = (await probe.text()).slice(0, 200); if (body.includes("Request blocked") || body.includes("ERROR")) { console.warn("[FS.com] Server IP is blocked by FS.com — skipping. Run via run-fs-scraper-mac.sh on a residential IP."); return; } } } catch (err) { const msg = (err as Error).message; if (msg.includes("ENOTFOUND") || msg.includes("ECONNREFUSED") || msg.includes("ERR_EMPTY") || msg.includes("TimeoutError")) { console.warn(`[FS.com] Connectivity check failed (${msg.slice(0, 60)}) — skipping. This scraper requires a residential IP.`); return; } // For other errors (e.g. 200 OK but weird body), proceed anyway } const proxyConfiguration = buildProxyConfiguration(); const vendorId = await ensureVendor( "FS.COM", "compatible", "https://www.fs.com", "https://www.fs.com/de/c/optical-transceivers-9" ); console.log(`Vendor ID: ${vendorId}`); // ── Phase 1: Discover product URLs ───────────────────────────────────────── console.log("\n[Phase 1] Collecting product URLs from category listing pages…"); const productMap = await collectProductUrls(proxyConfiguration); if (productMap.size === 0) { console.warn("[Phase 1] No products discovered — check selectors or proxy."); return; } // ── Filter: skip products with fresh stock data ───────────────────────────── const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean); const freshlyScraped = new Set(); if (allPartNumbers.length > 0) { const freshResult = await pool.query( `SELECT DISTINCT t.part_number FROM transceivers t JOIN stock_observations so ON so.transceiver_id = t.id WHERE so.source_vendor_id = $1 AND so.time > NOW() - INTERVAL '${STOCK_FRESH_HOURS} hours' AND t.part_number = ANY($2)`, [vendorId, allPartNumbers] ); for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string); } const urlsToScrape = [...productMap.keys()] .filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? "")) .slice(0, MAX_DETAIL_PAGES_PER_RUN); console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`); console.log(` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`); if (urlsToScrape.length === 0) { console.log("[Phase 2] All products have fresh stock data — nothing to scrape."); return; } // ── Phase 2: Scrape detail pages ──────────────────────────────────────────── const detailRequests = urlsToScrape.map((url) => { const s = productMap.get(url); return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } }; }); const details = await scrapeProductDetails(detailRequests, proxyConfiguration); console.log(`[Phase 2] Complete — ${details.length} pages scraped.`); // ── Phase 3: Write to database ───────────────────────────────────────────── console.log("\n[Phase 3] Writing to database…"); let priceWritten = 0; let stockWritten = 0; let specsUpdated = 0; let errors = 0; for (const detail of details) { try { const ff = detectFormFactor(detail.name); const speedInfo = detectSpeed(detail.name); const reach = detectReach(detail.name); const parsed = parseSpecTable(detail.specs); const transceiverId = await findOrCreateScrapedTransceiver({ partNumber: detail.partNumber, vendorId, formFactor: ff, speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel: reach ?? parsed.reachLabel, reachMeters: parsed.reachMeters, fiberType: parsed.fiberType, wavelengths: parsed.wavelengths, imageUrl: detail.imageUrl, category: "DataCenter", }); const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty); const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0); if (detail.priceNet && detail.priceNet > 0) { const hash = contentHash({ p: detail.priceNet, de: detail.deQty ?? 0, gl: detail.globalQty ?? 0, }); const isNew = await upsertPriceObservation({ transceiverId, sourceVendorId: vendorId, price: detail.priceNet, currency: "EUR", stockLevel, quantityAvailable: totalQty > 0 ? totalQty : undefined, url: detail.url, contentHash: hash, }); if (isNew) priceWritten++; } const stockNew = await upsertStockObservation({ transceiverId, sourceVendorId: vendorId, stockLevel, quantityAvailable: totalQty > 0 ? totalQty : undefined, warehouseDeQty: detail.deQty, warehouseDeDeliveryDate: detail.deDeliveryDate ?? null, warehouseGlobalQty: detail.globalQty, warehouseGlobalDeliveryDate: detail.globalDeliveryDate ?? null, backorderQty: detail.backorderQty, backorderEstimatedDate: detail.backorderDate ?? null, unitsSold: detail.unitsSold, compatibleBrands: detail.compatibleBrands, priceNet: detail.priceNet, productUrl: detail.url, // FS.com: per-warehouse breakdown (DE-Lager + Global-Lager), EUR net prices stockConfidence: 3, priceCurrency: "EUR", priceIncludesTax: false, }); if (stockNew) stockWritten++; if (Object.keys(detail.specs).length > 0) { const updated = await updateVerifiedSpecs({ transceiverId, fiberType: parsed.fiberType, connector: parsed.connector, wavelengths: parsed.wavelengths, reachMeters: parsed.reachMeters, reachLabel: parsed.reachLabel, powerConsumptionW: parsed.powerConsumptionW, tempRange: parsed.tempRange, modulation: parsed.modulation, domSupport: parsed.domSupport, imageUrl: detail.imageUrl, datasheetUrl: detail.datasheetUrl, source: "fs.com", }); if (updated) specsUpdated++; } } catch (err) { console.error(` ✗ ${detail.partNumber}: ${(err as Error).message}`); errors++; } } console.log("\n=== FS.com Scraper v2 Complete ==="); console.log(` Products discovered: ${productMap.size}`); console.log(` Detail pages scraped: ${details.length}`); console.log(` Price observations: ${priceWritten} new`); console.log(` Stock observations: ${stockWritten} new`); console.log(` Specs verified: ${specsUpdated}`); if (errors > 0) console.warn(` DB errors: ${errors}`); } if (require.main === module) { scrapeFs() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }