/** * FS.com Scraper — Prices, Stock, Product Catalog * * FS.com renders products client-side (JS), so we use PlaywrightCrawler. * Categories: /c/optical-transceivers-9 * * Respects: robots.txt, rate limiting (2s between requests) */ import { PlaywrightCrawler } from "crawlee"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; const BASE_URL = "https://www.fs.com"; const CATEGORY_URLS = [ "/c/1g-sfp-81", "/c/10g-sfp-63", "/c/25g-sfp28-3215", "/c/40g-qsfp-1360", "/c/100g-qsfp28-sfp-dd-1159", "/c/200g-qsfp-dd-qsfp56-3542", "/c/400g-osfp-qsfp112-qsfp-dd-3652", "/c/800g-osfp-qsfp-dd-4089", "/c/1.6t-osfp-5597", "/c/400g-coherent-qsfp-dd-4103", "/c/10g-cwdm-dwdm-sfp-65", "/c/100g-dwdm-qsfp28-3863", ]; interface FsProduct { partNumber: string; name: string; price: number; currency: string; stockLevel: string; quantity?: number; url: string; formFactor?: string; speedGbps?: number; speed?: string; reachLabel?: string; } function detectFormFactor(text: string): string | undefined { const lower = text.toLowerCase(); if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP"; if (lower.includes("qsfp-dd800") || lower.includes("qsfp-dd 800")) return "QSFP-DD800"; if (lower.includes("qsfp-dd")) return "QSFP-DD"; if (lower.includes("qsfp56")) return "QSFP56"; if (lower.includes("qsfp28")) return "QSFP28"; if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+"; if (lower.includes("sfp56")) return "SFP56"; if (lower.includes("sfp28")) return "SFP28"; if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+"; if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP"; if (lower.includes("cfp2")) return "CFP2"; if (lower.includes("xfp")) return "XFP"; return undefined; } function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { const patterns: [RegExp, string, number][] = [ [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200], [/100\s*g/i, "100G", 100], [/50\s*g/i, "50G", 50], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25], [/10\s*g/i, "10G", 10], [/1\s*g\b/i, "1G", 1], ]; for (const [re, speed, gbps] of patterns) { if (re.test(text)) return { speed, speedGbps: gbps }; } return undefined; } function detectReach(text: string): string | undefined { const match = text.match(/(\d+)\s*(m|km)\b/i); if (match) return `${match[1]}${match[2].toLowerCase()}`; return undefined; } export async function scrapeFs(): Promise { console.log("=== FS.com Scraper Starting ===\n"); const vendorId = await ensureVendor( "FS.COM", "compatible", "https://www.fs.com", "https://www.fs.com/c/optical-transceivers-9" ); console.log(`Vendor ID: ${vendorId}`); const products: FsProduct[] = []; let pagesScraped = 0; const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 15, requestHandlerTimeoutSecs: 60, headless: true, launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], }, }, preNavigationHooks: [ async ({ page }) => { await page.setExtraHTTPHeaders({ "Accept-Language": "en-US,en;q=0.9", }); await page.context().addCookies([ { name: "currency", value: "USD", domain: ".fs.com", path: "/" }, { name: "lang", value: "en", domain: ".fs.com", path: "/" }, { name: "country", value: "US", domain: ".fs.com", path: "/" }, ]); }, ], async requestHandler({ page, request, log }) { const url = request.url; log.info(`Scraping: ${url}`); // Wait for Vue.js product grid to render await page.waitForTimeout(4000); const productData = await page.evaluate(() => { const results: Array<{ name: string; href: string; price: string; stock: string; partNumber: string; }> = []; // Strategy 1: Parse .category__grid__item cards (2026 Vue.js DOM) const gridItems = document.querySelectorAll(".category__grid__item"); for (const item of gridItems) { const link = item.querySelector('a[href*="/products/"]') as HTMLAnchorElement | null; const img = item.querySelector("img"); const priceEl = item.querySelector(".grid__price"); const allText = item.textContent || ""; if (!link) continue; const name = img?.getAttribute("alt")?.trim() || link.textContent?.trim() || ""; const href = link.getAttribute("href") || ""; const price = priceEl?.textContent?.trim() || ""; // Extract stock from text like "1914 in Global Warehouse" const stockMatch = allText.match(/(\d+)\s+in\s+(?:Global\s+)?Warehouse/i); const stock = stockMatch ? stockMatch[1] + " in stock" : ""; // Extract FS product ID from URL const pnMatch = href.match(/products\/(\d+)\.html/); const partNumber = pnMatch ? `FS-${pnMatch[1]}` : ""; if (name && href) { results.push({ name, href, price, stock, partNumber }); } } // Strategy 2: Fallback — look for product links with prices nearby if (results.length === 0) { const productLinks = document.querySelectorAll( 'a[href*="/products/"], a[href*="/product/"]' ); for (const link of productLinks) { const el = link as HTMLAnchorElement; const name = el.textContent?.trim() || ""; const href = el.getAttribute("href") || ""; if (!name || name.length < 5 || !href) continue; const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest("li") || el.parentElement?.parentElement; let price = ""; let stock = ""; if (container) { const priceEl = container.querySelector('[class*="price"]'); price = priceEl?.textContent?.trim() || ""; const stockEl = container.querySelector('[class*="stock"], [class*="avail"]'); stock = stockEl?.textContent?.trim() || ""; } const pn = href.split("/").pop()?.replace(".html", "")?.replace(/\?.*/, "") || ""; if (name) results.push({ name, href, price, stock, partNumber: pn }); } } return results; }); for (const item of productData) { if (!item.name || !item.price) continue; const { price, currency } = parsePrice(item.price); const speedInfo = detectSpeed(item.name); if (price > 0) { products.push({ partNumber: item.partNumber || item.name.slice(0, 50), name: item.name, price, currency, stockLevel: item.stock ? parseStockLevel(item.stock) : "on_request", quantity: item.stock ? parseQuantity(item.stock) : undefined, url: item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`, formFactor: detectFormFactor(item.name), speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel: detectReach(item.name), }); } } pagesScraped++; log.info(` Found ${productData.length} items on page`); }, }); const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`); await crawler.run(startUrls); console.log(`\nPages scraped: ${pagesScraped}`); console.log(`Products found: ${products.length}`); // Deduplicate by partNumber const uniqueProducts = new Map(); for (const p of products) { const key = p.partNumber || p.name; if (!uniqueProducts.has(key)) { uniqueProducts.set(key, p); } } // Write to database let written = 0; let skipped = 0; for (const p of uniqueProducts.values()) { try { const transceiverId = await findOrCreateScrapedTransceiver({ partNumber: p.partNumber, vendorId, formFactor: p.formFactor, speedGbps: p.speedGbps, speed: p.speed, reachLabel: p.reachLabel, category: "DataCenter", }); const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity }); const isNew = await upsertPriceObservation({ transceiverId, sourceVendorId: vendorId, price: p.price, currency: p.currency, stockLevel: p.stockLevel, quantityAvailable: p.quantity, url: p.url, contentHash: hash, }); if (isNew) written++; else skipped++; } catch (err) { console.error(` Error: ${p.partNumber}:`, (err as Error).message); } } console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); // ═══ PHASE 2: Crawl product detail pages for VERIFIED specs ═══ console.log("\n=== Phase 2: Scraping product detail pages for verified specs ==="); // Get products that need spec verification (enriched_estimated or missing key fields) const needsSpecs = await pool.query( `SELECT t.id, t.part_number, t.slug FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.slug = 'fs-com' AND (t.data_confidence = 'enriched_estimated' OR t.data_confidence = 'unknown' OR t.connector IS NULL OR t.connector = '' OR t.connector = '-' OR t.wavelengths IS NULL OR t.wavelengths = '' OR t.fiber_type IS NULL OR t.fiber_type = '') LIMIT 200` ); console.log(`Products needing spec verification: ${needsSpecs.rows.length}`); // Build a map of product URLs from our scraped data const productUrls = new Map(); // transceiver_id → product URL for (const p of uniqueProducts.values()) { // Find the transceiver in DB by part number const match = await pool.query( `SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`, [p.partNumber, vendorId] ).catch(() => ({ rows: [] })); if (match.rows[0] && p.url) { productUrls.set(match.rows[0].id, p.url); } } let specsUpdated = 0; const specCrawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 10, requestHandlerTimeoutSecs: 45, headless: true, launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], }, }, preNavigationHooks: [ async ({ page }) => { await page.context().addCookies([ { name: "currency", value: "USD", domain: ".fs.com", path: "/" }, { name: "lang", value: "en", domain: ".fs.com", path: "/" }, ]); }, ], async requestHandler({ page, request, log }) { const transceiverId = request.userData?.transceiverId; if (!transceiverId) return; log.info(`Spec scrape: ${request.url}`); await page.waitForTimeout(3000); // Extract spec table from product detail page const specData = await page.evaluate(() => { const specs: Record = {}; // fs.com uses various spec table formats const rows = document.querySelectorAll( ".product-param tr, .product-specs tr, table.param-table tr, " + ".specifications tr, .detail-param tr, .prod-spec-list tr, " + '[class*="specification"] tr, [class*="param"] tr' ); for (const row of rows) { const cells = row.querySelectorAll("td, th"); if (cells.length >= 2) { const key = (cells[0]?.textContent || "").trim(); const val = (cells[1]?.textContent || "").trim(); if (key && val && key.length < 100) specs[key] = val; } } // Also try dl/dt/dd pattern const dts = document.querySelectorAll("dt, .spec-label, .param-label"); for (const dt of dts) { const dd = dt.nextElementSibling; if (dd && (dd.tagName === "DD" || dd.classList.contains("spec-value") || dd.classList.contains("param-value"))) { const key = (dt.textContent || "").trim(); const val = (dd.textContent || "").trim(); if (key && val) specs[key] = val; } } // Extract image const img = document.querySelector('.product-image img, .prod-img img, [class*="gallery"] img, .product-detail img'); const imageUrl = img?.getAttribute("src") || ""; // Extract datasheet link const dsLink = document.querySelector('a[href*="datasheet"], a[href*=".pdf"]'); const datasheetUrl = dsLink?.getAttribute("href") || ""; return { specs, imageUrl, datasheetUrl }; }); if (Object.keys(specData.specs).length > 0) { const parsed = parseSpecTable(specData.specs); const updated = await updateVerifiedSpecs({ transceiverId, fiberType: parsed.fiberType, connector: parsed.connector, wavelengths: parsed.wavelengths, reachMeters: parsed.reachMeters, reachLabel: parsed.reachLabel, powerConsumptionW: parsed.powerConsumptionW, tempRange: parsed.tempRange, modulation: parsed.modulation, domSupport: parsed.domSupport, imageUrl: specData.imageUrl ? (specData.imageUrl.startsWith("http") ? specData.imageUrl : `${BASE_URL}${specData.imageUrl}`) : undefined, datasheetUrl: specData.datasheetUrl ? (specData.datasheetUrl.startsWith("http") ? specData.datasheetUrl : `${BASE_URL}${specData.datasheetUrl}`) : undefined, source: "fs.com", }); if (updated) specsUpdated++; } }, }); // Build spec crawl requests (limit to 200 per run to avoid rate limiting) const specRequests = needsSpecs.rows .filter(r => productUrls.has(r.id)) .slice(0, 200) .map(r => ({ url: productUrls.get(r.id)!, userData: { transceiverId: r.id }, })); if (specRequests.length > 0) { console.log(`Crawling ${specRequests.length} product detail pages for specs...`); await specCrawler.run(specRequests); console.log(`Specs verified: ${specsUpdated} products updated`); } else { console.log("No product URLs available for spec verification this run"); } console.log("=== FS.com Scraper Complete ===\n"); } if (require.main === module) { scrapeFs() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }