/** * ATGBICS Scraper — Prices, Stock, Product Catalog * * ATGBICS is a UK-based independent compatible optics vendor. * Site uses Shopify with client-side rendering, so we use PlaywrightCrawler. * Prices are publicly visible in GBP. * * Categories scraped: * /collections/sfp-transceivers/ * /collections/sfp-plus-transceivers/ * /collections/sfp28-transceivers/ * /collections/qsfp-plus-transceivers/ * /collections/qsfp28-transceivers/ * /collections/qsfp-dd-transceivers/ * * Respects: robots.txt, rate limiting (2s between requests, max 50 pages) */ import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; import { makeCrawleeConfig } from "../utils/crawlee-config"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; const BASE_URL = "https://www.atgbics.com"; // SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") .map((u) => u.trim()) .filter(Boolean); function buildProxyConfiguration(): ProxyConfiguration | undefined { if (PROXY_URLS.length === 0) return undefined; return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); } const CATEGORY_URLS = [ "/collections/sfp-transceivers/", "/collections/sfp-plus-transceivers/", "/collections/sfp28-transceivers/", "/collections/qsfp-plus-transceivers/", "/collections/qsfp28-transceivers/", "/collections/qsfp-dd-transceivers/", ]; const MAX_PAGES = 50; interface AtgbicsProduct { partNumber: string; name: string; price: number; currency: string; stockLevel: string; quantity?: number; url: string; formFactor?: string; speedGbps?: number; speed?: string; reachLabel?: string; fiberType?: string; } function detectFormFactor(text: string): string | undefined { const lower = text.toLowerCase(); if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD"; if (lower.includes("qsfp28")) return "QSFP28"; if (lower.includes("qsfp+") || lower.includes("qsfp plus") || lower.includes("qsfp-plus")) return "QSFP+"; if (lower.includes("sfp28")) return "SFP28"; if (lower.includes("sfp+") || lower.includes("sfp plus") || lower.includes("sfp-plus")) return "SFP+"; if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP"; if (lower.includes("xfp")) return "XFP"; if (lower.includes("cfp2")) return "CFP2"; if (lower.includes("cfp")) return "CFP"; return undefined; } function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { const patterns: [RegExp, string, number][] = [ [/400\s*g/i, "400G", 400], [/100\s*g/i, "100G", 100], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25], [/10\s*g/i, "10G", 10], [/1000\s*base/i, "1G", 1], [/1\s*g\b/i, "1G", 1], ]; for (const [re, speed, gbps] of patterns) { if (re.test(text)) return { speed, speedGbps: gbps }; } return undefined; } function detectReach(text: string): string | undefined { const match = text.match(/(\d+)\s*(m|km)\b/i); if (match) return `${match[1]}${match[2].toLowerCase()}`; return undefined; } /** * Extract the real OEM part number from an ATGBICS URL slug. * * ATGBICS slug format: {oem-part-number}-{vendor}-r-compatible-transceiver-{specs} * Examples: * 3he16564aa-nokia-r-compatible-transceiver-qsfp-dd-... → 3HE16564AA * jnp-sfp-25g-lr-juniper-r-compatible-... → JNP-SFP-25G-LR * sfp-10g-sr-cisco-compatible-... → SFP-10G-SR * * Returns the slug uppercased if extraction fails (better than full slug). */ function extractOemPartNumber(slug: string): string { let pn = slug; // Remove "-r-compatible-transceiver-..." and everything after pn = pn.replace(/-r-compatible(?:-transceiver.*)?$/i, ""); // Remove "-compatible-transceiver-..." (no "r-") pn = pn.replace(/-compatible-transceiver.*$/i, ""); // Remove "-compatible-..." (short form) pn = pn.replace(/-compatible.*$/i, ""); // Remove trailing known OEM vendor names that ATGBICS appends before "-r-compatible" const oemVendors = [ "nokia", "cisco", "juniper", "arista", "huawei", "hp", "hpe", "dell", "extreme", "brocade", "avaya", "netgear", "mikrotik", "ubiquiti", "mellanox", "intel", "broadcom", "allied", "planet", "zyxel", "dlink", "d-link", "foundry", "force10", "enterasys", "optical", "palo", "fortinet", "hitachi", "calix", "calix", "ciena", "adtran", "ribbon", "sycamore", "rad", "zhone", "infinera", "fujitsu", "nec", "ericsson", "alcatel", "lucent", ]; for (const v of oemVendors) { pn = pn.replace(new RegExp(`-${v}$`, "i"), ""); } // Final cleanup: normalize to uppercase (OEM part numbers are uppercase) const result = pn.toUpperCase().trim(); // Safety: if result is empty, longer than 40 chars, or still has "TRANSCEIVER", return slug as-is if (!result || result.length > 40 || result.includes("TRANSCEIVER")) { return slug.toUpperCase().slice(0, 40); } return result; } function detectFiberType(text: string): string | undefined { const lower = text.toLowerCase(); if (lower.includes("single mode") || lower.includes("single-mode") || lower.includes("smf") || lower.includes("-lr") || lower.includes("-er") || lower.includes("-zr")) return "SMF"; if (lower.includes("multi mode") || lower.includes("multi-mode") || lower.includes("mmf") || lower.includes("-sr") || lower.includes("-sx")) return "MMF"; if (lower.includes("dac") || lower.includes("direct attach") || lower.includes("copper") || lower.includes("-t ") || lower.includes("twinax")) return "DAC"; return undefined; } export async function scrapeAtgbics(): Promise { console.log("=== ATGBICS Scraper Starting ===\n"); const vendorId = await ensureVendor( "ATGBICS", "compatible", "https://www.atgbics.com", "https://www.atgbics.com/collections/sfp-plus-transceivers/" ); console.log(`Vendor ID: ${vendorId}`); const products: AtgbicsProduct[] = []; let pagesScraped = 0; const proxyConfiguration = buildProxyConfiguration(); const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1 maxRequestsPerCrawl: MAX_PAGES, requestHandlerTimeoutSecs: 60, headless: true, useSessionPool: false, // Disable session pool to avoid SDK_SESSION_POOL_STATE.json crash ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"], }, }, async requestHandler({ page, request, enqueueLinks, log }) { const url = request.url; log.info(`Scraping: ${url}`); // Wait for Shopify product grid to render await page.waitForTimeout(2000); // Check if this is a collection (listing) page or a product page const isCollection = url.includes("/collections/"); if (isCollection) { // Extract product links from listing page and enqueue them const productData = await page.evaluate(() => { const results: Array<{ name: string; href: string; price: string; stock: string; partNumber: string; }> = []; // Shopify collection page — product cards const cards = document.querySelectorAll( ".product-item, .grid-product, [class*=\"product-card\"], [class*=\"product-grid\"] li, .collection-grid__item" ); for (const card of cards) { const linkEl = card.querySelector("a[href*=\"/products/\"]") as HTMLAnchorElement | null; const nameEl = card.querySelector( ".product-item__title, .grid-product__title, [class*=\"product-title\"], [class*=\"product-name\"], h2, h3" ); const priceEl = card.querySelector( ".product-item__price, .grid-product__price, [class*=\"price\"]:not([class*=\"compare\"]):not([class*=\"was\"])" ); const stockEl = card.querySelector( "[class*=\"stock\"], [class*=\"availability\"], [class*=\"badge\"]" ); const href = linkEl?.getAttribute("href") || ""; const name = nameEl?.textContent?.trim() || linkEl?.textContent?.trim() || ""; const price = priceEl?.textContent?.trim() || ""; const stock = stockEl?.textContent?.trim() || ""; // Derive part number from URL slug: /products/sfp-10g-lr → sfp-10g-lr // Then extract real OEM part number (strips "-r-compatible-transceiver-*") const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; if (href && name && name.length > 3) { results.push({ name, href, price, stock, partNumber: slug }); // OEM extraction done below after page parse } } // Fallback: grab any /products/ links with adjacent price text if (results.length === 0) { const allProductLinks = document.querySelectorAll("a[href*=\"/products/\"]"); const seen = new Set(); for (const el of allProductLinks) { const a = el as HTMLAnchorElement; const href = a.getAttribute("href") || ""; if (seen.has(href)) continue; seen.add(href); const name = a.textContent?.trim() || ""; if (!name || name.length < 3) continue; const container = a.closest("li") || a.closest("article") || a.parentElement?.parentElement; const priceEl = container?.querySelector("[class*=\"price\"]"); const price = priceEl?.textContent?.trim() || ""; const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; results.push({ name, href, price, stock: "", partNumber: slug }); } } return results; }); log.info(` Found ${productData.length} products on collection page`); for (const item of productData) { if (!item.href) continue; const fullUrl = item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`; // If we already have price data from the listing, store it directly if (item.price) { const { price, currency } = parsePrice(item.price); const speedInfo = detectSpeed(item.name); // Extract real OEM part number from slug (strips -r-compatible-transceiver-*) const realPartNumber = extractOemPartNumber(item.partNumber); // Extract reach from name OR slug (slug often has "120km" even when name doesn't) const reachLabel = detectReach(item.name) || detectReach(item.partNumber) || undefined; if (price > 0) { products.push({ partNumber: realPartNumber || item.name.slice(0, 80), name: item.name, price, currency: currency === "USD" ? "GBP" : currency, // ATGBICS is GBP — parsePrice may default to USD if no symbol on listing stockLevel: item.stock ? parseStockLevel(item.stock) : "in_stock", quantity: item.stock ? parseQuantity(item.stock) : undefined, url: fullUrl, formFactor: detectFormFactor(item.name), speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel, fiberType: detectFiberType(item.name), }); } } } // Enqueue next page if pagination exists await enqueueLinks({ selector: "a[href*=\"?page=\"], a.pagination__next, a[rel=\"next\"], .pagination a[href]", transformRequestFunction: (req) => { if (pagesScraped >= MAX_PAGES) return false; return req; }, }); pagesScraped++; } else { // Product detail page — extract precise data const data = await page.evaluate(() => { const title = document.querySelector( "h1.product__title, h1.product-title, h1.product_title, h1" )?.textContent?.trim() || ""; // Shopify price — prefer sale price if available const salePriceEl = document.querySelector( ".price__sale .price-item--sale, .product__price .money, [class*=\"price\"] .money, [data-product-price], .price ins" ); const priceText = salePriceEl?.textContent?.trim() || ""; // Stock / availability const stockEl = document.querySelector( ".product__availability, .availability, [class*=\"stock\"], [class*=\"inventory\"], .badge--sold-out, .badge--in-stock" ); const stockText = stockEl?.textContent?.trim() || ""; // Quantity badge (some Shopify themes show "X in stock") const qtyEl = document.querySelector("[class*=\"quantity\"], [class*=\"inventory-count\"]"); const qtyText = qtyEl?.textContent?.trim() || ""; // Short description / variant title for reach/fiber info const descEl = document.querySelector( ".product__description, .product-description, .rte p:first-child, .product__short-description" ); const description = descEl?.textContent?.trim() || ""; // SKU / part number (Shopify often exposes this) const skuEl = document.querySelector(".product__sku, [class*=\"sku\"], [itemprop=\"sku\"]"); const sku = skuEl?.textContent?.replace(/SKU[:\s]*/i, "").trim() || ""; return { title, priceText, stockText, qtyText, description, sku }; }); const slug = url.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; // Prefer Shopify SKU if available, otherwise extract real OEM PN from slug const partNumber = data.sku && data.sku.length > 2 && data.sku.length < 40 ? data.sku.toUpperCase() : extractOemPartNumber(slug); const name = data.title || slug; const combinedText = `${name} ${data.description}`; const { price, currency } = parsePrice(data.priceText); if (price > 0) { const speedInfo = detectSpeed(combinedText); // Reach from title/description first, then fall back to slug (slug often has "120km") const reachLabel = detectReach(combinedText) || detectReach(slug) || undefined; products.push({ partNumber, name, price, currency: currency === "USD" ? "GBP" : currency, // ATGBICS prices in GBP stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock", quantity: data.qtyText ? parseQuantity(data.qtyText) : undefined, url, formFactor: detectFormFactor(combinedText), speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel, fiberType: detectFiberType(combinedText), }); } pagesScraped++; } }, }, makeCrawleeConfig("atgbics")); const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`); await crawler.run(startUrls); console.log(`\nPages scraped: ${pagesScraped}`); console.log(`Products found: ${products.length}`); // Deduplicate by partNumber — prefer product detail page data (more precise) const uniqueProducts = new Map(); for (const p of products) { const key = p.partNumber || p.name; const existing = uniqueProducts.get(key); // Keep the entry with a non-GBP-forced currency (i.e., product detail page which has £ symbol) if (!existing || existing.currency === "GBP" && p.currency !== "GBP") { uniqueProducts.set(key, p); } else if (!existing) { uniqueProducts.set(key, p); } } // Write to database let written = 0; let skipped = 0; for (const p of uniqueProducts.values()) { try { const transceiverId = await findOrCreateScrapedTransceiver({ partNumber: p.partNumber, vendorId, formFactor: p.formFactor, speedGbps: p.speedGbps, speed: p.speed, reachLabel: p.reachLabel, fiberType: p.fiberType, category: "DataCenter", }); const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity }); const isNew = await upsertPriceObservation({ transceiverId, sourceVendorId: vendorId, price: p.price, currency: p.currency, stockLevel: p.stockLevel, quantityAvailable: p.quantity, url: p.url, contentHash: hash, }); if (isNew) written++; else skipped++; } catch (err) { console.error(` Error: ${p.partNumber}:`, (err as Error).message); } } console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); console.log("=== ATGBICS Scraper Complete ===\n"); } if (require.main === module) { scrapeAtgbics() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }