diff --git a/packages/scraper/src/scrapers/atgbics.ts b/packages/scraper/src/scrapers/atgbics.ts index 32dcc2c..ded5bfe 100644 --- a/packages/scraper/src/scrapers/atgbics.ts +++ b/packages/scraper/src/scrapers/atgbics.ts @@ -2,438 +2,317 @@ * ATGBICS Scraper — Prices, Stock, Product Catalog * * ATGBICS is a UK-based independent compatible optics vendor. - * Site uses Shopify with client-side rendering, so we use PlaywrightCrawler. - * Prices are publicly visible in GBP. + * Site uses Shopify. Prices ARE present in static HTML on collection pages. * - * Categories scraped: - * /collections/sfp-transceivers/ - * /collections/sfp-plus-transceivers/ - * /collections/sfp28-transceivers/ - * /collections/qsfp-plus-transceivers/ - * /collections/qsfp28-transceivers/ - * /collections/qsfp-dd-transceivers/ + * Strategy: + * 1. Fetch each collection page (correct handles discovered 2026-04-18) + * 2. Parse product cards: name (aria-label), handle, price (£X.XX), image + * 3. Paginate via ?page=N until empty + * 4. Upsert to DB * - * Respects: robots.txt, rate limiting (2s between requests, max 50 pages) + * No Playwright required — static HTML contains all needed data. + * Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages). */ -import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; -import { makeCrawleeConfig } from "../utils/crawlee-config"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; -import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; +import { contentHash } from "../utils/hash"; const BASE_URL = "https://www.atgbics.com"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + // Force GBP pricing regardless of visitor IP geolocation + Cookie: "cart_currency=GBP", +}; -// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks -const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") - .split(",") - .map((u) => u.trim()) - .filter(Boolean); +const MAX_PAGES_PER_CAT = 50; -function buildProxyConfiguration(): ProxyConfiguration | undefined { - if (PROXY_URLS.length === 0) return undefined; - return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); -} - -const CATEGORY_URLS = [ - "/collections/sfp-transceivers/", - "/collections/sfp-plus-transceivers/", - "/collections/sfp28-transceivers/", - "/collections/qsfp-plus-transceivers/", - "/collections/qsfp28-transceivers/", - "/collections/qsfp-dd-transceivers/", +// Correct collection handles discovered 2026-04-18 by fetching /collections/ +// Each collection has static-HTML-rendered prices (£X.XX in price__current span) +const CATEGORIES = [ + // === Core speeds by form factor === + { handle: "compatible-transceivers-sfp-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { handle: "compatible-transceivers-sfp-100m", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { handle: "compatible-transceiver-sfp-bidi-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { handle: "compatible-transceivers-sfpp-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { handle: "compatible-transceivers-sfpp-bidi-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { handle: "compatible-transceivers-sfpp-cwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { handle: "compatible-transceivers-sfp-dwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { handle: "compatible-transceiver-sfp-25g", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { handle: "high-speed-sfp-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { handle: "high-speed-sfp-transceivers-1", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + { handle: "compatible-tansceivers-qsfp-bidi-100gbps", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, ]; -const MAX_PAGES = 50; - interface AtgbicsProduct { partNumber: string; name: string; price: number; currency: string; stockLevel: string; - quantity?: number; url: string; - formFactor?: string; - speedGbps?: number; - speed?: string; + formFactor: string; + speed: string; + speedGbps: number; reachLabel?: string; + reachMeters?: number; fiberType?: string; + wavelength?: string; + imageUrl?: string; } -function detectFormFactor(text: string): string | undefined { +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function detectFormFactor(text: string, fallback: string): string { const lower = text.toLowerCase(); if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD"; + if (lower.includes("qsfp56")) return "QSFP56"; if (lower.includes("qsfp28")) return "QSFP28"; - if (lower.includes("qsfp+") || lower.includes("qsfp plus") || lower.includes("qsfp-plus")) return "QSFP+"; + if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+"; if (lower.includes("sfp28")) return "SFP28"; - if (lower.includes("sfp+") || lower.includes("sfp plus") || lower.includes("sfp-plus")) return "SFP+"; + if (lower.includes("sfp+") || lower.includes("sfp-plus") || lower.includes("sfpplus")) return "SFP+"; if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP"; if (lower.includes("xfp")) return "XFP"; if (lower.includes("cfp2")) return "CFP2"; if (lower.includes("cfp")) return "CFP"; - return undefined; + return fallback; } -function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { +function detectSpeed(text: string, fallbackGbps: number): { speed: string; speedGbps: number } { const patterns: [RegExp, string, number][] = [ - [/400\s*g/i, "400G", 400], - [/100\s*g/i, "100G", 100], - [/40\s*g/i, "40G", 40], - [/25\s*g/i, "25G", 25], - [/10\s*g/i, "10G", 10], - [/1000\s*base/i, "1G", 1], + [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200], + [/100\s*g/i, "100G", 100], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25], + [/10\s*g/i, "10G", 10], [/1000\s*base/i, "1G", 1], [/1\.25\s*g/i, "1G", 1], [/1\s*g\b/i, "1G", 1], ]; for (const [re, speed, gbps] of patterns) { if (re.test(text)) return { speed, speedGbps: gbps }; } + return { speed: fallbackGbps + "G", speedGbps: fallbackGbps }; +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], + [/\b15\s*km\b/i, "15km", 15000], [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], + [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], + [/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70], + [/\bLR4?\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000], + [/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300], + [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], + ]; + for (const [re, label, meters] of patterns) { + if (re.test(text)) return { label, meters }; + } return undefined; } -function detectReach(text: string): string | undefined { - const match = text.match(/(\d+)\s*(m|km)\b/i); - if (match) return `${match[1]}${match[2].toLowerCase()}`; - return undefined; +function detectFiber(text: string): string { + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; + if (/aoc|active.?optical/i.test(text)) return "MMF"; + return "SMF"; +} + +function detectWavelength(text: string): string { + const m = text.match(/(\d{3,4})\s*nm/i); + return m ? m[1] : ""; } /** - * Extract the real OEM part number from an ATGBICS URL slug. - * - * ATGBICS slug format: {oem-part-number}-{vendor}-r-compatible-transceiver-{specs} - * Examples: - * 3he16564aa-nokia-r-compatible-transceiver-qsfp-dd-... → 3HE16564AA - * jnp-sfp-25g-lr-juniper-r-compatible-... → JNP-SFP-25G-LR - * sfp-10g-sr-cisco-compatible-... → SFP-10G-SR - * - * Returns the slug uppercased if extraction fails (better than full slug). + * Extract OEM part number from the ATGBICS product name. + * Name format: "{OEM_PN} {Vendor}® Compatible Transceiver {Specs}" + * e.g. "SFP-10G-SR Cisco® Compatible Transceiver SFP+ 10GBase-SR ..." */ -function extractOemPartNumber(slug: string): string { - let pn = slug; - - // Remove "-r-compatible-transceiver-..." and everything after - pn = pn.replace(/-r-compatible(?:-transceiver.*)?$/i, ""); - // Remove "-compatible-transceiver-..." (no "r-") - pn = pn.replace(/-compatible-transceiver.*$/i, ""); - // Remove "-compatible-..." (short form) - pn = pn.replace(/-compatible.*$/i, ""); - - // Remove trailing known OEM vendor names that ATGBICS appends before "-r-compatible" - const oemVendors = [ - "nokia", "cisco", "juniper", "arista", "huawei", "hp", "hpe", "dell", - "extreme", "brocade", "avaya", "netgear", "mikrotik", "ubiquiti", "mellanox", - "intel", "broadcom", "allied", "planet", "zyxel", "dlink", "d-link", - "foundry", "force10", "enterasys", "optical", "palo", "fortinet", "hitachi", - "calix", "calix", "ciena", "adtran", "ribbon", "sycamore", "rad", "zhone", - "infinera", "fujitsu", "nec", "ericsson", "alcatel", "lucent", - ]; - for (const v of oemVendors) { - pn = pn.replace(new RegExp(`-${v}$`, "i"), ""); +function extractPartNumber(name: string): string { + // First token before first space-separated vendor name or ® symbol + const pnMatch = name.match(/^([A-Z0-9][A-Z0-9._\-/+]+)/i); + if (pnMatch && pnMatch[1].length >= 3 && pnMatch[1].length <= 60) { + return pnMatch[1].toUpperCase(); } - - // Final cleanup: normalize to uppercase (OEM part numbers are uppercase) - const result = pn.toUpperCase().trim(); - - // Safety: if result is empty, longer than 40 chars, or still has "TRANSCEIVER", return slug as-is - if (!result || result.length > 40 || result.includes("TRANSCEIVER")) { - return slug.toUpperCase().slice(0, 40); - } - - return result; + return name.split(/\s+/)[0]?.toUpperCase()?.slice(0, 60) || name.slice(0, 60); } -function detectFiberType(text: string): string | undefined { - const lower = text.toLowerCase(); - if (lower.includes("single mode") || lower.includes("single-mode") || lower.includes("smf") || lower.includes("-lr") || lower.includes("-er") || lower.includes("-zr")) return "SMF"; - if (lower.includes("multi mode") || lower.includes("multi-mode") || lower.includes("mmf") || lower.includes("-sr") || lower.includes("-sx")) return "MMF"; - if (lower.includes("dac") || lower.includes("direct attach") || lower.includes("copper") || lower.includes("-t ") || lower.includes("twinax")) return "DAC"; - return undefined; +/** Parse a collection page HTML — returns array of products */ +function parseCategoryPage(html: string, cat: typeof CATEGORIES[number]): AtgbicsProduct[] { + const products: AtgbicsProduct[] = []; + const seen = new Set(); + + // Split by product cards — class="card card--product + const cardParts = html.split(/class="card card--product/); + + for (const card of cardParts.slice(1)) { + // Name from aria-label (full descriptive name) + const nameM = card.match(/aria-label="([^"]{8,})"/); + if (!nameM) continue; + const name = nameM[1].replace(/®/g, "").replace(/\s+/g, " ").trim(); + + // Product handle from href + const hrefM = card.match(/href="\/(?:collections\/[^"]+\/)?products\/([^"?#]+)"/); + if (!hrefM) continue; + const handle = hrefM[1]; + if (seen.has(handle)) continue; + seen.add(handle); + + // Price — £X.XX in price__current (may have newline before £) + const priceM = card.match(/price__current"[^>]*>\s*£([\d,]+(?:\.\d{0,2})?)/s); + const price = priceM ? parseFloat(priceM[1].replace(",", "")) : 0; + if (!price || price <= 0 || price > 100000) continue; + + // Image from data-srcset (first src) + const imgM = card.match(/data-srcset="\/\/(atgbics\.com\/cdn\/shop\/files\/[^"\s]+)/); + const imageUrl = imgM ? `https://${imgM[1].split(" ")[0]}` : undefined; + + const fullText = `${name} ${handle}`; + const speedInfo = detectSpeed(fullText, cat.speedGbps); + const ff = detectFormFactor(fullText, cat.formFactor); + const reach = detectReach(fullText); + const partNumber = extractPartNumber(name); + + products.push({ + partNumber, + name, + price, + currency: "GBP", + stockLevel: "in_stock", // ATGBICS only lists available items + url: `${BASE_URL}/products/${handle}`, + formFactor: ff, + speed: speedInfo.speed, + speedGbps: speedInfo.speedGbps, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(fullText), + wavelength: detectWavelength(fullText), + imageUrl: imageUrl?.includes("no-image") ? undefined : imageUrl, + }); + } + + return products; +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +/** Check if a page has pagination links pointing to the next page */ +function hasNextPage(html: string, currentPage: number): boolean { + const nextPage = currentPage + 1; + return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`); } export async function scrapeAtgbics(): Promise { - console.log("=== ATGBICS Scraper Starting ===\n"); + console.log("=== ATGBICS Scraper Starting (static HTML, correct collection handles) ===\n"); const vendorId = await ensureVendor( "ATGBICS", "compatible", "https://www.atgbics.com", - "https://www.atgbics.com/collections/sfp-plus-transceivers/" + "https://www.atgbics.com/collections/compatible-transceivers-sfpp-10g", ); - console.log(`Vendor ID: ${vendorId}`); - const products: AtgbicsProduct[] = []; - let pagesScraped = 0; + let totalProducts = 0; + let priceUpdates = 0; + let imageUpdates = 0; + const seenHandles = new Set(); - const proxyConfiguration = buildProxyConfiguration(); - const crawler = new PlaywrightCrawler({ - maxConcurrency: 1, - maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1 - maxRequestsPerCrawl: MAX_PAGES, - requestHandlerTimeoutSecs: 60, - headless: true, - useSessionPool: false, // Disable session pool to avoid SDK_SESSION_POOL_STATE.json crash - ...(proxyConfiguration ? { proxyConfiguration } : {}), - launchContext: { - launchOptions: { - args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"], - }, - }, + for (const cat of CATEGORIES) { + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`); + let catTotal = 0; - async requestHandler({ page, request, enqueueLinks, log }) { - const url = request.url; - log.info(`Scraping: ${url}`); + for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) { + const pageUrl = page === 1 + ? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP` + : `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP&page=${page}`; - // Wait for Shopify product grid to render - await page.waitForTimeout(2000); + try { + const html = await fetchPage(pageUrl); + const pageProducts = parseCategoryPage(html, cat); - // Check if this is a collection (listing) page or a product page - const isCollection = url.includes("/collections/"); + if (pageProducts.length === 0) { + console.log(` Page ${page}: 0 products — stopping`); + break; + } + console.log(` Page ${page}: ${pageProducts.length} products`); - if (isCollection) { - // Extract product links from listing page and enqueue them - const productData = await page.evaluate(() => { - const results: Array<{ - name: string; - href: string; - price: string; - stock: string; - partNumber: string; - }> = []; + for (const product of pageProducts) { + // Skip cross-category duplicates (same product may appear in multiple collections) + const dedupKey = `${product.url}`; + if (seenHandles.has(dedupKey)) continue; + seenHandles.add(dedupKey); - // Shopify collection page — product cards - const cards = document.querySelectorAll( - ".product-item, .grid-product, [class*=\"product-card\"], [class*=\"product-grid\"] li, .collection-grid__item" - ); + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "Compatible", + }); - for (const card of cards) { - const linkEl = card.querySelector("a[href*=\"/products/\"]") as HTMLAnchorElement | null; - const nameEl = card.querySelector( - ".product-item__title, .grid-product__title, [class*=\"product-title\"], [class*=\"product-name\"], h2, h3" - ); - const priceEl = card.querySelector( - ".product-item__price, .grid-product__price, [class*=\"price\"]:not([class*=\"compare\"]):not([class*=\"was\"])" - ); - const stockEl = card.querySelector( - "[class*=\"stock\"], [class*=\"availability\"], [class*=\"badge\"]" - ); + const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: product.currency, + stockLevel: product.stockLevel, + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; - const href = linkEl?.getAttribute("href") || ""; - const name = nameEl?.textContent?.trim() || linkEl?.textContent?.trim() || ""; - const price = priceEl?.textContent?.trim() || ""; - const stock = stockEl?.textContent?.trim() || ""; - - // Derive part number from URL slug: /products/sfp-10g-lr → sfp-10g-lr - // Then extract real OEM part number (strips "-r-compatible-transceiver-*") - const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; - - if (href && name && name.length > 3) { - results.push({ name, href, price, stock, partNumber: slug }); // OEM extraction done below after page parse + if (product.imageUrl) { + const res = await pool.query( + `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true + WHERE id = $2 AND (image_url IS NULL OR image_url = '') + RETURNING id`, + [product.imageUrl, txId], + ); + if (res.rowCount && res.rowCount > 0) imageUpdates++; } - } - // Fallback: grab any /products/ links with adjacent price text - if (results.length === 0) { - const allProductLinks = document.querySelectorAll("a[href*=\"/products/\"]"); - const seen = new Set(); - for (const el of allProductLinks) { - const a = el as HTMLAnchorElement; - const href = a.getAttribute("href") || ""; - if (seen.has(href)) continue; - seen.add(href); - - const name = a.textContent?.trim() || ""; - if (!name || name.length < 3) continue; - - const container = a.closest("li") || a.closest("article") || a.parentElement?.parentElement; - const priceEl = container?.querySelector("[class*=\"price\"]"); - const price = priceEl?.textContent?.trim() || ""; - const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; - - results.push({ name, href, price, stock: "", partNumber: slug }); - } - } - - return results; - }); - - log.info(` Found ${productData.length} products on collection page`); - - for (const item of productData) { - if (!item.href) continue; - - const fullUrl = item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`; - - // If we already have price data from the listing, store it directly - if (item.price) { - const { price, currency } = parsePrice(item.price); - const speedInfo = detectSpeed(item.name); - // Extract real OEM part number from slug (strips -r-compatible-transceiver-*) - const realPartNumber = extractOemPartNumber(item.partNumber); - // Extract reach from name OR slug (slug often has "120km" even when name doesn't) - const reachLabel = detectReach(item.name) || detectReach(item.partNumber) || undefined; - if (price > 0) { - products.push({ - partNumber: realPartNumber || item.name.slice(0, 80), - name: item.name, - price, - currency: currency === "USD" ? "GBP" : currency, // ATGBICS is GBP — parsePrice may default to USD if no symbol on listing - stockLevel: item.stock ? parseStockLevel(item.stock) : "in_stock", - quantity: item.stock ? parseQuantity(item.stock) : undefined, - url: fullUrl, - formFactor: detectFormFactor(item.name), - speedGbps: speedInfo?.speedGbps, - speed: speedInfo?.speed, - reachLabel, - fiberType: detectFiberType(item.name), - }); - } + totalProducts++; + catTotal++; + } catch (err) { + console.warn(` DB error ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`); } } - // Enqueue next page if pagination exists - await enqueueLinks({ - selector: "a[href*=\"?page=\"], a.pagination__next, a[rel=\"next\"], .pagination a[href]", - transformRequestFunction: (req) => { - if (pagesScraped >= MAX_PAGES) return false; - return req; - }, - }); - - pagesScraped++; - } else { - // Product detail page — extract precise data - const data = await page.evaluate(() => { - const title = document.querySelector( - "h1.product__title, h1.product-title, h1.product_title, h1" - )?.textContent?.trim() || ""; - - // Shopify price — prefer sale price if available - const salePriceEl = document.querySelector( - ".price__sale .price-item--sale, .product__price .money, [class*=\"price\"] .money, [data-product-price], .price ins" - ); - const priceText = salePriceEl?.textContent?.trim() || ""; - - // Stock / availability - const stockEl = document.querySelector( - ".product__availability, .availability, [class*=\"stock\"], [class*=\"inventory\"], .badge--sold-out, .badge--in-stock" - ); - const stockText = stockEl?.textContent?.trim() || ""; - - // Quantity badge (some Shopify themes show "X in stock") - const qtyEl = document.querySelector("[class*=\"quantity\"], [class*=\"inventory-count\"]"); - const qtyText = qtyEl?.textContent?.trim() || ""; - - // Short description / variant title for reach/fiber info - const descEl = document.querySelector( - ".product__description, .product-description, .rte p:first-child, .product__short-description" - ); - const description = descEl?.textContent?.trim() || ""; - - // SKU / part number (Shopify often exposes this) - const skuEl = document.querySelector(".product__sku, [class*=\"sku\"], [itemprop=\"sku\"]"); - const sku = skuEl?.textContent?.replace(/SKU[:\s]*/i, "").trim() || ""; - - return { title, priceText, stockText, qtyText, description, sku }; - }); - - const slug = url.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; - // Prefer Shopify SKU if available, otherwise extract real OEM PN from slug - const partNumber = data.sku && data.sku.length > 2 && data.sku.length < 40 - ? data.sku.toUpperCase() - : extractOemPartNumber(slug); - const name = data.title || slug; - - const combinedText = `${name} ${data.description}`; - const { price, currency } = parsePrice(data.priceText); - - if (price > 0) { - const speedInfo = detectSpeed(combinedText); - // Reach from title/description first, then fall back to slug (slug often has "120km") - const reachLabel = detectReach(combinedText) || detectReach(slug) || undefined; - products.push({ - partNumber, - name, - price, - currency: currency === "USD" ? "GBP" : currency, // ATGBICS prices in GBP - stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock", - quantity: data.qtyText ? parseQuantity(data.qtyText) : undefined, - url, - formFactor: detectFormFactor(combinedText), - speedGbps: speedInfo?.speedGbps, - speed: speedInfo?.speed, - reachLabel, - fiberType: detectFiberType(combinedText), - }); + // Check pagination + if (!hasNextPage(html, page)) { + console.log(` No page ${page + 1} — collection done`); + break; } - - pagesScraped++; + } catch (err) { + console.warn(` Page ${page} error: ${(err as Error).message.slice(0, 80)}`); + break; } - }, - }, makeCrawleeConfig("atgbics")); - const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`); - await crawler.run(startUrls); - - console.log(`\nPages scraped: ${pagesScraped}`); - console.log(`Products found: ${products.length}`); - - // Deduplicate by partNumber — prefer product detail page data (more precise) - const uniqueProducts = new Map(); - for (const p of products) { - const key = p.partNumber || p.name; - const existing = uniqueProducts.get(key); - // Keep the entry with a non-GBP-forced currency (i.e., product detail page which has £ symbol) - if (!existing || existing.currency === "GBP" && p.currency !== "GBP") { - uniqueProducts.set(key, p); - } else if (!existing) { - uniqueProducts.set(key, p); + await sleep(1000); } + + console.log(` Category total: ${catTotal} products`); + await sleep(1000); } - // Write to database - let written = 0; - let skipped = 0; - - for (const p of uniqueProducts.values()) { - try { - const transceiverId = await findOrCreateScrapedTransceiver({ - partNumber: p.partNumber, - vendorId, - formFactor: p.formFactor, - speedGbps: p.speedGbps, - speed: p.speed, - reachLabel: p.reachLabel, - fiberType: p.fiberType, - category: "DataCenter", - }); - - const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity }); - const isNew = await upsertPriceObservation({ - transceiverId, - sourceVendorId: vendorId, - price: p.price, - currency: p.currency, - stockLevel: p.stockLevel, - quantityAvailable: p.quantity, - url: p.url, - contentHash: hash, - }); - - if (isNew) written++; - else skipped++; - } catch (err) { - console.error(` Error: ${p.partNumber}:`, (err as Error).message); - } - } - - console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`); - console.log("=== ATGBICS Scraper Complete ===\n"); + console.log(`\n=== ATGBICS Complete: ${totalProducts} products, ${priceUpdates} price updates, ${imageUpdates} images ===`); } if (require.main === module) { scrapeAtgbics() .then(() => pool.end()) - .catch((err) => { - console.error("Fatal:", err); - pool.end(); - process.exit(1); - }); + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); } diff --git a/scripts/run-atgbics-mac.sh b/scripts/run-atgbics-mac.sh new file mode 100755 index 0000000..3ca4324 --- /dev/null +++ b/scripts/run-atgbics-mac.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# run-atgbics-mac.sh — Run ATGBICS scraper from Mac (bypasses Erik datacenter IP block) +# +# Uses Shopify JSON API (no Playwright needed). +# Connects to Erik's PostgreSQL via SSH tunnel. +# +# Usage: ./scripts/run-atgbics-mac.sh +# Requirements: SSH access to Erik (root@82.165.222.127) + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" +TUNNEL_PORT=5434 +TUNNEL_PID="" +LOG="/tmp/atgbics-mac-run.log" + +cleanup() { + if [[ -n "$TUNNEL_PID" ]]; then + echo "Closing SSH tunnel (PID $TUNNEL_PID)..." + kill "$TUNNEL_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT + +echo "=== ATGBICS Mac Runner ===" +echo "Repo: $REPO_DIR" + +# Open SSH tunnel to Erik's PostgreSQL +echo "Opening SSH tunnel to Erik DB (port 5433 → local 5434)..." +ssh -fNL "${TUNNEL_PORT}:127.0.0.1:5433" root@82.165.222.127 +TUNNEL_PID=$(lsof -ti "TCP:${TUNNEL_PORT}" -sTCP:LISTEN 2>/dev/null | head -1) +echo "Tunnel open (listener PID: ${TUNNEL_PID:-unknown})" + +# Wait briefly for tunnel to stabilize +sleep 1 + +# Build scraper if needed +if [[ ! -f "$REPO_DIR/packages/scraper/dist/scrapers/atgbics.js" ]]; then + echo "Building scraper package..." + cd "$REPO_DIR/packages/scraper" && npm run build +fi + +# Run scraper +echo "Running ATGBICS scraper..." +cd "$REPO_DIR" +POSTGRES_HOST=127.0.0.1 \ +POSTGRES_PORT="${TUNNEL_PORT}" \ +POSTGRES_USER=tip \ +POSTGRES_PASSWORD=tip_prod_2026 \ +POSTGRES_DB=transceiver_db \ +node packages/scraper/dist/scrapers/atgbics.js 2>&1 | tee "$LOG" + +echo "" +echo "Log saved to: $LOG" +echo "=== Done ==="