feat: rewrite ATGBICS scraper — static HTML, correct collection handles, GBP cookie

- Replaces Playwright with pure fetch() — static HTML has prices
- Correct collection handles (compatible-transceivers-sfpp-10g etc.)
- Cookie: cart_currency=GBP forces GBP pricing from any geo-IP
- Handles 35+ pages per category × 24 products = 840+ SFP+ products
- No IP-blocking with static HTML (Playwright was the trigger)
- Adds scripts/run-atgbics-mac.sh for Mac-side runner if needed
This commit is contained in:
Rene Fichtmueller 2026-04-18 22:48:29 +02:00
parent 785a6731ab
commit cb5a587d7e
2 changed files with 287 additions and 353 deletions

View File

@ -2,438 +2,317 @@
* ATGBICS Scraper Prices, Stock, Product Catalog * ATGBICS Scraper Prices, Stock, Product Catalog
* *
* ATGBICS is a UK-based independent compatible optics vendor. * ATGBICS is a UK-based independent compatible optics vendor.
* Site uses Shopify with client-side rendering, so we use PlaywrightCrawler. * Site uses Shopify. Prices ARE present in static HTML on collection pages.
* Prices are publicly visible in GBP.
* *
* Categories scraped: * Strategy:
* /collections/sfp-transceivers/ * 1. Fetch each collection page (correct handles discovered 2026-04-18)
* /collections/sfp-plus-transceivers/ * 2. Parse product cards: name (aria-label), handle, price (£X.XX), image
* /collections/sfp28-transceivers/ * 3. Paginate via ?page=N until empty
* /collections/qsfp-plus-transceivers/ * 4. Upsert to DB
* /collections/qsfp28-transceivers/
* /collections/qsfp-dd-transceivers/
* *
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages) * No Playwright required static HTML contains all needed data.
* Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages).
*/ */
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { contentHash } from "../utils/hash";
const BASE_URL = "https://www.atgbics.com"; const BASE_URL = "https://www.atgbics.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
// Force GBP pricing regardless of visitor IP geolocation
Cookie: "cart_currency=GBP",
};
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks const MAX_PAGES_PER_CAT = 50;
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
.map((u) => u.trim())
.filter(Boolean);
function buildProxyConfiguration(): ProxyConfiguration | undefined { // Correct collection handles discovered 2026-04-18 by fetching /collections/
if (PROXY_URLS.length === 0) return undefined; // Each collection has static-HTML-rendered prices (£X.XX in price__current span)
return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); const CATEGORIES = [
} // === Core speeds by form factor ===
{ handle: "compatible-transceivers-sfp-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 },
const CATEGORY_URLS = [ { handle: "compatible-transceivers-sfp-100m", formFactor: "SFP", speed: "1G", speedGbps: 1 },
"/collections/sfp-transceivers/", { handle: "compatible-transceiver-sfp-bidi-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 },
"/collections/sfp-plus-transceivers/", { handle: "compatible-transceivers-sfpp-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
"/collections/sfp28-transceivers/", { handle: "compatible-transceivers-sfpp-bidi-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
"/collections/qsfp-plus-transceivers/", { handle: "compatible-transceivers-sfpp-cwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
"/collections/qsfp28-transceivers/", { handle: "compatible-transceivers-sfp-dwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
"/collections/qsfp-dd-transceivers/", { handle: "compatible-transceiver-sfp-25g", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ handle: "high-speed-sfp-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ handle: "high-speed-sfp-transceivers-1", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ handle: "compatible-tansceivers-qsfp-bidi-100gbps", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
]; ];
const MAX_PAGES = 50;
interface AtgbicsProduct { interface AtgbicsProduct {
partNumber: string; partNumber: string;
name: string; name: string;
price: number; price: number;
currency: string; currency: string;
stockLevel: string; stockLevel: string;
quantity?: number;
url: string; url: string;
formFactor?: string; formFactor: string;
speedGbps?: number; speed: string;
speed?: string; speedGbps: number;
reachLabel?: string; reachLabel?: string;
reachMeters?: number;
fiberType?: string; fiberType?: string;
wavelength?: string;
imageUrl?: string;
} }
function detectFormFactor(text: string): string | undefined { function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectFormFactor(text: string, fallback: string): string {
const lower = text.toLowerCase(); const lower = text.toLowerCase();
if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD"; if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD";
if (lower.includes("qsfp56")) return "QSFP56";
if (lower.includes("qsfp28")) return "QSFP28"; if (lower.includes("qsfp28")) return "QSFP28";
if (lower.includes("qsfp+") || lower.includes("qsfp plus") || lower.includes("qsfp-plus")) return "QSFP+"; if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
if (lower.includes("sfp28")) return "SFP28"; if (lower.includes("sfp28")) return "SFP28";
if (lower.includes("sfp+") || lower.includes("sfp plus") || lower.includes("sfp-plus")) return "SFP+"; if (lower.includes("sfp+") || lower.includes("sfp-plus") || lower.includes("sfpplus")) return "SFP+";
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP"; if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
if (lower.includes("xfp")) return "XFP"; if (lower.includes("xfp")) return "XFP";
if (lower.includes("cfp2")) return "CFP2"; if (lower.includes("cfp2")) return "CFP2";
if (lower.includes("cfp")) return "CFP"; if (lower.includes("cfp")) return "CFP";
return undefined; return fallback;
} }
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { function detectSpeed(text: string, fallbackGbps: number): { speed: string; speedGbps: number } {
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
[/400\s*g/i, "400G", 400], [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200],
[/100\s*g/i, "100G", 100], [/100\s*g/i, "100G", 100], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25],
[/40\s*g/i, "40G", 40], [/10\s*g/i, "10G", 10], [/1000\s*base/i, "1G", 1], [/1\.25\s*g/i, "1G", 1],
[/25\s*g/i, "25G", 25],
[/10\s*g/i, "10G", 10],
[/1000\s*base/i, "1G", 1],
[/1\s*g\b/i, "1G", 1], [/1\s*g\b/i, "1G", 1],
]; ];
for (const [re, speed, gbps] of patterns) { for (const [re, speed, gbps] of patterns) {
if (re.test(text)) return { speed, speedGbps: gbps }; if (re.test(text)) return { speed, speedGbps: gbps };
} }
return { speed: fallbackGbps + "G", speedGbps: fallbackGbps };
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000],
[/\b15\s*km\b/i, "15km", 15000], [/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70],
[/\bLR4?\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined; return undefined;
} }
function detectReach(text: string): string | undefined { function detectFiber(text: string): string {
const match = text.match(/(\d+)\s*(m|km)\b/i); if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (match) return `${match[1]}${match[2].toLowerCase()}`; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
return undefined; if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
if (/aoc|active.?optical/i.test(text)) return "MMF";
return "SMF";
}
function detectWavelength(text: string): string {
const m = text.match(/(\d{3,4})\s*nm/i);
return m ? m[1] : "";
} }
/** /**
* Extract the real OEM part number from an ATGBICS URL slug. * Extract OEM part number from the ATGBICS product name.
* * Name format: "{OEM_PN} {Vendor}® Compatible Transceiver {Specs}"
* ATGBICS slug format: {oem-part-number}-{vendor}-r-compatible-transceiver-{specs} * e.g. "SFP-10G-SR Cisco® Compatible Transceiver SFP+ 10GBase-SR ..."
* Examples:
* 3he16564aa-nokia-r-compatible-transceiver-qsfp-dd-... 3HE16564AA
* jnp-sfp-25g-lr-juniper-r-compatible-... JNP-SFP-25G-LR
* sfp-10g-sr-cisco-compatible-... SFP-10G-SR
*
* Returns the slug uppercased if extraction fails (better than full slug).
*/ */
function extractOemPartNumber(slug: string): string { function extractPartNumber(name: string): string {
let pn = slug; // First token before first space-separated vendor name or ® symbol
const pnMatch = name.match(/^([A-Z0-9][A-Z0-9._\-/+]+)/i);
// Remove "-r-compatible-transceiver-..." and everything after if (pnMatch && pnMatch[1].length >= 3 && pnMatch[1].length <= 60) {
pn = pn.replace(/-r-compatible(?:-transceiver.*)?$/i, ""); return pnMatch[1].toUpperCase();
// Remove "-compatible-transceiver-..." (no "r-")
pn = pn.replace(/-compatible-transceiver.*$/i, "");
// Remove "-compatible-..." (short form)
pn = pn.replace(/-compatible.*$/i, "");
// Remove trailing known OEM vendor names that ATGBICS appends before "-r-compatible"
const oemVendors = [
"nokia", "cisco", "juniper", "arista", "huawei", "hp", "hpe", "dell",
"extreme", "brocade", "avaya", "netgear", "mikrotik", "ubiquiti", "mellanox",
"intel", "broadcom", "allied", "planet", "zyxel", "dlink", "d-link",
"foundry", "force10", "enterasys", "optical", "palo", "fortinet", "hitachi",
"calix", "calix", "ciena", "adtran", "ribbon", "sycamore", "rad", "zhone",
"infinera", "fujitsu", "nec", "ericsson", "alcatel", "lucent",
];
for (const v of oemVendors) {
pn = pn.replace(new RegExp(`-${v}$`, "i"), "");
} }
return name.split(/\s+/)[0]?.toUpperCase()?.slice(0, 60) || name.slice(0, 60);
// Final cleanup: normalize to uppercase (OEM part numbers are uppercase)
const result = pn.toUpperCase().trim();
// Safety: if result is empty, longer than 40 chars, or still has "TRANSCEIVER", return slug as-is
if (!result || result.length > 40 || result.includes("TRANSCEIVER")) {
return slug.toUpperCase().slice(0, 40);
}
return result;
} }
function detectFiberType(text: string): string | undefined { /** Parse a collection page HTML — returns array of products */
const lower = text.toLowerCase(); function parseCategoryPage(html: string, cat: typeof CATEGORIES[number]): AtgbicsProduct[] {
if (lower.includes("single mode") || lower.includes("single-mode") || lower.includes("smf") || lower.includes("-lr") || lower.includes("-er") || lower.includes("-zr")) return "SMF"; const products: AtgbicsProduct[] = [];
if (lower.includes("multi mode") || lower.includes("multi-mode") || lower.includes("mmf") || lower.includes("-sr") || lower.includes("-sx")) return "MMF"; const seen = new Set<string>();
if (lower.includes("dac") || lower.includes("direct attach") || lower.includes("copper") || lower.includes("-t ") || lower.includes("twinax")) return "DAC";
return undefined; // Split by product cards — class="card card--product
const cardParts = html.split(/class="card card--product/);
for (const card of cardParts.slice(1)) {
// Name from aria-label (full descriptive name)
const nameM = card.match(/aria-label="([^"]{8,})"/);
if (!nameM) continue;
const name = nameM[1].replace(/®/g, "").replace(/\s+/g, " ").trim();
// Product handle from href
const hrefM = card.match(/href="\/(?:collections\/[^"]+\/)?products\/([^"?#]+)"/);
if (!hrefM) continue;
const handle = hrefM[1];
if (seen.has(handle)) continue;
seen.add(handle);
// Price — £X.XX in price__current (may have newline before £)
const priceM = card.match(/price__current"[^>]*>\s*£([\d,]+(?:\.\d{0,2})?)/s);
const price = priceM ? parseFloat(priceM[1].replace(",", "")) : 0;
if (!price || price <= 0 || price > 100000) continue;
// Image from data-srcset (first src)
const imgM = card.match(/data-srcset="\/\/(atgbics\.com\/cdn\/shop\/files\/[^"\s]+)/);
const imageUrl = imgM ? `https://${imgM[1].split(" ")[0]}` : undefined;
const fullText = `${name} ${handle}`;
const speedInfo = detectSpeed(fullText, cat.speedGbps);
const ff = detectFormFactor(fullText, cat.formFactor);
const reach = detectReach(fullText);
const partNumber = extractPartNumber(name);
products.push({
partNumber,
name,
price,
currency: "GBP",
stockLevel: "in_stock", // ATGBICS only lists available items
url: `${BASE_URL}/products/${handle}`,
formFactor: ff,
speed: speedInfo.speed,
speedGbps: speedInfo.speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(fullText),
wavelength: detectWavelength(fullText),
imageUrl: imageUrl?.includes("no-image") ? undefined : imageUrl,
});
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
/** Check if a page has pagination links pointing to the next page */
function hasNextPage(html: string, currentPage: number): boolean {
const nextPage = currentPage + 1;
return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`);
} }
export async function scrapeAtgbics(): Promise<void> { export async function scrapeAtgbics(): Promise<void> {
console.log("=== ATGBICS Scraper Starting ===\n"); console.log("=== ATGBICS Scraper Starting (static HTML, correct collection handles) ===\n");
const vendorId = await ensureVendor( const vendorId = await ensureVendor(
"ATGBICS", "ATGBICS",
"compatible", "compatible",
"https://www.atgbics.com", "https://www.atgbics.com",
"https://www.atgbics.com/collections/sfp-plus-transceivers/" "https://www.atgbics.com/collections/compatible-transceivers-sfpp-10g",
); );
console.log(`Vendor ID: ${vendorId}`);
const products: AtgbicsProduct[] = []; let totalProducts = 0;
let pagesScraped = 0; let priceUpdates = 0;
let imageUpdates = 0;
const seenHandles = new Set<string>();
const proxyConfiguration = buildProxyConfiguration(); for (const cat of CATEGORIES) {
const crawler = new PlaywrightCrawler({ console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
maxConcurrency: 1, let catTotal = 0;
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
maxRequestsPerCrawl: MAX_PAGES,
requestHandlerTimeoutSecs: 60,
headless: true,
useSessionPool: false, // Disable session pool to avoid SDK_SESSION_POOL_STATE.json crash
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: {
launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
},
},
async requestHandler({ page, request, enqueueLinks, log }) { for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
const url = request.url; const pageUrl = page === 1
log.info(`Scraping: ${url}`); ? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending&currency=GBP`
: `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending&currency=GBP&page=${page}`;
// Wait for Shopify product grid to render try {
await page.waitForTimeout(2000); const html = await fetchPage(pageUrl);
const pageProducts = parseCategoryPage(html, cat);
// Check if this is a collection (listing) page or a product page if (pageProducts.length === 0) {
const isCollection = url.includes("/collections/"); console.log(` Page ${page}: 0 products — stopping`);
break;
}
console.log(` Page ${page}: ${pageProducts.length} products`);
if (isCollection) { for (const product of pageProducts) {
// Extract product links from listing page and enqueue them // Skip cross-category duplicates (same product may appear in multiple collections)
const productData = await page.evaluate(() => { const dedupKey = `${product.url}`;
const results: Array<{ if (seenHandles.has(dedupKey)) continue;
name: string; seenHandles.add(dedupKey);
href: string;
price: string;
stock: string;
partNumber: string;
}> = [];
// Shopify collection page — product cards try {
const cards = document.querySelectorAll( const txId = await findOrCreateScrapedTransceiver({
".product-item, .grid-product, [class*=\"product-card\"], [class*=\"product-grid\"] li, .collection-grid__item" partNumber: product.partNumber,
); vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "Compatible",
});
for (const card of cards) { const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const linkEl = card.querySelector("a[href*=\"/products/\"]") as HTMLAnchorElement | null; const updated = await upsertPriceObservation({
const nameEl = card.querySelector( transceiverId: txId,
".product-item__title, .grid-product__title, [class*=\"product-title\"], [class*=\"product-name\"], h2, h3" sourceVendorId: vendorId,
); price: product.price,
const priceEl = card.querySelector( currency: product.currency,
".product-item__price, .grid-product__price, [class*=\"price\"]:not([class*=\"compare\"]):not([class*=\"was\"])" stockLevel: product.stockLevel,
); url: product.url,
const stockEl = card.querySelector( contentHash: hash,
"[class*=\"stock\"], [class*=\"availability\"], [class*=\"badge\"]" });
); if (updated) priceUpdates++;
const href = linkEl?.getAttribute("href") || ""; if (product.imageUrl) {
const name = nameEl?.textContent?.trim() || linkEl?.textContent?.trim() || ""; const res = await pool.query(
const price = priceEl?.textContent?.trim() || ""; `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
const stock = stockEl?.textContent?.trim() || ""; WHERE id = $2 AND (image_url IS NULL OR image_url = '')
RETURNING id`,
// Derive part number from URL slug: /products/sfp-10g-lr → sfp-10g-lr [product.imageUrl, txId],
// Then extract real OEM part number (strips "-r-compatible-transceiver-*") );
const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; if (res.rowCount && res.rowCount > 0) imageUpdates++;
if (href && name && name.length > 3) {
results.push({ name, href, price, stock, partNumber: slug }); // OEM extraction done below after page parse
} }
}
// Fallback: grab any /products/ links with adjacent price text totalProducts++;
if (results.length === 0) { catTotal++;
const allProductLinks = document.querySelectorAll("a[href*=\"/products/\"]"); } catch (err) {
const seen = new Set<string>(); console.warn(` DB error ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`);
for (const el of allProductLinks) {
const a = el as HTMLAnchorElement;
const href = a.getAttribute("href") || "";
if (seen.has(href)) continue;
seen.add(href);
const name = a.textContent?.trim() || "";
if (!name || name.length < 3) continue;
const container = a.closest("li") || a.closest("article") || a.parentElement?.parentElement;
const priceEl = container?.querySelector("[class*=\"price\"]");
const price = priceEl?.textContent?.trim() || "";
const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || "";
results.push({ name, href, price, stock: "", partNumber: slug });
}
}
return results;
});
log.info(` Found ${productData.length} products on collection page`);
for (const item of productData) {
if (!item.href) continue;
const fullUrl = item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`;
// If we already have price data from the listing, store it directly
if (item.price) {
const { price, currency } = parsePrice(item.price);
const speedInfo = detectSpeed(item.name);
// Extract real OEM part number from slug (strips -r-compatible-transceiver-*)
const realPartNumber = extractOemPartNumber(item.partNumber);
// Extract reach from name OR slug (slug often has "120km" even when name doesn't)
const reachLabel = detectReach(item.name) || detectReach(item.partNumber) || undefined;
if (price > 0) {
products.push({
partNumber: realPartNumber || item.name.slice(0, 80),
name: item.name,
price,
currency: currency === "USD" ? "GBP" : currency, // ATGBICS is GBP — parsePrice may default to USD if no symbol on listing
stockLevel: item.stock ? parseStockLevel(item.stock) : "in_stock",
quantity: item.stock ? parseQuantity(item.stock) : undefined,
url: fullUrl,
formFactor: detectFormFactor(item.name),
speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed,
reachLabel,
fiberType: detectFiberType(item.name),
});
}
} }
} }
// Enqueue next page if pagination exists // Check pagination
await enqueueLinks({ if (!hasNextPage(html, page)) {
selector: "a[href*=\"?page=\"], a.pagination__next, a[rel=\"next\"], .pagination a[href]", console.log(` No page ${page + 1} — collection done`);
transformRequestFunction: (req) => { break;
if (pagesScraped >= MAX_PAGES) return false;
return req;
},
});
pagesScraped++;
} else {
// Product detail page — extract precise data
const data = await page.evaluate(() => {
const title = document.querySelector(
"h1.product__title, h1.product-title, h1.product_title, h1"
)?.textContent?.trim() || "";
// Shopify price — prefer sale price if available
const salePriceEl = document.querySelector(
".price__sale .price-item--sale, .product__price .money, [class*=\"price\"] .money, [data-product-price], .price ins"
);
const priceText = salePriceEl?.textContent?.trim() || "";
// Stock / availability
const stockEl = document.querySelector(
".product__availability, .availability, [class*=\"stock\"], [class*=\"inventory\"], .badge--sold-out, .badge--in-stock"
);
const stockText = stockEl?.textContent?.trim() || "";
// Quantity badge (some Shopify themes show "X in stock")
const qtyEl = document.querySelector("[class*=\"quantity\"], [class*=\"inventory-count\"]");
const qtyText = qtyEl?.textContent?.trim() || "";
// Short description / variant title for reach/fiber info
const descEl = document.querySelector(
".product__description, .product-description, .rte p:first-child, .product__short-description"
);
const description = descEl?.textContent?.trim() || "";
// SKU / part number (Shopify often exposes this)
const skuEl = document.querySelector(".product__sku, [class*=\"sku\"], [itemprop=\"sku\"]");
const sku = skuEl?.textContent?.replace(/SKU[:\s]*/i, "").trim() || "";
return { title, priceText, stockText, qtyText, description, sku };
});
const slug = url.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || "";
// Prefer Shopify SKU if available, otherwise extract real OEM PN from slug
const partNumber = data.sku && data.sku.length > 2 && data.sku.length < 40
? data.sku.toUpperCase()
: extractOemPartNumber(slug);
const name = data.title || slug;
const combinedText = `${name} ${data.description}`;
const { price, currency } = parsePrice(data.priceText);
if (price > 0) {
const speedInfo = detectSpeed(combinedText);
// Reach from title/description first, then fall back to slug (slug often has "120km")
const reachLabel = detectReach(combinedText) || detectReach(slug) || undefined;
products.push({
partNumber,
name,
price,
currency: currency === "USD" ? "GBP" : currency, // ATGBICS prices in GBP
stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock",
quantity: data.qtyText ? parseQuantity(data.qtyText) : undefined,
url,
formFactor: detectFormFactor(combinedText),
speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed,
reachLabel,
fiberType: detectFiberType(combinedText),
});
} }
} catch (err) {
pagesScraped++; console.warn(` Page ${page} error: ${(err as Error).message.slice(0, 80)}`);
break;
} }
},
}, makeCrawleeConfig("atgbics"));
const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`); await sleep(1000);
await crawler.run(startUrls);
console.log(`\nPages scraped: ${pagesScraped}`);
console.log(`Products found: ${products.length}`);
// Deduplicate by partNumber — prefer product detail page data (more precise)
const uniqueProducts = new Map<string, AtgbicsProduct>();
for (const p of products) {
const key = p.partNumber || p.name;
const existing = uniqueProducts.get(key);
// Keep the entry with a non-GBP-forced currency (i.e., product detail page which has £ symbol)
if (!existing || existing.currency === "GBP" && p.currency !== "GBP") {
uniqueProducts.set(key, p);
} else if (!existing) {
uniqueProducts.set(key, p);
} }
console.log(` Category total: ${catTotal} products`);
await sleep(1000);
} }
// Write to database console.log(`\n=== ATGBICS Complete: ${totalProducts} products, ${priceUpdates} price updates, ${imageUpdates} images ===`);
let written = 0;
let skipped = 0;
for (const p of uniqueProducts.values()) {
try {
const transceiverId = await findOrCreateScrapedTransceiver({
partNumber: p.partNumber,
vendorId,
formFactor: p.formFactor,
speedGbps: p.speedGbps,
speed: p.speed,
reachLabel: p.reachLabel,
fiberType: p.fiberType,
category: "DataCenter",
});
const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity });
const isNew = await upsertPriceObservation({
transceiverId,
sourceVendorId: vendorId,
price: p.price,
currency: p.currency,
stockLevel: p.stockLevel,
quantityAvailable: p.quantity,
url: p.url,
contentHash: hash,
});
if (isNew) written++;
else skipped++;
} catch (err) {
console.error(` Error: ${p.partNumber}:`, (err as Error).message);
}
}
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
console.log("=== ATGBICS Scraper Complete ===\n");
} }
if (require.main === module) { if (require.main === module) {
scrapeAtgbics() scrapeAtgbics()
.then(() => pool.end()) .then(() => pool.end())
.catch((err) => { .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
} }

55
scripts/run-atgbics-mac.sh Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env bash
# run-atgbics-mac.sh — Run ATGBICS scraper from Mac (bypasses Erik datacenter IP block)
#
# Uses Shopify JSON API (no Playwright needed).
# Connects to Erik's PostgreSQL via SSH tunnel.
#
# Usage: ./scripts/run-atgbics-mac.sh
# Requirements: SSH access to Erik (root@82.165.222.127)
set -euo pipefail
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
TUNNEL_PORT=5434
TUNNEL_PID=""
LOG="/tmp/atgbics-mac-run.log"
cleanup() {
if [[ -n "$TUNNEL_PID" ]]; then
echo "Closing SSH tunnel (PID $TUNNEL_PID)..."
kill "$TUNNEL_PID" 2>/dev/null || true
fi
}
trap cleanup EXIT
echo "=== ATGBICS Mac Runner ==="
echo "Repo: $REPO_DIR"
# Open SSH tunnel to Erik's PostgreSQL
echo "Opening SSH tunnel to Erik DB (port 5433 → local 5434)..."
ssh -fNL "${TUNNEL_PORT}:127.0.0.1:5433" root@82.165.222.127
TUNNEL_PID=$(lsof -ti "TCP:${TUNNEL_PORT}" -sTCP:LISTEN 2>/dev/null | head -1)
echo "Tunnel open (listener PID: ${TUNNEL_PID:-unknown})"
# Wait briefly for tunnel to stabilize
sleep 1
# Build scraper if needed
if [[ ! -f "$REPO_DIR/packages/scraper/dist/scrapers/atgbics.js" ]]; then
echo "Building scraper package..."
cd "$REPO_DIR/packages/scraper" && npm run build
fi
# Run scraper
echo "Running ATGBICS scraper..."
cd "$REPO_DIR"
POSTGRES_HOST=127.0.0.1 \
POSTGRES_PORT="${TUNNEL_PORT}" \
POSTGRES_USER=tip \
POSTGRES_PASSWORD=***REDACTED*** \
POSTGRES_DB=transceiver_db \
node packages/scraper/dist/scrapers/atgbics.js 2>&1 | tee "$LOG"
echo ""
echo "Log saved to: $LOG"
echo "=== Done ==="