- Add utils/crawlee-config.ts: makeCrawleeConfig(name) returns a Crawlee Configuration with isolated localDataDirectory per scraper. Uses storageClientOptions (not global CRAWLEE_STORAGE_DIR) so concurrent pg-boss workers in the same process don't race on the shared env var. - Apply makeCrawleeConfig to all 6 Crawlee-based scrapers: optcore (PlaywrightCrawler), atgbics (PlaywrightCrawler), community-issues (CheerioCrawler + RequestQueue), edgecore (CheerioCrawler), ufispace (CheerioCrawler), market-intelligence (CheerioCrawler). - scheduler.ts: add withIsolatedStorage for optcore and market-intel workers (was missing, caused storage-fs path bleed from fs scraper). - ebay-enricher.ts: fix vendor type 'marketplace' -> 'reseller' to satisfy vendors_type_check constraint ['manufacturer','distributor','oem','reseller','compatible'].
440 lines
17 KiB
TypeScript
440 lines
17 KiB
TypeScript
/**
|
|
* ATGBICS Scraper — Prices, Stock, Product Catalog
|
|
*
|
|
* ATGBICS is a UK-based independent compatible optics vendor.
|
|
* Site uses Shopify with client-side rendering, so we use PlaywrightCrawler.
|
|
* Prices are publicly visible in GBP.
|
|
*
|
|
* Categories scraped:
|
|
* /collections/sfp-transceivers/
|
|
* /collections/sfp-plus-transceivers/
|
|
* /collections/sfp28-transceivers/
|
|
* /collections/qsfp-plus-transceivers/
|
|
* /collections/qsfp28-transceivers/
|
|
* /collections/qsfp-dd-transceivers/
|
|
*
|
|
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages)
|
|
*/
|
|
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
|
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
|
|
|
const BASE_URL = "https://www.atgbics.com";
|
|
|
|
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
|
.split(",")
|
|
.map((u) => u.trim())
|
|
.filter(Boolean);
|
|
|
|
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
|
if (PROXY_URLS.length === 0) return undefined;
|
|
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
|
}
|
|
|
|
const CATEGORY_URLS = [
|
|
"/collections/sfp-transceivers/",
|
|
"/collections/sfp-plus-transceivers/",
|
|
"/collections/sfp28-transceivers/",
|
|
"/collections/qsfp-plus-transceivers/",
|
|
"/collections/qsfp28-transceivers/",
|
|
"/collections/qsfp-dd-transceivers/",
|
|
];
|
|
|
|
const MAX_PAGES = 50;
|
|
|
|
interface AtgbicsProduct {
|
|
partNumber: string;
|
|
name: string;
|
|
price: number;
|
|
currency: string;
|
|
stockLevel: string;
|
|
quantity?: number;
|
|
url: string;
|
|
formFactor?: string;
|
|
speedGbps?: number;
|
|
speed?: string;
|
|
reachLabel?: string;
|
|
fiberType?: string;
|
|
}
|
|
|
|
function detectFormFactor(text: string): string | undefined {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD";
|
|
if (lower.includes("qsfp28")) return "QSFP28";
|
|
if (lower.includes("qsfp+") || lower.includes("qsfp plus") || lower.includes("qsfp-plus")) return "QSFP+";
|
|
if (lower.includes("sfp28")) return "SFP28";
|
|
if (lower.includes("sfp+") || lower.includes("sfp plus") || lower.includes("sfp-plus")) return "SFP+";
|
|
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
|
|
if (lower.includes("xfp")) return "XFP";
|
|
if (lower.includes("cfp2")) return "CFP2";
|
|
if (lower.includes("cfp")) return "CFP";
|
|
return undefined;
|
|
}
|
|
|
|
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/400\s*g/i, "400G", 400],
|
|
[/100\s*g/i, "100G", 100],
|
|
[/40\s*g/i, "40G", 40],
|
|
[/25\s*g/i, "25G", 25],
|
|
[/10\s*g/i, "10G", 10],
|
|
[/1000\s*base/i, "1G", 1],
|
|
[/1\s*g\b/i, "1G", 1],
|
|
];
|
|
for (const [re, speed, gbps] of patterns) {
|
|
if (re.test(text)) return { speed, speedGbps: gbps };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectReach(text: string): string | undefined {
|
|
const match = text.match(/(\d+)\s*(m|km)\b/i);
|
|
if (match) return `${match[1]}${match[2].toLowerCase()}`;
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Extract the real OEM part number from an ATGBICS URL slug.
|
|
*
|
|
* ATGBICS slug format: {oem-part-number}-{vendor}-r-compatible-transceiver-{specs}
|
|
* Examples:
|
|
* 3he16564aa-nokia-r-compatible-transceiver-qsfp-dd-... → 3HE16564AA
|
|
* jnp-sfp-25g-lr-juniper-r-compatible-... → JNP-SFP-25G-LR
|
|
* sfp-10g-sr-cisco-compatible-... → SFP-10G-SR
|
|
*
|
|
* Returns the slug uppercased if extraction fails (better than full slug).
|
|
*/
|
|
function extractOemPartNumber(slug: string): string {
|
|
let pn = slug;
|
|
|
|
// Remove "-r-compatible-transceiver-..." and everything after
|
|
pn = pn.replace(/-r-compatible(?:-transceiver.*)?$/i, "");
|
|
// Remove "-compatible-transceiver-..." (no "r-")
|
|
pn = pn.replace(/-compatible-transceiver.*$/i, "");
|
|
// Remove "-compatible-..." (short form)
|
|
pn = pn.replace(/-compatible.*$/i, "");
|
|
|
|
// Remove trailing known OEM vendor names that ATGBICS appends before "-r-compatible"
|
|
const oemVendors = [
|
|
"nokia", "cisco", "juniper", "arista", "huawei", "hp", "hpe", "dell",
|
|
"extreme", "brocade", "avaya", "netgear", "mikrotik", "ubiquiti", "mellanox",
|
|
"intel", "broadcom", "allied", "planet", "zyxel", "dlink", "d-link",
|
|
"foundry", "force10", "enterasys", "optical", "palo", "fortinet", "hitachi",
|
|
"calix", "calix", "ciena", "adtran", "ribbon", "sycamore", "rad", "zhone",
|
|
"infinera", "fujitsu", "nec", "ericsson", "alcatel", "lucent",
|
|
];
|
|
for (const v of oemVendors) {
|
|
pn = pn.replace(new RegExp(`-${v}$`, "i"), "");
|
|
}
|
|
|
|
// Final cleanup: normalize to uppercase (OEM part numbers are uppercase)
|
|
const result = pn.toUpperCase().trim();
|
|
|
|
// Safety: if result is empty, longer than 40 chars, or still has "TRANSCEIVER", return slug as-is
|
|
if (!result || result.length > 40 || result.includes("TRANSCEIVER")) {
|
|
return slug.toUpperCase().slice(0, 40);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
function detectFiberType(text: string): string | undefined {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("single mode") || lower.includes("single-mode") || lower.includes("smf") || lower.includes("-lr") || lower.includes("-er") || lower.includes("-zr")) return "SMF";
|
|
if (lower.includes("multi mode") || lower.includes("multi-mode") || lower.includes("mmf") || lower.includes("-sr") || lower.includes("-sx")) return "MMF";
|
|
if (lower.includes("dac") || lower.includes("direct attach") || lower.includes("copper") || lower.includes("-t ") || lower.includes("twinax")) return "DAC";
|
|
return undefined;
|
|
}
|
|
|
|
export async function scrapeAtgbics(): Promise<void> {
|
|
console.log("=== ATGBICS Scraper Starting ===\n");
|
|
|
|
const vendorId = await ensureVendor(
|
|
"ATGBICS",
|
|
"compatible",
|
|
"https://www.atgbics.com",
|
|
"https://www.atgbics.com/collections/sfp-plus-transceivers/"
|
|
);
|
|
console.log(`Vendor ID: ${vendorId}`);
|
|
|
|
const products: AtgbicsProduct[] = [];
|
|
let pagesScraped = 0;
|
|
|
|
const proxyConfiguration = buildProxyConfiguration();
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 1,
|
|
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
|
|
maxRequestsPerCrawl: MAX_PAGES,
|
|
requestHandlerTimeoutSecs: 60,
|
|
headless: true,
|
|
useSessionPool: false, // Disable session pool to avoid SDK_SESSION_POOL_STATE.json crash
|
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
|
|
},
|
|
},
|
|
|
|
async requestHandler({ page, request, enqueueLinks, log }) {
|
|
const url = request.url;
|
|
log.info(`Scraping: ${url}`);
|
|
|
|
// Wait for Shopify product grid to render
|
|
await page.waitForTimeout(2000);
|
|
|
|
// Check if this is a collection (listing) page or a product page
|
|
const isCollection = url.includes("/collections/");
|
|
|
|
if (isCollection) {
|
|
// Extract product links from listing page and enqueue them
|
|
const productData = await page.evaluate(() => {
|
|
const results: Array<{
|
|
name: string;
|
|
href: string;
|
|
price: string;
|
|
stock: string;
|
|
partNumber: string;
|
|
}> = [];
|
|
|
|
// Shopify collection page — product cards
|
|
const cards = document.querySelectorAll(
|
|
".product-item, .grid-product, [class*=\"product-card\"], [class*=\"product-grid\"] li, .collection-grid__item"
|
|
);
|
|
|
|
for (const card of cards) {
|
|
const linkEl = card.querySelector("a[href*=\"/products/\"]") as HTMLAnchorElement | null;
|
|
const nameEl = card.querySelector(
|
|
".product-item__title, .grid-product__title, [class*=\"product-title\"], [class*=\"product-name\"], h2, h3"
|
|
);
|
|
const priceEl = card.querySelector(
|
|
".product-item__price, .grid-product__price, [class*=\"price\"]:not([class*=\"compare\"]):not([class*=\"was\"])"
|
|
);
|
|
const stockEl = card.querySelector(
|
|
"[class*=\"stock\"], [class*=\"availability\"], [class*=\"badge\"]"
|
|
);
|
|
|
|
const href = linkEl?.getAttribute("href") || "";
|
|
const name = nameEl?.textContent?.trim() || linkEl?.textContent?.trim() || "";
|
|
const price = priceEl?.textContent?.trim() || "";
|
|
const stock = stockEl?.textContent?.trim() || "";
|
|
|
|
// Derive part number from URL slug: /products/sfp-10g-lr → sfp-10g-lr
|
|
// Then extract real OEM part number (strips "-r-compatible-transceiver-*")
|
|
const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || "";
|
|
|
|
if (href && name && name.length > 3) {
|
|
results.push({ name, href, price, stock, partNumber: slug }); // OEM extraction done below after page parse
|
|
}
|
|
}
|
|
|
|
// Fallback: grab any /products/ links with adjacent price text
|
|
if (results.length === 0) {
|
|
const allProductLinks = document.querySelectorAll("a[href*=\"/products/\"]");
|
|
const seen = new Set<string>();
|
|
for (const el of allProductLinks) {
|
|
const a = el as HTMLAnchorElement;
|
|
const href = a.getAttribute("href") || "";
|
|
if (seen.has(href)) continue;
|
|
seen.add(href);
|
|
|
|
const name = a.textContent?.trim() || "";
|
|
if (!name || name.length < 3) continue;
|
|
|
|
const container = a.closest("li") || a.closest("article") || a.parentElement?.parentElement;
|
|
const priceEl = container?.querySelector("[class*=\"price\"]");
|
|
const price = priceEl?.textContent?.trim() || "";
|
|
const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || "";
|
|
|
|
results.push({ name, href, price, stock: "", partNumber: slug });
|
|
}
|
|
}
|
|
|
|
return results;
|
|
});
|
|
|
|
log.info(` Found ${productData.length} products on collection page`);
|
|
|
|
for (const item of productData) {
|
|
if (!item.href) continue;
|
|
|
|
const fullUrl = item.href.startsWith("http") ? item.href : `${BASE_URL}${item.href}`;
|
|
|
|
// If we already have price data from the listing, store it directly
|
|
if (item.price) {
|
|
const { price, currency } = parsePrice(item.price);
|
|
const speedInfo = detectSpeed(item.name);
|
|
// Extract real OEM part number from slug (strips -r-compatible-transceiver-*)
|
|
const realPartNumber = extractOemPartNumber(item.partNumber);
|
|
// Extract reach from name OR slug (slug often has "120km" even when name doesn't)
|
|
const reachLabel = detectReach(item.name) || detectReach(item.partNumber) || undefined;
|
|
if (price > 0) {
|
|
products.push({
|
|
partNumber: realPartNumber || item.name.slice(0, 80),
|
|
name: item.name,
|
|
price,
|
|
currency: currency === "USD" ? "GBP" : currency, // ATGBICS is GBP — parsePrice may default to USD if no symbol on listing
|
|
stockLevel: item.stock ? parseStockLevel(item.stock) : "in_stock",
|
|
quantity: item.stock ? parseQuantity(item.stock) : undefined,
|
|
url: fullUrl,
|
|
formFactor: detectFormFactor(item.name),
|
|
speedGbps: speedInfo?.speedGbps,
|
|
speed: speedInfo?.speed,
|
|
reachLabel,
|
|
fiberType: detectFiberType(item.name),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Enqueue next page if pagination exists
|
|
await enqueueLinks({
|
|
selector: "a[href*=\"?page=\"], a.pagination__next, a[rel=\"next\"], .pagination a[href]",
|
|
transformRequestFunction: (req) => {
|
|
if (pagesScraped >= MAX_PAGES) return false;
|
|
return req;
|
|
},
|
|
});
|
|
|
|
pagesScraped++;
|
|
} else {
|
|
// Product detail page — extract precise data
|
|
const data = await page.evaluate(() => {
|
|
const title = document.querySelector(
|
|
"h1.product__title, h1.product-title, h1.product_title, h1"
|
|
)?.textContent?.trim() || "";
|
|
|
|
// Shopify price — prefer sale price if available
|
|
const salePriceEl = document.querySelector(
|
|
".price__sale .price-item--sale, .product__price .money, [class*=\"price\"] .money, [data-product-price], .price ins"
|
|
);
|
|
const priceText = salePriceEl?.textContent?.trim() || "";
|
|
|
|
// Stock / availability
|
|
const stockEl = document.querySelector(
|
|
".product__availability, .availability, [class*=\"stock\"], [class*=\"inventory\"], .badge--sold-out, .badge--in-stock"
|
|
);
|
|
const stockText = stockEl?.textContent?.trim() || "";
|
|
|
|
// Quantity badge (some Shopify themes show "X in stock")
|
|
const qtyEl = document.querySelector("[class*=\"quantity\"], [class*=\"inventory-count\"]");
|
|
const qtyText = qtyEl?.textContent?.trim() || "";
|
|
|
|
// Short description / variant title for reach/fiber info
|
|
const descEl = document.querySelector(
|
|
".product__description, .product-description, .rte p:first-child, .product__short-description"
|
|
);
|
|
const description = descEl?.textContent?.trim() || "";
|
|
|
|
// SKU / part number (Shopify often exposes this)
|
|
const skuEl = document.querySelector(".product__sku, [class*=\"sku\"], [itemprop=\"sku\"]");
|
|
const sku = skuEl?.textContent?.replace(/SKU[:\s]*/i, "").trim() || "";
|
|
|
|
return { title, priceText, stockText, qtyText, description, sku };
|
|
});
|
|
|
|
const slug = url.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || "";
|
|
// Prefer Shopify SKU if available, otherwise extract real OEM PN from slug
|
|
const partNumber = data.sku && data.sku.length > 2 && data.sku.length < 40
|
|
? data.sku.toUpperCase()
|
|
: extractOemPartNumber(slug);
|
|
const name = data.title || slug;
|
|
|
|
const combinedText = `${name} ${data.description}`;
|
|
const { price, currency } = parsePrice(data.priceText);
|
|
|
|
if (price > 0) {
|
|
const speedInfo = detectSpeed(combinedText);
|
|
// Reach from title/description first, then fall back to slug (slug often has "120km")
|
|
const reachLabel = detectReach(combinedText) || detectReach(slug) || undefined;
|
|
products.push({
|
|
partNumber,
|
|
name,
|
|
price,
|
|
currency: currency === "USD" ? "GBP" : currency, // ATGBICS prices in GBP
|
|
stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock",
|
|
quantity: data.qtyText ? parseQuantity(data.qtyText) : undefined,
|
|
url,
|
|
formFactor: detectFormFactor(combinedText),
|
|
speedGbps: speedInfo?.speedGbps,
|
|
speed: speedInfo?.speed,
|
|
reachLabel,
|
|
fiberType: detectFiberType(combinedText),
|
|
});
|
|
}
|
|
|
|
pagesScraped++;
|
|
}
|
|
},
|
|
}, makeCrawleeConfig("atgbics"));
|
|
|
|
const startUrls = CATEGORY_URLS.map((path) => `${BASE_URL}${path}`);
|
|
await crawler.run(startUrls);
|
|
|
|
console.log(`\nPages scraped: ${pagesScraped}`);
|
|
console.log(`Products found: ${products.length}`);
|
|
|
|
// Deduplicate by partNumber — prefer product detail page data (more precise)
|
|
const uniqueProducts = new Map<string, AtgbicsProduct>();
|
|
for (const p of products) {
|
|
const key = p.partNumber || p.name;
|
|
const existing = uniqueProducts.get(key);
|
|
// Keep the entry with a non-GBP-forced currency (i.e., product detail page which has £ symbol)
|
|
if (!existing || existing.currency === "GBP" && p.currency !== "GBP") {
|
|
uniqueProducts.set(key, p);
|
|
} else if (!existing) {
|
|
uniqueProducts.set(key, p);
|
|
}
|
|
}
|
|
|
|
// Write to database
|
|
let written = 0;
|
|
let skipped = 0;
|
|
|
|
for (const p of uniqueProducts.values()) {
|
|
try {
|
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
|
partNumber: p.partNumber,
|
|
vendorId,
|
|
formFactor: p.formFactor,
|
|
speedGbps: p.speedGbps,
|
|
speed: p.speed,
|
|
reachLabel: p.reachLabel,
|
|
fiberType: p.fiberType,
|
|
category: "DataCenter",
|
|
});
|
|
|
|
const hash = contentHash({ price: p.price, stock: p.stockLevel, qty: p.quantity });
|
|
const isNew = await upsertPriceObservation({
|
|
transceiverId,
|
|
sourceVendorId: vendorId,
|
|
price: p.price,
|
|
currency: p.currency,
|
|
stockLevel: p.stockLevel,
|
|
quantityAvailable: p.quantity,
|
|
url: p.url,
|
|
contentHash: hash,
|
|
});
|
|
|
|
if (isNew) written++;
|
|
else skipped++;
|
|
} catch (err) {
|
|
console.error(` Error: ${p.partNumber}:`, (err as Error).message);
|
|
}
|
|
}
|
|
|
|
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${uniqueProducts.size} unique)`);
|
|
console.log("=== ATGBICS Scraper Complete ===\n");
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeAtgbics()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|