2026-05-09 17:30:08 +02:00

330 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* ATGBICS Scraper — Prices, Stock, Product Catalog
*
* ATGBICS is a UK-based independent compatible optics vendor.
* Site uses Shopify. Uses the /products.json API for reliable, JS-free data access.
*
* Strategy:
* For each collection: GET /collections/{handle}/products.json?limit=250&page=N
* Parse JSON: title, handle, variants[0].price (GBP string), images
* Paginate until response returns < limit products.
*
* Rate limited: 1 req/1 sec. Runs from Mac or Erik.
* Rewritten 2026-05-06: switched from HTML parsing to products.json API after
* Shopify's static HTML stopped rendering per-collection results correctly.
*/
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, markImageVerified, pool } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE_URL = "https://atgbics.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "application/json",
};
const LIMIT = 250; // Shopify products.json max per page
const MAX_PAGES_PER_CAT = 40; // 40 × 250 = 10,000 products per collection
/** Collections for transceiver products — discovered 2026-05-06 via /collections.json */
const CATEGORIES = [
// Core speeds by form factor
{ handle: "compatible-transceivers-sfp-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ handle: "compatible-transceiver-sfp-bidi-1-25g", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ handle: "compatible-transceivers-sfp-100m", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ handle: "compatible-transceivers-sfpp-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ handle: "compatible-transceivers-sfpp-bidi-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ handle: "compatible-transceivers-sfpp-cwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ handle: "compatible-transceivers-sfp-dwdm-10g", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ handle: "compatible-transceiver-sfp-25g", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ handle: "compatible-transceivers-qsfpp-40gbps", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ handle: "compatible-transceivers-qsfp28-100gbps",formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ handle: "400gbase-products", formFactor: "QSFP-DD",speed: "400G", speedGbps: 400 },
];
interface ShopifyVariant {
price: string;
compare_at_price?: string | null;
available?: boolean;
}
interface ShopifyProduct {
title: string;
handle: string;
variants: ShopifyVariant[];
images?: Array<{ src: string }>;
tags?: string[];
}
interface AtgbicsProduct {
partNumber: string;
name: string;
price: number;
currency: string;
stockLevel: string;
url: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
imageUrl?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectFormFactor(text: string, fallback: string): string {
const lower = text.toLowerCase();
if (lower.includes("osfp")) return "OSFP";
if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return "QSFP-DD";
if (lower.includes("qsfp56")) return "QSFP56";
if (lower.includes("qsfp28")) return "QSFP28";
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
if (lower.includes("sfp28")) return "SFP28";
if (lower.includes("sfp+") || lower.includes("sfp-plus") || lower.includes("sfpplus")) return "SFP+";
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
if (lower.includes("xfp")) return "XFP";
if (lower.includes("cfp2")) return "CFP2";
if (lower.includes("cfp")) return "CFP";
return fallback;
}
function detectSpeed(text: string, fallbackGbps: number): { speed: string; speedGbps: number } {
const patterns: [RegExp, string, number][] = [
[/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200],
[/100\s*g/i, "100G", 100], [/40\s*g/i, "40G", 40], [/25\s*g/i, "25G", 25],
[/10\s*g/i, "10G", 10], [/1000\s*base/i, "1G", 1], [/1\.25\s*g/i, "1G", 1],
[/1\s*g\b/i, "1G", 1],
];
for (const [re, speed, gbps] of patterns) {
if (re.test(text)) return { speed, speedGbps: gbps };
}
return { speed: fallbackGbps + "G", speedGbps: fallbackGbps };
}
function detectReach(text: string): { label: string; meters: number } | undefined {
if (/\b\d+(?:[.,]\d+)?\s*(?:-||to|bis)\s*\d+(?:[.,]\d+)?\s*(?:m|km)\b/i.test(text)) {
return undefined;
}
const generic = text.match(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1]);
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
const labelValue = String(value).replace(/\.0$/, "");
return { label: `${labelValue}${unit}`, meters };
}
}
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000],
[/\b15\s*km\b/i, "15km", 15000], [/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70],
[/\bLR4?\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm|[^a-z]dr\d?[^a-z]?|[^a-z]fr\d?[^a-z]?|psm4|2dr4|2fr4/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
if (/aoc|active.?optical/i.test(text)) return "MMF";
return "";
}
function detectWavelength(text: string): string {
if (/copper|dac|twinax|base-t|rj.?45/i.test(text)) return "N/A";
const m = text.match(/(\d{3,4})\s*nm/i);
if (m) {
const nm = m[1];
if (nm === "1311") return "1310";
return nm;
}
// Use protocol-family evidence only when the optical code is explicit.
// This avoids treating arbitrary product-number digits as wavelengths.
if (/\bCWDM4\b/i.test(text)) return "1271,1291,1311,1331";
if (/\b(?:SR|SR4|SR8|SRBD|VR|VR4|ESR4|CSR4)\b/i.test(text)) return "850";
if (/\b(?:DR|DR4|DR8|FR|FR4|FR8|LR|LR4|ER|ER4|PSM4|2DR4|2FR4)\b/i.test(text)) return "1310";
return "";
}
/**
* Extract OEM part number from the ATGBICS product name.
* Format: "{OEM_PN} {Vendor}® Compatible Transceiver {Specs}"
* e.g. "SFP-10G-SR Cisco® Compatible Transceiver SFP+ 10GBase-SR ..."
*/
function extractPartNumber(name: string): string {
const pnMatch = name.match(/^([A-Z0-9][A-Z0-9._\-/+]+)/i);
if (pnMatch && pnMatch[1].length >= 3 && pnMatch[1].length <= 60) {
return pnMatch[1].toUpperCase();
}
return name.split(/\s+/)[0]?.toUpperCase()?.slice(0, 60) || name.slice(0, 60);
}
/** Parse a single Shopify product JSON into our AtgbicsProduct format */
function parseShopifyProduct(
sp: ShopifyProduct,
cat: typeof CATEGORIES[number]
): AtgbicsProduct | null {
const priceStr = sp.variants[0]?.price;
const price = priceStr ? parseFloat(priceStr) : 0;
if (!price || price <= 0 || price > 100000) return null;
const name = sp.title.replace(/®/g, "").replace(/\s+/g, " ").trim();
if (name.length < 5) return null;
const fullText = `${name} ${sp.handle}`;
const speedInfo = detectSpeed(fullText, cat.speedGbps);
const ff = detectFormFactor(fullText, cat.formFactor);
const reach = detectReach(fullText);
const partNumber = extractPartNumber(name);
// Image URL — first non-placeholder image
const rawImg = sp.images?.[0]?.src;
const imageUrl = rawImg && !rawImg.includes("no-image") ? rawImg : undefined;
return {
partNumber,
name,
price,
currency: "GBP",
stockLevel: "in_stock",
url: `${BASE_URL}/products/${sp.handle}`,
formFactor: ff,
speed: speedInfo.speed,
speedGbps: speedInfo.speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(fullText),
wavelength: detectWavelength(fullText),
imageUrl,
};
}
async function fetchProductsJson(url: string): Promise<ShopifyProduct[]> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
const data = (await resp.json()) as { products: ShopifyProduct[] };
return data.products ?? [];
}
export async function scrapeAtgbics(): Promise<void> {
console.log("=== ATGBICS Scraper Starting (products.json API) ===\n");
const vendorId = await ensureVendor(
"ATGBICS",
"compatible",
"https://atgbics.com",
"https://atgbics.com/collections/compatible-transceivers-sfpp-10g",
);
let totalProducts = 0;
let priceUpdates = 0;
let imageUpdates = 0;
const seenHandles = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
let catTotal = 0;
for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
const pageUrl = `${BASE_URL}/collections/${cat.handle}/products.json?limit=${LIMIT}&page=${page}`;
let shopifyProducts: ShopifyProduct[];
try {
shopifyProducts = await fetchProductsJson(pageUrl);
} catch (err) {
console.warn(` Page ${page} error: ${(err as Error).message.slice(0, 80)}`);
break;
}
if (shopifyProducts.length === 0) {
if (page === 1) console.log(` Empty collection — skipping`);
else console.log(` Page ${page}: 0 products — done`);
break;
}
console.log(` Page ${page}: ${shopifyProducts.length} products`);
for (const sp of shopifyProducts) {
// Skip cross-category duplicates
if (seenHandles.has(sp.handle)) continue;
seenHandles.add(sp.handle);
const product = parseShopifyProduct(sp, cat);
if (!product) continue;
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "Compatible",
});
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: product.currency,
stockLevel: product.stockLevel,
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
if (product.imageUrl) {
const updatedImage = await markImageVerified(txId, product.imageUrl);
if (updatedImage) imageUpdates++;
}
totalProducts++;
catTotal++;
} catch (err) {
console.warn(` DB error ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`);
}
}
// If we got fewer products than the limit, we're on the last page
if (shopifyProducts.length < LIMIT) break;
await sleep(1000);
}
console.log(` Category total: ${catTotal} new products saved`);
await sleep(1000);
}
console.log(`\n=== ATGBICS Complete: ${totalProducts} products, ${priceUpdates} price updates, ${imageUpdates} images ===`);
}
if (require.main === module) {
scrapeAtgbics()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}