- Add TIP Proxy Network (packages/proxy-agent): SOCKS5 proxy agent for residential IP bypass of CloudFront WAF blocks - Add /api/proxy/* routes: node registration, heartbeat, load balancing - Add image extraction to Flexoptix catalog scraper (GraphQL small_image) - Add image extraction to Optcore scraper (Playwright gallery img) - Fix Fluxlight price scraping (BigCommerce HTML structure: data-product-price-without-tax) - Add SmartOptics scraper (8 DWDM/coherent products, og:image extraction) - Fix findOrCreateScrapedTransceiver to update image_url for existing records - Add image backfill script (backfill-images.ts): 178 Flexoptix images added - Fix DB connection pool: max 5, idleTimeoutMillis 10s (was unlimited, caused >100 connections) - Add proxy.ts utility for scraper proxy rotation
191 lines
7.1 KiB
TypeScript
191 lines
7.1 KiB
TypeScript
/**
|
|
* SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer
|
|
*
|
|
* smartoptics.com — WordPress site, no prices (B2B, RFQ model).
|
|
* Scrapes product catalog for specs, images, datasheets.
|
|
* Products listed at /products/optical-transceivers/ → individual /product/SKU/ pages.
|
|
*/
|
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
|
|
|
const BASE = "https://smartoptics.com";
|
|
const CATALOG_URL = `${BASE}/products/optical-transceivers/`;
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
Accept: "text/html,application/xhtml+xml",
|
|
};
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((r) => setTimeout(r, ms));
|
|
}
|
|
|
|
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
|
|
const t = text.toLowerCase();
|
|
if (t.includes("qsfp-dd800") || t.includes("sfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
|
|
if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
|
if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 };
|
|
if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 };
|
|
if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
|
if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
|
|
if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
|
if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
|
if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
|
|
return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
|
}
|
|
|
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
|
const kmMatch = text.match(/(\d+)\s*km/i);
|
|
if (kmMatch) {
|
|
const km = parseInt(kmMatch[1]);
|
|
return { label: `${km}km`, meters: km * 1000 };
|
|
}
|
|
const mMatch = text.match(/(\d+)\s*m\b/i);
|
|
if (mMatch) {
|
|
const m = parseInt(mMatch[1]);
|
|
return { label: `${m}m`, meters: m };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectFiber(text: string): string {
|
|
if (/dwdm|cwdm|coherent|coh|single.?mode|smf/i.test(text)) return "SMF";
|
|
if (/multi.?mode|mmf|sr/i.test(text)) return "MMF";
|
|
return "SMF"; // SmartOptics is almost exclusively SMF/coherent
|
|
}
|
|
|
|
async function fetchPage(url: string): Promise<string> {
|
|
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
|
return resp.text();
|
|
}
|
|
|
|
function extractProductUrls(html: string): string[] {
|
|
const urls = new Set<string>();
|
|
const regex = /href="(https?:\/\/smartoptics\.com\/product\/[^"]+)"/gi;
|
|
let m: RegExpExecArray | null;
|
|
while ((m = regex.exec(html)) !== null) {
|
|
const u = m[1].replace(/\/$/, "") + "/";
|
|
urls.add(u);
|
|
}
|
|
return Array.from(urls);
|
|
}
|
|
|
|
interface ProductData {
|
|
sku: string;
|
|
name: string;
|
|
url: string;
|
|
imageUrl?: string;
|
|
formFactor: string;
|
|
speed: string;
|
|
speedGbps: number;
|
|
reachLabel?: string;
|
|
reachMeters?: number;
|
|
fiberType: string;
|
|
coherent: boolean;
|
|
wdmType?: string;
|
|
}
|
|
|
|
async function scrapeProductPage(url: string): Promise<ProductData | null> {
|
|
try {
|
|
const html = await fetchPage(url);
|
|
|
|
const nameMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/) || html.match(/og:title" content="([^"]+)"/);
|
|
const name = nameMatch ? nameMatch[1].trim().replace(/ \| Smartoptics$/, "") : "";
|
|
if (!name) return null;
|
|
|
|
const sku = url.split("/").filter(Boolean).pop()?.toUpperCase() || name.replace(/\s+/g, "-");
|
|
|
|
const imgMatch = html.match(/property="og:image" content="([^"]+)"/)
|
|
|| html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]* class="[^"]*product/i);
|
|
const imageUrl = imgMatch ? imgMatch[1] : undefined;
|
|
|
|
const ff = detectFormFactor(name);
|
|
const reach = detectReach(name);
|
|
const coherent = /coherent|coh-t|coh\.|dwdm|dp-qpsk|qpsk|cfp2/i.test(name + html.slice(0, 3000));
|
|
const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined;
|
|
|
|
return {
|
|
sku, name, url, imageUrl,
|
|
...ff,
|
|
reachLabel: reach?.label,
|
|
reachMeters: reach?.meters,
|
|
fiberType: detectFiber(name),
|
|
coherent,
|
|
wdmType,
|
|
};
|
|
} catch (err) {
|
|
console.warn(` Failed ${url}: ${(err as Error).message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function scrapeSmartOptics(): Promise<void> {
|
|
console.log("=== SmartOptics Scraper Starting ===\n");
|
|
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + images only.\n");
|
|
|
|
const vendorId = await ensureVendor(
|
|
"SmartOptics",
|
|
"manufacturer",
|
|
"https://www.smartoptics.com",
|
|
"https://smartoptics.com/products/optical-transceivers/"
|
|
);
|
|
|
|
const productUrls = new Set<string>();
|
|
for (let page = 1; page <= 10; page++) {
|
|
try {
|
|
const url = page === 1 ? CATALOG_URL : `${CATALOG_URL}page/${page}/`;
|
|
const html = await fetchPage(url);
|
|
const urls = extractProductUrls(html);
|
|
if (urls.length === 0) break;
|
|
urls.forEach((u) => productUrls.add(u));
|
|
console.log(` Catalog page ${page}: ${urls.length} products`);
|
|
await sleep(1500);
|
|
} catch {
|
|
break;
|
|
}
|
|
}
|
|
|
|
console.log(`\nTotal product URLs: ${productUrls.size}`);
|
|
if (productUrls.size === 0) {
|
|
console.log("No products found — site may have changed structure");
|
|
return;
|
|
}
|
|
|
|
let saved = 0;
|
|
let withImages = 0;
|
|
|
|
for (const url of productUrls) {
|
|
const product = await scrapeProductPage(url);
|
|
if (!product) continue;
|
|
|
|
try {
|
|
await findOrCreateScrapedTransceiver({
|
|
partNumber: product.sku,
|
|
vendorId,
|
|
formFactor: product.formFactor,
|
|
speedGbps: product.speedGbps,
|
|
speed: product.speed,
|
|
reachMeters: product.reachMeters,
|
|
reachLabel: product.reachLabel,
|
|
fiberType: product.fiberType,
|
|
wavelengths: product.wdmType ? "DWDM-tunable" : undefined,
|
|
category: product.coherent ? "Coherent" : "DataCenter",
|
|
imageUrl: product.imageUrl,
|
|
});
|
|
saved++;
|
|
if (product.imageUrl) withImages++;
|
|
console.log(` ✓ ${product.sku} — ${product.name.slice(0, 60)}`);
|
|
} catch (err) {
|
|
console.warn(` Error saving ${product.sku}: ${(err as Error).message.slice(0, 80)}`);
|
|
}
|
|
await sleep(1500);
|
|
}
|
|
|
|
console.log(`\n=== SmartOptics Complete: ${saved} products, ${withImages} with images ===`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeSmartOptics()
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|