Rene Fichtmueller 1026787318 feat: add proxy network, image backfill, and scraper improvements
- Add TIP Proxy Network (packages/proxy-agent): SOCKS5 proxy agent
  for residential IP bypass of CloudFront WAF blocks
- Add /api/proxy/* routes: node registration, heartbeat, load balancing
- Add image extraction to Flexoptix catalog scraper (GraphQL small_image)
- Add image extraction to Optcore scraper (Playwright gallery img)
- Fix Fluxlight price scraping (BigCommerce HTML structure: data-product-price-without-tax)
- Add SmartOptics scraper (8 DWDM/coherent products, og:image extraction)
- Fix findOrCreateScrapedTransceiver to update image_url for existing records
- Add image backfill script (backfill-images.ts): 178 Flexoptix images added
- Fix DB connection pool: max 5, idleTimeoutMillis 10s (was unlimited, caused >100 connections)
- Add proxy.ts utility for scraper proxy rotation
2026-04-03 21:13:03 +02:00

191 lines
7.1 KiB
TypeScript

/**
* SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer
*
* smartoptics.com — WordPress site, no prices (B2B, RFQ model).
* Scrapes product catalog for specs, images, datasheets.
* Products listed at /products/optical-transceivers/ → individual /product/SKU/ pages.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
const BASE = "https://smartoptics.com";
const CATALOG_URL = `${BASE}/products/optical-transceivers/`;
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
};
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
const t = text.toLowerCase();
if (t.includes("qsfp-dd800") || t.includes("sfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 };
if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 };
if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const kmMatch = text.match(/(\d+)\s*km/i);
if (kmMatch) {
const km = parseInt(kmMatch[1]);
return { label: `${km}km`, meters: km * 1000 };
}
const mMatch = text.match(/(\d+)\s*m\b/i);
if (mMatch) {
const m = parseInt(mMatch[1]);
return { label: `${m}m`, meters: m };
}
return undefined;
}
function detectFiber(text: string): string {
if (/dwdm|cwdm|coherent|coh|single.?mode|smf/i.test(text)) return "SMF";
if (/multi.?mode|mmf|sr/i.test(text)) return "MMF";
return "SMF"; // SmartOptics is almost exclusively SMF/coherent
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
function extractProductUrls(html: string): string[] {
const urls = new Set<string>();
const regex = /href="(https?:\/\/smartoptics\.com\/product\/[^"]+)"/gi;
let m: RegExpExecArray | null;
while ((m = regex.exec(html)) !== null) {
const u = m[1].replace(/\/$/, "") + "/";
urls.add(u);
}
return Array.from(urls);
}
interface ProductData {
sku: string;
name: string;
url: string;
imageUrl?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType: string;
coherent: boolean;
wdmType?: string;
}
async function scrapeProductPage(url: string): Promise<ProductData | null> {
try {
const html = await fetchPage(url);
const nameMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/) || html.match(/og:title" content="([^"]+)"/);
const name = nameMatch ? nameMatch[1].trim().replace(/ \| Smartoptics$/, "") : "";
if (!name) return null;
const sku = url.split("/").filter(Boolean).pop()?.toUpperCase() || name.replace(/\s+/g, "-");
const imgMatch = html.match(/property="og:image" content="([^"]+)"/)
|| html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]* class="[^"]*product/i);
const imageUrl = imgMatch ? imgMatch[1] : undefined;
const ff = detectFormFactor(name);
const reach = detectReach(name);
const coherent = /coherent|coh-t|coh\.|dwdm|dp-qpsk|qpsk|cfp2/i.test(name + html.slice(0, 3000));
const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined;
return {
sku, name, url, imageUrl,
...ff,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(name),
coherent,
wdmType,
};
} catch (err) {
console.warn(` Failed ${url}: ${(err as Error).message}`);
return null;
}
}
export async function scrapeSmartOptics(): Promise<void> {
console.log("=== SmartOptics Scraper Starting ===\n");
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + images only.\n");
const vendorId = await ensureVendor(
"SmartOptics",
"manufacturer",
"https://www.smartoptics.com",
"https://smartoptics.com/products/optical-transceivers/"
);
const productUrls = new Set<string>();
for (let page = 1; page <= 10; page++) {
try {
const url = page === 1 ? CATALOG_URL : `${CATALOG_URL}page/${page}/`;
const html = await fetchPage(url);
const urls = extractProductUrls(html);
if (urls.length === 0) break;
urls.forEach((u) => productUrls.add(u));
console.log(` Catalog page ${page}: ${urls.length} products`);
await sleep(1500);
} catch {
break;
}
}
console.log(`\nTotal product URLs: ${productUrls.size}`);
if (productUrls.size === 0) {
console.log("No products found — site may have changed structure");
return;
}
let saved = 0;
let withImages = 0;
for (const url of productUrls) {
const product = await scrapeProductPage(url);
if (!product) continue;
try {
await findOrCreateScrapedTransceiver({
partNumber: product.sku,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wdmType ? "DWDM-tunable" : undefined,
category: product.coherent ? "Coherent" : "DataCenter",
imageUrl: product.imageUrl,
});
saved++;
if (product.imageUrl) withImages++;
console.log(`${product.sku}${product.name.slice(0, 60)}`);
} catch (err) {
console.warn(` Error saving ${product.sku}: ${(err as Error).message.slice(0, 80)}`);
}
await sleep(1500);
}
console.log(`\n=== SmartOptics Complete: ${saved} products, ${withImages} with images ===`);
}
if (require.main === module) {
scrapeSmartOptics()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}