Scraper changes:
- fs-com.ts v2: Playwright stealth patches + www.fs.com/de/ URL fix (de.fs.com DNS NXDOMAIN).
Extracts DE-Lager, Global-Lager, Nachlieferung, units_sold, compatible_brands, price_net.
Mac-side runner (run-fs-scraper-mac.sh) via SSH tunnel for residential IP access.
Fast-fail connectivity check on datacenter IPs that are blocked by Cloudflare.
- smartoptics.ts v2: WooCommerce REST API fallback + 8 catalog categories + relative URL fix.
Was finding only 8 products, now discovers 18+ with multi-category crawl.
DB layer:
- db.ts: add upsertStockObservation() — writes 10 new stock_observations columns
(warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands,
price_net, product_url, delivery dates) with dedup check.
API:
- routes/stock.ts: GET /api/stock, /api/stock/summary, /api/stock/:id
Warehouse breakdowns per transceiver/vendor with top-sellers and vendor summary.
- routes/review.ts: equivalence review queue (approve/reject/bulk-approve).
- index.ts: register /api/stock and /api/review routes.
Dashboard:
- index.html: 🏭 Stock tab with stat cards (DE-Lager, Global-Lager, Nachlieferung totals),
top-sellers table, vendor breakdown, recently-restocked events, part-number lookup.
SQL migrations:
- 034: blog-review-tag, 035: price-observations is_anomalous, 036: transceiver-equivalences.
320 lines
12 KiB
TypeScript
320 lines
12 KiB
TypeScript
/**
|
||
* SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer
|
||
*
|
||
* smartoptics.com — WordPress/WooCommerce, no prices (B2B, RFQ only).
|
||
* Scrapes product catalog for specs, images, and datasheets.
|
||
*
|
||
* v2 fixes:
|
||
* - Multi-category crawl (coherent, DWDM, access, SFP, QSFP)
|
||
* - Handles both absolute AND relative product URLs
|
||
* - WooCommerce REST API fallback for complete product list
|
||
* - Up to 10 pagination pages per category
|
||
*/
|
||
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
||
|
||
const BASE = "https://smartoptics.com";
|
||
const HEADERS = {
|
||
"User-Agent":
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
};
|
||
|
||
/** All transceiver-related catalog category pages to crawl */
|
||
const CATALOG_PAGES = [
|
||
"/products/optical-transceivers/",
|
||
"/products/",
|
||
"/product-category/optical-transceivers/",
|
||
"/product-category/transceivers/",
|
||
"/product-category/sfp/",
|
||
"/product-category/qsfp/",
|
||
"/product-category/coherent/",
|
||
"/product-category/dwdm/",
|
||
];
|
||
|
||
function sleep(ms: number): Promise<void> {
|
||
return new Promise((r) => setTimeout(r, ms));
|
||
}
|
||
|
||
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
|
||
const t = text.toLowerCase();
|
||
if (t.includes("qsfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
|
||
if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
||
if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 };
|
||
if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 };
|
||
if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
||
if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
|
||
if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
||
if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
||
if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
|
||
return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
||
}
|
||
|
||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||
const kmMatch = text.match(/(\d+(?:\.\d+)?)\s*km/i);
|
||
if (kmMatch) { const km = parseFloat(kmMatch[1]); return { label: `${km}km`, meters: km * 1000 }; }
|
||
const mMatch = text.match(/(\d+)\s*m\b/i);
|
||
if (mMatch) { const m = parseInt(mMatch[1]); return { label: `${m}m`, meters: m }; }
|
||
return undefined;
|
||
}
|
||
|
||
function detectFiber(text: string): string {
|
||
if (/multi.?mode|mmf|sr\b/i.test(text)) return "MMF";
|
||
return "SMF"; // SmartOptics is almost exclusively SMF/coherent
|
||
}
|
||
|
||
async function fetchPage(url: string): Promise<string> {
|
||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||
return resp.text();
|
||
}
|
||
|
||
/**
|
||
* Extract all /product/xxx/ URLs from an HTML page.
|
||
* Handles both absolute (https://smartoptics.com/product/...) and
|
||
* root-relative (/product/...) href patterns.
|
||
*/
|
||
function extractProductUrls(html: string, pageUrl: string): string[] {
|
||
const urls = new Set<string>();
|
||
|
||
// Absolute URLs
|
||
const absRegex = /href="(https?:\/\/(?:www\.)?smartoptics\.com\/product\/[^"#?]+)"/gi;
|
||
let m: RegExpExecArray | null;
|
||
while ((m = absRegex.exec(html)) !== null) {
|
||
urls.add(normalizeProductUrl(m[1]));
|
||
}
|
||
|
||
// Root-relative: href="/product/..." or href="/products/..." (individual product, not category)
|
||
const relRegex = /href="(\/product\/[^"#?]+)"/gi;
|
||
while ((m = relRegex.exec(html)) !== null) {
|
||
urls.add(normalizeProductUrl(`${BASE}${m[1]}`));
|
||
}
|
||
|
||
// WooCommerce data attributes: data-permalink or data-product-url
|
||
const dataRegex = /data-(?:permalink|product-url)="([^"]*\/product\/[^"]+)"/gi;
|
||
while ((m = dataRegex.exec(html)) !== null) {
|
||
const u = m[1].startsWith("http") ? m[1] : `${BASE}${m[1]}`;
|
||
urls.add(normalizeProductUrl(u));
|
||
}
|
||
|
||
// Filter out category pages — only keep individual product URLs
|
||
return Array.from(urls).filter((u) => {
|
||
const path = new URL(u).pathname;
|
||
// Must be /product/something — not /products/ (that's a category)
|
||
return path.startsWith("/product/") && path.split("/").filter(Boolean).length >= 2;
|
||
});
|
||
}
|
||
|
||
function normalizeProductUrl(url: string): string {
|
||
// Ensure trailing slash, strip query and fragment
|
||
try {
|
||
const u = new URL(url);
|
||
let path = u.pathname;
|
||
if (!path.endsWith("/")) path += "/";
|
||
return `${u.origin}${path}`;
|
||
} catch {
|
||
return url;
|
||
}
|
||
}
|
||
|
||
interface ProductData {
|
||
sku: string;
|
||
name: string;
|
||
url: string;
|
||
imageUrl?: string;
|
||
datasheetUrl?: string;
|
||
formFactor: string;
|
||
speed: string;
|
||
speedGbps: number;
|
||
reachLabel?: string;
|
||
reachMeters?: number;
|
||
fiberType: string;
|
||
coherent: boolean;
|
||
wdmType?: string;
|
||
}
|
||
|
||
async function scrapeProductPage(url: string): Promise<ProductData | null> {
|
||
try {
|
||
const html = await fetchPage(url);
|
||
|
||
// Product name — try OG tag first (most reliable), then H1
|
||
const nameMatch =
|
||
html.match(/property="og:title"\s+content="([^"]+)"/) ||
|
||
html.match(/content="([^"]+)"\s+property="og:title"/) ||
|
||
html.match(/<h1[^>]*class="[^"]*(?:product_title|entry-title)[^"]*"[^>]*>([^<]+)<\/h1>/i) ||
|
||
html.match(/<h1[^>]*>([^<]+)<\/h1>/);
|
||
const rawName = nameMatch?.[1]?.trim() ?? "";
|
||
const name = rawName.replace(/\s*\|\s*Smartoptics\s*$/, "").replace(/\s*–\s*Smartoptics\s*$/, "").trim();
|
||
if (!name || name.length < 4) return null;
|
||
|
||
// SKU — try WooCommerce SKU field first
|
||
const skuMatch =
|
||
html.match(/(?:SKU|Artikelnummer)[^<]*<\/[^>]+>\s*<[^>]+>([A-Z0-9][-A-Z0-9./]{2,40})/i) ||
|
||
html.match(/"sku"\s*:\s*"([^"]+)"/) ||
|
||
html.match(/class="sku"[^>]*>([^<]+)</) ||
|
||
html.match(/data-sku="([^"]+)"/);
|
||
const sku = skuMatch?.[1]?.trim().toUpperCase() ||
|
||
url.split("/").filter(Boolean).pop()?.toUpperCase().replace(/-/g, "") ||
|
||
name.slice(0, 30).toUpperCase().replace(/\s+/g, "-");
|
||
|
||
// Product image
|
||
const imgMatch =
|
||
html.match(/property="og:image"\s+content="([^"]+)"/) ||
|
||
html.match(/content="([^"]+)"\s+property="og:image"/) ||
|
||
html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]+class="[^"]*(?:wp-post-image|attachment-shop_single)[^"]*"/i);
|
||
const imageUrl = imgMatch?.[1];
|
||
|
||
// Datasheet PDF link
|
||
const dsMatch = html.match(/href="([^"]*\.pdf)"[^>]*>.*?(?:datasheet|datenblatt|spec)/gi);
|
||
const datasheetUrl = dsMatch
|
||
? (dsMatch[0].match(/href="([^"]+)"/) ?? [])[1]
|
||
: undefined;
|
||
|
||
const ff = detectFormFactor(name);
|
||
const reach = detectReach(name);
|
||
const pageText = html.slice(0, 5000); // only check first 5KB for coherent detection
|
||
const coherent = /coherent|coh-t|coh\.|dp-qpsk|qpsk|cfp2/i.test(name + pageText);
|
||
const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined;
|
||
|
||
return {
|
||
sku,
|
||
name,
|
||
url,
|
||
imageUrl,
|
||
datasheetUrl,
|
||
...ff,
|
||
reachLabel: reach?.label,
|
||
reachMeters: reach?.meters,
|
||
fiberType: detectFiber(name),
|
||
coherent,
|
||
wdmType,
|
||
};
|
||
} catch (err) {
|
||
console.warn(` Failed ${url}: ${(err as Error).message.slice(0, 80)}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/** Try WooCommerce REST API for a complete product list (often publicly accessible) */
|
||
async function tryWooCommerceApi(): Promise<string[]> {
|
||
const urls: string[] = [];
|
||
try {
|
||
for (let page = 1; page <= 20; page++) {
|
||
const apiUrl = `${BASE}/wp-json/wc/v3/products?per_page=100&page=${page}&category=optical-transceivers&status=publish`;
|
||
const resp = await fetch(apiUrl, {
|
||
headers: { ...HEADERS, Accept: "application/json" },
|
||
signal: AbortSignal.timeout(10000),
|
||
});
|
||
if (!resp.ok) break;
|
||
const products = await resp.json() as Array<{ permalink?: string; slug?: string }>;
|
||
if (!Array.isArray(products) || products.length === 0) break;
|
||
for (const p of products) {
|
||
if (p.permalink) urls.push(normalizeProductUrl(p.permalink));
|
||
else if (p.slug) urls.push(normalizeProductUrl(`${BASE}/product/${p.slug}/`));
|
||
}
|
||
if (products.length < 100) break;
|
||
await sleep(500);
|
||
}
|
||
} catch {
|
||
// API not accessible — not unusual, fall through to HTML crawl
|
||
}
|
||
return urls;
|
||
}
|
||
|
||
export async function scrapeSmartOptics(): Promise<void> {
|
||
console.log("=== SmartOptics Scraper v2 Starting ===\n");
|
||
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + catalog only.\n");
|
||
|
||
const vendorId = await ensureVendor(
|
||
"SmartOptics",
|
||
"manufacturer",
|
||
"https://www.smartoptics.com",
|
||
"https://smartoptics.com/products/optical-transceivers/"
|
||
);
|
||
|
||
const productUrls = new Set<string>();
|
||
|
||
// ── Try WooCommerce REST API first (fastest, most complete) ──────────────
|
||
console.log("[1] Trying WooCommerce REST API…");
|
||
const apiUrls = await tryWooCommerceApi();
|
||
if (apiUrls.length > 0) {
|
||
console.log(` API returned ${apiUrls.length} products`);
|
||
apiUrls.forEach((u) => productUrls.add(u));
|
||
} else {
|
||
console.log(" API not accessible — falling back to HTML crawl");
|
||
}
|
||
|
||
// ── HTML catalog crawl (always run to catch any API misses) ───────────────
|
||
console.log("[2] Crawling category pages…");
|
||
for (const catPath of CATALOG_PAGES) {
|
||
const catBase = `${BASE}${catPath}`;
|
||
for (let page = 1; page <= 10; page++) {
|
||
const pageUrl = page === 1 ? catBase : `${catBase}page/${page}/`;
|
||
try {
|
||
const html = await fetchPage(pageUrl);
|
||
const found = extractProductUrls(html, pageUrl);
|
||
if (found.length === 0 && page > 1) break; // no more pages in this category
|
||
if (found.length === 0 && page === 1) break; // category doesn't exist
|
||
found.forEach((u) => productUrls.add(u));
|
||
console.log(` ${catPath} p${page}: ${found.length} products`);
|
||
await sleep(1200);
|
||
} catch (err) {
|
||
const msg = (err as Error).message;
|
||
if (!msg.includes("404")) console.warn(` ${pageUrl}: ${msg.slice(0, 60)}`);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`\nTotal unique product URLs: ${productUrls.size}`);
|
||
if (productUrls.size === 0) {
|
||
console.warn("No products found — SmartOptics site structure may have changed");
|
||
return;
|
||
}
|
||
|
||
// ── Scrape individual product pages ───────────────────────────────────────
|
||
console.log("\n[3] Scraping product detail pages…");
|
||
let saved = 0;
|
||
let withImages = 0;
|
||
let failed = 0;
|
||
|
||
for (const url of productUrls) {
|
||
const product = await scrapeProductPage(url);
|
||
if (!product) { failed++; continue; }
|
||
|
||
try {
|
||
await findOrCreateScrapedTransceiver({
|
||
partNumber: product.sku,
|
||
vendorId,
|
||
formFactor: product.formFactor,
|
||
speedGbps: product.speedGbps,
|
||
speed: product.speed,
|
||
reachMeters: product.reachMeters,
|
||
reachLabel: product.reachLabel,
|
||
fiberType: product.fiberType,
|
||
wavelengths: product.wdmType ? "DWDM-tunable" : undefined,
|
||
category: product.coherent ? "Coherent" : "DataCenter",
|
||
imageUrl: product.imageUrl,
|
||
});
|
||
saved++;
|
||
if (product.imageUrl) withImages++;
|
||
console.log(` ✓ ${product.sku.slice(0, 25).padEnd(25)} ${product.name.slice(0, 50)}`);
|
||
} catch (err) {
|
||
console.warn(` ✗ ${product.sku}: ${(err as Error).message.slice(0, 80)}`);
|
||
}
|
||
await sleep(1200);
|
||
}
|
||
|
||
console.log(`\n=== SmartOptics v2 Complete ===`);
|
||
console.log(` Products discovered: ${productUrls.size}`);
|
||
console.log(` Saved to DB: ${saved}`);
|
||
console.log(` With images: ${withImages}`);
|
||
if (failed > 0) console.warn(` Failed pages: ${failed}`);
|
||
}
|
||
|
||
if (require.main === module) {
|
||
scrapeSmartOptics()
|
||
.then(() => pool.end())
|
||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||
}
|