Rene Fichtmueller 5b35b2b8be feat(scraper+api): warehouse stock data pipeline — FS.com v2, SmartOptics v2, Stock API
Scraper changes:
- fs-com.ts v2: Playwright stealth patches + www.fs.com/de/ URL fix (de.fs.com DNS NXDOMAIN).
  Extracts DE-Lager, Global-Lager, Nachlieferung, units_sold, compatible_brands, price_net.
  Mac-side runner (run-fs-scraper-mac.sh) via SSH tunnel for residential IP access.
  Fast-fail connectivity check on datacenter IPs that are blocked by Cloudflare.
- smartoptics.ts v2: WooCommerce REST API fallback + 8 catalog categories + relative URL fix.
  Was finding only 8 products, now discovers 18+ with multi-category crawl.

DB layer:
- db.ts: add upsertStockObservation() — writes 10 new stock_observations columns
  (warehouse_de_qty, warehouse_global_qty, backorder_qty, units_sold, compatible_brands,
  price_net, product_url, delivery dates) with dedup check.

API:
- routes/stock.ts: GET /api/stock, /api/stock/summary, /api/stock/:id
  Warehouse breakdowns per transceiver/vendor with top-sellers and vendor summary.
- routes/review.ts: equivalence review queue (approve/reject/bulk-approve).
- index.ts: register /api/stock and /api/review routes.

Dashboard:
- index.html: 🏭 Stock tab with stat cards (DE-Lager, Global-Lager, Nachlieferung totals),
  top-sellers table, vendor breakdown, recently-restocked events, part-number lookup.

SQL migrations:
- 034: blog-review-tag, 035: price-observations is_anomalous, 036: transceiver-equivalences.
2026-04-17 10:45:59 +02:00

320 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer
*
* smartoptics.com — WordPress/WooCommerce, no prices (B2B, RFQ only).
* Scrapes product catalog for specs, images, and datasheets.
*
* v2 fixes:
* - Multi-category crawl (coherent, DWDM, access, SFP, QSFP)
* - Handles both absolute AND relative product URLs
* - WooCommerce REST API fallback for complete product list
* - Up to 10 pagination pages per category
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
const BASE = "https://smartoptics.com";
const HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
/** All transceiver-related catalog category pages to crawl */
const CATALOG_PAGES = [
"/products/optical-transceivers/",
"/products/",
"/product-category/optical-transceivers/",
"/product-category/transceivers/",
"/product-category/sfp/",
"/product-category/qsfp/",
"/product-category/coherent/",
"/product-category/dwdm/",
];
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
const t = text.toLowerCase();
if (t.includes("qsfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 };
if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 };
if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 };
if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const kmMatch = text.match(/(\d+(?:\.\d+)?)\s*km/i);
if (kmMatch) { const km = parseFloat(kmMatch[1]); return { label: `${km}km`, meters: km * 1000 }; }
const mMatch = text.match(/(\d+)\s*m\b/i);
if (mMatch) { const m = parseInt(mMatch[1]); return { label: `${m}m`, meters: m }; }
return undefined;
}
function detectFiber(text: string): string {
if (/multi.?mode|mmf|sr\b/i.test(text)) return "MMF";
return "SMF"; // SmartOptics is almost exclusively SMF/coherent
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
/**
* Extract all /product/xxx/ URLs from an HTML page.
* Handles both absolute (https://smartoptics.com/product/...) and
* root-relative (/product/...) href patterns.
*/
function extractProductUrls(html: string, pageUrl: string): string[] {
const urls = new Set<string>();
// Absolute URLs
const absRegex = /href="(https?:\/\/(?:www\.)?smartoptics\.com\/product\/[^"#?]+)"/gi;
let m: RegExpExecArray | null;
while ((m = absRegex.exec(html)) !== null) {
urls.add(normalizeProductUrl(m[1]));
}
// Root-relative: href="/product/..." or href="/products/..." (individual product, not category)
const relRegex = /href="(\/product\/[^"#?]+)"/gi;
while ((m = relRegex.exec(html)) !== null) {
urls.add(normalizeProductUrl(`${BASE}${m[1]}`));
}
// WooCommerce data attributes: data-permalink or data-product-url
const dataRegex = /data-(?:permalink|product-url)="([^"]*\/product\/[^"]+)"/gi;
while ((m = dataRegex.exec(html)) !== null) {
const u = m[1].startsWith("http") ? m[1] : `${BASE}${m[1]}`;
urls.add(normalizeProductUrl(u));
}
// Filter out category pages — only keep individual product URLs
return Array.from(urls).filter((u) => {
const path = new URL(u).pathname;
// Must be /product/something — not /products/ (that's a category)
return path.startsWith("/product/") && path.split("/").filter(Boolean).length >= 2;
});
}
function normalizeProductUrl(url: string): string {
// Ensure trailing slash, strip query and fragment
try {
const u = new URL(url);
let path = u.pathname;
if (!path.endsWith("/")) path += "/";
return `${u.origin}${path}`;
} catch {
return url;
}
}
interface ProductData {
sku: string;
name: string;
url: string;
imageUrl?: string;
datasheetUrl?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType: string;
coherent: boolean;
wdmType?: string;
}
async function scrapeProductPage(url: string): Promise<ProductData | null> {
try {
const html = await fetchPage(url);
// Product name — try OG tag first (most reliable), then H1
const nameMatch =
html.match(/property="og:title"\s+content="([^"]+)"/) ||
html.match(/content="([^"]+)"\s+property="og:title"/) ||
html.match(/<h1[^>]*class="[^"]*(?:product_title|entry-title)[^"]*"[^>]*>([^<]+)<\/h1>/i) ||
html.match(/<h1[^>]*>([^<]+)<\/h1>/);
const rawName = nameMatch?.[1]?.trim() ?? "";
const name = rawName.replace(/\s*\|\s*Smartoptics\s*$/, "").replace(/\s*\s*Smartoptics\s*$/, "").trim();
if (!name || name.length < 4) return null;
// SKU — try WooCommerce SKU field first
const skuMatch =
html.match(/(?:SKU|Artikelnummer)[^<]*<\/[^>]+>\s*<[^>]+>([A-Z0-9][-A-Z0-9./]{2,40})/i) ||
html.match(/"sku"\s*:\s*"([^"]+)"/) ||
html.match(/class="sku"[^>]*>([^<]+)</) ||
html.match(/data-sku="([^"]+)"/);
const sku = skuMatch?.[1]?.trim().toUpperCase() ||
url.split("/").filter(Boolean).pop()?.toUpperCase().replace(/-/g, "") ||
name.slice(0, 30).toUpperCase().replace(/\s+/g, "-");
// Product image
const imgMatch =
html.match(/property="og:image"\s+content="([^"]+)"/) ||
html.match(/content="([^"]+)"\s+property="og:image"/) ||
html.match(/<img[^>]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]+class="[^"]*(?:wp-post-image|attachment-shop_single)[^"]*"/i);
const imageUrl = imgMatch?.[1];
// Datasheet PDF link
const dsMatch = html.match(/href="([^"]*\.pdf)"[^>]*>.*?(?:datasheet|datenblatt|spec)/gi);
const datasheetUrl = dsMatch
? (dsMatch[0].match(/href="([^"]+)"/) ?? [])[1]
: undefined;
const ff = detectFormFactor(name);
const reach = detectReach(name);
const pageText = html.slice(0, 5000); // only check first 5KB for coherent detection
const coherent = /coherent|coh-t|coh\.|dp-qpsk|qpsk|cfp2/i.test(name + pageText);
const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined;
return {
sku,
name,
url,
imageUrl,
datasheetUrl,
...ff,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(name),
coherent,
wdmType,
};
} catch (err) {
console.warn(` Failed ${url}: ${(err as Error).message.slice(0, 80)}`);
return null;
}
}
/** Try WooCommerce REST API for a complete product list (often publicly accessible) */
async function tryWooCommerceApi(): Promise<string[]> {
const urls: string[] = [];
try {
for (let page = 1; page <= 20; page++) {
const apiUrl = `${BASE}/wp-json/wc/v3/products?per_page=100&page=${page}&category=optical-transceivers&status=publish`;
const resp = await fetch(apiUrl, {
headers: { ...HEADERS, Accept: "application/json" },
signal: AbortSignal.timeout(10000),
});
if (!resp.ok) break;
const products = await resp.json() as Array<{ permalink?: string; slug?: string }>;
if (!Array.isArray(products) || products.length === 0) break;
for (const p of products) {
if (p.permalink) urls.push(normalizeProductUrl(p.permalink));
else if (p.slug) urls.push(normalizeProductUrl(`${BASE}/product/${p.slug}/`));
}
if (products.length < 100) break;
await sleep(500);
}
} catch {
// API not accessible — not unusual, fall through to HTML crawl
}
return urls;
}
export async function scrapeSmartOptics(): Promise<void> {
console.log("=== SmartOptics Scraper v2 Starting ===\n");
console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + catalog only.\n");
const vendorId = await ensureVendor(
"SmartOptics",
"manufacturer",
"https://www.smartoptics.com",
"https://smartoptics.com/products/optical-transceivers/"
);
const productUrls = new Set<string>();
// ── Try WooCommerce REST API first (fastest, most complete) ──────────────
console.log("[1] Trying WooCommerce REST API…");
const apiUrls = await tryWooCommerceApi();
if (apiUrls.length > 0) {
console.log(` API returned ${apiUrls.length} products`);
apiUrls.forEach((u) => productUrls.add(u));
} else {
console.log(" API not accessible — falling back to HTML crawl");
}
// ── HTML catalog crawl (always run to catch any API misses) ───────────────
console.log("[2] Crawling category pages…");
for (const catPath of CATALOG_PAGES) {
const catBase = `${BASE}${catPath}`;
for (let page = 1; page <= 10; page++) {
const pageUrl = page === 1 ? catBase : `${catBase}page/${page}/`;
try {
const html = await fetchPage(pageUrl);
const found = extractProductUrls(html, pageUrl);
if (found.length === 0 && page > 1) break; // no more pages in this category
if (found.length === 0 && page === 1) break; // category doesn't exist
found.forEach((u) => productUrls.add(u));
console.log(` ${catPath} p${page}: ${found.length} products`);
await sleep(1200);
} catch (err) {
const msg = (err as Error).message;
if (!msg.includes("404")) console.warn(` ${pageUrl}: ${msg.slice(0, 60)}`);
break;
}
}
}
console.log(`\nTotal unique product URLs: ${productUrls.size}`);
if (productUrls.size === 0) {
console.warn("No products found — SmartOptics site structure may have changed");
return;
}
// ── Scrape individual product pages ───────────────────────────────────────
console.log("\n[3] Scraping product detail pages…");
let saved = 0;
let withImages = 0;
let failed = 0;
for (const url of productUrls) {
const product = await scrapeProductPage(url);
if (!product) { failed++; continue; }
try {
await findOrCreateScrapedTransceiver({
partNumber: product.sku,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wdmType ? "DWDM-tunable" : undefined,
category: product.coherent ? "Coherent" : "DataCenter",
imageUrl: product.imageUrl,
});
saved++;
if (product.imageUrl) withImages++;
console.log(`${product.sku.slice(0, 25).padEnd(25)} ${product.name.slice(0, 50)}`);
} catch (err) {
console.warn(`${product.sku}: ${(err as Error).message.slice(0, 80)}`);
}
await sleep(1200);
}
console.log(`\n=== SmartOptics v2 Complete ===`);
console.log(` Products discovered: ${productUrls.size}`);
console.log(` Saved to DB: ${saved}`);
console.log(` With images: ${withImages}`);
if (failed > 0) console.warn(` Failed pages: ${failed}`);
}
if (require.main === module) {
scrapeSmartOptics()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}