transceiver-db/packages/scraper/src/scrapers/flexoptix-catalog.ts
Rene Fichtmueller c6308e93c0 feat: massive scraper expansion + hype cycle engine + lifecycle prediction
New scrapers:
- GBICS.com (BigCommerce, GBP prices, 10 categories, 78 products)
- Juniper HCT (Next.js SSR parser, 475 transceivers with specs/EOL)
- SFPcables.com (Magento store, 16 categories, 78 products)
- Fluxlight (BigCommerce, 6 pages, 118 products)
- Champion ONE (compatible vendor scraper)

Scraper fixes:
- 10Gtek: rewritten to parse HTML spec tables (152 products)
- Flexoptix: fix price extraction from Magento Hyva HTML
- Register all scrapers in CLI (--gbics, --juniper, --sfpcables, etc.)

Hype Cycle Engine enhancements:
- Data-driven enrichment from scraped vendor/price data
- Revenue lifecycle prediction (peak year, decline, revenue index)
- Regional adoption model (NA, China, APAC, Europe, RoW with lag coefficients)
- New API endpoints: /enriched, /lifecycle, /regional/:tech

DB growth: 89 → 1,168 transceivers, 0 → 416 prices, 6 vendors
Qdrant: 1,162 products embedded with nomic-embed-text

Research: Norton-Bass model, standards-to-market timelines, hype signals
2026-03-28 02:30:19 +13:00

387 lines
15 KiB
TypeScript

/**
* Flexoptix Product Catalog Scraper
*
* Scrapes flexoptix.net product catalog for transceiver specs and pricing.
* This is our own data — no restrictions.
*
* Strategy: Use the Magento search/suggest AJAX API which returns JSON
* with product names, URLs, prices, and SKUs. We query by form factor
* keywords to enumerate the full catalog.
*
* Rate limited: 1 req/sec.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.flexoptix.net";
const SEARCH_URL = `${BASE}/en/search/ajax/suggest/`;
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix)",
Accept: "application/json, text/html",
};
// Search queries that cover the full transceiver catalog
const SEARCH_QUERIES = [
// By form factor
{ query: "SFP 1G", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "SFP BiDi", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "SFP CWDM", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "SFP DWDM", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "SFP copper", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "SFP+ 10G", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ SR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ LR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ ER", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ ZR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ BiDi", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ CWDM", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ DWDM", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ DAC", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "SFP+ AOC", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "25G SFP28", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "SFP28 SR", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "SFP28 LR", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "SFP28 DWDM", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "SFP28 DAC", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "SFP28 AOC", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ query: "QSFP+ 40G", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ query: "QSFP+ SR4", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ query: "QSFP+ LR4", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ query: "QSFP+ DAC", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ query: "QSFP+ AOC", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ query: "QSFP28 100G", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 SR4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 LR4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 ER4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 CWDM4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 PSM4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 DAC", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP28 AOC", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ query: "QSFP56 200G", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ query: "QSFP-DD 400G", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD DR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD FR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD LR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD SR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD ZR", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "QSFP-DD800 800G", formFactor: "QSFP-DD800", speed: "800G", speedGbps: 800 },
{ query: "OSFP 400G", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP SR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP DR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP FR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP LR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP ZR", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
{ query: "OSFP 800G", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
// Generic searches to catch stragglers
{ query: "transceiver SR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "transceiver LR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "transceiver ER", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "transceiver ZR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ query: "transceiver BiDi", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ query: "coherent 400ZR", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ query: "coherent ZR+", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
];
interface Product {
name: string;
partNumber: string;
url: string;
price?: number;
currency?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000],
[/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "100m", 100],
[/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
[/\bCWDM4\b/i, "2km", 2000],
[/\bPSM4\b/i, "500m", 500],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper";
if (/aoc|active optical/i.test(text)) return "AOC";
return "";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
if (match) return match[1];
return "";
}
function inferFormFactor(name: string, defaultFF: string): string {
const lower = name.toLowerCase();
if (lower.includes("osfp224")) return "OSFP224";
if (lower.includes("osfp112")) return "OSFP112";
if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP";
if (lower.includes("qsfp-dd800")) return "QSFP-DD800";
if (lower.includes("qsfp-dd")) return "QSFP-DD";
if (lower.includes("qsfp112")) return "QSFP112";
if (lower.includes("qsfp56")) return "QSFP56";
if (lower.includes("qsfp28")) return "QSFP28";
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
if (lower.includes("sfp56")) return "SFP56";
if (lower.includes("sfp28")) return "SFP28";
if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+";
if (lower.includes("cfp2")) return "CFP2";
if (lower.includes("xfp")) return "XFP";
if (/\bsfp\b/i.test(lower) && !lower.includes("qsfp")) return "SFP";
return defaultFF;
}
function inferSpeed(name: string, defaultGbps: number): number {
const patterns: [RegExp, number][] = [
[/\b1\.6\s*T\b/i, 1600],
[/\b800\s*G\b/i, 800],
[/\b400\s*G\b/i, 400],
[/\b200\s*G\b/i, 200],
[/\b100\s*G\b/i, 100],
[/\b50\s*G\b/i, 50],
[/\b40\s*G\b/i, 40],
[/\b25\s*G\b/i, 25],
[/\b10\s*G\b/i, 10],
[/\b1\s*G\b/i, 1],
];
for (const [regex, gbps] of patterns) {
if (regex.test(name)) return gbps;
}
return defaultGbps;
}
function speedLabel(gbps: number): string {
if (gbps >= 1000) return `${gbps / 1000}T`;
return `${gbps}G`;
}
interface SearchResult {
title: string;
url: string;
price?: string;
sku?: string;
}
async function searchProducts(query: string): Promise<SearchResult[]> {
const url = `${SEARCH_URL}?q=${encodeURIComponent(query)}`;
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(15000) });
if (!resp.ok) return [];
const text = await resp.text();
// The response may be JSON or HTML with embedded product data
// Try JSON parse first
try {
const data = JSON.parse(text);
const results: SearchResult[] = [];
/** Extract price from Magento price HTML (data-price-amount="39.64") or plain number */
function extractPrice(priceField: unknown): string | undefined {
if (!priceField) return undefined;
const s = String(priceField);
// Try data-price-amount attribute first (Magento Hyva theme)
const attrMatch = s.match(/data-price-amount="([\d.]+)"/);
if (attrMatch) return attrMatch[1];
// Try plain price text like "39.64 EUR"
const textMatch = s.match(/([\d.]+)\s*EUR/i);
if (textMatch) return textMatch[1];
// Try bare number
const num = parseFloat(s);
if (!isNaN(num) && num > 0) return String(num);
return undefined;
}
// Handle various Magento search response formats
if (Array.isArray(data)) {
for (const item of data) {
if (item.title && item.url) {
results.push({
title: item.title,
url: item.url,
price: extractPrice(item.price),
sku: item.sku,
});
}
}
} else if (data.products && Array.isArray(data.products)) {
for (const item of data.products) {
results.push({
title: item.title || item.name || "",
url: item.url || item.product_url || "",
price: extractPrice(item.price),
sku: item.sku,
});
}
} else if (typeof data === "object") {
// Iterate over all keys looking for product arrays
for (const key of Object.keys(data)) {
const val = data[key];
if (Array.isArray(val)) {
for (const item of val) {
if (item && typeof item === "object" && (item.title || item.name) && item.url) {
results.push({
title: item.title || item.name,
url: item.url,
price: extractPrice(item.price),
sku: item.sku,
});
}
}
}
}
}
return results;
} catch {
// Not JSON — parse as HTML
const results: SearchResult[] = [];
const linkRegex = /href="([^"]*\.html)"[^>]*>([^<]{3,})<\/a>/gi;
let match;
while ((match = linkRegex.exec(text)) !== null) {
const pUrl = match[1];
const title = match[2].trim();
if (title.length < 5) continue;
// Look for price near this match
const context = text.slice(match.index, match.index + 500);
const priceMatch = context.match(/(?:€|EUR)\s*([\d.,]+)/i) || context.match(/([\d.,]+)\s*(?:€|EUR)/i);
results.push({
title,
url: pUrl.startsWith("http") ? pUrl : BASE + pUrl,
price: priceMatch ? priceMatch[1].replace(",", ".") : undefined,
});
}
return results;
}
}
export async function scrapeFlexoptixCatalog(): Promise<void> {
console.log("=== Flexoptix Catalog Scraper Starting ===\n");
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
const allProducts = new Map<string, Product>();
let priceUpdates = 0;
for (const sq of SEARCH_QUERIES) {
console.log(` Searching: "${sq.query}"`);
try {
const results = await searchProducts(sq.query);
let newCount = 0;
for (const r of results) {
// Skip non-product results
if (!r.url || !r.title) continue;
const key = r.url;
if (allProducts.has(key)) continue;
const name = r.title;
const formFactor = inferFormFactor(name, sq.formFactor);
const gbps = inferSpeed(name, sq.speedGbps);
const reach = detectReach(name);
const price = r.price ? parseFloat(r.price.replace(",", ".")) : undefined;
allProducts.set(key, {
name,
partNumber: r.sku || name.replace(/\s+/g, "-").slice(0, 80),
url: r.url.startsWith("http") ? r.url : BASE + r.url,
price: price && price > 0 && price < 100000 ? price : undefined,
currency: price ? "EUR" : undefined,
formFactor,
speed: speedLabel(gbps),
speedGbps: gbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(name),
wavelength: detectWavelength(name),
});
newCount++;
}
if (newCount > 0) console.log(` +${newCount} new (${results.length} results)`);
} catch (err) {
console.warn(` Search failed: ${(err as Error).message.slice(0, 60)}`);
}
await sleep(1000);
}
console.log(`\nTotal unique products: ${allProducts.size}`);
console.log("Writing to database...\n");
// Write all products to DB
for (const product of allProducts.values()) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: product.currency || "EUR",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
console.log(`\n=== Flexoptix Catalog Complete: ${allProducts.size} products, ${priceUpdates} prices ===`);
}
if (require.main === module) {
scrapeFlexoptixCatalog()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}