New scrapers: - GBICS.com (BigCommerce, GBP prices, 10 categories, 78 products) - Juniper HCT (Next.js SSR parser, 475 transceivers with specs/EOL) - SFPcables.com (Magento store, 16 categories, 78 products) - Fluxlight (BigCommerce, 6 pages, 118 products) - Champion ONE (compatible vendor scraper) Scraper fixes: - 10Gtek: rewritten to parse HTML spec tables (152 products) - Flexoptix: fix price extraction from Magento Hyva HTML - Register all scrapers in CLI (--gbics, --juniper, --sfpcables, etc.) Hype Cycle Engine enhancements: - Data-driven enrichment from scraped vendor/price data - Revenue lifecycle prediction (peak year, decline, revenue index) - Regional adoption model (NA, China, APAC, Europe, RoW with lag coefficients) - New API endpoints: /enriched, /lifecycle, /regional/:tech DB growth: 89 → 1,168 transceivers, 0 → 416 prices, 6 vendors Qdrant: 1,162 products embedded with nomic-embed-text Research: Norton-Bass model, standards-to-market timelines, hype signals
387 lines
15 KiB
TypeScript
387 lines
15 KiB
TypeScript
/**
|
|
* Flexoptix Product Catalog Scraper
|
|
*
|
|
* Scrapes flexoptix.net product catalog for transceiver specs and pricing.
|
|
* This is our own data — no restrictions.
|
|
*
|
|
* Strategy: Use the Magento search/suggest AJAX API which returns JSON
|
|
* with product names, URLs, prices, and SKUs. We query by form factor
|
|
* keywords to enumerate the full catalog.
|
|
*
|
|
* Rate limited: 1 req/sec.
|
|
*/
|
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
|
import { contentHash } from "../utils/hash";
|
|
|
|
const BASE = "https://www.flexoptix.net";
|
|
const SEARCH_URL = `${BASE}/en/search/ajax/suggest/`;
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix)",
|
|
Accept: "application/json, text/html",
|
|
};
|
|
|
|
// Search queries that cover the full transceiver catalog
|
|
const SEARCH_QUERIES = [
|
|
// By form factor
|
|
{ query: "SFP 1G", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "SFP BiDi", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "SFP CWDM", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "SFP DWDM", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "SFP copper", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "SFP+ 10G", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ SR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ LR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ ER", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ ZR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ BiDi", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ CWDM", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ DWDM", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ DAC", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "SFP+ AOC", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "25G SFP28", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "SFP28 SR", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "SFP28 LR", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "SFP28 DWDM", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "SFP28 DAC", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "SFP28 AOC", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
|
{ query: "QSFP+ 40G", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
|
{ query: "QSFP+ SR4", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
|
{ query: "QSFP+ LR4", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
|
{ query: "QSFP+ DAC", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
|
{ query: "QSFP+ AOC", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
|
{ query: "QSFP28 100G", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 SR4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 LR4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 ER4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 CWDM4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 PSM4", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 DAC", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP28 AOC", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
|
{ query: "QSFP56 200G", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
|
{ query: "QSFP-DD 400G", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD DR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD FR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD LR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD SR4", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD ZR", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "QSFP-DD800 800G", formFactor: "QSFP-DD800", speed: "800G", speedGbps: 800 },
|
|
{ query: "OSFP 400G", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP SR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP DR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP FR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP LR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP ZR", formFactor: "OSFP", speed: "400G", speedGbps: 400 },
|
|
{ query: "OSFP 800G", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
|
// Generic searches to catch stragglers
|
|
{ query: "transceiver SR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "transceiver LR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "transceiver ER", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "transceiver ZR", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
|
{ query: "transceiver BiDi", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
|
{ query: "coherent 400ZR", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
{ query: "coherent ZR+", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
|
];
|
|
|
|
interface Product {
|
|
name: string;
|
|
partNumber: string;
|
|
url: string;
|
|
price?: number;
|
|
currency?: string;
|
|
formFactor: string;
|
|
speed: string;
|
|
speedGbps: number;
|
|
reachLabel?: string;
|
|
reachMeters?: number;
|
|
fiberType?: string;
|
|
wavelength?: string;
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/\b120\s*km\b/i, "120km", 120000],
|
|
[/\b80\s*km\b/i, "80km", 80000],
|
|
[/\b40\s*km\b/i, "40km", 40000],
|
|
[/\b20\s*km\b/i, "20km", 20000],
|
|
[/\b10\s*km\b/i, "10km", 10000],
|
|
[/\b2\s*km\b/i, "2km", 2000],
|
|
[/\b500\s*m\b/i, "500m", 500],
|
|
[/\b300\s*m\b/i, "300m", 300],
|
|
[/\b100\s*m\b/i, "100m", 100],
|
|
[/\bLR4\b/, "10km", 10000],
|
|
[/\bLR\b/, "10km", 10000],
|
|
[/\bER4?\b/, "40km", 40000],
|
|
[/\bZR4?\b/, "80km", 80000],
|
|
[/\bSR4?\b/, "100m", 100],
|
|
[/\bDR4?\b/, "500m", 500],
|
|
[/\bFR4?\b/, "2km", 2000],
|
|
[/\bCWDM4\b/i, "2km", 2000],
|
|
[/\bPSM4\b/i, "500m", 500],
|
|
];
|
|
for (const [regex, label, meters] of patterns) {
|
|
if (regex.test(text)) return { label, meters };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectFiber(text: string): string {
|
|
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
|
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
|
if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper";
|
|
if (/aoc|active optical/i.test(text)) return "AOC";
|
|
return "";
|
|
}
|
|
|
|
function detectWavelength(text: string): string {
|
|
const match = text.match(/(\d{3,4})\s*nm/i);
|
|
if (match) return match[1];
|
|
return "";
|
|
}
|
|
|
|
function inferFormFactor(name: string, defaultFF: string): string {
|
|
const lower = name.toLowerCase();
|
|
if (lower.includes("osfp224")) return "OSFP224";
|
|
if (lower.includes("osfp112")) return "OSFP112";
|
|
if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP";
|
|
if (lower.includes("qsfp-dd800")) return "QSFP-DD800";
|
|
if (lower.includes("qsfp-dd")) return "QSFP-DD";
|
|
if (lower.includes("qsfp112")) return "QSFP112";
|
|
if (lower.includes("qsfp56")) return "QSFP56";
|
|
if (lower.includes("qsfp28")) return "QSFP28";
|
|
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
|
|
if (lower.includes("sfp56")) return "SFP56";
|
|
if (lower.includes("sfp28")) return "SFP28";
|
|
if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+";
|
|
if (lower.includes("cfp2")) return "CFP2";
|
|
if (lower.includes("xfp")) return "XFP";
|
|
if (/\bsfp\b/i.test(lower) && !lower.includes("qsfp")) return "SFP";
|
|
return defaultFF;
|
|
}
|
|
|
|
function inferSpeed(name: string, defaultGbps: number): number {
|
|
const patterns: [RegExp, number][] = [
|
|
[/\b1\.6\s*T\b/i, 1600],
|
|
[/\b800\s*G\b/i, 800],
|
|
[/\b400\s*G\b/i, 400],
|
|
[/\b200\s*G\b/i, 200],
|
|
[/\b100\s*G\b/i, 100],
|
|
[/\b50\s*G\b/i, 50],
|
|
[/\b40\s*G\b/i, 40],
|
|
[/\b25\s*G\b/i, 25],
|
|
[/\b10\s*G\b/i, 10],
|
|
[/\b1\s*G\b/i, 1],
|
|
];
|
|
for (const [regex, gbps] of patterns) {
|
|
if (regex.test(name)) return gbps;
|
|
}
|
|
return defaultGbps;
|
|
}
|
|
|
|
function speedLabel(gbps: number): string {
|
|
if (gbps >= 1000) return `${gbps / 1000}T`;
|
|
return `${gbps}G`;
|
|
}
|
|
|
|
interface SearchResult {
|
|
title: string;
|
|
url: string;
|
|
price?: string;
|
|
sku?: string;
|
|
}
|
|
|
|
async function searchProducts(query: string): Promise<SearchResult[]> {
|
|
const url = `${SEARCH_URL}?q=${encodeURIComponent(query)}`;
|
|
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(15000) });
|
|
if (!resp.ok) return [];
|
|
|
|
const text = await resp.text();
|
|
|
|
// The response may be JSON or HTML with embedded product data
|
|
// Try JSON parse first
|
|
try {
|
|
const data = JSON.parse(text);
|
|
const results: SearchResult[] = [];
|
|
|
|
/** Extract price from Magento price HTML (data-price-amount="39.64") or plain number */
|
|
function extractPrice(priceField: unknown): string | undefined {
|
|
if (!priceField) return undefined;
|
|
const s = String(priceField);
|
|
// Try data-price-amount attribute first (Magento Hyva theme)
|
|
const attrMatch = s.match(/data-price-amount="([\d.]+)"/);
|
|
if (attrMatch) return attrMatch[1];
|
|
// Try plain price text like "39.64 EUR"
|
|
const textMatch = s.match(/([\d.]+)\s*EUR/i);
|
|
if (textMatch) return textMatch[1];
|
|
// Try bare number
|
|
const num = parseFloat(s);
|
|
if (!isNaN(num) && num > 0) return String(num);
|
|
return undefined;
|
|
}
|
|
|
|
// Handle various Magento search response formats
|
|
if (Array.isArray(data)) {
|
|
for (const item of data) {
|
|
if (item.title && item.url) {
|
|
results.push({
|
|
title: item.title,
|
|
url: item.url,
|
|
price: extractPrice(item.price),
|
|
sku: item.sku,
|
|
});
|
|
}
|
|
}
|
|
} else if (data.products && Array.isArray(data.products)) {
|
|
for (const item of data.products) {
|
|
results.push({
|
|
title: item.title || item.name || "",
|
|
url: item.url || item.product_url || "",
|
|
price: extractPrice(item.price),
|
|
sku: item.sku,
|
|
});
|
|
}
|
|
} else if (typeof data === "object") {
|
|
// Iterate over all keys looking for product arrays
|
|
for (const key of Object.keys(data)) {
|
|
const val = data[key];
|
|
if (Array.isArray(val)) {
|
|
for (const item of val) {
|
|
if (item && typeof item === "object" && (item.title || item.name) && item.url) {
|
|
results.push({
|
|
title: item.title || item.name,
|
|
url: item.url,
|
|
price: extractPrice(item.price),
|
|
sku: item.sku,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return results;
|
|
} catch {
|
|
// Not JSON — parse as HTML
|
|
const results: SearchResult[] = [];
|
|
const linkRegex = /href="([^"]*\.html)"[^>]*>([^<]{3,})<\/a>/gi;
|
|
let match;
|
|
while ((match = linkRegex.exec(text)) !== null) {
|
|
const pUrl = match[1];
|
|
const title = match[2].trim();
|
|
if (title.length < 5) continue;
|
|
|
|
// Look for price near this match
|
|
const context = text.slice(match.index, match.index + 500);
|
|
const priceMatch = context.match(/(?:€|EUR)\s*([\d.,]+)/i) || context.match(/([\d.,]+)\s*(?:€|EUR)/i);
|
|
|
|
results.push({
|
|
title,
|
|
url: pUrl.startsWith("http") ? pUrl : BASE + pUrl,
|
|
price: priceMatch ? priceMatch[1].replace(",", ".") : undefined,
|
|
});
|
|
}
|
|
return results;
|
|
}
|
|
}
|
|
|
|
export async function scrapeFlexoptixCatalog(): Promise<void> {
|
|
console.log("=== Flexoptix Catalog Scraper Starting ===\n");
|
|
|
|
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
|
|
|
|
const allProducts = new Map<string, Product>();
|
|
let priceUpdates = 0;
|
|
|
|
for (const sq of SEARCH_QUERIES) {
|
|
console.log(` Searching: "${sq.query}"`);
|
|
|
|
try {
|
|
const results = await searchProducts(sq.query);
|
|
let newCount = 0;
|
|
|
|
for (const r of results) {
|
|
// Skip non-product results
|
|
if (!r.url || !r.title) continue;
|
|
const key = r.url;
|
|
if (allProducts.has(key)) continue;
|
|
|
|
const name = r.title;
|
|
const formFactor = inferFormFactor(name, sq.formFactor);
|
|
const gbps = inferSpeed(name, sq.speedGbps);
|
|
const reach = detectReach(name);
|
|
const price = r.price ? parseFloat(r.price.replace(",", ".")) : undefined;
|
|
|
|
allProducts.set(key, {
|
|
name,
|
|
partNumber: r.sku || name.replace(/\s+/g, "-").slice(0, 80),
|
|
url: r.url.startsWith("http") ? r.url : BASE + r.url,
|
|
price: price && price > 0 && price < 100000 ? price : undefined,
|
|
currency: price ? "EUR" : undefined,
|
|
formFactor,
|
|
speed: speedLabel(gbps),
|
|
speedGbps: gbps,
|
|
reachLabel: reach?.label,
|
|
reachMeters: reach?.meters,
|
|
fiberType: detectFiber(name),
|
|
wavelength: detectWavelength(name),
|
|
});
|
|
newCount++;
|
|
}
|
|
|
|
if (newCount > 0) console.log(` +${newCount} new (${results.length} results)`);
|
|
} catch (err) {
|
|
console.warn(` Search failed: ${(err as Error).message.slice(0, 60)}`);
|
|
}
|
|
|
|
await sleep(1000);
|
|
}
|
|
|
|
console.log(`\nTotal unique products: ${allProducts.size}`);
|
|
console.log("Writing to database...\n");
|
|
|
|
// Write all products to DB
|
|
for (const product of allProducts.values()) {
|
|
try {
|
|
const txId = await findOrCreateScrapedTransceiver({
|
|
partNumber: product.partNumber,
|
|
vendorId,
|
|
formFactor: product.formFactor,
|
|
speedGbps: product.speedGbps,
|
|
speed: product.speed,
|
|
reachMeters: product.reachMeters,
|
|
reachLabel: product.reachLabel,
|
|
fiberType: product.fiberType,
|
|
wavelengths: product.wavelength,
|
|
category: "DataCenter",
|
|
});
|
|
|
|
if (product.price && product.price > 0) {
|
|
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
|
const updated = await upsertPriceObservation({
|
|
transceiverId: txId,
|
|
sourceVendorId: vendorId,
|
|
price: product.price,
|
|
currency: product.currency || "EUR",
|
|
stockLevel: "in_stock",
|
|
url: product.url,
|
|
contentHash: hash,
|
|
});
|
|
if (updated) priceUpdates++;
|
|
}
|
|
} catch (err) {
|
|
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\n=== Flexoptix Catalog Complete: ${allProducts.size} products, ${priceUpdates} prices ===`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeFlexoptixCatalog()
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|