Rene Fichtmueller c6308e93c0 feat: massive scraper expansion + hype cycle engine + lifecycle prediction
New scrapers:
- GBICS.com (BigCommerce, GBP prices, 10 categories, 78 products)
- Juniper HCT (Next.js SSR parser, 475 transceivers with specs/EOL)
- SFPcables.com (Magento store, 16 categories, 78 products)
- Fluxlight (BigCommerce, 6 pages, 118 products)
- Champion ONE (compatible vendor scraper)

Scraper fixes:
- 10Gtek: rewritten to parse HTML spec tables (152 products)
- Flexoptix: fix price extraction from Magento Hyva HTML
- Register all scrapers in CLI (--gbics, --juniper, --sfpcables, etc.)

Hype Cycle Engine enhancements:
- Data-driven enrichment from scraped vendor/price data
- Revenue lifecycle prediction (peak year, decline, revenue index)
- Regional adoption model (NA, China, APAC, Europe, RoW with lag coefficients)
- New API endpoints: /enriched, /lifecycle, /regional/:tech

DB growth: 89 → 1,168 transceivers, 0 → 416 prices, 6 vendors
Qdrant: 1,162 products embedded with nomic-embed-text

Research: Norton-Bass model, standards-to-market timelines, hype signals
2026-03-28 02:30:19 +13:00

242 lines
8.8 KiB
TypeScript

/**
* Juniper HCT Scraper — OEM Hardware Compatibility Tool
*
* apps.juniper.net/hct — Next.js SSR app with product data embedded in
* self.__next_f.push() payloads. Transceivers category = 100001.
* Rich data: modelNumber, partNumber, distance, speedType, formFactor, EOL status.
* No prices (OEM), but excellent compatibility + spec data.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
const BASE = "https://apps.juniper.net/hct";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml",
};
const CATEGORIES = [
{ id: 100001, name: "Transceivers" },
];
interface JuniperTransceiver {
modelNumber: string;
partNumber: string;
description: string;
cableType: string;
distance: string;
speedType: string;
formFactor: string;
connectorType: string;
maxDistanceKm?: number;
maxDistanceLabel?: string;
isModelEol: boolean;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function parseSpeedGbps(speedType: string): { speed: string; speedGbps: number } {
const lower = speedType.toLowerCase();
if (lower.includes("800g")) return { speed: "800G", speedGbps: 800 };
if (lower.includes("400g")) return { speed: "400G", speedGbps: 400 };
if (lower.includes("200g")) return { speed: "200G", speedGbps: 200 };
if (lower.includes("100g")) return { speed: "100G", speedGbps: 100 };
if (lower.includes("40g")) return { speed: "40G", speedGbps: 40 };
if (lower.includes("25g")) return { speed: "25G", speedGbps: 25 };
if (lower.includes("10g")) return { speed: "10G", speedGbps: 10 };
if (lower.includes("1g") || lower.includes("1000")) return { speed: "1G", speedGbps: 1 };
return { speed: speedType || "Unknown", speedGbps: 0 };
}
function normalizeFormFactor(ff: string): string {
const upper = ff.toUpperCase().trim();
if (upper.includes("QSFP-DD") || upper.includes("QSFPDD")) return "QSFP-DD";
if (upper.includes("QSFP28")) return "QSFP28";
if (upper.includes("QSFP+") || upper === "QSFP") return "QSFP+";
if (upper.includes("OSFP")) return "OSFP";
if (upper.includes("CFP2")) return "CFP2";
if (upper.includes("CFP4")) return "CFP4";
if (upper.includes("CFP")) return "CFP";
if (upper.includes("SFP56")) return "SFP56";
if (upper.includes("SFP28")) return "SFP28";
if (upper.includes("SFP+")) return "SFP+";
if (upper.includes("XFP")) return "XFP";
if (upper.includes("SFP")) return "SFP";
return ff || "SFP";
}
function detectFiber(cableType: string, description: string): string {
const text = `${cableType} ${description}`.toLowerCase();
if (/smf|single.?mode/.test(text)) return "SMF";
if (/mmf|multi.?mode/.test(text)) return "MMF";
if (/copper|dac|twinax|cat\s*[56]|rj.?45|base-t/.test(text)) return "Copper";
return "";
}
function parseDistance(distance: string): { label: string; meters: number } | undefined {
if (!distance) return undefined;
const km = distance.match(/([\d.]+)\s*km/i);
if (km) return { label: `${km[1]}km`, meters: Math.round(parseFloat(km[1]) * 1000) };
const m = distance.match(/([\d.]+)\s*m\b/i);
if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) };
return undefined;
}
function detectWavelength(description: string): string {
const match = description.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
/**
* Extract transceiver data from Next.js SSR payload.
* Data is embedded in self.__next_f.push([...]) with escaped JSON (\" not ").
* Strategy: unescape the HTML, find categoryDetail array, parse each object.
*/
function parseNextJsData(html: string): JuniperTransceiver[] {
const transceivers: JuniperTransceiver[] = [];
// Unescape the escaped JSON (\" → ", \\ → \)
const unescaped = html.replace(/\\"/g, '"').replace(/\\\\"/g, '\\"');
// Find categoryDetail array and extract individual objects
const detailIdx = unescaped.indexOf('"categoryDetail":[');
if (detailIdx === -1) {
console.log(" Warning: categoryDetail not found in HTML");
return transceivers;
}
// Extract from categoryDetail to end of array
const arrayStart = unescaped.indexOf("[", detailIdx);
if (arrayStart === -1) return transceivers;
// Use regex to find each transceiver object by modelNumber
const modelRegex = /"modelNumber"\s*:\s*"([^"]+)"/g;
const seen = new Set<string>();
let match;
while ((match = modelRegex.exec(unescaped)) !== null) {
const modelNumber = match[1];
if (seen.has(modelNumber)) continue;
seen.add(modelNumber);
// Extract chunk around this model
const idx = match.index;
const objStart = unescaped.lastIndexOf("{", idx);
const chunk = unescaped.slice(objStart, objStart + 2000);
const getString = (field: string): string => {
const re = new RegExp(`"${field}"\\s*:\\s*"([^"]*)"`, "i");
const m = chunk.match(re);
return m ? m[1] : "";
};
// For array fields like cableType:["SMF"], speedType:[{speed:"100G"}], formFactor:["CFP"]
const getArrayFirst = (field: string): string => {
// Try ["value"] pattern
const arrRe = new RegExp(`"${field}"\\s*:\\s*\\[\\s*"([^"]*)"`, "i");
const arrM = chunk.match(arrRe);
if (arrM) return arrM[1];
// Try [{speed:"value"}] pattern
const objRe = new RegExp(`"${field}"\\s*:\\s*\\[\\s*\\{\\s*"\\w+"\\s*:\\s*"([^"]*)"`, "i");
const objM = chunk.match(objRe);
if (objM) return objM[1];
return getString(field);
};
const getBool = (field: string): boolean => {
const re = new RegExp(`"${field}"\\s*:\\s*(true|false)`, "i");
const m = chunk.match(re);
return m ? m[1] === "true" : false;
};
const getNum = (field: string): number | undefined => {
const re = new RegExp(`"${field}"\\s*:\\s*(\\d+(?:\\.\\d+)?)`, "i");
const m = chunk.match(re);
return m ? parseFloat(m[1]) : undefined;
};
// Extract distance from array like ["40 km"] or from maxDistanceLabel
const distArr = chunk.match(/"distance"\s*:\s*\[\s*"([^"]*)"/i);
const distance = distArr ? distArr[1] : getString("maxDistanceLabel");
transceivers.push({
modelNumber,
partNumber: getString("partNumber") || getString("oldPartNumber") || modelNumber,
description: getString("description"),
cableType: getArrayFirst("cableType"),
distance,
speedType: getArrayFirst("speedType"),
formFactor: getArrayFirst("formFactor"),
connectorType: getString("connectorType"),
maxDistanceKm: getNum("maxDistanceKm"),
maxDistanceLabel: getString("maxDistanceLabel"),
isModelEol: getBool("isModelEol"),
});
}
return transceivers;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(60000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeJuniperHct(): Promise<void> {
console.log("=== Juniper HCT Scraper Starting ===\n");
const vendorId = await ensureVendor("Juniper Networks", "oem", "https://www.juniper.net", "https://apps.juniper.net/hct/");
let totalProducts = 0;
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.name} (category ${cat.id}) ---`);
try {
const html = await fetchPage(`${BASE}/category/${cat.id}`);
console.log(` Fetched ${(html.length / 1024).toFixed(0)}KB`);
const transceivers = parseNextJsData(html);
console.log(` Parsed ${transceivers.length} transceivers`);
for (const tx of transceivers) {
try {
const speedInfo = parseSpeedGbps(tx.speedType || tx.description);
const distInfo = tx.maxDistanceKm
? { label: `${tx.maxDistanceKm}km`, meters: Math.round(tx.maxDistanceKm * 1000) }
: parseDistance(tx.distance);
const formFactor = normalizeFormFactor(tx.formFactor);
await findOrCreateScrapedTransceiver({
partNumber: tx.modelNumber, vendorId,
formFactor, speedGbps: speedInfo.speedGbps,
speed: speedInfo.speed, reachMeters: distInfo?.meters,
reachLabel: distInfo?.label,
fiberType: detectFiber(tx.cableType, tx.description),
wavelengths: detectWavelength(tx.description),
category: "DataCenter",
});
totalProducts++;
} catch (err) {
console.warn(` Error [${tx.modelNumber}]: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== Juniper HCT Complete: ${totalProducts} transceivers (no prices - OEM) ===`);
}
if (require.main === module) {
scrapeJuniperHct()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}