New scrapers: - GBICS.com (BigCommerce, GBP prices, 10 categories, 78 products) - Juniper HCT (Next.js SSR parser, 475 transceivers with specs/EOL) - SFPcables.com (Magento store, 16 categories, 78 products) - Fluxlight (BigCommerce, 6 pages, 118 products) - Champion ONE (compatible vendor scraper) Scraper fixes: - 10Gtek: rewritten to parse HTML spec tables (152 products) - Flexoptix: fix price extraction from Magento Hyva HTML - Register all scrapers in CLI (--gbics, --juniper, --sfpcables, etc.) Hype Cycle Engine enhancements: - Data-driven enrichment from scraped vendor/price data - Revenue lifecycle prediction (peak year, decline, revenue index) - Regional adoption model (NA, China, APAC, Europe, RoW with lag coefficients) - New API endpoints: /enriched, /lifecycle, /regional/:tech DB growth: 89 → 1,168 transceivers, 0 → 416 prices, 6 vendors Qdrant: 1,162 products embedded with nomic-embed-text Research: Norton-Bass model, standards-to-market timelines, hype signals
242 lines
8.8 KiB
TypeScript
242 lines
8.8 KiB
TypeScript
/**
|
|
* Juniper HCT Scraper — OEM Hardware Compatibility Tool
|
|
*
|
|
* apps.juniper.net/hct — Next.js SSR app with product data embedded in
|
|
* self.__next_f.push() payloads. Transceivers category = 100001.
|
|
* Rich data: modelNumber, partNumber, distance, speedType, formFactor, EOL status.
|
|
* No prices (OEM), but excellent compatibility + spec data.
|
|
*/
|
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
|
|
|
const BASE = "https://apps.juniper.net/hct";
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
|
Accept: "text/html,application/xhtml+xml",
|
|
};
|
|
|
|
const CATEGORIES = [
|
|
{ id: 100001, name: "Transceivers" },
|
|
];
|
|
|
|
interface JuniperTransceiver {
|
|
modelNumber: string;
|
|
partNumber: string;
|
|
description: string;
|
|
cableType: string;
|
|
distance: string;
|
|
speedType: string;
|
|
formFactor: string;
|
|
connectorType: string;
|
|
maxDistanceKm?: number;
|
|
maxDistanceLabel?: string;
|
|
isModelEol: boolean;
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function parseSpeedGbps(speedType: string): { speed: string; speedGbps: number } {
|
|
const lower = speedType.toLowerCase();
|
|
if (lower.includes("800g")) return { speed: "800G", speedGbps: 800 };
|
|
if (lower.includes("400g")) return { speed: "400G", speedGbps: 400 };
|
|
if (lower.includes("200g")) return { speed: "200G", speedGbps: 200 };
|
|
if (lower.includes("100g")) return { speed: "100G", speedGbps: 100 };
|
|
if (lower.includes("40g")) return { speed: "40G", speedGbps: 40 };
|
|
if (lower.includes("25g")) return { speed: "25G", speedGbps: 25 };
|
|
if (lower.includes("10g")) return { speed: "10G", speedGbps: 10 };
|
|
if (lower.includes("1g") || lower.includes("1000")) return { speed: "1G", speedGbps: 1 };
|
|
return { speed: speedType || "Unknown", speedGbps: 0 };
|
|
}
|
|
|
|
function normalizeFormFactor(ff: string): string {
|
|
const upper = ff.toUpperCase().trim();
|
|
if (upper.includes("QSFP-DD") || upper.includes("QSFPDD")) return "QSFP-DD";
|
|
if (upper.includes("QSFP28")) return "QSFP28";
|
|
if (upper.includes("QSFP+") || upper === "QSFP") return "QSFP+";
|
|
if (upper.includes("OSFP")) return "OSFP";
|
|
if (upper.includes("CFP2")) return "CFP2";
|
|
if (upper.includes("CFP4")) return "CFP4";
|
|
if (upper.includes("CFP")) return "CFP";
|
|
if (upper.includes("SFP56")) return "SFP56";
|
|
if (upper.includes("SFP28")) return "SFP28";
|
|
if (upper.includes("SFP+")) return "SFP+";
|
|
if (upper.includes("XFP")) return "XFP";
|
|
if (upper.includes("SFP")) return "SFP";
|
|
return ff || "SFP";
|
|
}
|
|
|
|
function detectFiber(cableType: string, description: string): string {
|
|
const text = `${cableType} ${description}`.toLowerCase();
|
|
if (/smf|single.?mode/.test(text)) return "SMF";
|
|
if (/mmf|multi.?mode/.test(text)) return "MMF";
|
|
if (/copper|dac|twinax|cat\s*[56]|rj.?45|base-t/.test(text)) return "Copper";
|
|
return "";
|
|
}
|
|
|
|
function parseDistance(distance: string): { label: string; meters: number } | undefined {
|
|
if (!distance) return undefined;
|
|
const km = distance.match(/([\d.]+)\s*km/i);
|
|
if (km) return { label: `${km[1]}km`, meters: Math.round(parseFloat(km[1]) * 1000) };
|
|
const m = distance.match(/([\d.]+)\s*m\b/i);
|
|
if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) };
|
|
return undefined;
|
|
}
|
|
|
|
function detectWavelength(description: string): string {
|
|
const match = description.match(/(\d{3,4})\s*nm/i);
|
|
return match ? match[1] : "";
|
|
}
|
|
|
|
/**
|
|
* Extract transceiver data from Next.js SSR payload.
|
|
* Data is embedded in self.__next_f.push([...]) with escaped JSON (\" not ").
|
|
* Strategy: unescape the HTML, find categoryDetail array, parse each object.
|
|
*/
|
|
function parseNextJsData(html: string): JuniperTransceiver[] {
|
|
const transceivers: JuniperTransceiver[] = [];
|
|
|
|
// Unescape the escaped JSON (\" → ", \\ → \)
|
|
const unescaped = html.replace(/\\"/g, '"').replace(/\\\\"/g, '\\"');
|
|
|
|
// Find categoryDetail array and extract individual objects
|
|
const detailIdx = unescaped.indexOf('"categoryDetail":[');
|
|
if (detailIdx === -1) {
|
|
console.log(" Warning: categoryDetail not found in HTML");
|
|
return transceivers;
|
|
}
|
|
|
|
// Extract from categoryDetail to end of array
|
|
const arrayStart = unescaped.indexOf("[", detailIdx);
|
|
if (arrayStart === -1) return transceivers;
|
|
|
|
// Use regex to find each transceiver object by modelNumber
|
|
const modelRegex = /"modelNumber"\s*:\s*"([^"]+)"/g;
|
|
const seen = new Set<string>();
|
|
let match;
|
|
|
|
while ((match = modelRegex.exec(unescaped)) !== null) {
|
|
const modelNumber = match[1];
|
|
if (seen.has(modelNumber)) continue;
|
|
seen.add(modelNumber);
|
|
|
|
// Extract chunk around this model
|
|
const idx = match.index;
|
|
const objStart = unescaped.lastIndexOf("{", idx);
|
|
const chunk = unescaped.slice(objStart, objStart + 2000);
|
|
|
|
const getString = (field: string): string => {
|
|
const re = new RegExp(`"${field}"\\s*:\\s*"([^"]*)"`, "i");
|
|
const m = chunk.match(re);
|
|
return m ? m[1] : "";
|
|
};
|
|
|
|
// For array fields like cableType:["SMF"], speedType:[{speed:"100G"}], formFactor:["CFP"]
|
|
const getArrayFirst = (field: string): string => {
|
|
// Try ["value"] pattern
|
|
const arrRe = new RegExp(`"${field}"\\s*:\\s*\\[\\s*"([^"]*)"`, "i");
|
|
const arrM = chunk.match(arrRe);
|
|
if (arrM) return arrM[1];
|
|
// Try [{speed:"value"}] pattern
|
|
const objRe = new RegExp(`"${field}"\\s*:\\s*\\[\\s*\\{\\s*"\\w+"\\s*:\\s*"([^"]*)"`, "i");
|
|
const objM = chunk.match(objRe);
|
|
if (objM) return objM[1];
|
|
return getString(field);
|
|
};
|
|
|
|
const getBool = (field: string): boolean => {
|
|
const re = new RegExp(`"${field}"\\s*:\\s*(true|false)`, "i");
|
|
const m = chunk.match(re);
|
|
return m ? m[1] === "true" : false;
|
|
};
|
|
|
|
const getNum = (field: string): number | undefined => {
|
|
const re = new RegExp(`"${field}"\\s*:\\s*(\\d+(?:\\.\\d+)?)`, "i");
|
|
const m = chunk.match(re);
|
|
return m ? parseFloat(m[1]) : undefined;
|
|
};
|
|
|
|
// Extract distance from array like ["40 km"] or from maxDistanceLabel
|
|
const distArr = chunk.match(/"distance"\s*:\s*\[\s*"([^"]*)"/i);
|
|
const distance = distArr ? distArr[1] : getString("maxDistanceLabel");
|
|
|
|
transceivers.push({
|
|
modelNumber,
|
|
partNumber: getString("partNumber") || getString("oldPartNumber") || modelNumber,
|
|
description: getString("description"),
|
|
cableType: getArrayFirst("cableType"),
|
|
distance,
|
|
speedType: getArrayFirst("speedType"),
|
|
formFactor: getArrayFirst("formFactor"),
|
|
connectorType: getString("connectorType"),
|
|
maxDistanceKm: getNum("maxDistanceKm"),
|
|
maxDistanceLabel: getString("maxDistanceLabel"),
|
|
isModelEol: getBool("isModelEol"),
|
|
});
|
|
}
|
|
|
|
return transceivers;
|
|
}
|
|
|
|
async function fetchPage(url: string): Promise<string> {
|
|
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(60000) });
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
|
return resp.text();
|
|
}
|
|
|
|
export async function scrapeJuniperHct(): Promise<void> {
|
|
console.log("=== Juniper HCT Scraper Starting ===\n");
|
|
|
|
const vendorId = await ensureVendor("Juniper Networks", "oem", "https://www.juniper.net", "https://apps.juniper.net/hct/");
|
|
|
|
let totalProducts = 0;
|
|
|
|
for (const cat of CATEGORIES) {
|
|
console.log(`\n--- ${cat.name} (category ${cat.id}) ---`);
|
|
|
|
try {
|
|
const html = await fetchPage(`${BASE}/category/${cat.id}`);
|
|
console.log(` Fetched ${(html.length / 1024).toFixed(0)}KB`);
|
|
|
|
const transceivers = parseNextJsData(html);
|
|
console.log(` Parsed ${transceivers.length} transceivers`);
|
|
|
|
for (const tx of transceivers) {
|
|
try {
|
|
const speedInfo = parseSpeedGbps(tx.speedType || tx.description);
|
|
const distInfo = tx.maxDistanceKm
|
|
? { label: `${tx.maxDistanceKm}km`, meters: Math.round(tx.maxDistanceKm * 1000) }
|
|
: parseDistance(tx.distance);
|
|
const formFactor = normalizeFormFactor(tx.formFactor);
|
|
|
|
await findOrCreateScrapedTransceiver({
|
|
partNumber: tx.modelNumber, vendorId,
|
|
formFactor, speedGbps: speedInfo.speedGbps,
|
|
speed: speedInfo.speed, reachMeters: distInfo?.meters,
|
|
reachLabel: distInfo?.label,
|
|
fiberType: detectFiber(tx.cableType, tx.description),
|
|
wavelengths: detectWavelength(tx.description),
|
|
category: "DataCenter",
|
|
});
|
|
|
|
totalProducts++;
|
|
} catch (err) {
|
|
console.warn(` Error [${tx.modelNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(` Category failed: ${(err as Error).message}`);
|
|
}
|
|
|
|
await sleep(2000);
|
|
}
|
|
|
|
console.log(`\n=== Juniper HCT Complete: ${totalProducts} transceivers (no prices - OEM) ===`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeJuniperHct()
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|