- Add utils/crawlee-config.ts: makeCrawleeConfig(name) returns a Crawlee Configuration with isolated localDataDirectory per scraper. Uses storageClientOptions (not global CRAWLEE_STORAGE_DIR) so concurrent pg-boss workers in the same process don't race on the shared env var. - Apply makeCrawleeConfig to all 6 Crawlee-based scrapers: optcore (PlaywrightCrawler), atgbics (PlaywrightCrawler), community-issues (CheerioCrawler + RequestQueue), edgecore (CheerioCrawler), ufispace (CheerioCrawler), market-intelligence (CheerioCrawler). - scheduler.ts: add withIsolatedStorage for optcore and market-intel workers (was missing, caused storage-fs path bleed from fs scraper). - ebay-enricher.ts: fix vendor type 'marketplace' -> 'reseller' to satisfy vendors_type_check constraint ['manufacturer','distributor','oem','reseller','compatible'].
200 lines
7.0 KiB
TypeScript
200 lines
7.0 KiB
TypeScript
/**
|
|
* UfiSpace Product Catalog Scraper
|
|
*
|
|
* Scrapes switch product pages from ufispace.com for specs and compatibility.
|
|
* UfiSpace publishes clean, well-structured product pages.
|
|
*
|
|
* Source: https://www.ufispace.com/products/datacenter-switches
|
|
*/
|
|
import { CheerioCrawler } from "crawlee";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
import { pool, ensureWhiteboxVendor, findOrCreateSwitch } from "../utils/db";
|
|
|
|
const BASE_URL = "https://www.ufispace.com";
|
|
const PRODUCT_URLS = [
|
|
`${BASE_URL}/products/datacenter-switches`,
|
|
`${BASE_URL}/networking-white-box`,
|
|
];
|
|
|
|
function extractPortsFromSpec(specText: string): {
|
|
portsConfig: Record<string, number>;
|
|
totalPorts: number;
|
|
maxSpeedGbps: number;
|
|
formFactors: string[];
|
|
} {
|
|
const portsConfig: Record<string, number> = {};
|
|
let totalPorts = 0;
|
|
let maxSpeedGbps = 0;
|
|
const formFactors: string[] = [];
|
|
|
|
const portPattern = /(\d+)\s*x\s*(\d+)\s*G(?:bE|b\/s)?\s*(QSFP-DD|QSFP28|QSFP\+|QSFP56|SFP28|SFP\+|SFP56|OSFP|CFP2)?/gi;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = portPattern.exec(specText)) !== null) {
|
|
const count = parseInt(match[1]);
|
|
const speed = parseInt(match[2]);
|
|
const ff = match[3]?.toUpperCase() || `${speed}G`;
|
|
const key = `${speed}G_${ff}`;
|
|
|
|
portsConfig[key] = (portsConfig[key] || 0) + count;
|
|
totalPorts += count;
|
|
maxSpeedGbps = Math.max(maxSpeedGbps, speed);
|
|
|
|
if (match[3] && !formFactors.includes(match[3].toUpperCase())) {
|
|
formFactors.push(match[3].toUpperCase());
|
|
}
|
|
}
|
|
|
|
return { portsConfig, totalPorts, maxSpeedGbps, formFactors };
|
|
}
|
|
|
|
function detectAsic(text: string): { vendor: string; model: string; series: string } {
|
|
const asicPatterns: Array<{ pattern: RegExp; vendor: string; model: string; series: string }> = [
|
|
{ pattern: /tomahawk\s*5/i, vendor: "Broadcom", model: "Tomahawk 5", series: "StrataDNX" },
|
|
{ pattern: /tomahawk\s*4/i, vendor: "Broadcom", model: "Tomahawk 4", series: "StrataDNX" },
|
|
{ pattern: /tomahawk\s*3/i, vendor: "Broadcom", model: "Tomahawk 3", series: "StrataDNX" },
|
|
{ pattern: /tomahawk\s*2/i, vendor: "Broadcom", model: "Tomahawk 2", series: "StrataDNX" },
|
|
{ pattern: /tomahawk/i, vendor: "Broadcom", model: "Tomahawk", series: "StrataDNX" },
|
|
{ pattern: /trident\s*(3|iii)/i, vendor: "Broadcom", model: "Trident III", series: "StrataDNX" },
|
|
{ pattern: /jericho\s*2/i, vendor: "Broadcom", model: "Jericho2", series: "StrataDNX" },
|
|
{ pattern: /spectrum/i, vendor: "NVIDIA", model: "Spectrum", series: "Spectrum" },
|
|
];
|
|
|
|
for (const { pattern, vendor, model, series } of asicPatterns) {
|
|
if (pattern.test(text)) {
|
|
return { vendor, model, series };
|
|
}
|
|
}
|
|
|
|
return { vendor: "Broadcom", model: "Unknown", series: "" };
|
|
}
|
|
|
|
export async function scrapeUfiSpace(): Promise<void> {
|
|
console.log("\n=== UfiSpace Scraper ===\n");
|
|
|
|
const vendorId = await ensureWhiteboxVendor("UfiSpace", "https://www.ufispace.com", {
|
|
isOdm: true,
|
|
ocpMember: true,
|
|
sonicContributor: true,
|
|
});
|
|
|
|
let created = 0;
|
|
let updated = 0;
|
|
|
|
const crawler = new CheerioCrawler({
|
|
maxConcurrency: 2,
|
|
maxRequestsPerMinute: 15,
|
|
requestHandlerTimeoutSecs: 30,
|
|
|
|
async requestHandler({ request, $, enqueueLinks }) {
|
|
// Product list pages — enqueue individual products
|
|
if (request.url.includes("products/") || request.url.includes("networking-white-box")) {
|
|
console.log(` Parsing: ${request.url}`);
|
|
|
|
const productLinks: string[] = [];
|
|
|
|
// Look for links to individual product pages
|
|
$("a").each((_i, el) => {
|
|
const href = $(el).attr("href") || "";
|
|
if (href.match(/\/S9[0-9]+-/i) || href.match(/\/product\//i)) {
|
|
const fullUrl = href.startsWith("http") ? href : `${BASE_URL}${href}`;
|
|
if (!productLinks.includes(fullUrl)) {
|
|
productLinks.push(fullUrl);
|
|
}
|
|
}
|
|
});
|
|
|
|
console.log(` Found ${productLinks.length} product links`);
|
|
for (const link of productLinks) {
|
|
await enqueueLinks({ urls: [link] });
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Individual product page
|
|
const pageText = $("body").text();
|
|
const title = $("h1, .product-title").first().text().trim();
|
|
|
|
if (!title) return;
|
|
|
|
// Extract model name (S9600-32X, S9700-53DX, etc.)
|
|
const modelMatch = title.match(/(S\d{4}-\d+[A-Z]*)/i) || pageText.match(/(S\d{4}-\d+[A-Z]*)/i);
|
|
if (!modelMatch) return;
|
|
|
|
const model = modelMatch[1];
|
|
const portInfo = extractPortsFromSpec(pageText);
|
|
const asicInfo = detectAsic(pageText);
|
|
|
|
if (portInfo.totalPorts === 0) return;
|
|
|
|
const powerMatch = pageText.match(/(?:max|maximum)\s*power[:\s]*(\d+)\s*W/i);
|
|
const cpuMatch = pageText.match(/(Intel\s+(?:Xeon|Atom|Core)[^\n,;]+)/i);
|
|
const ramMatch = pageText.match(/(\d+)\s*GB?\s*(?:DDR[34]|RAM|memory)/i);
|
|
const storageMatch = pageText.match(/(\d+)\s*GB?\s*(?:SSD|eMMC|M\.2)/i);
|
|
const switchCapMatch = pageText.match(/switching\s*capacity[:\s]*([\d.]+)\s*Tb/i);
|
|
|
|
const seriesMatch = model.match(/^(S\d{4})/);
|
|
const series = seriesMatch ? seriesMatch[1] : "";
|
|
|
|
// Determine category based on model/series
|
|
let category: "DataCenter" | "Edge" | "SP" = "DataCenter";
|
|
if (model.includes("9510") || pageText.toLowerCase().includes("cell site")) {
|
|
category = "Edge";
|
|
}
|
|
|
|
const existing = await pool.query(
|
|
`SELECT id FROM switches WHERE model = $1 AND vendor_id = $2`,
|
|
[model, vendorId]
|
|
);
|
|
const isNew = existing.rows.length === 0;
|
|
|
|
await findOrCreateSwitch({
|
|
model,
|
|
vendorId,
|
|
series,
|
|
category,
|
|
layer: "L3",
|
|
portsConfig: portInfo.portsConfig,
|
|
totalPorts: portInfo.totalPorts,
|
|
maxSpeedGbps: portInfo.maxSpeedGbps,
|
|
switchingCapacityTbps: switchCapMatch ? parseFloat(switchCapMatch[1]) : undefined,
|
|
asicVendor: asicInfo.vendor,
|
|
asicModel: asicInfo.model,
|
|
asicSeries: asicInfo.series,
|
|
maxPowerW: powerMatch ? parseInt(powerMatch[1]) : undefined,
|
|
cpu: cpuMatch ? cpuMatch[1].trim() : undefined,
|
|
ramGb: ramMatch ? parseInt(ramMatch[1]) : undefined,
|
|
storageGb: storageMatch ? parseInt(storageMatch[1]) : undefined,
|
|
sonicCompatible: true,
|
|
isWhitebox: true,
|
|
onieSupport: true,
|
|
supportedNos: ["SONiC"],
|
|
transceiverFormFactors: portInfo.formFactors,
|
|
catalogUrl: request.url,
|
|
tags: [
|
|
"whitebox",
|
|
"UfiSpace",
|
|
`${portInfo.maxSpeedGbps}G`,
|
|
asicInfo.model,
|
|
...(category === "Edge" ? ["cell-site", "DCSG"] : []),
|
|
],
|
|
scrapeSource: "ufispace-catalog",
|
|
});
|
|
|
|
if (isNew) {
|
|
created++;
|
|
console.log(` + ${model} (${portInfo.maxSpeedGbps}G, ${asicInfo.vendor} ${asicInfo.model})`);
|
|
} else {
|
|
updated++;
|
|
}
|
|
},
|
|
|
|
failedRequestHandler({ request }) {
|
|
console.error(` ! Failed: ${request.url}`);
|
|
},
|
|
}, makeCrawleeConfig("ufispace"));
|
|
|
|
await crawler.run(PRODUCT_URLS);
|
|
console.log(`\n Created: ${created}, Updated: ${updated}\n`);
|
|
}
|