Rene Fichtmueller c7d7456de9 fix: instance-level Crawlee storage isolation + eBay vendor type
- Add utils/crawlee-config.ts: makeCrawleeConfig(name) returns a
  Crawlee Configuration with isolated localDataDirectory per scraper.
  Uses storageClientOptions (not global CRAWLEE_STORAGE_DIR) so
  concurrent pg-boss workers in the same process don't race on
  the shared env var.

- Apply makeCrawleeConfig to all 6 Crawlee-based scrapers:
  optcore (PlaywrightCrawler), atgbics (PlaywrightCrawler),
  community-issues (CheerioCrawler + RequestQueue),
  edgecore (CheerioCrawler), ufispace (CheerioCrawler),
  market-intelligence (CheerioCrawler).

- scheduler.ts: add withIsolatedStorage for optcore and market-intel
  workers (was missing, caused storage-fs path bleed from fs scraper).

- ebay-enricher.ts: fix vendor type 'marketplace' -> 'reseller' to
  satisfy vendors_type_check constraint
  ['manufacturer','distributor','oem','reseller','compatible'].
2026-04-18 01:35:57 +02:00

200 lines
7.0 KiB
TypeScript

/**
* UfiSpace Product Catalog Scraper
*
* Scrapes switch product pages from ufispace.com for specs and compatibility.
* UfiSpace publishes clean, well-structured product pages.
*
* Source: https://www.ufispace.com/products/datacenter-switches
*/
import { CheerioCrawler } from "crawlee";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import { pool, ensureWhiteboxVendor, findOrCreateSwitch } from "../utils/db";
const BASE_URL = "https://www.ufispace.com";
const PRODUCT_URLS = [
`${BASE_URL}/products/datacenter-switches`,
`${BASE_URL}/networking-white-box`,
];
function extractPortsFromSpec(specText: string): {
portsConfig: Record<string, number>;
totalPorts: number;
maxSpeedGbps: number;
formFactors: string[];
} {
const portsConfig: Record<string, number> = {};
let totalPorts = 0;
let maxSpeedGbps = 0;
const formFactors: string[] = [];
const portPattern = /(\d+)\s*x\s*(\d+)\s*G(?:bE|b\/s)?\s*(QSFP-DD|QSFP28|QSFP\+|QSFP56|SFP28|SFP\+|SFP56|OSFP|CFP2)?/gi;
let match: RegExpExecArray | null;
while ((match = portPattern.exec(specText)) !== null) {
const count = parseInt(match[1]);
const speed = parseInt(match[2]);
const ff = match[3]?.toUpperCase() || `${speed}G`;
const key = `${speed}G_${ff}`;
portsConfig[key] = (portsConfig[key] || 0) + count;
totalPorts += count;
maxSpeedGbps = Math.max(maxSpeedGbps, speed);
if (match[3] && !formFactors.includes(match[3].toUpperCase())) {
formFactors.push(match[3].toUpperCase());
}
}
return { portsConfig, totalPorts, maxSpeedGbps, formFactors };
}
function detectAsic(text: string): { vendor: string; model: string; series: string } {
const asicPatterns: Array<{ pattern: RegExp; vendor: string; model: string; series: string }> = [
{ pattern: /tomahawk\s*5/i, vendor: "Broadcom", model: "Tomahawk 5", series: "StrataDNX" },
{ pattern: /tomahawk\s*4/i, vendor: "Broadcom", model: "Tomahawk 4", series: "StrataDNX" },
{ pattern: /tomahawk\s*3/i, vendor: "Broadcom", model: "Tomahawk 3", series: "StrataDNX" },
{ pattern: /tomahawk\s*2/i, vendor: "Broadcom", model: "Tomahawk 2", series: "StrataDNX" },
{ pattern: /tomahawk/i, vendor: "Broadcom", model: "Tomahawk", series: "StrataDNX" },
{ pattern: /trident\s*(3|iii)/i, vendor: "Broadcom", model: "Trident III", series: "StrataDNX" },
{ pattern: /jericho\s*2/i, vendor: "Broadcom", model: "Jericho2", series: "StrataDNX" },
{ pattern: /spectrum/i, vendor: "NVIDIA", model: "Spectrum", series: "Spectrum" },
];
for (const { pattern, vendor, model, series } of asicPatterns) {
if (pattern.test(text)) {
return { vendor, model, series };
}
}
return { vendor: "Broadcom", model: "Unknown", series: "" };
}
export async function scrapeUfiSpace(): Promise<void> {
console.log("\n=== UfiSpace Scraper ===\n");
const vendorId = await ensureWhiteboxVendor("UfiSpace", "https://www.ufispace.com", {
isOdm: true,
ocpMember: true,
sonicContributor: true,
});
let created = 0;
let updated = 0;
const crawler = new CheerioCrawler({
maxConcurrency: 2,
maxRequestsPerMinute: 15,
requestHandlerTimeoutSecs: 30,
async requestHandler({ request, $, enqueueLinks }) {
// Product list pages — enqueue individual products
if (request.url.includes("products/") || request.url.includes("networking-white-box")) {
console.log(` Parsing: ${request.url}`);
const productLinks: string[] = [];
// Look for links to individual product pages
$("a").each((_i, el) => {
const href = $(el).attr("href") || "";
if (href.match(/\/S9[0-9]+-/i) || href.match(/\/product\//i)) {
const fullUrl = href.startsWith("http") ? href : `${BASE_URL}${href}`;
if (!productLinks.includes(fullUrl)) {
productLinks.push(fullUrl);
}
}
});
console.log(` Found ${productLinks.length} product links`);
for (const link of productLinks) {
await enqueueLinks({ urls: [link] });
}
return;
}
// Individual product page
const pageText = $("body").text();
const title = $("h1, .product-title").first().text().trim();
if (!title) return;
// Extract model name (S9600-32X, S9700-53DX, etc.)
const modelMatch = title.match(/(S\d{4}-\d+[A-Z]*)/i) || pageText.match(/(S\d{4}-\d+[A-Z]*)/i);
if (!modelMatch) return;
const model = modelMatch[1];
const portInfo = extractPortsFromSpec(pageText);
const asicInfo = detectAsic(pageText);
if (portInfo.totalPorts === 0) return;
const powerMatch = pageText.match(/(?:max|maximum)\s*power[:\s]*(\d+)\s*W/i);
const cpuMatch = pageText.match(/(Intel\s+(?:Xeon|Atom|Core)[^\n,;]+)/i);
const ramMatch = pageText.match(/(\d+)\s*GB?\s*(?:DDR[34]|RAM|memory)/i);
const storageMatch = pageText.match(/(\d+)\s*GB?\s*(?:SSD|eMMC|M\.2)/i);
const switchCapMatch = pageText.match(/switching\s*capacity[:\s]*([\d.]+)\s*Tb/i);
const seriesMatch = model.match(/^(S\d{4})/);
const series = seriesMatch ? seriesMatch[1] : "";
// Determine category based on model/series
let category: "DataCenter" | "Edge" | "SP" = "DataCenter";
if (model.includes("9510") || pageText.toLowerCase().includes("cell site")) {
category = "Edge";
}
const existing = await pool.query(
`SELECT id FROM switches WHERE model = $1 AND vendor_id = $2`,
[model, vendorId]
);
const isNew = existing.rows.length === 0;
await findOrCreateSwitch({
model,
vendorId,
series,
category,
layer: "L3",
portsConfig: portInfo.portsConfig,
totalPorts: portInfo.totalPorts,
maxSpeedGbps: portInfo.maxSpeedGbps,
switchingCapacityTbps: switchCapMatch ? parseFloat(switchCapMatch[1]) : undefined,
asicVendor: asicInfo.vendor,
asicModel: asicInfo.model,
asicSeries: asicInfo.series,
maxPowerW: powerMatch ? parseInt(powerMatch[1]) : undefined,
cpu: cpuMatch ? cpuMatch[1].trim() : undefined,
ramGb: ramMatch ? parseInt(ramMatch[1]) : undefined,
storageGb: storageMatch ? parseInt(storageMatch[1]) : undefined,
sonicCompatible: true,
isWhitebox: true,
onieSupport: true,
supportedNos: ["SONiC"],
transceiverFormFactors: portInfo.formFactors,
catalogUrl: request.url,
tags: [
"whitebox",
"UfiSpace",
`${portInfo.maxSpeedGbps}G`,
asicInfo.model,
...(category === "Edge" ? ["cell-site", "DCSG"] : []),
],
scrapeSource: "ufispace-catalog",
});
if (isNew) {
created++;
console.log(` + ${model} (${portInfo.maxSpeedGbps}G, ${asicInfo.vendor} ${asicInfo.model})`);
} else {
updated++;
}
},
failedRequestHandler({ request }) {
console.error(` ! Failed: ${request.url}`);
},
}, makeCrawleeConfig("ufispace"));
await crawler.run(PRODUCT_URLS);
console.log(`\n Created: ${created}, Updated: ${updated}\n`);
}