Rene Fichtmueller 25f1579d29 feat(scraper): add FiberMall/Vcelink/OpticsBay scrapers, fix QSFPTEK API migration
- New scrapers: fibermall.ts (WooCommerce), vcelink.ts (Shopify), opticsbay.ts (WooCommerce)
- QSFPTEK rewritten to use /mall/commodity/list API (old OpenCart /c/*.html paths gone 404)
  - New: attribute-based filtering by data rate (1G/10G/25G/40G/100G/200G/400G/800G)
  - Scrapes HTML fragments, extracts US$ prices and product URLs
- scheduler.ts: +3 queues/schedules/workers (fibermall, vcelink, opticsbay) → 61 total workers
- index-pi.ts: Pi fleet picks up all 3 new scrapers
2026-04-11 19:13:36 +02:00

276 lines
10 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* FiberMall Scraper — Chinese compatible transceiver vendor
*
* fibermall.com — custom PHP/REST shop, USD pricing.
* Large catalog: 1G400G, SFP/SFP+/QSFP28/QSFP-DD/OSFP.
* Pagination via ?page=N. Rate limited: 1 req/2sec.
*
* FiberMall (Shenzhen FiberMall Technology Co.) offers DAC/AOC + optics,
* transparent USD prices, no login required.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.fibermall.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
const MAX_PAGES = 30;
const CATEGORIES = [
{ path: "/c/1g-sfp-transceiver/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/c/10g-sfp-transceiver/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/c/25g-sfp28-transceiver/", formFactor: "SFP28",speed: "25G", speedGbps: 25 },
{ path: "/c/40g-qsfp-transceiver/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/c/100g-qsfp28-transceiver/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/c/200g-qsfp56-transceiver/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ path: "/c/400g-qsfp-dd-transceiver/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/c/800g-osfp-transceiver/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/c/dac-cable/", formFactor: "DAC", speed: "10G", speedGbps: 10 },
{ path: "/c/aoc-cable/", formFactor: "AOC", speed: "10G", speedGbps: 10 },
{ path: "/c/optical-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const m = text.match(/(\d{3,4})\s*nm/i);
return m ? m[1] : "";
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// Strategy 1: fibermall product card (class="product-item" or similar)
for (const m of collapsed.matchAll(/<(?:li|div|article)[^>]+class="[^"]*(?:product-item|product-thumb|goods-item|pro-item)[^"]*"[^>]*>([\s\S]*?)<\/(?:li|div|article)>/gi)) {
const card = m[1];
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?fibermall\.com\/[^"?#]+)"/i) ||
card.match(/href="(\/[a-z0-9][^"?#]{5,})"/i);
if (!urlMatch) continue;
const url = urlMatch[1].startsWith("http") ? urlMatch[1] : BASE + urlMatch[1];
if (seen.has(url) || !/fibermall\.com|\/product|\/p\//i.test(url)) continue;
seen.add(url);
const nameMatch = card.match(/<(?:h[23456]|p)[^>]+class="[^"]*(?:name|title)[^"]*"[^>]*>([^<]{8,})<\//i) ||
card.match(/title="([^"]{10,})"/i) ||
card.match(/<a[^>]*>([^<]{10,})<\/a>/i);
if (!nameMatch) continue;
const name = nameMatch[1].trim().replace(/&amp;/g, "&").replace(/&#\d+;/g, "");
if (name.length < 5) continue;
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
// Strategy 2: generic product link scan
if (products.length === 0) {
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?fibermall\.com\/[^"?#]{10,})"[^>]*>([^<]{10,})</gi)) {
const url = m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 8) continue;
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|optic/i.test(name)) continue;
seen.add(url);
const idx = collapsed.indexOf(url);
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
products.push({
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeFiberMall(): Promise<void> {
console.log("=== FiberMall Scraper Starting ===\n");
const vendorId = await ensureVendor(
"FiberMall",
"compatible",
"https://www.fibermall.com",
"https://www.fibermall.com/c/optical-transceivers/",
);
let totalProducts = 0;
let priceUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
// Skip generic fallback if specific categories already scraped
if (cat.path.includes("/optical-transceivers/") && seenCategories.size > 3) {
console.log(` Skipping generic fallback (${seenCategories.size} categories scraped)`);
continue;
}
if (catProducts.length === 0) {
console.log(" No products on page 1 — trying alternate pagination");
// Try ?page=1 format
try {
const html1alt = await fetchPage(BASE + cat.path + "?page=1");
catProducts.push(...parseProductList(html1alt, cat));
} catch { /* ignore */ }
}
if (catProducts.length === 0) {
console.log(" No products found — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
const allProducts = [...catProducts];
for (let page = 2; page <= MAX_PAGES; page++) {
await sleep(2000);
try {
// Try both pagination formats
const pageUrl = `${BASE}${cat.path}?page=${page}`;
const html = await fetchPage(pageUrl);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
}
if (require.main === module) {
scrapeFiberMall()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}