Rene Fichtmueller fd3476f5c4 fix(scraper): FiberMall URL schema + price parser + Flexoptix EUR comma bug
FiberMall:
- Correct /store-XXXXX-name.htm category URLs (was /c/xxx/ → HTTP 404)
- Parser: split on new_proList_mainListLi, price from data-price on
  currency_price span — fix 0.00 false-match from SKU variant items
- Also scrape SKU brand variant links from .sku_item divs
- Result: 3,410 prices now in DB (was 0)

Flexoptix:
- Fix extractPrice regex for EUR thousand-separator format
  (2,921.60 EUR was parsed as 2 EUR)
- Add OSFP224 / 1.6T search queries (4 new, form factor was missing)
- Fix O.138HG2.C.05 stale price 3009.60→2921.60 EUR

Schema: competitor_verified + competitor_verified_at columns
added via ALTER TABLE (were referenced in code but missing in DB)

CHANGELOG: added 6 entries for 2026-04-12
2026-04-12 04:26:35 +02:00

266 lines
10 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* FiberMall Scraper — Chinese compatible transceiver vendor
*
* fibermall.com — custom Vue.js/PHP shop, USD pricing.
* Large catalog: 1G800G, SFP/SFP+/QSFP28/QSFP-DD/OSFP.
* Rate limited: 1 req/2sec.
*
* URL schema (discovered 2026-04-11):
* Category pages: /store-XXXXX-name.htm
* Product pages: /sale-XXXXXX-name.htm
* Pagination: /store-XXXXX-name.htm?page=N
* Product list: CSS class "new_proList_mainListLi"
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.fibermall.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
Referer: "https://www.fibermall.com/",
};
const MAX_PAGES = 30;
// Discovered via homepage navigation scrape 2026-04-11
// Format: /store-XXXXX-description.htm
const CATEGORIES = [
{ path: "/store-17147-sfp-transceivers.htm", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/store-17014-10g-sfp.htm", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/store-17012-25g-sfp28.htm", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/store-16652-40g-qsfp.htm", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/store-16528-100g-qsfp28.htm", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/store-20654-200g-qsfp56-qsfp-dd.htm", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ path: "/store-20656-400g-qsfp-dd.htm", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/store-21972-800g-qsfp-dd-osfp.htm", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/store-16527-dac-aoc-acc-aec-cables.htm", formFactor: "DAC", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const m = text.match(/(\d{3,4})\s*nm/i);
return m ? m[1] : "";
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// FiberMall HTML structure (SSR, confirmed 2026-04-11):
// <li class="new_proList_mainListLi">
// <a href="/sale-XXXXXX-name.htm" title="Full Name">...</a>
// <span class="currency_price" data-price="12.00">12.00</span>
// </li>
//
// Each <li> is a product GROUP with SKU variants inside .sku_item divs.
// The main product URL has a `title` attribute; sku variants do not.
// Price: data-price="X.XX" on <span class="currency_price">
// Split by <li class="new_proList_mainListLi"
const liParts = collapsed.split(/(?=<li class="new_proList_mainListLi")/);
for (const card of liParts) {
if (!card.includes('new_proList_mainListLi')) continue;
// Price: from <span class="currency_price" data-price="X.XX">
// Must target specifically currency_price span — SKU items have data-price="0.00"
const priceM = card.match(/class="currency_price"[^>]*data-price="([\d.]+)"/i) ||
card.match(/data-price="([1-9][\d]*\.?\d{0,2})"/); // skip 0.00
const price = priceM ? parseFloat(priceM[1]) : undefined;
// Main product link: first <a href="/sale-..."> with title attribute
const mainLinkM = card.match(/href="(\/sale-\d+[^"?#]*\.htm)"[^>]*title="([^"]{8,})"/i);
if (mainLinkM) {
const url = BASE + mainLinkM[1];
const name = mainLinkM[2].trim().replace(/&amp;/g, "&").replace(/&#\d+;/g, "").replace(/\s+/g, " ");
if (!seen.has(url) && name.length >= 5) {
seen.add(url);
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
}
// Also collect SKU variant links from .sku_item (no title attribute, use link text)
for (const m of card.matchAll(/class="sku_item[^"]*"[^>]*>\s*<a href="(\/sale-\d+[^"?#]*\.htm)"[^>]*>([^<]{5,})<\/a>/gi)) {
const url = BASE + m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 4) continue;
seen.add(url);
const reach = detectReach(name);
products.push({
partNumber: name.slice(0, 80),
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeFiberMall(): Promise<void> {
console.log("=== FiberMall Scraper Starting ===\n");
const vendorId = await ensureVendor(
"FiberMall",
"compatible",
"https://www.fibermall.com",
"https://www.fibermall.com/store-16528-100g-qsfp28.htm",
);
let totalProducts = 0;
let priceUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
if (catProducts.length === 0) {
console.log(" No products found — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
const allProducts = [...catProducts];
for (let page = 2; page <= MAX_PAGES; page++) {
await sleep(2000);
try {
// FiberMall pagination: ?page=N
const pageUrl = `${BASE}${cat.path}?page=${page}`;
const html = await fetchPage(pageUrl);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
}
if (require.main === module) {
scrapeFiberMall()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}