Rene Fichtmueller a8529d166b fix: resolve TS build errors — export backfillImages, add writeRobotExperience
- backfill-images.ts: rename main() → export backfillImages() to match index.ts import
- training-data-writer.ts: add writeRobotExperience export; remove hardcoded Gitea token
- fiber24.ts/fibermall.ts: scraper improvements from previous sessions
- image-downloader.ts/spec-updater.ts: utility updates
- robots/: add verification robots module
2026-05-06 23:39:00 +02:00

282 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* FiberMall Scraper — Chinese compatible transceiver vendor
*
* fibermall.com — custom Vue.js/PHP shop, USD pricing.
* Large catalog: 1G800G, SFP/SFP+/QSFP28/QSFP-DD/OSFP.
* Rate limited: 1 req/2sec.
*
* URL schema (discovered 2026-04-11):
* Category pages: /store-XXXXX-name.htm
* Product pages: /sale-XXXXXX-name.htm
* Pagination: /store-XXXXX-name.htm?page=N
* Product list: CSS class "new_proList_mainListLi"
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.fibermall.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
Referer: "https://www.fibermall.com/",
};
const MAX_PAGES = 30;
// Discovered via homepage navigation scrape 2026-04-11
// Format: /store-XXXXX-description.htm
const CATEGORIES = [
{ path: "/store-17147-sfp-transceivers.htm", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/store-17014-10g-sfp.htm", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/store-17012-25g-sfp28.htm", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/store-16652-40g-qsfp.htm", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/store-16528-100g-qsfp28.htm", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/store-20654-200g-qsfp56-qsfp-dd.htm", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
{ path: "/store-20656-400g-qsfp-dd.htm", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/store-21972-800g-qsfp-dd-osfp.htm", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/store-16527-dac-aoc-acc-aec-cables.htm", formFactor: "DAC", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
imageUrl?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const m = text.match(/(\d{3,4})\s*nm/i);
return m ? m[1] : "";
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const seen = new Set<string>();
const collapsed = html.replace(/\s+/g, " ");
// FiberMall HTML structure (SSR, confirmed 2026-04-11):
// <li class="new_proList_mainListLi">
// <a href="/sale-XXXXXX-name.htm" title="Full Name">...</a>
// <span class="currency_price" data-price="12.00">12.00</span>
// </li>
//
// Each <li> is a product GROUP with SKU variants inside .sku_item divs.
// The main product URL has a `title` attribute; sku variants do not.
// Price: data-price="X.XX" on <span class="currency_price">
// Split by <li class="new_proList_mainListLi"
const liParts = collapsed.split(/(?=<li class="new_proList_mainListLi")/);
for (const card of liParts) {
if (!card.includes('new_proList_mainListLi')) continue;
// Price: from <span class="currency_price" data-price="X.XX">
// Must target specifically currency_price span — SKU items have data-price="0.00"
const priceM = card.match(/class="currency_price"[^>]*data-price="([\d.]+)"/i) ||
card.match(/data-price="([1-9][\d]*\.?\d{0,2})"/); // skip 0.00
const price = priceM ? parseFloat(priceM[1]) : undefined;
// Image URL: first <img src="..."> in the card (360x360 product photo)
const imgM = card.match(/(?:src|data-src)="(https:\/\/www\.fibermall\.com\/photo\/[^"]+\.(?:jpg|png|webp))"/i);
const imageUrl = imgM ? imgM[1] : undefined;
// Main product link: first <a href="/sale-..."> with title attribute
const mainLinkM = card.match(/href="(\/sale-\d+[^"?#]*\.htm)"[^>]*title="([^"]{8,})"/i);
if (mainLinkM) {
const url = BASE + mainLinkM[1];
const name = mainLinkM[2].trim().replace(/&amp;/g, "&").replace(/&#\d+;/g, "").replace(/\s+/g, " ");
if (!seen.has(url) && name.length >= 5) {
seen.add(url);
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
imageUrl,
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
}
// Also collect SKU variant links from .sku_item (no title attribute, use link text)
for (const m of card.matchAll(/class="sku_item[^"]*"[^>]*>\s*<a href="(\/sale-\d+[^"?#]*\.htm)"[^>]*>([^<]{5,})<\/a>/gi)) {
const url = BASE + m[1];
const name = m[2].trim().replace(/&amp;/g, "&");
if (seen.has(url) || name.length < 4) continue;
seen.add(url);
const reach = detectReach(name);
products.push({
partNumber: name.slice(0, 80),
name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
imageUrl, // SKU variants share the same product image
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
}
}
return products;
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeFiberMall(): Promise<void> {
console.log("=== FiberMall Scraper Starting ===\n");
const vendorId = await ensureVendor(
"FiberMall",
"compatible",
"https://www.fibermall.com",
"https://www.fibermall.com/store-16528-100g-qsfp28.htm",
);
let totalProducts = 0;
let priceUpdates = 0;
let imageUpdates = 0;
const seenCategories = new Set<string>();
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
try {
const html1 = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html1, cat);
if (catProducts.length === 0) {
console.log(" No products found — skipping");
continue;
}
seenCategories.add(cat.path);
console.log(` Found ${catProducts.length} products on page 1`);
const allProducts = [...catProducts];
for (let page = 2; page <= MAX_PAGES; page++) {
await sleep(2000);
try {
// FiberMall pagination: ?page=N
const pageUrl = `${BASE}${cat.path}?page=${page}`;
const html = await fetchPage(pageUrl);
const pageProds = parseProductList(html, cat);
if (pageProds.length === 0) break;
allProducts.push(...pageProds);
console.log(` Page ${page}: ${pageProds.length} products`);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
break;
}
}
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex(x => x.url === p.url) === i);
console.log(` Total unique: ${uniqueProducts.length}`);
for (const product of uniqueProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
// Save image URL if found and not yet stored
if (product.imageUrl) {
const updatedImage = await markImageVerified(txId, product.imageUrl);
if (updatedImage) imageUpdates++;
}
totalProducts++;
} catch (err) {
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates, ${imageUpdates} images ===`);
}
if (require.main === module) {
scrapeFiberMall()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}