diff --git a/packages/scraper/src/scrapers/fiber24.ts b/packages/scraper/src/scrapers/fiber24.ts index de2eb97..c5d2404 100644 --- a/packages/scraper/src/scrapers/fiber24.ts +++ b/packages/scraper/src/scrapers/fiber24.ts @@ -2,17 +2,25 @@ * ShopFiber24 Scraper — German compatible transceiver shop * * shop.fiber24.net — EUR prices, FO transceiver category. - * Paginated catalog: /FO-TRANSCEIVER/de?p=N * - * Rate limited: 1 req/2sec. + * Strategy: The catalog page is fully JS-rendered (BMS/JTL shop system). + * Product detail pages DO have static Schema.org microdata with real prices. + * + * Approach: + * 1. Fetch sitemap_0.xml.gz → extract all /de product URLs + * 2. Filter transceiver/optics URLs (keyword match) + * 3. Fetch each product page → parse itemprop microdata (price, sku, image) + * 4. Upsert transceiver + price_observation + image_url + * + * Rate limited: 1 req/1.5 sec. */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; import * as cheerio from "cheerio"; +import * as zlib from "zlib"; const BASE = "https://shop.fiber24.net"; -const CATALOG_PATH = "/FO-TRANSCEIVER/de"; -const MAX_PAGES = 20; +const SITEMAP_URL = "https://shop.fiber24.net/export/sitemap_0.xml.gz"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", @@ -24,6 +32,8 @@ interface Product { name: string; url: string; price?: number; + currency: string; + imageUrl?: string; formFactor: string; speed: string; speedGbps: number; @@ -39,29 +49,28 @@ function sleep(ms: number): Promise { function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const lower = text.toLowerCase(); + if (lower.includes("400g") || lower.includes("qsfp-dd800") || lower.includes("800g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 }; - if (lower.includes("qsfp-dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; - if (lower.includes("qsfp28")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; - if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; - if (lower.includes("sfp56")) return { formFactor: "SFP56", speed: "50G", speedGbps: 50 }; + if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; + if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; + if (lower.includes("qsfp+") || lower.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; + if (lower.includes("sfp56") || lower.includes("50g")) return { formFactor: "SFP56", speed: "50G", speedGbps: 50 }; if (lower.includes("sfp28") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; - if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; - if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 }; - if (lower.includes("1000base") || lower.includes("1g")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; - if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; + if (lower.includes("10g") || lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("xfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + if (lower.includes("1000base") || lower.includes("1g") || lower.includes(" sfp ") || lower.match(/\bsfp\b/)) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; + if (lower.includes("aoc") || lower.includes("dac")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; } function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ - [/\b80\s*km\b/i, "80km", 80000], - [/\b40\s*km\b/i, "40km", 40000], - [/\b20\s*km\b/i, "20km", 20000], - [/\b10\s*km\b/i, "10km", 10000], - [/\b2\s*km\b/i, "2km", 2000], - [/\b550\s*m\b/i, "550m", 550], - [/\b300\s*m\b/i, "300m", 300], - [/\b100\s*m\b/i, "100m", 100], + [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000], + [/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000], + [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], + [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], + [/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70], + [/\b30\s*m\b/i, "30m", 30], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], @@ -76,6 +85,7 @@ function detectFiber(text: string): string { if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper"; + if (/aoc|active.?optical/i.test(text)) return "MMF"; return "SMF"; } @@ -84,87 +94,81 @@ function detectWavelength(text: string): string { return match ? match[1] : ""; } -function parseProductList(html: string): Product[] { - const $ = cheerio.load(html); - const products: Product[] = []; +/** Fetch sitemap GZ → decompress → return XML string */ +async function fetchSitemap(): Promise { + const resp = await fetch(SITEMAP_URL, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`Sitemap fetch failed: HTTP ${resp.status}`); + const buf = await resp.arrayBuffer(); + const decompressed = zlib.gunzipSync(Buffer.from(buf)); + return decompressed.toString("utf8"); +} - // Common e-commerce product listing selectors - const cardSelectors = [ - ".product-item", ".item", "li.product", ".product-card", - ".woocommerce-loop-product", "article.product", ".grid-item", - ]; - - let found = false; - for (const sel of cardSelectors) { - if ($(sel).length > 0) { - $(sel).each((_i, el) => { - const nameEl = $(el).find("h2, h3, .product-name, .product-title, .item-title, a").first(); - const name = nameEl.text().trim(); - if (!name || name.length < 5 || !/sfp|qsfp|xfp|transceiver|optic/i.test(name)) return; - - const linkEl = $(el).find("a[href]").first(); - const href = linkEl.attr("href") || ""; - const url = href.startsWith("http") ? href : BASE + href; - - const priceText = $(el).find(".price, .product-price, .price-box, .amount, [data-price]").text(); - const priceMatch = priceText.match(/([\d.,]+)\s*€|€\s*([\d.,]+)/); - let price: number | undefined; - if (priceMatch) { - const raw = (priceMatch[1] || priceMatch[2]).replace(/\./g, "").replace(",", "."); - const parsed = parseFloat(raw); - if (parsed > 0 && parsed < 50000) price = parsed; - } - - const skuText = $(el).find(".sku, [data-sku], .product-sku").text().trim(); - const partNumber = skuText || - name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || - name.replace(/\s+/g, "-").slice(0, 60); - - const ff = detectFormFactor(name); - const reach = detectReach(name); - - products.push({ - partNumber, - name, - url, - price, - ...ff, - reachLabel: reach?.label, - reachMeters: reach?.meters, - fiberType: detectFiber(name), - wavelength: detectWavelength(name), - }); - }); - if (products.length > 0) { found = true; break; } - } - } - - // Fallback: extract all transceiver-relevant links - if (!found) { - $("a[href]").each((_i, el) => { - const name = $(el).text().trim(); - const href = $(el).attr("href") || ""; - if (name.length < 8 || name.length > 200 || !/sfp|qsfp|transceiver/i.test(name)) return; - const url = href.startsWith("http") ? href : BASE + href; - const ff = detectFormFactor(name); - const reach = detectReach(name); - products.push({ - partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60), - name, url, ...ff, - reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: detectFiber(name), wavelength: detectWavelength(name), - }); - }); - } - - const seen = new Set(); - return products.filter((p) => { - if (!p.url || seen.has(p.url)) return false; - seen.add(p.url); - return true; +/** Extract German product URLs from sitemap XML */ +function extractProductUrls(xml: string): string[] { + const all = [...xml.matchAll(/(https?:\/\/shop\.fiber24\.net\/[^<]+)<\/loc>/g)] + .map((m) => m[1]); + // Keep only /de language URLs, filter transceiver/optics/aoc/dac categories + return all.filter((url) => { + if (!url.endsWith("/de")) return false; + const slug = url.split("/").slice(-2, -1)[0].toLowerCase(); + return /sfp|qsfp|osfp|xfp|transceiver|optic|aoc|dac|cwdm|dwdm|bidi|wdm/i.test(slug); }); } +/** Parse product detail page — extracts microdata price, SKU, image */ +function parseProductPage(html: string, url: string): Product | null { + const $ = cheerio.load(html); + + // SKU from URL slug (e.g. https://shop.fiber24.net/F24-CI-SFP-10G-AOC/de → F24-CI-SFP-10G-AOC) + const slug = url.split("/").slice(-2, -1)[0]; + const partNumber = $("[itemprop='sku']").first().text().trim() || slug; + if (!partNumber || partNumber.length < 3) return null; + + // Name + const name = $("h1").first().text().trim() + || $("[itemprop='name']").first().text().trim() + || partNumber; + if (!name || name.length < 5) return null; + + // Price — take the lowest itemprop price (minPrice if available, else first price) + const priceEls = $("[itemprop='price']").map((_i, el) => { + const val = $(el).attr("content") || $(el).text(); + return parseFloat(val.replace(",", ".")); + }).get().filter((p: number) => !isNaN(p) && p > 0 && p < 50000); + const minPriceEl = $("[itemprop='minPrice']").attr("content"); + const price = minPriceEl ? parseFloat(minPriceEl) : (priceEls.length > 0 ? Math.min(...priceEls) : undefined); + + // Currency + const currency = $("[itemprop='priceCurrency']").first().attr("content") || "EUR"; + + // Image — prefer large image + const imageUrl = $("[itemprop='image']").filter((_i, el) => { + const src = $(el).attr("src") || $(el).attr("content") || ""; + return src.includes("/lg/") || src.includes("large"); + }).first().attr("src") + || $("[itemprop='image']").first().attr("src") + || $("[itemprop='image']").first().attr("content") + || undefined; + + const fullText = `${name} ${partNumber}`; + const ff = detectFormFactor(fullText); + const reach = detectReach(fullText); + + return { + partNumber, + name, + url, + price: price && price > 0 ? price : undefined, + currency, + imageUrl: imageUrl && !imageUrl.includes("keinBild") ? imageUrl : undefined, + ...ff, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(fullText), + wavelength: detectWavelength(fullText), + }; +} + async function fetchPage(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); @@ -172,52 +176,47 @@ async function fetchPage(url: string): Promise { } export async function scrapeFiber24(): Promise { - console.log("=== ShopFiber24 Scraper Starting ===\n"); + console.log("=== ShopFiber24 Scraper Starting (sitemap-based) ===\n"); const vendorId = await ensureVendor( "ShopFiber24", "compatible", "https://shop.fiber24.net", - BASE + CATALOG_PATH, + "https://shop.fiber24.net/FO-TRANSCEIVER/de", ); - let allProducts: Product[] = []; - - for (let page = 1; page <= MAX_PAGES; page++) { - try { - // Try common pagination patterns: ?p=N, ?page=N, /page/N/ - const url = page === 1 - ? BASE + CATALOG_PATH - : `${BASE}${CATALOG_PATH}?p=${page}`; - const html = await fetchPage(url); - const pageProducts = parseProductList(html); - allProducts.push(...pageProducts); - console.log(` Page ${page}: ${pageProducts.length} products`); - if (pageProducts.length === 0) { - console.log(` Empty page ${page}, stopping pagination.`); - break; - } - if (page < MAX_PAGES) await sleep(2000); - } catch (err) { - console.warn(` Page ${page} failed: ${(err as Error).message}`); - break; - } + // Step 1: Fetch sitemap and extract product URLs + console.log(" Fetching sitemap..."); + let productUrls: string[] = []; + try { + const xml = await fetchSitemap(); + productUrls = extractProductUrls(xml); + console.log(` Found ${productUrls.length} transceiver product URLs in sitemap`); + } catch (err) { + console.error(` Sitemap failed: ${(err as Error).message}`); + return; } - const seen = new Set(); - allProducts = allProducts.filter((p) => { - if (seen.has(p.url)) return false; - seen.add(p.url); - return true; - }); - - console.log(`\nTotal unique products: ${allProducts.length}`); + if (productUrls.length === 0) { + console.log(" No product URLs found — aborting"); + return; + } + // Step 2: Scrape each product page let totalProducts = 0; let priceUpdates = 0; + let imageUpdates = 0; - for (const product of allProducts) { + for (let i = 0; i < productUrls.length; i++) { + const url = productUrls[i]; try { + const html = await fetchPage(url); + const product = parseProductPage(html, url); + if (!product) { + console.log(` [${i + 1}/${productUrls.length}] Skip (no data): ${url}`); + continue; + } + const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, @@ -228,7 +227,7 @@ export async function scrapeFiber24(): Promise { reachLabel: product.reachLabel, fiberType: product.fiberType, wavelengths: product.wavelength, - category: "DataCenter", + category: "Compatible", }); if (product.price && product.price > 0) { @@ -237,20 +236,36 @@ export async function scrapeFiber24(): Promise { transceiverId: txId, sourceVendorId: vendorId, price: product.price, - currency: "EUR", - stockLevel: "in_stock", + currency: product.currency, + stockLevel: "unknown", url: product.url, contentHash: hash, }); if (updated) priceUpdates++; } + + // Save image URL to transceivers table if present + if (product.imageUrl) { + await pool.query( + `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true + WHERE id = $2 AND (image_url IS NULL OR image_url = '')`, + [product.imageUrl, txId], + ); + imageUpdates++; + } + totalProducts++; + if ((i + 1) % 10 === 0) { + console.log(` Progress: ${i + 1}/${productUrls.length} — ${priceUpdates} prices, ${imageUpdates} images`); + } } catch (err) { - console.warn(` Error saving ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`); + console.warn(` [${i + 1}] Error ${url}: ${(err as Error).message.slice(0, 80)}`); } + + if (i < productUrls.length - 1) await sleep(1500); } - console.log(`\n=== ShopFiber24 Complete: ${totalProducts} products, ${priceUpdates} prices ===`); + console.log(`\n=== ShopFiber24 Complete: ${totalProducts} products, ${priceUpdates} prices, ${imageUpdates} images ===`); } if (require.main === module) { diff --git a/packages/scraper/src/scrapers/fibermall.ts b/packages/scraper/src/scrapers/fibermall.ts index 6e675bc..79e2d62 100644 --- a/packages/scraper/src/scrapers/fibermall.ts +++ b/packages/scraper/src/scrapers/fibermall.ts @@ -43,6 +43,7 @@ interface Product { name: string; url: string; price?: number; + imageUrl?: string; formFactor: string; speed: string; speedGbps: number; @@ -119,6 +120,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product card.match(/data-price="([1-9][\d]*\.?\d{0,2})"/); // skip 0.00 const price = priceM ? parseFloat(priceM[1]) : undefined; + // Image URL: first in the card (360x360 product photo) + const imgM = card.match(/(?:src|data-src)="(https:\/\/www\.fibermall\.com\/photo\/[^"]+\.(?:jpg|png|webp))"/i); + const imageUrl = imgM ? imgM[1] : undefined; + // Main product link: first with title attribute const mainLinkM = card.match(/href="(\/sale-\d+[^"?#]*\.htm)"[^>]*title="([^"]{8,})"/i); if (mainLinkM) { @@ -131,6 +136,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product products.push({ partNumber, name, url, price: price && price > 0 && price < 100000 ? price : undefined, + imageUrl, formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(name), wavelength: detectWavelength(name), @@ -149,6 +155,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product partNumber: name.slice(0, 80), name, url, price: price && price > 0 && price < 100000 ? price : undefined, + imageUrl, // SKU variants share the same product image formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(name), wavelength: detectWavelength(name), @@ -177,6 +184,7 @@ export async function scrapeFiberMall(): Promise { let totalProducts = 0; let priceUpdates = 0; + let imageUpdates = 0; const seenCategories = new Set(); for (const cat of CATEGORIES) { @@ -243,6 +251,18 @@ export async function scrapeFiberMall(): Promise { }); if (updated) priceUpdates++; } + + // Save image URL if found and not yet stored + if (product.imageUrl) { + const imgResult = await pool.query( + `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true + WHERE id = $2 AND (image_url IS NULL OR image_url = '') + RETURNING id`, + [product.imageUrl, txId], + ); + if (imgResult.rowCount && imgResult.rowCount > 0) imageUpdates++; + } + totalProducts++; } catch (err) { console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); @@ -255,7 +275,7 @@ export async function scrapeFiberMall(): Promise { await sleep(2000); } - console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); + console.log(`\n=== FiberMall Complete: ${totalProducts} products, ${priceUpdates} price updates, ${imageUpdates} images ===`); } if (require.main === module) {