/** * ShopFiber24 Scraper — German compatible transceiver shop * * shop.fiber24.net — EUR prices, FO transceiver category. * * Strategy: The catalog page is fully JS-rendered (BMS/JTL shop system). * Product detail pages DO have static Schema.org microdata with real prices. * * Approach: * 1. Fetch sitemap_0.xml.gz → extract all /de product URLs * 2. Filter transceiver/optics URLs (keyword match) * 3. Fetch each product page → parse itemprop microdata (price, sku, image) * 4. Upsert transceiver + price_observation + image_url * * Rate limited: 1 req/1.5 sec. */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; import * as cheerio from "cheerio"; import * as zlib from "zlib"; const BASE = "https://shop.fiber24.net"; const SITEMAP_URL = "https://shop.fiber24.net/export/sitemap_0.xml.gz"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", }; interface Product { partNumber: string; name: string; url: string; price?: number; currency: string; imageUrl?: string; formFactor: string; speed: string; speedGbps: number; reachLabel?: string; reachMeters?: number; fiberType?: string; wavelength?: string; } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const lower = text.toLowerCase(); if (lower.includes("400g") || lower.includes("qsfp-dd800") || lower.includes("800g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; if (lower.includes("qsfp+") || lower.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; if (lower.includes("sfp56") || lower.includes("50g")) return { formFactor: "SFP56", speed: "50G", speedGbps: 50 }; if (lower.includes("sfp28") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; if (lower.includes("10g") || lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("xfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; if (lower.includes("1000base") || lower.includes("1g") || lower.includes(" sfp ") || lower.match(/\bsfp\b/)) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; if (lower.includes("aoc") || lower.includes("dac")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; } function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000], [/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000], [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70], [/\b30\s*m\b/i, "30m", 30], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], ]; for (const [regex, label, meters] of patterns) { if (regex.test(text)) return { label, meters }; } return undefined; } function detectFiber(text: string): string { if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper"; if (/aoc|active.?optical/i.test(text)) return "MMF"; return "SMF"; } function detectWavelength(text: string): string { const match = text.match(/(\d{3,4})\s*nm/i); return match ? match[1] : ""; } /** Fetch sitemap GZ → decompress → return XML string */ async function fetchSitemap(): Promise { const resp = await fetch(SITEMAP_URL, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); if (!resp.ok) throw new Error(`Sitemap fetch failed: HTTP ${resp.status}`); const buf = await resp.arrayBuffer(); const decompressed = zlib.gunzipSync(Buffer.from(buf)); return decompressed.toString("utf8"); } /** Extract German product URLs from sitemap XML */ function extractProductUrls(xml: string): string[] { const all = [...xml.matchAll(/(https?:\/\/shop\.fiber24\.net\/[^<]+)<\/loc>/g)] .map((m) => m[1]); // Keep only /de language URLs, filter transceiver/optics/aoc/dac categories return all.filter((url) => { if (!url.endsWith("/de")) return false; const slug = url.split("/").slice(-2, -1)[0].toLowerCase(); return /sfp|qsfp|osfp|xfp|transceiver|optic|aoc|dac|cwdm|dwdm|bidi|wdm/i.test(slug); }); } /** Parse product detail page — extracts microdata price, SKU, image */ function parseProductPage(html: string, url: string): Product | null { const $ = cheerio.load(html); // SKU from URL slug (e.g. https://shop.fiber24.net/F24-CI-SFP-10G-AOC/de → F24-CI-SFP-10G-AOC) const slug = url.split("/").slice(-2, -1)[0]; const partNumber = $("[itemprop='sku']").first().text().trim() || slug; if (!partNumber || partNumber.length < 3) return null; // Name const name = $("h1").first().text().trim() || $("[itemprop='name']").first().text().trim() || partNumber; if (!name || name.length < 5) return null; // Price — take the lowest itemprop price (minPrice if available, else first price) const priceEls = $("[itemprop='price']").map((_i, el) => { const val = $(el).attr("content") || $(el).text(); return parseFloat(val.replace(",", ".")); }).get().filter((p: number) => !isNaN(p) && p > 0 && p < 50000); const minPriceEl = $("[itemprop='minPrice']").attr("content"); const price = minPriceEl ? parseFloat(minPriceEl) : (priceEls.length > 0 ? Math.min(...priceEls) : undefined); // Currency const currency = $("[itemprop='priceCurrency']").first().attr("content") || "EUR"; // Image — prefer large image const imageUrl = $("[itemprop='image']").filter((_i, el) => { const src = $(el).attr("src") || $(el).attr("content") || ""; return src.includes("/lg/") || src.includes("large"); }).first().attr("src") || $("[itemprop='image']").first().attr("src") || $("[itemprop='image']").first().attr("content") || undefined; const fullText = `${name} ${partNumber}`; const ff = detectFormFactor(fullText); const reach = detectReach(fullText); return { partNumber, name, url, price: price && price > 0 ? price : undefined, currency, imageUrl: imageUrl && !imageUrl.includes("keinBild") ? imageUrl : undefined, ...ff, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(fullText), wavelength: detectWavelength(fullText), }; } async function fetchPage(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); return resp.text(); } export async function scrapeFiber24(): Promise { console.log("=== ShopFiber24 Scraper Starting (sitemap-based) ===\n"); const vendorId = await ensureVendor( "ShopFiber24", "compatible", "https://shop.fiber24.net", "https://shop.fiber24.net/FO-TRANSCEIVER/de", ); // Step 1: Fetch sitemap and extract product URLs console.log(" Fetching sitemap..."); let productUrls: string[] = []; try { const xml = await fetchSitemap(); productUrls = extractProductUrls(xml); console.log(` Found ${productUrls.length} transceiver product URLs in sitemap`); } catch (err) { console.error(` Sitemap failed: ${(err as Error).message}`); return; } if (productUrls.length === 0) { console.log(" No product URLs found — aborting"); return; } // Step 2: Scrape each product page let totalProducts = 0; let priceUpdates = 0; let imageUpdates = 0; for (let i = 0; i < productUrls.length; i++) { const url = productUrls[i]; try { const html = await fetchPage(url); const product = parseProductPage(html, url); if (!product) { console.log(` [${i + 1}/${productUrls.length}] Skip (no data): ${url}`); continue; } const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, reachMeters: product.reachMeters, reachLabel: product.reachLabel, fiberType: product.fiberType, wavelengths: product.wavelength, category: "Compatible", }); if (product.price && product.price > 0) { const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: product.currency, stockLevel: "on_request", url: product.url, contentHash: hash, }); if (updated) priceUpdates++; } // Save image URL to transceivers table if present if (product.imageUrl) { await pool.query( `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true WHERE id = $2 AND (image_url IS NULL OR image_url = '')`, [product.imageUrl, txId], ); imageUpdates++; } totalProducts++; if ((i + 1) % 10 === 0) { console.log(` Progress: ${i + 1}/${productUrls.length} — ${priceUpdates} prices, ${imageUpdates} images`); } } catch (err) { console.warn(` [${i + 1}] Error ${url}: ${(err as Error).message.slice(0, 80)}`); } if (i < productUrls.length - 1) await sleep(1500); } console.log(`\n=== ShopFiber24 Complete: ${totalProducts} products, ${priceUpdates} prices, ${imageUpdates} images ===`); } if (require.main === module) { scrapeFiber24() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }