2026-05-10 01:42:24 +02:00

291 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* ShopFiber24 Scraper — German compatible transceiver shop
*
* shop.fiber24.net — EUR prices, FO transceiver category.
*
* Strategy: The catalog page is fully JS-rendered (BMS/JTL shop system).
* Product detail pages DO have static Schema.org microdata with real prices.
*
* Approach:
* 1. Fetch sitemap_0.xml.gz → extract all /de product URLs
* 2. Filter transceiver/optics URLs (keyword match)
* 3. Fetch each product page → parse itemprop microdata (price, sku, image)
* 4. Upsert transceiver + price_observation + image_url
*
* Rate limited: 1 req/1.5 sec.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
import { contentHash } from "../utils/hash";
import * as cheerio from "cheerio";
import * as zlib from "zlib";
const BASE = "https://shop.fiber24.net";
const SITEMAP_URL = "https://shop.fiber24.net/export/sitemap_0.xml.gz";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
};
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
currency: string;
imageUrl?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
const lower = text.toLowerCase();
if (lower.includes("800g") || lower.includes("qsfp-dd800")) return { formFactor: lower.includes("osfp") ? "OSFP" : "QSFP-DD", speed: "800G", speedGbps: 800 };
if (lower.includes("400g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 };
if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
if (lower.includes("qsfp+") || lower.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
if (lower.includes("sfp56") || lower.includes("50g")) return { formFactor: "SFP56", speed: "50G", speedGbps: 50 };
if (lower.includes("sfp28") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
if (lower.includes("10g") || lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("xfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
if (lower.includes("1000base") || lower.includes("1g") || lower.includes(" sfp ") || lower.match(/\bsfp\b/)) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
if (lower.includes("aoc") || lower.includes("dac")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
}
function detectReach(text: string): { label: string; meters: number } | undefined {
// Variable-length products such as "1 - 30 m" must not be marked as one
// deterministic cable reach. They need variant-level extraction first.
if (/\b\d+(?:[.,]\d+)?\s*(?:-||to|bis)\s*\d+(?:[.,]\d+)?\s*(?:m|km)\b/i.test(text)) {
return undefined;
}
const explicit = text.match(/\b(\d+(?:[.,]\d+)?)\s*(km|m)\b/i);
if (explicit) {
const value = parseFloat(explicit[1].replace(",", "."));
const unit = explicit[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value);
const label = unit === "km" ? `${explicit[1].replace(",", ".")}km` : `${explicit[1].replace(",", ".")}m`;
return { label, meters };
}
}
const patterns: [RegExp, string, number][] = [
[/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000],
[/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100], [/\b70\s*m\b/i, "70m", 70],
[/\b30\s*m\b/i, "30m", 30],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper";
if (/aoc|active.?optical/i.test(text)) return "MMF";
return "SMF";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
/** Fetch sitemap GZ → decompress → return XML string */
async function fetchSitemap(): Promise<string> {
const resp = await fetch(SITEMAP_URL, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`Sitemap fetch failed: HTTP ${resp.status}`);
const buf = await resp.arrayBuffer();
const decompressed = zlib.gunzipSync(Buffer.from(buf));
return decompressed.toString("utf8");
}
/** Extract German product URLs from sitemap XML */
function extractProductUrls(xml: string): string[] {
const all = [...xml.matchAll(/<loc>(https?:\/\/shop\.fiber24\.net\/[^<]+)<\/loc>/g)]
.map((m) => m[1]);
// Keep only /de language URLs, filter transceiver/optics/aoc/dac categories
return all.filter((url) => {
if (!url.endsWith("/de")) return false;
const slug = url.split("/").slice(-2, -1)[0].toLowerCase();
return /sfp|qsfp|osfp|xfp|transceiver|optic|aoc|dac|cwdm|dwdm|bidi|wdm/i.test(slug);
});
}
/** Parse product detail page — extracts microdata price, SKU, image */
function parseProductPage(html: string, url: string): Product | null {
const $ = cheerio.load(html);
// SKU from URL slug (e.g. https://shop.fiber24.net/F24-CI-SFP-10G-AOC/de → F24-CI-SFP-10G-AOC)
const slug = url.split("/").slice(-2, -1)[0];
const partNumber = $("[itemprop='sku']").first().text().trim() || slug;
if (!partNumber || partNumber.length < 3) return null;
// Name
const name = $("h1").first().text().trim()
|| $("[itemprop='name']").first().text().trim()
|| partNumber;
if (!name || name.length < 5) return null;
// Price — take the first product price in DOM order. Later itemprop/minPrice
// values can belong to related accessories and must not override the main SKU.
const priceEls = $("[itemprop='price']").map((_i, el) => {
const val = $(el).attr("content") || $(el).text();
return parseFloat(val.replace(",", "."));
}).get().filter((p: number) => !isNaN(p) && p > 0 && p < 50000);
const price = priceEls.length > 0 ? priceEls[0] : undefined;
// Currency
const currency = $("[itemprop='priceCurrency']").first().attr("content") || "EUR";
// Image — prefer large image
const imageUrl = $("[itemprop='image']").filter((_i, el) => {
const src = $(el).attr("src") || $(el).attr("content") || "";
return src.includes("/lg/") || src.includes("large");
}).first().attr("src")
|| $("[itemprop='image']").first().attr("src")
|| $("[itemprop='image']").first().attr("content")
|| undefined;
const fullText = `${name} ${partNumber}`;
const ff = detectFormFactor(fullText);
const reach = detectReach(fullText);
return {
partNumber,
name,
url,
price: price && price > 0 ? price : undefined,
currency,
imageUrl: imageUrl && !imageUrl.includes("keinBild") ? imageUrl : undefined,
...ff,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(fullText),
wavelength: detectWavelength(fullText),
};
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
export async function scrapeFiber24(): Promise<void> {
console.log("=== ShopFiber24 Scraper Starting (sitemap-based) ===\n");
const vendorId = await ensureVendor(
"ShopFiber24",
"compatible",
"https://shop.fiber24.net",
"https://shop.fiber24.net/FO-TRANSCEIVER/de",
);
// Step 1: Fetch sitemap and extract product URLs
console.log(" Fetching sitemap...");
let productUrls: string[] = [];
try {
const xml = await fetchSitemap();
productUrls = extractProductUrls(xml);
console.log(` Found ${productUrls.length} transceiver product URLs in sitemap`);
} catch (err) {
console.error(` Sitemap failed: ${(err as Error).message}`);
return;
}
if (productUrls.length === 0) {
console.log(" No product URLs found — aborting");
return;
}
// Step 2: Scrape each product page
let totalProducts = 0;
let priceUpdates = 0;
let imageUpdates = 0;
for (let i = 0; i < productUrls.length; i++) {
const url = productUrls[i];
try {
const html = await fetchPage(url);
const product = parseProductPage(html, url);
if (!product) {
console.log(` [${i + 1}/${productUrls.length}] Skip (no data): ${url}`);
continue;
}
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "Compatible",
});
if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: product.currency,
stockLevel: "on_request",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
// Save image URL to transceivers table if present
if (product.imageUrl) {
const updatedImage = await markImageVerified(txId, product.imageUrl);
if (updatedImage) imageUpdates++;
}
totalProducts++;
if ((i + 1) % 10 === 0) {
console.log(` Progress: ${i + 1}/${productUrls.length}${priceUpdates} prices, ${imageUpdates} images`);
}
} catch (err) {
console.warn(` [${i + 1}] Error ${url}: ${(err as Error).message.slice(0, 80)}`);
}
if (i < productUrls.length - 1) await sleep(1500);
}
console.log(`\n=== ShopFiber24 Complete: ${totalProducts} products, ${priceUpdates} prices, ${imageUpdates} images ===`);
}
if (require.main === module) {
scrapeFiber24()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}