2026-05-09 05:13:37 +02:00

981 lines
40 KiB
TypeScript

/**
* FS.com Scraper v2 — Warehouse Data, Prices, Product Catalog
*
* Phase 1: Category listing pages → collect product URLs (paginated)
* Phase 2: Product detail pages → extract warehouse breakdown, net price, specs
* Phase 3: Write to PostgreSQL (price_observations + stock_observations)
*
* Uses German locale (www.fs.com/de/) for EUR prices and German warehouse labels:
* DE-Lager → warehouse_de_qty + warehouse_de_delivery_date
* Global-Lager → warehouse_global_qty + warehouse_global_delivery_date
* Nachlieferung → backorder_qty + backorder_estimated_date
* verkauft → units_sold
*
* Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail).
*/
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config";
import { rmSync } from "node:fs";
import type { Page } from "playwright";
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
async function applyStealthPatches(page: Page): Promise<void> {
await page.addInitScript(() => {
// Remove webdriver flag
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
// Mimic Chrome plugins (empty list = detected as bot)
Object.defineProperty(navigator, "plugins", {
get: () => ({
0: { name: "Chrome PDF Plugin", filename: "internal-pdf-viewer", description: "Portable Document Format", length: 1 },
length: 1,
namedItem: (n: string) => null,
refresh: () => {},
item: (i: number) => null,
[Symbol.iterator]: function* () { yield (this as any)[0]; }
})
});
// Real Chrome languages
Object.defineProperty(navigator, "languages", { get: () => ["de-DE", "de", "en-US", "en"] });
// Non-zero hardware concurrency
Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 8 });
// Permissions API
const originalQuery = window.navigator.permissions?.query;
if (originalQuery) {
(window.navigator.permissions as any).query = (params: any) =>
params.name === "notifications"
? Promise.resolve({ state: Notification.permission } as PermissionStatus)
: originalQuery.call(navigator.permissions, params);
}
// Chrome object (headless detection)
(window as any).chrome = { runtime: {}, loadTimes: () => {}, csi: () => {}, app: {} };
// Hide automation-specific properties
delete (window as any).__playwright;
delete (window as any).__pwInitScripts;
});
}
import {
ensureVendor,
upsertPriceObservation,
upsertStockObservation,
findOrCreateScrapedTransceiver,
pool,
} from "../utils/db";
import { contentHash } from "../utils/hash";
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
// ── Constants ──────────────────────────────────────────────────────────────────
// FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist)
const BASE_URL = "https://www.fs.com/de";
const MAX_PAGES_PER_CATEGORY = 10;
const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_RUN"] ?? "300", 10);
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
.map((u) => u.trim())
.filter(Boolean);
function buildProxyConfiguration(): ProxyConfiguration | undefined {
if (PROXY_URLS.length === 0) return undefined;
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
}
function normalizeFsProductUrl(url: string): string {
return url.replace(/^https:\/\/www\.fs\.com\/de\/de\//, "https://www.fs.com/de/")
.replace(/\?.*$/, "")
.replace(/\/$/, "");
}
const CATEGORY_URLS = [
"/c/1g-sfp-81",
"/c/10g-sfp-63",
"/c/25g-sfp28-3215",
"/c/40g-qsfp-1360",
"/c/100g-qsfp28-sfp-dd-1159",
"/c/200g-qsfp-dd-qsfp56-3542",
"/c/400g-osfp-qsfp112-qsfp-dd-3652",
"/c/800g-osfp-qsfp-dd-4089",
"/c/1.6t-osfp-5597",
"/c/400g-coherent-qsfp-dd-4103",
"/c/10g-cwdm-dwdm-sfp-65",
"/c/100g-dwdm-qsfp28-3863",
];
const DE_COOKIES = [
{ name: "currency", value: "EUR", domain: ".fs.com", path: "/" },
{ name: "lang", value: "de", domain: ".fs.com", path: "/" },
{ name: "country", value: "DE", domain: ".fs.com", path: "/" },
];
// ── German locale parsers ──────────────────────────────────────────────────────
const GERMAN_MONTHS: Record<string, string> = {
jan: "01", feb: "02", mär: "03", mar: "03",
apr: "04", mai: "05", may: "05", jun: "06",
jul: "07", aug: "08", sep: "09", okt: "10",
oct: "10", nov: "11", dez: "12", dec: "12",
};
/**
* Parse German-formatted quantity string.
* "4.895" → 4895 (period = thousands separator in German)
* "210.9K" → 210900
* "1.2M" → 1200000
*/
function parseGermanQty(text: string): number | undefined {
const t = text.trim().replace(/\s/g, "");
if (!t) return undefined;
const kMatch = t.match(/^([\d.,]+)[Kk]$/);
if (kMatch) {
const n = parseFloat(kMatch[1].replace(/\./g, "").replace(",", "."));
return isNaN(n) ? undefined : Math.round(n * 1_000);
}
const mMatch = t.match(/^([\d.,]+)[Mm]$/);
if (mMatch) {
const n = parseFloat(mMatch[1].replace(/\./g, "").replace(",", "."));
return isNaN(n) ? undefined : Math.round(n * 1_000_000);
}
const n = parseInt(t.replace(/\./g, "").replace(/,/g, ""), 10);
return isNaN(n) ? undefined : n;
}
/**
* Parse German date to ISO "YYYY-MM-DD".
* "20 Apr., 2026" → "2026-04-20"
* "20.04.2026" → "2026-04-20"
*/
function parseGermanDate(text: string): string | undefined {
const numericMatch = text.match(/(\d{1,2})\.(\d{1,2})\.(\d{4})/);
if (numericMatch) {
const [, d, m, y] = numericMatch;
return `${y}-${m.padStart(2, "0")}-${d.padStart(2, "0")}`;
}
const wordMatch = text.match(/(\d{1,2})\.?\s+([A-Za-zÄÖÜäöüß]+)\.?,?\s*(\d{4})/);
if (!wordMatch) return undefined;
const day = wordMatch[1].padStart(2, "0");
const monthRaw = wordMatch[2]
.toLowerCase()
.replace(/ä/g, "a").replace(/ö/g, "o").replace(/ü/g, "u")
.slice(0, 3);
const month = GERMAN_MONTHS[monthRaw];
if (!month) return undefined;
return `${wordMatch[3]}-${month}-${day}`;
}
/**
* Parse German price to EUR float.
* "42,50" → 42.50
* "1.063,02" → 1063.02
*/
function parseGermanPrice(raw: string): number | undefined {
const cleaned = raw.replace(/[^0-9.,]/g, "").trim();
if (!cleaned) return undefined;
let normalized: string;
if (/\d+\.\d{3},\d{2}/.test(cleaned)) {
normalized = cleaned.replace(/\./g, "").replace(",", ".");
} else if (cleaned.includes(",")) {
normalized = cleaned.replace(",", ".");
} else {
normalized = cleaned;
}
const n = parseFloat(normalized);
return isNaN(n) || n <= 0 ? undefined : n;
}
// ── Stock level helper ─────────────────────────────────────────────────────────
function deriveStockLevel(
deQty?: number,
globalQty?: number,
backorderQty?: number
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
const total = (deQty ?? 0) + (globalQty ?? 0);
if (total > 100) return "in_stock";
if (total > 0) return "low_stock";
if ((backorderQty ?? 0) > 0) return "on_request";
return "out_of_stock";
}
// ── Product classification ─────────────────────────────────────────────────────
function detectFormFactor(text: string): string | undefined {
const l = text.toLowerCase();
if (l.includes("osfp") && !l.includes("qsfp")) return "OSFP";
if (l.includes("qsfp-dd800") || l.includes("qsfp-dd 800")) return "QSFP-DD800";
if (l.includes("qsfp-dd")) return "QSFP-DD";
if (l.includes("qsfp56")) return "QSFP56";
if (l.includes("qsfp28")) return "QSFP28";
if (l.includes("qsfp+") || l.includes("qsfp plus")) return "QSFP+";
if (l.includes("sfp56")) return "SFP56";
if (l.includes("sfp28")) return "SFP28";
if (l.includes("sfp+") || l.includes("sfp plus")) return "SFP+";
if (l.includes("sfp") && !l.includes("qsfp")) return "SFP";
if (l.includes("cfp2")) return "CFP2";
if (l.includes("xfp")) return "XFP";
return undefined;
}
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/1\.6\s*t/i, "1.6T", 1600],
[/800\s*g/i, "800G", 800],
[/400\s*g/i, "400G", 400],
[/200\s*g/i, "200G", 200],
[/100\s*g/i, "100G", 100],
[/50\s*g/i, "50G", 50],
[/40\s*g/i, "40G", 40],
[/25\s*g/i, "25G", 25],
[/10\s*g/i, "10G", 10],
[/\b1\s*g\b/i, "1G", 1],
];
for (const [re, speed, gbps] of patterns) {
if (re.test(text)) return { speed, speedGbps: gbps };
}
return undefined;
}
function detectReach(text: string): string | undefined {
const m = text.match(/(\d+)\s*(m|km)\b/i);
return m ? `${m[1]}${m[2].toLowerCase()}` : undefined;
}
// ── Types ──────────────────────────────────────────────────────────────────────
interface ProductSummary {
url: string;
name: string;
partNumber: string;
}
interface ProductDetail extends ProductSummary {
priceNet?: number;
deQty?: number;
deDeliveryDate?: string;
globalQty?: number;
globalDeliveryDate?: string;
backorderQty?: number;
backorderDate?: string;
unitsSold?: number;
compatibleBrands: string[];
specs: Record<string, string>;
imageUrl?: string;
datasheetUrl?: string;
}
// ── Phase 1: Collect product URLs ──────────────────────────────────────────────
/**
* Visit all category pages (paginated) and return a Map of product URL → summary.
* Pages are queued in round-robin order (all p1s, then all p2s, …) so an
* exhausted category is detected before we waste further requests on it.
*/
async function collectProductUrls(
proxyConfiguration: ProxyConfiguration | undefined
): Promise<Map<string, ProductSummary>> {
// Purge leftover request queue from previous runs (instance-isolated storage)
const fsPhase1Dir = crawleeStorageDir("fs-phase1");
try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ }
const products = new Map<string, ProductSummary>();
const exhausted = new Set<string>();
// Pre-queue: all page-1s, then all page-2s, …
const listingRequests = Array.from({ length: MAX_PAGES_PER_CATEGORY }, (_, i) =>
CATEGORY_URLS.map((cat) => ({
url: i === 0 ? `${BASE_URL}${cat}` : `${BASE_URL}${cat}?page=${i + 1}`,
userData: { catPath: cat, pageNum: i + 1 },
uniqueKey: `listing-${cat}-p${i + 1}`,
}))
).flat();
const crawler = new PlaywrightCrawler({
maxConcurrency: 1,
maxRequestsPerMinute: 12,
requestHandlerTimeoutSecs: 60,
headless: true,
useSessionPool: false,
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: {
launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
},
},
preNavigationHooks: [
async ({ page }) => {
await applyStealthPatches(page);
await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" });
await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" })));
},
],
async requestHandler({ page, request, log }) {
const { catPath, pageNum } = request.userData as { catPath: string; pageNum: number };
if (exhausted.has(catPath)) {
log.debug(`[p1] Skip ${catPath} p${pageNum} — exhausted`);
return;
}
try {
await page.waitForSelector('a[href*="/products/"]', { timeout: 12000 });
} catch {
await page.waitForTimeout(6000);
}
const found = await page.evaluate(
(): Array<{ url: string; name: string; partNumber: string }> => {
const results: Array<{ url: string; name: string; partNumber: string }> = [];
const seen = new Set<string>();
for (const link of document.querySelectorAll<HTMLAnchorElement>('a[href*="/products/"]')) {
const href = link.getAttribute("href") ?? "";
if (!href.includes("/products/")) continue;
const absUrl = href.startsWith("http") ? href : `https://www.fs.com${href}`;
if (seen.has(absUrl)) continue;
seen.add(absUrl);
const img = link.querySelector("img");
const name = (
img?.getAttribute("alt") ??
link.getAttribute("title") ??
link.textContent ??
""
).trim().replace(/\s+/g, " ").slice(0, 200);
const pnMatch = href.match(/\/products\/(\d+)\.html/);
const partNumber = pnMatch ? `FS-${pnMatch[1]}` : "";
if (name.length >= 5 && partNumber) results.push({ url: absUrl, name, partNumber });
}
return results;
}
);
log.info(`[Listing] ${catPath} p${pageNum}: ${found.length} products`);
if (found.length === 0) {
exhausted.add(catPath);
} else {
for (const p of found) {
if (!products.has(p.url)) products.set(p.url, p);
}
}
},
}, makeCrawleeConfig("fs-phase1"));
try {
await crawler.run(listingRequests);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
// Benign Crawlee post-run lock-file race: _isTaskReadyFunction reads a
// request .json that was already cleaned up after the crawl finished.
// Crawlee catches + re-throws it internally, which rejects crawler.run().
// Safe to ignore — all pages were already processed.
console.warn("[Phase 1] Crawlee post-run ENOENT (benign, ignoring)");
} else {
throw err;
}
}
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
return products;
}
// ── Phase 2: Scrape product detail pages ──────────────────────────────────────
async function scrapeProductDetails(
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
proxyConfiguration: ProxyConfiguration | undefined
): Promise<ProductDetail[]> {
// Purge Phase 2 storage so it starts with a clean request queue
const fsPhase2Dir = crawleeStorageDir("fs-phase2");
try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ }
const details: ProductDetail[] = [];
const crawler = new PlaywrightCrawler({
maxConcurrency: 1,
maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 90,
headless: true,
useSessionPool: false,
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: {
launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
},
},
preNavigationHooks: [
async ({ page }) => {
await applyStealthPatches(page);
await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" });
await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" })));
},
],
async requestHandler({ page, request, log }) {
const { name: listingName, partNumber: listingPn } = request.userData as {
name: string;
partNumber: string;
};
const url = request.url;
try {
// Wait for the page structure AND ideally a price element to render
await page.waitForSelector(
'h1, .product-detail, [class*="product-info"], [class*="product-main"]',
{ timeout: 12000 }
);
// Give JS-rendered price elements a moment to appear after the DOM is ready
await page.waitForSelector(
'[class*="price-value"], [class*="product-price"], [class*="prod-price"], [class*="final-price"]',
{ timeout: 5000 }
).catch(() => { /* price element optional — proceed with bodyText fallback */ });
} catch {
await page.waitForTimeout(7000);
}
const raw = await page.evaluate(
(): {
bodyText: string;
priceRaw: string;
specs: Record<string, string>;
brands: string[];
imageUrl: string;
datasheetUrl: string;
h1: string;
} => {
const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n");
// ── DOM price extraction (avoids matching site-wide shipping threshold) ──
// FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" on every page.
// bodyText regex matches this and returns 79 for ALL products. We extract
// the actual product price from its own DOM element, skipping bad parents.
// ── Net price (ohne MwSt) preferred for B2B comparisons ──────────────
// FS.com HTML structure (verified 2026-05-06):
// <div class="no_tax">5,10 € ohne MwSt.</div> ← net price ✓
// <div class="price">6,07 €</div> ← gross price
// <div class="standard_price">6,07 €</div> ← gross
// The .no_tax element contains the B2B net price — prefer it.
let priceRaw = "";
const PRICE_SELS = [
// ── FS.com current structure (2026-05) — net price first ──
".no_tax", // "5,10 € ohne MwSt." — net/B2B price
".standard_price", // "6,07 €" — gross fallback
".price", // "6,07 €" — gross fallback (simple class)
// ── Legacy / other patterns ───────────────────────────────
"[class*='price-value']",
"[class*='product-price']",
"[class*='prod-price']",
"[class*='final-price']",
"[class*='regular-price']",
"[class*='price-amount']",
"[data-cy='price']",
".price-box",
];
const SKIP_PARENT =
"[class*='shipping'], [class*='banner'], [class*='delivery'], " +
"[class*='free-ship'], [class*='cart'], [class*='checkout'], " +
"[class*='notice'], [class*='promo'], footer, header, nav";
outer: for (const sel of PRICE_SELS) {
for (const el of Array.from(document.querySelectorAll<HTMLElement>(sel))) {
if (el.closest(SKIP_PARENT)) continue;
const txt = (el.textContent ?? "").replace(/\s+/g, " ").trim();
// Must contain a digit, currency marker, and be short (<40 chars)
if (/\d/.test(txt) && txt.length < 40 && /[€$]|EUR/i.test(txt)) {
priceRaw = txt;
break outer;
}
}
}
const specs: Record<string, string> = {};
const SEL = [
".product-param tr", ".product-specs tr", ".param-table tr",
".specifications tr", ".detail-param tr", ".prod-spec-list tr",
'[class*="specification"] tr', '[class*="param"] tr',
".tab-content tr", ".product-info-table tr", ".tech-param tr",
].join(", ");
document.querySelectorAll(SEL).forEach((row) => {
const cells = row.querySelectorAll("td, th");
if (cells.length >= 2) {
const k = (cells[0]?.textContent ?? "").trim().replace(/\s+/g, " ");
const v = (cells[1]?.textContent ?? "").trim().replace(/\s+/g, " ");
if (k && v && k.length < 80 && !/^[-\s]+$/.test(k)) specs[k] = v;
}
});
document.querySelectorAll("dt").forEach((dt) => {
const dd = dt.nextElementSibling;
if (dd?.tagName === "DD") {
const k = (dt.textContent ?? "").trim();
const v = (dd.textContent ?? "").trim();
if (k && v && k.length < 80) specs[k] = v;
}
});
const brands: string[] = [];
const brandContainer = document.querySelector<Element>(
'[class*="compatible"], [class*="brand-list"], [class*="compatibility"], ' +
'[class*="apply-brand"], [id*="brand"], [id*="compatible"]'
);
if (brandContainer) {
brandContainer.querySelectorAll("button, a, span, li").forEach((el) => {
const t = (el.textContent ?? "").trim();
if (t && t.length > 1 && t.length < 50 && !/^\d+$/.test(t)) brands.push(t);
});
}
if (brands.length === 0) {
const bodyTxt = document.body?.innerText ?? "";
const m = bodyTxt.match(/[Kk]ompatibel\s+mit[^:]*:\s*([\s\S]{0,600})/);
if (m) {
m[1].split(/[,;\n]/).forEach((s) => {
const b = s.trim();
if (b.length > 1 && b.length < 50 && !/^\d/.test(b)) brands.push(b);
});
}
}
const imageCandidates = Array.from(document.querySelectorAll<HTMLImageElement>(
".big_img_box img, img.big_img, .big_img_m_active, .big_img_m, " +
".small_img_active img, .product-image img, .prod-img img, .product-gallery img, " +
'[class*="main-image"] img, [class*="primary-image"] img, ' +
".slick-current img, .product__image img"
)).map((img) => {
const url =
img.currentSrc ||
img.getAttribute("data-src") ||
img.getAttribute("data-original") ||
img.getAttribute("data-lazy") ||
img.getAttribute("src") ||
"";
const cls = `${img.className || ""} ${img.parentElement?.className || ""}`;
const score =
/big_img_box|big_img|big_img_m_active/.test(cls) ? 100 :
/small_img_active/.test(cls) ? 50 :
10;
return { url, score, w: img.naturalWidth || 0, h: img.naturalHeight || 0 };
}).filter((candidate) =>
candidate.url &&
/resource\.fs\.com/.test(candidate.url) &&
!/default\.jpg|generalImg|logo|icon|svg/i.test(candidate.url) &&
(candidate.w === 0 || candidate.w >= 120) &&
(candidate.h === 0 || candidate.h >= 120)
).sort((a, b) => b.score - a.score || (b.w * b.h) - (a.w * a.h));
const imageUrl = imageCandidates[0]?.url ?? "";
const dsEl = document.querySelector<HTMLAnchorElement>(
'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]'
);
const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? "";
const h1 = document.querySelector("h1")?.textContent?.trim() ?? "";
return { bodyText, priceRaw, specs, brands, imageUrl, datasheetUrl, h1 };
}
);
if (!raw.bodyText) { log.warning(`No text: ${url}`); return; }
const t = raw.bodyText;
// ── Net price (ohne MwSt, EUR) ─────────────────────────────────────────
// Strategy:
// 1. DOM extraction (priceRaw) — most reliable, avoids shipping-threshold text
// 2. bodyText qualified patterns (ohne MwSt / netto) with shipping-ctx exclusion
// 3. Broad bodyText fallback — only >€100 to skip free-shipping threshold
//
// Root cause of the €79 bug: FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)"
// on every page. The qualified regex matched that and returned 79 for every product.
let priceNet: number | undefined;
// 1. DOM-extracted price string
if (raw.priceRaw) {
const p = parseGermanPrice(raw.priceRaw);
if (p && p > 0.5 && p < 500_000) priceNet = p;
}
// 2. bodyText qualified patterns — with shipping-context exclusion
if (!priceNet) {
const PRICE_QUALIFIED: RegExp[] = [
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i,
/€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i,
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i,
/Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i,
/Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i,
/([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i,
];
for (const pat of PRICE_QUALIFIED) {
const m = t.match(pat);
if (m?.[1]) {
// Skip if the match appears in a shipping/free-delivery context
const matchIdx = m.index ?? t.indexOf(m[0]);
const ctx = t.slice(Math.max(0, matchIdx - 200), matchIdx + 200);
if (/versand|shipping|lieferung.*\bab\b|\bab\b.*versand|gratis.*ab|kostenlos/i.test(ctx)) continue;
const p = parseGermanPrice(m[1]);
if (p && p > 0.5 && p < 500_000) { priceNet = p; break; }
}
}
}
// 3. Broad bodyText fallback — only accept > €100 (free-shipping threshold is €79)
if (!priceNet) {
for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) {
const m = t.match(pat);
if (m?.[1]) {
const p = parseGermanPrice(m[1]);
if (p && p > 100 && p < 500_000) { priceNet = p; break; }
}
}
}
// ── DE-Lager ───────────────────────────────────────────────────────────
let deQty: number | undefined;
let deDeliveryDate: string | undefined;
const deM =
t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*DE[- ]Lager/i) ??
t.match(/(\d[\d.,KkMm]*)\s*im\s*DE[- ]?Lager/i);
if (deM?.[1]) {
deQty = parseGermanQty(deM[1]);
const idx = t.indexOf(deM[0]);
const ctx = t.slice(idx, idx + 300);
const dm =
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
if (dm?.[1]) deDeliveryDate = parseGermanDate(dm[1]);
}
// ── Global-Lager ───────────────────────────────────────────────────────
let globalQty: number | undefined;
let globalDeliveryDate: string | undefined;
const glM =
t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*Global[- ]Lager/i) ??
t.match(/(\d[\d.,KkMm]*)\s*im\s*Global[- ]?(?:Lager|Warehouse)/i) ??
t.match(/(\d[\d.,KkMm]*)\s*in\s+Global\s+Warehouse/i);
if (glM?.[1]) {
globalQty = parseGermanQty(glM[1]);
const idx = t.indexOf(glM[0]);
const ctx = t.slice(idx, idx + 300);
const dm =
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
if (dm?.[1]) globalDeliveryDate = parseGermanDate(dm[1]);
}
// ── Nachlieferung ──────────────────────────────────────────────────────
let backorderQty: number | undefined;
let backorderDate: string | undefined;
const boM =
t.match(/(\d[\d.,KkMm]*)\s*(?:Stk\.)?\s*in\s+Nachlieferung/i) ??
t.match(/Nachlieferung[:\s]*(\d[\d.,KkMm]*)/i);
if (boM?.[1]) {
backorderQty = parseGermanQty(boM[1]);
const idx = t.indexOf(boM[0]);
const ctx = t.slice(idx, idx + 300);
const dm =
ctx.match(/[Ee]rwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
if (dm?.[1]) backorderDate = parseGermanDate(dm[1]);
}
// ── Units sold ─────────────────────────────────────────────────────────
let unitsSold: number | undefined;
const soldM =
t.match(/(\d[\d.,KkMm]*)\s*(?:[Mm]al\s+)?[Vv]erkauft/) ??
t.match(/([\d.,KkMm]+)\+?\s*sold/i);
if (soldM?.[1]) unitsSold = parseGermanQty(soldM[1]);
// ── Part number refinement ─────────────────────────────────────────────
let partNumber = listingPn;
const pnM = t.match(
/(?:Part\s+Number|Teilenummer|Artikelnummer|P\/N)[:\s]+([A-Z0-9][-A-Z0-9./]{3,40})/i
);
if (pnM?.[1]) partNumber = pnM[1].trim();
const resolveUrl = (u: string): string | undefined => {
if (!u) return undefined;
if (u.startsWith("//")) return `https:${u}`;
if (u.startsWith("/")) return `${BASE_URL}${u}`;
if (u.startsWith("http")) return u;
return undefined;
};
const compatibleBrands = [...new Set(raw.brands)].filter((b) => b.length > 1).slice(0, 30);
log.info(
`${partNumber}: €${priceNet?.toFixed(2) ?? "?"} | ` +
`DE=${deQty ?? "-"} GL=${globalQty ?? "-"} BO=${backorderQty ?? "-"} ` +
`sold=${unitsSold ?? "-"} brands=${compatibleBrands.length}`
);
details.push({
url,
name: raw.h1 || listingName,
partNumber,
priceNet,
deQty,
deDeliveryDate,
globalQty,
globalDeliveryDate,
backorderQty,
backorderDate,
unitsSold,
compatibleBrands,
specs: raw.specs,
imageUrl: resolveUrl(raw.imageUrl),
datasheetUrl: resolveUrl(raw.datasheetUrl),
});
},
}, makeCrawleeConfig("fs-phase2"));
try {
await crawler.run(requests);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
console.warn("[Phase 2] Crawlee post-run ENOENT (benign, ignoring)");
} else {
throw err;
}
}
return details;
}
// ── Main export ────────────────────────────────────────────────────────────────
export async function scrapeFs(): Promise<void> {
console.log("=== FS.com Scraper v2 Starting ===\n");
// ── Quick connectivity check — exit early on datacenter IPs that block FS.com ─
try {
const probe = await fetch("https://www.fs.com/robots.txt", {
signal: AbortSignal.timeout(8000),
headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" },
});
if (!probe.ok && probe.status !== 403) throw new Error(`HTTP ${probe.status}`);
if (probe.status === 403) {
const body = (await probe.text()).slice(0, 200);
if (body.includes("Request blocked") || body.includes("ERROR")) {
console.warn("[FS.com] Server IP is blocked by FS.com — skipping. Run via run-fs-scraper-mac.sh on a residential IP.");
return;
}
}
} catch (err) {
const msg = (err as Error).message;
if (msg.includes("ENOTFOUND") || msg.includes("ECONNREFUSED") || msg.includes("ERR_EMPTY") || msg.includes("TimeoutError")) {
console.warn(`[FS.com] Connectivity check failed (${msg.slice(0, 60)}) — skipping. This scraper requires a residential IP.`);
return;
}
// For other errors (e.g. 200 OK but weird body), proceed anyway
}
const proxyConfiguration = buildProxyConfiguration();
const vendorId = await ensureVendor(
"FS.COM",
"compatible",
"https://www.fs.com",
"https://www.fs.com/de/c/optical-transceivers-9"
);
console.log(`Vendor ID: ${vendorId}`);
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
console.log("\n[Phase 1] Collecting product URLs from category listing pages…");
const productMap = await collectProductUrls(proxyConfiguration);
if (productMap.size === 0) {
console.warn("[Phase 1] No products discovered — check selectors or proxy.");
return;
}
// ── Filter: skip products with fresh stock data ─────────────────────────────
const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean);
const freshlyScraped = new Set<string>();
if (!FORCE_REVALIDATE && allPartNumbers.length > 0) {
const freshResult = await pool.query(
`SELECT DISTINCT t.part_number
FROM transceivers t
JOIN stock_observations so ON so.transceiver_id = t.id
WHERE so.source_vendor_id = $1
AND so.time > NOW() - INTERVAL '${STOCK_FRESH_HOURS} hours'
AND t.part_number = ANY($2)`,
[vendorId, allPartNumbers]
);
for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string);
}
let missingImageUrls = new Set<string>();
if (ONLY_MISSING_IMAGES) {
const missingResult = await pool.query(
`SELECT DISTINCT product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'FS.COM'
AND COALESCE(t.image_verified, false) = false
AND product_page_url LIKE '%/products/%'`
);
missingImageUrls = new Set(
missingResult.rows
.map((row) => normalizeFsProductUrl(row.product_page_url as string))
.filter(Boolean)
);
}
const urlsToScrape = [...productMap.keys()]
.filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? ""))
.filter((url) => !ONLY_MISSING_IMAGES || missingImageUrls.has(normalizeFsProductUrl(url)))
.slice(0, MAX_DETAIL_PAGES_PER_RUN);
console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`);
console.log(
ONLY_MISSING_IMAGES
? ` (${missingImageUrls.size} DB product URLs missing images; ${productMap.size - urlsToScrape.length} skipped by targeted image filter)`
:
FORCE_REVALIDATE
? ` (${productMap.size - urlsToScrape.length} skipped — max detail cap ${MAX_DETAIL_PAGES_PER_RUN})`
: ` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`
);
if (urlsToScrape.length === 0) {
console.log("[Phase 2] All products have fresh stock data — nothing to scrape.");
return;
}
// ── Phase 2: Scrape detail pages ────────────────────────────────────────────
const detailRequests = urlsToScrape.map((url) => {
const s = productMap.get(url);
return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } };
});
const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
console.log(`[Phase 2] Complete — ${details.length} pages scraped.`);
// ── Phase 3: Write to database ─────────────────────────────────────────────
console.log("\n[Phase 3] Writing to database…");
let priceWritten = 0;
let stockWritten = 0;
let specsUpdated = 0;
let errors = 0;
for (const detail of details) {
try {
const ff = detectFormFactor(detail.name);
const speedInfo = detectSpeed(detail.name);
const reach = detectReach(detail.name);
const parsed = parseSpecTable(detail.specs);
const transceiverId = await findOrCreateScrapedTransceiver({
partNumber: detail.partNumber,
vendorId,
formFactor: ff,
speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed,
reachLabel: reach ?? parsed.reachLabel,
reachMeters: parsed.reachMeters,
fiberType: parsed.fiberType,
wavelengths: parsed.wavelengths,
imageUrl: detail.imageUrl,
category: "DataCenter",
});
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
if (detail.priceNet && detail.priceNet > 0) {
const hash = contentHash({
p: detail.priceNet,
de: detail.deQty ?? 0,
gl: detail.globalQty ?? 0,
});
const isNew = await upsertPriceObservation({
transceiverId,
sourceVendorId: vendorId,
price: detail.priceNet,
currency: "EUR",
stockLevel,
quantityAvailable: totalQty > 0 ? totalQty : undefined,
url: detail.url,
contentHash: hash,
});
if (isNew) priceWritten++;
}
const stockNew = await upsertStockObservation({
transceiverId,
sourceVendorId: vendorId,
stockLevel,
quantityAvailable: totalQty > 0 ? totalQty : undefined,
warehouseDeQty: detail.deQty,
warehouseDeDeliveryDate: detail.deDeliveryDate ?? null,
warehouseGlobalQty: detail.globalQty,
warehouseGlobalDeliveryDate: detail.globalDeliveryDate ?? null,
backorderQty: detail.backorderQty,
backorderEstimatedDate: detail.backorderDate ?? null,
unitsSold: detail.unitsSold,
compatibleBrands: detail.compatibleBrands,
priceNet: detail.priceNet,
productUrl: detail.url,
// FS.com: per-warehouse breakdown (DE-Lager + Global-Lager), EUR net prices
stockConfidence: 3,
priceCurrency: "EUR",
priceIncludesTax: false,
});
if (stockNew) stockWritten++;
if (Object.keys(detail.specs).length > 0) {
const updated = await updateVerifiedSpecs({
transceiverId,
fiberType: parsed.fiberType,
connector: parsed.connector,
wavelengths: parsed.wavelengths,
reachMeters: parsed.reachMeters,
reachLabel: parsed.reachLabel,
powerConsumptionW: parsed.powerConsumptionW,
tempRange: parsed.tempRange,
modulation: parsed.modulation,
domSupport: parsed.domSupport,
imageUrl: detail.imageUrl,
datasheetUrl: detail.datasheetUrl,
source: "fs.com",
});
if (updated) specsUpdated++;
}
} catch (err) {
console.error(`${detail.partNumber}: ${(err as Error).message}`);
errors++;
}
}
console.log("\n=== FS.com Scraper v2 Complete ===");
console.log(` Products discovered: ${productMap.size}`);
console.log(` Detail pages scraped: ${details.length}`);
console.log(` Price observations: ${priceWritten} new`);
console.log(` Stock observations: ${stockWritten} new`);
console.log(` Specs verified: ${specsUpdated}`);
if (errors > 0) console.warn(` DB errors: ${errors}`);
}
if (require.main === module) {
// Crawlee's FileSystemStorage emits spurious unhandledRejection errors after
// crawler.run() resolves: the internal task loop schedules one final
// _isTaskReadyFunction call which tries to read a request .json file that
// Crawlee already cleaned up during normal processing. This ENOENT is benign
// (crawling is done), but the default unhandledRejection handler would call
// process.exit(1) and abort Phase 2. We swallow it here.
process.on("unhandledRejection", (reason) => {
const msg = reason instanceof Error ? reason.message : String(reason);
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
// Benign Crawlee post-run cleanup race — ignore
return;
}
// All other unhandled rejections are real errors
console.error("Unhandled rejection:", reason);
process.exit(1);
});
scrapeFs()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}