981 lines
40 KiB
TypeScript
981 lines
40 KiB
TypeScript
/**
|
|
* FS.com Scraper v2 — Warehouse Data, Prices, Product Catalog
|
|
*
|
|
* Phase 1: Category listing pages → collect product URLs (paginated)
|
|
* Phase 2: Product detail pages → extract warehouse breakdown, net price, specs
|
|
* Phase 3: Write to PostgreSQL (price_observations + stock_observations)
|
|
*
|
|
* Uses German locale (www.fs.com/de/) for EUR prices and German warehouse labels:
|
|
* DE-Lager → warehouse_de_qty + warehouse_de_delivery_date
|
|
* Global-Lager → warehouse_global_qty + warehouse_global_delivery_date
|
|
* Nachlieferung → backorder_qty + backorder_estimated_date
|
|
* verkauft → units_sold
|
|
*
|
|
* Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail).
|
|
*/
|
|
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
|
import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config";
|
|
import { rmSync } from "node:fs";
|
|
import type { Page } from "playwright";
|
|
|
|
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
|
|
async function applyStealthPatches(page: Page): Promise<void> {
|
|
await page.addInitScript(() => {
|
|
// Remove webdriver flag
|
|
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
|
|
// Mimic Chrome plugins (empty list = detected as bot)
|
|
Object.defineProperty(navigator, "plugins", {
|
|
get: () => ({
|
|
0: { name: "Chrome PDF Plugin", filename: "internal-pdf-viewer", description: "Portable Document Format", length: 1 },
|
|
length: 1,
|
|
namedItem: (n: string) => null,
|
|
refresh: () => {},
|
|
item: (i: number) => null,
|
|
[Symbol.iterator]: function* () { yield (this as any)[0]; }
|
|
})
|
|
});
|
|
// Real Chrome languages
|
|
Object.defineProperty(navigator, "languages", { get: () => ["de-DE", "de", "en-US", "en"] });
|
|
// Non-zero hardware concurrency
|
|
Object.defineProperty(navigator, "hardwareConcurrency", { get: () => 8 });
|
|
// Permissions API
|
|
const originalQuery = window.navigator.permissions?.query;
|
|
if (originalQuery) {
|
|
(window.navigator.permissions as any).query = (params: any) =>
|
|
params.name === "notifications"
|
|
? Promise.resolve({ state: Notification.permission } as PermissionStatus)
|
|
: originalQuery.call(navigator.permissions, params);
|
|
}
|
|
// Chrome object (headless detection)
|
|
(window as any).chrome = { runtime: {}, loadTimes: () => {}, csi: () => {}, app: {} };
|
|
// Hide automation-specific properties
|
|
delete (window as any).__playwright;
|
|
delete (window as any).__pwInitScripts;
|
|
});
|
|
}
|
|
import {
|
|
ensureVendor,
|
|
upsertPriceObservation,
|
|
upsertStockObservation,
|
|
findOrCreateScrapedTransceiver,
|
|
pool,
|
|
} from "../utils/db";
|
|
import { contentHash } from "../utils/hash";
|
|
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
|
|
|
// ── Constants ──────────────────────────────────────────────────────────────────
|
|
|
|
// FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist)
|
|
const BASE_URL = "https://www.fs.com/de";
|
|
const MAX_PAGES_PER_CATEGORY = 10;
|
|
const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_RUN"] ?? "300", 10);
|
|
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
|
|
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
|
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
|
|
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
|
.split(",")
|
|
.map((u) => u.trim())
|
|
.filter(Boolean);
|
|
|
|
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
|
if (PROXY_URLS.length === 0) return undefined;
|
|
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
|
}
|
|
|
|
function normalizeFsProductUrl(url: string): string {
|
|
return url.replace(/^https:\/\/www\.fs\.com\/de\/de\//, "https://www.fs.com/de/")
|
|
.replace(/\?.*$/, "")
|
|
.replace(/\/$/, "");
|
|
}
|
|
|
|
const CATEGORY_URLS = [
|
|
"/c/1g-sfp-81",
|
|
"/c/10g-sfp-63",
|
|
"/c/25g-sfp28-3215",
|
|
"/c/40g-qsfp-1360",
|
|
"/c/100g-qsfp28-sfp-dd-1159",
|
|
"/c/200g-qsfp-dd-qsfp56-3542",
|
|
"/c/400g-osfp-qsfp112-qsfp-dd-3652",
|
|
"/c/800g-osfp-qsfp-dd-4089",
|
|
"/c/1.6t-osfp-5597",
|
|
"/c/400g-coherent-qsfp-dd-4103",
|
|
"/c/10g-cwdm-dwdm-sfp-65",
|
|
"/c/100g-dwdm-qsfp28-3863",
|
|
];
|
|
|
|
const DE_COOKIES = [
|
|
{ name: "currency", value: "EUR", domain: ".fs.com", path: "/" },
|
|
{ name: "lang", value: "de", domain: ".fs.com", path: "/" },
|
|
{ name: "country", value: "DE", domain: ".fs.com", path: "/" },
|
|
];
|
|
|
|
// ── German locale parsers ──────────────────────────────────────────────────────
|
|
|
|
const GERMAN_MONTHS: Record<string, string> = {
|
|
jan: "01", feb: "02", mär: "03", mar: "03",
|
|
apr: "04", mai: "05", may: "05", jun: "06",
|
|
jul: "07", aug: "08", sep: "09", okt: "10",
|
|
oct: "10", nov: "11", dez: "12", dec: "12",
|
|
};
|
|
|
|
/**
|
|
* Parse German-formatted quantity string.
|
|
* "4.895" → 4895 (period = thousands separator in German)
|
|
* "210.9K" → 210900
|
|
* "1.2M" → 1200000
|
|
*/
|
|
function parseGermanQty(text: string): number | undefined {
|
|
const t = text.trim().replace(/\s/g, "");
|
|
if (!t) return undefined;
|
|
|
|
const kMatch = t.match(/^([\d.,]+)[Kk]$/);
|
|
if (kMatch) {
|
|
const n = parseFloat(kMatch[1].replace(/\./g, "").replace(",", "."));
|
|
return isNaN(n) ? undefined : Math.round(n * 1_000);
|
|
}
|
|
|
|
const mMatch = t.match(/^([\d.,]+)[Mm]$/);
|
|
if (mMatch) {
|
|
const n = parseFloat(mMatch[1].replace(/\./g, "").replace(",", "."));
|
|
return isNaN(n) ? undefined : Math.round(n * 1_000_000);
|
|
}
|
|
|
|
const n = parseInt(t.replace(/\./g, "").replace(/,/g, ""), 10);
|
|
return isNaN(n) ? undefined : n;
|
|
}
|
|
|
|
/**
|
|
* Parse German date to ISO "YYYY-MM-DD".
|
|
* "20 Apr., 2026" → "2026-04-20"
|
|
* "20.04.2026" → "2026-04-20"
|
|
*/
|
|
function parseGermanDate(text: string): string | undefined {
|
|
const numericMatch = text.match(/(\d{1,2})\.(\d{1,2})\.(\d{4})/);
|
|
if (numericMatch) {
|
|
const [, d, m, y] = numericMatch;
|
|
return `${y}-${m.padStart(2, "0")}-${d.padStart(2, "0")}`;
|
|
}
|
|
const wordMatch = text.match(/(\d{1,2})\.?\s+([A-Za-zÄÖÜäöüß]+)\.?,?\s*(\d{4})/);
|
|
if (!wordMatch) return undefined;
|
|
const day = wordMatch[1].padStart(2, "0");
|
|
const monthRaw = wordMatch[2]
|
|
.toLowerCase()
|
|
.replace(/ä/g, "a").replace(/ö/g, "o").replace(/ü/g, "u")
|
|
.slice(0, 3);
|
|
const month = GERMAN_MONTHS[monthRaw];
|
|
if (!month) return undefined;
|
|
return `${wordMatch[3]}-${month}-${day}`;
|
|
}
|
|
|
|
/**
|
|
* Parse German price to EUR float.
|
|
* "42,50" → 42.50
|
|
* "1.063,02" → 1063.02
|
|
*/
|
|
function parseGermanPrice(raw: string): number | undefined {
|
|
const cleaned = raw.replace(/[^0-9.,]/g, "").trim();
|
|
if (!cleaned) return undefined;
|
|
let normalized: string;
|
|
if (/\d+\.\d{3},\d{2}/.test(cleaned)) {
|
|
normalized = cleaned.replace(/\./g, "").replace(",", ".");
|
|
} else if (cleaned.includes(",")) {
|
|
normalized = cleaned.replace(",", ".");
|
|
} else {
|
|
normalized = cleaned;
|
|
}
|
|
const n = parseFloat(normalized);
|
|
return isNaN(n) || n <= 0 ? undefined : n;
|
|
}
|
|
|
|
// ── Stock level helper ─────────────────────────────────────────────────────────
|
|
|
|
function deriveStockLevel(
|
|
deQty?: number,
|
|
globalQty?: number,
|
|
backorderQty?: number
|
|
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
|
|
const total = (deQty ?? 0) + (globalQty ?? 0);
|
|
if (total > 100) return "in_stock";
|
|
if (total > 0) return "low_stock";
|
|
if ((backorderQty ?? 0) > 0) return "on_request";
|
|
return "out_of_stock";
|
|
}
|
|
|
|
// ── Product classification ─────────────────────────────────────────────────────
|
|
|
|
function detectFormFactor(text: string): string | undefined {
|
|
const l = text.toLowerCase();
|
|
if (l.includes("osfp") && !l.includes("qsfp")) return "OSFP";
|
|
if (l.includes("qsfp-dd800") || l.includes("qsfp-dd 800")) return "QSFP-DD800";
|
|
if (l.includes("qsfp-dd")) return "QSFP-DD";
|
|
if (l.includes("qsfp56")) return "QSFP56";
|
|
if (l.includes("qsfp28")) return "QSFP28";
|
|
if (l.includes("qsfp+") || l.includes("qsfp plus")) return "QSFP+";
|
|
if (l.includes("sfp56")) return "SFP56";
|
|
if (l.includes("sfp28")) return "SFP28";
|
|
if (l.includes("sfp+") || l.includes("sfp plus")) return "SFP+";
|
|
if (l.includes("sfp") && !l.includes("qsfp")) return "SFP";
|
|
if (l.includes("cfp2")) return "CFP2";
|
|
if (l.includes("xfp")) return "XFP";
|
|
return undefined;
|
|
}
|
|
|
|
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/1\.6\s*t/i, "1.6T", 1600],
|
|
[/800\s*g/i, "800G", 800],
|
|
[/400\s*g/i, "400G", 400],
|
|
[/200\s*g/i, "200G", 200],
|
|
[/100\s*g/i, "100G", 100],
|
|
[/50\s*g/i, "50G", 50],
|
|
[/40\s*g/i, "40G", 40],
|
|
[/25\s*g/i, "25G", 25],
|
|
[/10\s*g/i, "10G", 10],
|
|
[/\b1\s*g\b/i, "1G", 1],
|
|
];
|
|
for (const [re, speed, gbps] of patterns) {
|
|
if (re.test(text)) return { speed, speedGbps: gbps };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectReach(text: string): string | undefined {
|
|
const m = text.match(/(\d+)\s*(m|km)\b/i);
|
|
return m ? `${m[1]}${m[2].toLowerCase()}` : undefined;
|
|
}
|
|
|
|
// ── Types ──────────────────────────────────────────────────────────────────────
|
|
|
|
interface ProductSummary {
|
|
url: string;
|
|
name: string;
|
|
partNumber: string;
|
|
}
|
|
|
|
interface ProductDetail extends ProductSummary {
|
|
priceNet?: number;
|
|
deQty?: number;
|
|
deDeliveryDate?: string;
|
|
globalQty?: number;
|
|
globalDeliveryDate?: string;
|
|
backorderQty?: number;
|
|
backorderDate?: string;
|
|
unitsSold?: number;
|
|
compatibleBrands: string[];
|
|
specs: Record<string, string>;
|
|
imageUrl?: string;
|
|
datasheetUrl?: string;
|
|
}
|
|
|
|
// ── Phase 1: Collect product URLs ──────────────────────────────────────────────
|
|
|
|
/**
|
|
* Visit all category pages (paginated) and return a Map of product URL → summary.
|
|
* Pages are queued in round-robin order (all p1s, then all p2s, …) so an
|
|
* exhausted category is detected before we waste further requests on it.
|
|
*/
|
|
async function collectProductUrls(
|
|
proxyConfiguration: ProxyConfiguration | undefined
|
|
): Promise<Map<string, ProductSummary>> {
|
|
// Purge leftover request queue from previous runs (instance-isolated storage)
|
|
const fsPhase1Dir = crawleeStorageDir("fs-phase1");
|
|
try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
|
|
|
const products = new Map<string, ProductSummary>();
|
|
const exhausted = new Set<string>();
|
|
|
|
// Pre-queue: all page-1s, then all page-2s, …
|
|
const listingRequests = Array.from({ length: MAX_PAGES_PER_CATEGORY }, (_, i) =>
|
|
CATEGORY_URLS.map((cat) => ({
|
|
url: i === 0 ? `${BASE_URL}${cat}` : `${BASE_URL}${cat}?page=${i + 1}`,
|
|
userData: { catPath: cat, pageNum: i + 1 },
|
|
uniqueKey: `listing-${cat}-p${i + 1}`,
|
|
}))
|
|
).flat();
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 1,
|
|
maxRequestsPerMinute: 12,
|
|
requestHandlerTimeoutSecs: 60,
|
|
headless: true,
|
|
useSessionPool: false,
|
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
|
|
},
|
|
},
|
|
preNavigationHooks: [
|
|
async ({ page }) => {
|
|
await applyStealthPatches(page);
|
|
await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" });
|
|
await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" })));
|
|
},
|
|
],
|
|
async requestHandler({ page, request, log }) {
|
|
const { catPath, pageNum } = request.userData as { catPath: string; pageNum: number };
|
|
if (exhausted.has(catPath)) {
|
|
log.debug(`[p1] Skip ${catPath} p${pageNum} — exhausted`);
|
|
return;
|
|
}
|
|
|
|
try {
|
|
await page.waitForSelector('a[href*="/products/"]', { timeout: 12000 });
|
|
} catch {
|
|
await page.waitForTimeout(6000);
|
|
}
|
|
|
|
const found = await page.evaluate(
|
|
(): Array<{ url: string; name: string; partNumber: string }> => {
|
|
const results: Array<{ url: string; name: string; partNumber: string }> = [];
|
|
const seen = new Set<string>();
|
|
for (const link of document.querySelectorAll<HTMLAnchorElement>('a[href*="/products/"]')) {
|
|
const href = link.getAttribute("href") ?? "";
|
|
if (!href.includes("/products/")) continue;
|
|
const absUrl = href.startsWith("http") ? href : `https://www.fs.com${href}`;
|
|
if (seen.has(absUrl)) continue;
|
|
seen.add(absUrl);
|
|
const img = link.querySelector("img");
|
|
const name = (
|
|
img?.getAttribute("alt") ??
|
|
link.getAttribute("title") ??
|
|
link.textContent ??
|
|
""
|
|
).trim().replace(/\s+/g, " ").slice(0, 200);
|
|
const pnMatch = href.match(/\/products\/(\d+)\.html/);
|
|
const partNumber = pnMatch ? `FS-${pnMatch[1]}` : "";
|
|
if (name.length >= 5 && partNumber) results.push({ url: absUrl, name, partNumber });
|
|
}
|
|
return results;
|
|
}
|
|
);
|
|
|
|
log.info(`[Listing] ${catPath} p${pageNum}: ${found.length} products`);
|
|
if (found.length === 0) {
|
|
exhausted.add(catPath);
|
|
} else {
|
|
for (const p of found) {
|
|
if (!products.has(p.url)) products.set(p.url, p);
|
|
}
|
|
}
|
|
},
|
|
}, makeCrawleeConfig("fs-phase1"));
|
|
|
|
try {
|
|
await crawler.run(listingRequests);
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
|
|
// Benign Crawlee post-run lock-file race: _isTaskReadyFunction reads a
|
|
// request .json that was already cleaned up after the crawl finished.
|
|
// Crawlee catches + re-throws it internally, which rejects crawler.run().
|
|
// Safe to ignore — all pages were already processed.
|
|
console.warn("[Phase 1] Crawlee post-run ENOENT (benign, ignoring)");
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
|
|
return products;
|
|
}
|
|
|
|
// ── Phase 2: Scrape product detail pages ──────────────────────────────────────
|
|
|
|
async function scrapeProductDetails(
|
|
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
|
proxyConfiguration: ProxyConfiguration | undefined
|
|
): Promise<ProductDetail[]> {
|
|
// Purge Phase 2 storage so it starts with a clean request queue
|
|
const fsPhase2Dir = crawleeStorageDir("fs-phase2");
|
|
try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
|
const details: ProductDetail[] = [];
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 1,
|
|
maxRequestsPerMinute: 10,
|
|
requestHandlerTimeoutSecs: 90,
|
|
headless: true,
|
|
useSessionPool: false,
|
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
|
|
},
|
|
},
|
|
preNavigationHooks: [
|
|
async ({ page }) => {
|
|
await applyStealthPatches(page);
|
|
await page.setExtraHTTPHeaders({ "Accept-Language": "de-DE,de;q=0.9" });
|
|
await page.context().addCookies(DE_COOKIES.map(c => ({ ...c, domain: "www.fs.com" })));
|
|
},
|
|
],
|
|
async requestHandler({ page, request, log }) {
|
|
const { name: listingName, partNumber: listingPn } = request.userData as {
|
|
name: string;
|
|
partNumber: string;
|
|
};
|
|
const url = request.url;
|
|
|
|
try {
|
|
// Wait for the page structure AND ideally a price element to render
|
|
await page.waitForSelector(
|
|
'h1, .product-detail, [class*="product-info"], [class*="product-main"]',
|
|
{ timeout: 12000 }
|
|
);
|
|
// Give JS-rendered price elements a moment to appear after the DOM is ready
|
|
await page.waitForSelector(
|
|
'[class*="price-value"], [class*="product-price"], [class*="prod-price"], [class*="final-price"]',
|
|
{ timeout: 5000 }
|
|
).catch(() => { /* price element optional — proceed with bodyText fallback */ });
|
|
} catch {
|
|
await page.waitForTimeout(7000);
|
|
}
|
|
|
|
const raw = await page.evaluate(
|
|
(): {
|
|
bodyText: string;
|
|
priceRaw: string;
|
|
specs: Record<string, string>;
|
|
brands: string[];
|
|
imageUrl: string;
|
|
datasheetUrl: string;
|
|
h1: string;
|
|
} => {
|
|
const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n");
|
|
|
|
// ── DOM price extraction (avoids matching site-wide shipping threshold) ──
|
|
// FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" on every page.
|
|
// bodyText regex matches this and returns 79 for ALL products. We extract
|
|
// the actual product price from its own DOM element, skipping bad parents.
|
|
// ── Net price (ohne MwSt) preferred for B2B comparisons ──────────────
|
|
// FS.com HTML structure (verified 2026-05-06):
|
|
// <div class="no_tax">5,10 € ohne MwSt.</div> ← net price ✓
|
|
// <div class="price">6,07 €</div> ← gross price
|
|
// <div class="standard_price">6,07 €</div> ← gross
|
|
// The .no_tax element contains the B2B net price — prefer it.
|
|
let priceRaw = "";
|
|
const PRICE_SELS = [
|
|
// ── FS.com current structure (2026-05) — net price first ──
|
|
".no_tax", // "5,10 € ohne MwSt." — net/B2B price
|
|
".standard_price", // "6,07 €" — gross fallback
|
|
".price", // "6,07 €" — gross fallback (simple class)
|
|
// ── Legacy / other patterns ───────────────────────────────
|
|
"[class*='price-value']",
|
|
"[class*='product-price']",
|
|
"[class*='prod-price']",
|
|
"[class*='final-price']",
|
|
"[class*='regular-price']",
|
|
"[class*='price-amount']",
|
|
"[data-cy='price']",
|
|
".price-box",
|
|
];
|
|
const SKIP_PARENT =
|
|
"[class*='shipping'], [class*='banner'], [class*='delivery'], " +
|
|
"[class*='free-ship'], [class*='cart'], [class*='checkout'], " +
|
|
"[class*='notice'], [class*='promo'], footer, header, nav";
|
|
outer: for (const sel of PRICE_SELS) {
|
|
for (const el of Array.from(document.querySelectorAll<HTMLElement>(sel))) {
|
|
if (el.closest(SKIP_PARENT)) continue;
|
|
const txt = (el.textContent ?? "").replace(/\s+/g, " ").trim();
|
|
// Must contain a digit, currency marker, and be short (<40 chars)
|
|
if (/\d/.test(txt) && txt.length < 40 && /[€$]|EUR/i.test(txt)) {
|
|
priceRaw = txt;
|
|
break outer;
|
|
}
|
|
}
|
|
}
|
|
|
|
const specs: Record<string, string> = {};
|
|
const SEL = [
|
|
".product-param tr", ".product-specs tr", ".param-table tr",
|
|
".specifications tr", ".detail-param tr", ".prod-spec-list tr",
|
|
'[class*="specification"] tr', '[class*="param"] tr',
|
|
".tab-content tr", ".product-info-table tr", ".tech-param tr",
|
|
].join(", ");
|
|
document.querySelectorAll(SEL).forEach((row) => {
|
|
const cells = row.querySelectorAll("td, th");
|
|
if (cells.length >= 2) {
|
|
const k = (cells[0]?.textContent ?? "").trim().replace(/\s+/g, " ");
|
|
const v = (cells[1]?.textContent ?? "").trim().replace(/\s+/g, " ");
|
|
if (k && v && k.length < 80 && !/^[-\s]+$/.test(k)) specs[k] = v;
|
|
}
|
|
});
|
|
document.querySelectorAll("dt").forEach((dt) => {
|
|
const dd = dt.nextElementSibling;
|
|
if (dd?.tagName === "DD") {
|
|
const k = (dt.textContent ?? "").trim();
|
|
const v = (dd.textContent ?? "").trim();
|
|
if (k && v && k.length < 80) specs[k] = v;
|
|
}
|
|
});
|
|
|
|
const brands: string[] = [];
|
|
const brandContainer = document.querySelector<Element>(
|
|
'[class*="compatible"], [class*="brand-list"], [class*="compatibility"], ' +
|
|
'[class*="apply-brand"], [id*="brand"], [id*="compatible"]'
|
|
);
|
|
if (brandContainer) {
|
|
brandContainer.querySelectorAll("button, a, span, li").forEach((el) => {
|
|
const t = (el.textContent ?? "").trim();
|
|
if (t && t.length > 1 && t.length < 50 && !/^\d+$/.test(t)) brands.push(t);
|
|
});
|
|
}
|
|
if (brands.length === 0) {
|
|
const bodyTxt = document.body?.innerText ?? "";
|
|
const m = bodyTxt.match(/[Kk]ompatibel\s+mit[^:]*:\s*([\s\S]{0,600})/);
|
|
if (m) {
|
|
m[1].split(/[,;\n]/).forEach((s) => {
|
|
const b = s.trim();
|
|
if (b.length > 1 && b.length < 50 && !/^\d/.test(b)) brands.push(b);
|
|
});
|
|
}
|
|
}
|
|
|
|
const imageCandidates = Array.from(document.querySelectorAll<HTMLImageElement>(
|
|
".big_img_box img, img.big_img, .big_img_m_active, .big_img_m, " +
|
|
".small_img_active img, .product-image img, .prod-img img, .product-gallery img, " +
|
|
'[class*="main-image"] img, [class*="primary-image"] img, ' +
|
|
".slick-current img, .product__image img"
|
|
)).map((img) => {
|
|
const url =
|
|
img.currentSrc ||
|
|
img.getAttribute("data-src") ||
|
|
img.getAttribute("data-original") ||
|
|
img.getAttribute("data-lazy") ||
|
|
img.getAttribute("src") ||
|
|
"";
|
|
const cls = `${img.className || ""} ${img.parentElement?.className || ""}`;
|
|
const score =
|
|
/big_img_box|big_img|big_img_m_active/.test(cls) ? 100 :
|
|
/small_img_active/.test(cls) ? 50 :
|
|
10;
|
|
return { url, score, w: img.naturalWidth || 0, h: img.naturalHeight || 0 };
|
|
}).filter((candidate) =>
|
|
candidate.url &&
|
|
/resource\.fs\.com/.test(candidate.url) &&
|
|
!/default\.jpg|generalImg|logo|icon|svg/i.test(candidate.url) &&
|
|
(candidate.w === 0 || candidate.w >= 120) &&
|
|
(candidate.h === 0 || candidate.h >= 120)
|
|
).sort((a, b) => b.score - a.score || (b.w * b.h) - (a.w * a.h));
|
|
const imageUrl = imageCandidates[0]?.url ?? "";
|
|
|
|
const dsEl = document.querySelector<HTMLAnchorElement>(
|
|
'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]'
|
|
);
|
|
const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? "";
|
|
|
|
const h1 = document.querySelector("h1")?.textContent?.trim() ?? "";
|
|
return { bodyText, priceRaw, specs, brands, imageUrl, datasheetUrl, h1 };
|
|
}
|
|
);
|
|
|
|
if (!raw.bodyText) { log.warning(`No text: ${url}`); return; }
|
|
const t = raw.bodyText;
|
|
|
|
// ── Net price (ohne MwSt, EUR) ─────────────────────────────────────────
|
|
// Strategy:
|
|
// 1. DOM extraction (priceRaw) — most reliable, avoids shipping-threshold text
|
|
// 2. bodyText qualified patterns (ohne MwSt / netto) with shipping-ctx exclusion
|
|
// 3. Broad bodyText fallback — only >€100 to skip free-shipping threshold
|
|
//
|
|
// Root cause of the €79 bug: FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)"
|
|
// on every page. The qualified regex matched that and returned 79 for every product.
|
|
let priceNet: number | undefined;
|
|
|
|
// 1. DOM-extracted price string
|
|
if (raw.priceRaw) {
|
|
const p = parseGermanPrice(raw.priceRaw);
|
|
if (p && p > 0.5 && p < 500_000) priceNet = p;
|
|
}
|
|
|
|
// 2. bodyText qualified patterns — with shipping-context exclusion
|
|
if (!priceNet) {
|
|
const PRICE_QUALIFIED: RegExp[] = [
|
|
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i,
|
|
/€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i,
|
|
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i,
|
|
/Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i,
|
|
/Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i,
|
|
/([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i,
|
|
];
|
|
for (const pat of PRICE_QUALIFIED) {
|
|
const m = t.match(pat);
|
|
if (m?.[1]) {
|
|
// Skip if the match appears in a shipping/free-delivery context
|
|
const matchIdx = m.index ?? t.indexOf(m[0]);
|
|
const ctx = t.slice(Math.max(0, matchIdx - 200), matchIdx + 200);
|
|
if (/versand|shipping|lieferung.*\bab\b|\bab\b.*versand|gratis.*ab|kostenlos/i.test(ctx)) continue;
|
|
const p = parseGermanPrice(m[1]);
|
|
if (p && p > 0.5 && p < 500_000) { priceNet = p; break; }
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. Broad bodyText fallback — only accept > €100 (free-shipping threshold is €79)
|
|
if (!priceNet) {
|
|
for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) {
|
|
const m = t.match(pat);
|
|
if (m?.[1]) {
|
|
const p = parseGermanPrice(m[1]);
|
|
if (p && p > 100 && p < 500_000) { priceNet = p; break; }
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── DE-Lager ───────────────────────────────────────────────────────────
|
|
let deQty: number | undefined;
|
|
let deDeliveryDate: string | undefined;
|
|
const deM =
|
|
t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*DE[- ]Lager/i) ??
|
|
t.match(/(\d[\d.,KkMm]*)\s*im\s*DE[- ]?Lager/i);
|
|
if (deM?.[1]) {
|
|
deQty = parseGermanQty(deM[1]);
|
|
const idx = t.indexOf(deM[0]);
|
|
const ctx = t.slice(idx, idx + 300);
|
|
const dm =
|
|
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
|
|
if (dm?.[1]) deDeliveryDate = parseGermanDate(dm[1]);
|
|
}
|
|
|
|
// ── Global-Lager ───────────────────────────────────────────────────────
|
|
let globalQty: number | undefined;
|
|
let globalDeliveryDate: string | undefined;
|
|
const glM =
|
|
t.match(/(\d[\d.,KkMm]*)\s*Stk\.\s*im\s*Global[- ]Lager/i) ??
|
|
t.match(/(\d[\d.,KkMm]*)\s*im\s*Global[- ]?(?:Lager|Warehouse)/i) ??
|
|
t.match(/(\d[\d.,KkMm]*)\s*in\s+Global\s+Warehouse/i);
|
|
if (glM?.[1]) {
|
|
globalQty = parseGermanQty(glM[1]);
|
|
const idx = t.indexOf(glM[0]);
|
|
const ctx = t.slice(idx, idx + 300);
|
|
const dm =
|
|
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/erwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
|
|
if (dm?.[1]) globalDeliveryDate = parseGermanDate(dm[1]);
|
|
}
|
|
|
|
// ── Nachlieferung ──────────────────────────────────────────────────────
|
|
let backorderQty: number | undefined;
|
|
let backorderDate: string | undefined;
|
|
const boM =
|
|
t.match(/(\d[\d.,KkMm]*)\s*(?:Stk\.)?\s*in\s+Nachlieferung/i) ??
|
|
t.match(/Nachlieferung[:\s]*(\d[\d.,KkMm]*)/i);
|
|
if (boM?.[1]) {
|
|
backorderQty = parseGermanQty(boM[1]);
|
|
const idx = t.indexOf(boM[0]);
|
|
const ctx = t.slice(idx, idx + 300);
|
|
const dm =
|
|
ctx.match(/[Ee]rwartet[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/Lieferung[:\s]+([0-9]{1,2}\.?\s+\w+\.?,?\s*\d{4})/i) ??
|
|
ctx.match(/([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4})/);
|
|
if (dm?.[1]) backorderDate = parseGermanDate(dm[1]);
|
|
}
|
|
|
|
// ── Units sold ─────────────────────────────────────────────────────────
|
|
let unitsSold: number | undefined;
|
|
const soldM =
|
|
t.match(/(\d[\d.,KkMm]*)\s*(?:[Mm]al\s+)?[Vv]erkauft/) ??
|
|
t.match(/([\d.,KkMm]+)\+?\s*sold/i);
|
|
if (soldM?.[1]) unitsSold = parseGermanQty(soldM[1]);
|
|
|
|
// ── Part number refinement ─────────────────────────────────────────────
|
|
let partNumber = listingPn;
|
|
const pnM = t.match(
|
|
/(?:Part\s+Number|Teilenummer|Artikelnummer|P\/N)[:\s]+([A-Z0-9][-A-Z0-9./]{3,40})/i
|
|
);
|
|
if (pnM?.[1]) partNumber = pnM[1].trim();
|
|
|
|
const resolveUrl = (u: string): string | undefined => {
|
|
if (!u) return undefined;
|
|
if (u.startsWith("//")) return `https:${u}`;
|
|
if (u.startsWith("/")) return `${BASE_URL}${u}`;
|
|
if (u.startsWith("http")) return u;
|
|
return undefined;
|
|
};
|
|
|
|
const compatibleBrands = [...new Set(raw.brands)].filter((b) => b.length > 1).slice(0, 30);
|
|
|
|
log.info(
|
|
`${partNumber}: €${priceNet?.toFixed(2) ?? "?"} | ` +
|
|
`DE=${deQty ?? "-"} GL=${globalQty ?? "-"} BO=${backorderQty ?? "-"} ` +
|
|
`sold=${unitsSold ?? "-"} brands=${compatibleBrands.length}`
|
|
);
|
|
|
|
details.push({
|
|
url,
|
|
name: raw.h1 || listingName,
|
|
partNumber,
|
|
priceNet,
|
|
deQty,
|
|
deDeliveryDate,
|
|
globalQty,
|
|
globalDeliveryDate,
|
|
backorderQty,
|
|
backorderDate,
|
|
unitsSold,
|
|
compatibleBrands,
|
|
specs: raw.specs,
|
|
imageUrl: resolveUrl(raw.imageUrl),
|
|
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
|
});
|
|
},
|
|
}, makeCrawleeConfig("fs-phase2"));
|
|
|
|
try {
|
|
await crawler.run(requests);
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
|
|
console.warn("[Phase 2] Crawlee post-run ENOENT (benign, ignoring)");
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
return details;
|
|
}
|
|
|
|
// ── Main export ────────────────────────────────────────────────────────────────
|
|
|
|
export async function scrapeFs(): Promise<void> {
|
|
console.log("=== FS.com Scraper v2 Starting ===\n");
|
|
|
|
// ── Quick connectivity check — exit early on datacenter IPs that block FS.com ─
|
|
try {
|
|
const probe = await fetch("https://www.fs.com/robots.txt", {
|
|
signal: AbortSignal.timeout(8000),
|
|
headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" },
|
|
});
|
|
if (!probe.ok && probe.status !== 403) throw new Error(`HTTP ${probe.status}`);
|
|
if (probe.status === 403) {
|
|
const body = (await probe.text()).slice(0, 200);
|
|
if (body.includes("Request blocked") || body.includes("ERROR")) {
|
|
console.warn("[FS.com] Server IP is blocked by FS.com — skipping. Run via run-fs-scraper-mac.sh on a residential IP.");
|
|
return;
|
|
}
|
|
}
|
|
} catch (err) {
|
|
const msg = (err as Error).message;
|
|
if (msg.includes("ENOTFOUND") || msg.includes("ECONNREFUSED") || msg.includes("ERR_EMPTY") || msg.includes("TimeoutError")) {
|
|
console.warn(`[FS.com] Connectivity check failed (${msg.slice(0, 60)}) — skipping. This scraper requires a residential IP.`);
|
|
return;
|
|
}
|
|
// For other errors (e.g. 200 OK but weird body), proceed anyway
|
|
}
|
|
|
|
const proxyConfiguration = buildProxyConfiguration();
|
|
|
|
const vendorId = await ensureVendor(
|
|
"FS.COM",
|
|
"compatible",
|
|
"https://www.fs.com",
|
|
"https://www.fs.com/de/c/optical-transceivers-9"
|
|
);
|
|
console.log(`Vendor ID: ${vendorId}`);
|
|
|
|
// ── Phase 1: Discover product URLs ─────────────────────────────────────────
|
|
console.log("\n[Phase 1] Collecting product URLs from category listing pages…");
|
|
const productMap = await collectProductUrls(proxyConfiguration);
|
|
|
|
if (productMap.size === 0) {
|
|
console.warn("[Phase 1] No products discovered — check selectors or proxy.");
|
|
return;
|
|
}
|
|
|
|
// ── Filter: skip products with fresh stock data ─────────────────────────────
|
|
const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean);
|
|
const freshlyScraped = new Set<string>();
|
|
if (!FORCE_REVALIDATE && allPartNumbers.length > 0) {
|
|
const freshResult = await pool.query(
|
|
`SELECT DISTINCT t.part_number
|
|
FROM transceivers t
|
|
JOIN stock_observations so ON so.transceiver_id = t.id
|
|
WHERE so.source_vendor_id = $1
|
|
AND so.time > NOW() - INTERVAL '${STOCK_FRESH_HOURS} hours'
|
|
AND t.part_number = ANY($2)`,
|
|
[vendorId, allPartNumbers]
|
|
);
|
|
for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string);
|
|
}
|
|
|
|
let missingImageUrls = new Set<string>();
|
|
if (ONLY_MISSING_IMAGES) {
|
|
const missingResult = await pool.query(
|
|
`SELECT DISTINCT product_page_url
|
|
FROM transceivers t
|
|
JOIN vendors v ON v.id = t.vendor_id
|
|
WHERE v.name = 'FS.COM'
|
|
AND COALESCE(t.image_verified, false) = false
|
|
AND product_page_url LIKE '%/products/%'`
|
|
);
|
|
missingImageUrls = new Set(
|
|
missingResult.rows
|
|
.map((row) => normalizeFsProductUrl(row.product_page_url as string))
|
|
.filter(Boolean)
|
|
);
|
|
}
|
|
|
|
const urlsToScrape = [...productMap.keys()]
|
|
.filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? ""))
|
|
.filter((url) => !ONLY_MISSING_IMAGES || missingImageUrls.has(normalizeFsProductUrl(url)))
|
|
.slice(0, MAX_DETAIL_PAGES_PER_RUN);
|
|
|
|
console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`);
|
|
console.log(
|
|
ONLY_MISSING_IMAGES
|
|
? ` (${missingImageUrls.size} DB product URLs missing images; ${productMap.size - urlsToScrape.length} skipped by targeted image filter)`
|
|
:
|
|
FORCE_REVALIDATE
|
|
? ` (${productMap.size - urlsToScrape.length} skipped — max detail cap ${MAX_DETAIL_PAGES_PER_RUN})`
|
|
: ` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`
|
|
);
|
|
|
|
if (urlsToScrape.length === 0) {
|
|
console.log("[Phase 2] All products have fresh stock data — nothing to scrape.");
|
|
return;
|
|
}
|
|
|
|
// ── Phase 2: Scrape detail pages ────────────────────────────────────────────
|
|
const detailRequests = urlsToScrape.map((url) => {
|
|
const s = productMap.get(url);
|
|
return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } };
|
|
});
|
|
|
|
const details = await scrapeProductDetails(detailRequests, proxyConfiguration);
|
|
console.log(`[Phase 2] Complete — ${details.length} pages scraped.`);
|
|
|
|
// ── Phase 3: Write to database ─────────────────────────────────────────────
|
|
console.log("\n[Phase 3] Writing to database…");
|
|
let priceWritten = 0;
|
|
let stockWritten = 0;
|
|
let specsUpdated = 0;
|
|
let errors = 0;
|
|
|
|
for (const detail of details) {
|
|
try {
|
|
const ff = detectFormFactor(detail.name);
|
|
const speedInfo = detectSpeed(detail.name);
|
|
const reach = detectReach(detail.name);
|
|
const parsed = parseSpecTable(detail.specs);
|
|
|
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
|
partNumber: detail.partNumber,
|
|
vendorId,
|
|
formFactor: ff,
|
|
speedGbps: speedInfo?.speedGbps,
|
|
speed: speedInfo?.speed,
|
|
reachLabel: reach ?? parsed.reachLabel,
|
|
reachMeters: parsed.reachMeters,
|
|
fiberType: parsed.fiberType,
|
|
wavelengths: parsed.wavelengths,
|
|
imageUrl: detail.imageUrl,
|
|
category: "DataCenter",
|
|
});
|
|
|
|
const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty);
|
|
const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0);
|
|
|
|
if (detail.priceNet && detail.priceNet > 0) {
|
|
const hash = contentHash({
|
|
p: detail.priceNet,
|
|
de: detail.deQty ?? 0,
|
|
gl: detail.globalQty ?? 0,
|
|
});
|
|
const isNew = await upsertPriceObservation({
|
|
transceiverId,
|
|
sourceVendorId: vendorId,
|
|
price: detail.priceNet,
|
|
currency: "EUR",
|
|
stockLevel,
|
|
quantityAvailable: totalQty > 0 ? totalQty : undefined,
|
|
url: detail.url,
|
|
contentHash: hash,
|
|
});
|
|
if (isNew) priceWritten++;
|
|
}
|
|
|
|
const stockNew = await upsertStockObservation({
|
|
transceiverId,
|
|
sourceVendorId: vendorId,
|
|
stockLevel,
|
|
quantityAvailable: totalQty > 0 ? totalQty : undefined,
|
|
warehouseDeQty: detail.deQty,
|
|
warehouseDeDeliveryDate: detail.deDeliveryDate ?? null,
|
|
warehouseGlobalQty: detail.globalQty,
|
|
warehouseGlobalDeliveryDate: detail.globalDeliveryDate ?? null,
|
|
backorderQty: detail.backorderQty,
|
|
backorderEstimatedDate: detail.backorderDate ?? null,
|
|
unitsSold: detail.unitsSold,
|
|
compatibleBrands: detail.compatibleBrands,
|
|
priceNet: detail.priceNet,
|
|
productUrl: detail.url,
|
|
// FS.com: per-warehouse breakdown (DE-Lager + Global-Lager), EUR net prices
|
|
stockConfidence: 3,
|
|
priceCurrency: "EUR",
|
|
priceIncludesTax: false,
|
|
});
|
|
if (stockNew) stockWritten++;
|
|
|
|
if (Object.keys(detail.specs).length > 0) {
|
|
const updated = await updateVerifiedSpecs({
|
|
transceiverId,
|
|
fiberType: parsed.fiberType,
|
|
connector: parsed.connector,
|
|
wavelengths: parsed.wavelengths,
|
|
reachMeters: parsed.reachMeters,
|
|
reachLabel: parsed.reachLabel,
|
|
powerConsumptionW: parsed.powerConsumptionW,
|
|
tempRange: parsed.tempRange,
|
|
modulation: parsed.modulation,
|
|
domSupport: parsed.domSupport,
|
|
imageUrl: detail.imageUrl,
|
|
datasheetUrl: detail.datasheetUrl,
|
|
source: "fs.com",
|
|
});
|
|
if (updated) specsUpdated++;
|
|
}
|
|
} catch (err) {
|
|
console.error(` ✗ ${detail.partNumber}: ${(err as Error).message}`);
|
|
errors++;
|
|
}
|
|
}
|
|
|
|
console.log("\n=== FS.com Scraper v2 Complete ===");
|
|
console.log(` Products discovered: ${productMap.size}`);
|
|
console.log(` Detail pages scraped: ${details.length}`);
|
|
console.log(` Price observations: ${priceWritten} new`);
|
|
console.log(` Stock observations: ${stockWritten} new`);
|
|
console.log(` Specs verified: ${specsUpdated}`);
|
|
if (errors > 0) console.warn(` DB errors: ${errors}`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
// Crawlee's FileSystemStorage emits spurious unhandledRejection errors after
|
|
// crawler.run() resolves: the internal task loop schedules one final
|
|
// _isTaskReadyFunction call which tries to read a request .json file that
|
|
// Crawlee already cleaned up during normal processing. This ENOENT is benign
|
|
// (crawling is done), but the default unhandledRejection handler would call
|
|
// process.exit(1) and abort Phase 2. We swallow it here.
|
|
process.on("unhandledRejection", (reason) => {
|
|
const msg = reason instanceof Error ? reason.message : String(reason);
|
|
if (msg.includes("ENOENT") && msg.includes("request_queues")) {
|
|
// Benign Crawlee post-run cleanup race — ignore
|
|
return;
|
|
}
|
|
// All other unhandled rejections are real errors
|
|
console.error("Unhandled rejection:", reason);
|
|
process.exit(1);
|
|
});
|
|
|
|
scrapeFs()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|