From 18a9e1346e8afeb479667796fa504d1add2f46ac Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 21 Apr 2026 06:16:05 +0200 Subject: [PATCH] feat: Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme) --- packages/scraper/src/scheduler.ts | 17 +- .../src/scrapers/switch-image-playwright.ts | 305 ++++++++++++++++++ 2 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 packages/scraper/src/scrapers/switch-image-playwright.ts diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 20da0bf..1b85c83 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -101,6 +101,8 @@ export async function registerSchedules(boss: PgBoss): Promise { "scrape:assets:switches", // ── Switch og:image fetcher (daily, after switch-assets) ────────── "scrape:images:switches", + // ── Playwright image fetcher for bot-blocked vendors (every 3d) ─── + "scrape:images:switches:playwright", // ── eBay enrichment (every 6h) ──────────────────────────────────── "enrich:ebay-transceivers", "enrich:ebay-switches", @@ -241,6 +243,9 @@ export async function registerSchedules(boss: PgBoss): Promise { await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); // og:image fetcher: daily at 08:30, after switch-assets completes at 07:30 await boss.schedule("scrape:images:switches", "30 8 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 }); + // Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme) + // Every 3 days at 09:00 — Playwright is slower and heavier than plain HTTP + await boss.schedule("scrape:images:switches:playwright", "0 9 */3 * *", {}, { retryLimit: 1, expireInSeconds: 10800 }); // ══════════════════════════════════════════════════════════════════════ // EBAY ENRICHMENT — every 6h @@ -336,7 +341,8 @@ export async function registerWorkers(boss: PgBoss): Promise { const { scrapeUfiSpace } = await import("./scrapers/ufispace"); const { scrapeEdgecore } = await import("./scrapers/edgecore"); const { scrapeSwitchAssets } = await import("./scrapers/switch-assets"); - const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher"); + const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher"); + const { fetchSwitchImagesPlaywright } = await import("./scrapers/switch-image-playwright"); const { scrapeFlexoptixCompatibility } = await import("./scrapers/flexoptix-compat"); // ── Prediction signal scrapers ──────────────────────────────────────── const { scrapeSecEdgar } = await import("./scrapers/sec-edgar"); @@ -537,6 +543,15 @@ export async function registerWorkers(boss: PgBoss): Promise { await fetchSwitchImages(); }); + await boss.work("scrape:images:switches:playwright", async () => { + console.log(`[${new Date().toISOString()}] Running: Switch image fetcher (Playwright — bot-blocked vendors)`); + if (!isLoadAcceptable(2.0)) { + console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Playwright image fetch`); + return; + } + await fetchSwitchImagesPlaywright(); + }); + // ── eBay enrichment ─────────────────────────────────────────────────── await boss.work("enrich:ebay-transceivers", async () => { diff --git a/packages/scraper/src/scrapers/switch-image-playwright.ts b/packages/scraper/src/scrapers/switch-image-playwright.ts new file mode 100644 index 0000000..383ea2e --- /dev/null +++ b/packages/scraper/src/scrapers/switch-image-playwright.ts @@ -0,0 +1,305 @@ +/** + * Switch Image Fetcher — Playwright edition for bot-blocked vendors + * + * Vendors that reject plain HTTP bots (403/406) or require JS rendering: + * Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403), + * Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs) + * + * Strategy: + * 1. Query switches without image_url for JS-blocked vendors + * 2. Open each product page in headless Chromium (stealth mode) + * 3. Extract og:image (or fallback: first large product ) + * 4. Apply same isGenericImage() filter as the plain HTTP fetcher + * 5. Write image_url + product_page_url to switches table + * + * Rate limit: maxConcurrency=1, 4s delay between requests. + * Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista] + */ + +import { PlaywrightCrawler } from "crawlee"; +import { pool } from "../utils/db"; +import { makeCrawleeConfig } from "../utils/crawlee-config"; + +// ── Stealth headers injected into every page ───────────────────────────────── + +const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; + +// ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ──────── + +const GENERIC_IMAGE_PATTERNS: RegExp[] = [ + /[-/_]logo[-_.]|\/logos?\//i, + /cisco[-_]?logo/i, + /juniper[-_]networks[-_]logo/i, + /arista[-_]?logo/i, + /brand[-_]?logo/i, + /company[-_]?logo/i, + /\/svg\//i, + /\.svg(\?|$)/i, + /naas-homepag/i, + /al-enterprise.*\/images\/naas/i, + /og[-_]default/i, + /default[-_](?:og|social|share|image)/i, + /site[-_](?:default|image|og)/i, + /social[-_](?:default|share)/i, + /twitter[-_]default/i, + /default[-_]thumbnail/i, + /\/homepage\//i, + /hero[-_](?:banner|bg|background|image)/i, + /banner[-_](?:bg|background)/i, + /lifestyle/i, + /stock[-_]?photo/i, + /placeholder/i, + /no[-_]?image/i, + /image[-_]?not[-_]?found/i, + /\/fallback[/-]/i, + /missing[-_]image/i, + /\/press[-_]kit/i, + /\/media[-_]kit/i, + // Vendor-specific brand icons + /open-graph\.gif/i, + /social[-_]icon/i, + /favicon/i, + /og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic +]; + +function isGenericImage(url: string): boolean { + return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url)); +} + +// ── Product page URL builders ───────────────────────────────────────────────── + +function buildAristaUrl(model: string): string | null { + // 7060X6-64PE → try series page and individual page + // Arista individual model pages: /en/products/ + const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); + return `https://www.arista.com/en/products/${slug}`; +} + +function buildDellUrl(model: string): string | null { + // PowerSwitch Z9332F-ON → try Dell networking product page + const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim(); + const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-"); + return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`; +} + +function buildEdgecoreUrl(model: string): string | null { + const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); + return `https://www.edge-core.com/product/${slug}.html`; +} + +function buildFortinetUrl(model: string): string | null { + // FortiSwitch 424E → fortiswitch-424e + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.fortinet.com/products/fortiswitch/${slug}`; +} + +function buildHpeArubaUrl(model: string): string | null { + const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); + return `https://www.arubanetworks.com/products/switches/${slug}/`; +} + +function buildExtremeUrl(model: string): string | null { + // Extreme uses Coveo JS search — try series page + const m = model.toLowerCase(); + if (m.startsWith("x6")) { + const series = model.match(/^(X\d+)/i)?.[1]?.toLowerCase() ?? ""; + return `https://www.extremenetworks.com/products/switching/${series}-series/`; + } + if (m.startsWith("slx") || m.startsWith("8720") || m.startsWith("5520")) { + return `https://www.extremenetworks.com/products/switching/`; + } + return `https://www.extremenetworks.com/products/switching/`; +} + +const URL_BUILDERS: Record string | null> = { + arista: buildAristaUrl, + dell: buildDellUrl, + edgecore: buildEdgecoreUrl, + fortinet: buildFortinetUrl, + "hpe-aruba": buildHpeArubaUrl, + extreme: buildExtremeUrl, +}; + +// ── Request data attached to each crawl URL ────────────────────────────────── + +interface SwitchCrawlData { + switchId: string; + model: string; + vendorName: string; + vendorSlug: string; + productPageUrl: string; +} + +// ── Main scraper ────────────────────────────────────────────────────────────── + +export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise { + console.log("=== Switch Image Fetcher (Playwright) ===\n"); + + const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`; + + const { rows } = await pool.query<{ + id: string; + model: string; + vendor_slug: string; + vendor_name: string; + product_page_url: string | null; + }>(` + SELECT sw.id, sw.model, sw.product_page_url, + v.slug AS vendor_slug, v.name AS vendor_name + FROM switches sw + JOIN vendors v ON v.id = sw.vendor_id + WHERE (sw.image_url IS NULL OR sw.image_url = '') + ${slugFilter} + ORDER BY v.slug, sw.model + `); + + if (rows.length === 0) { + console.log(" All target switches already have images.\n"); + return; + } + + console.log(` ${rows.length} switches need images (Playwright vendors)\n`); + + const requests: Array<{ url: string; userData: SwitchCrawlData }> = []; + + for (const row of rows) { + const builder = URL_BUILDERS[row.vendor_slug]; + const productUrl = row.product_page_url || (builder ? builder(row.model) : null); + if (!productUrl) { + console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`); + continue; + } + requests.push({ + url: productUrl, + userData: { + switchId: row.id, + model: row.model, + vendorName: row.vendor_name, + vendorSlug: row.vendor_slug, + productPageUrl: productUrl, + }, + }); + } + + if (requests.length === 0) { + console.log(" Nothing to crawl.\n"); + return; + } + + let found = 0; + let missed = 0; + let errors = 0; + + const crawler = new PlaywrightCrawler( + { + maxConcurrency: 1, // one at a time — server-friendly + maxRequestsPerMinute: 12, // ~5s per request minimum + requestHandlerTimeoutSecs: 45, + navigationTimeoutSecs: 30, + headless: true, + launchContext: { + launchOptions: { + args: [ + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + "--window-size=1920,1080", + ], + }, + }, + + preNavigationHooks: [ + async (_ctx, gotoOptions) => { + gotoOptions!.waitUntil = "domcontentloaded"; + }, + ], + + async requestHandler({ request, page }) { + const data = request.userData as SwitchCrawlData; + + // Inject stealth UA + await page.setExtraHTTPHeaders({ + "Accept-Language": "en-US,en;q=0.9", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + }); + await page.evaluate((ua) => { + Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true }); + Object.defineProperty(navigator, "webdriver", { value: false, configurable: true }); + }, STEALTH_UA); + + // Wait for page to settle (JS rendering) + await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {}); + + // Extract og:image + const imageUrl: string | null = await page.evaluate(() => { + const ogMeta = document.querySelector('meta[property="og:image"]'); + if (ogMeta?.content) return ogMeta.content; + + const twitterMeta = document.querySelector('meta[name="twitter:image"]'); + if (twitterMeta?.content) return twitterMeta.content; + + // Fallback: largest product-looking image + const imgs = Array.from(document.querySelectorAll("img")); + const candidate = imgs + .filter((img) => { + const src = img.src || img.getAttribute("data-src") || ""; + return src.startsWith("http") && + (src.match(/\.(jpg|jpeg|png|webp)/i)) && + img.naturalWidth > 300 && + img.naturalHeight > 200 && + (src.includes("product") || src.includes("switch") || src.includes("router") || src.includes("hardware")); + }) + .sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0]; + return candidate?.src ?? null; + }); + + if (!imageUrl || isGenericImage(imageUrl)) { + console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`); + missed++; + + // Save product_page_url even on miss to track that we tried + if (!data.productPageUrl) { + await pool.query( + `UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`, + [data.switchId, request.url], + ); + } + return; + } + + await pool.query( + `UPDATE switches + SET image_url = $2, + product_page_url = COALESCE(product_page_url, $3), + assets_scraped_at = NOW() + WHERE id = $1`, + [data.switchId, imageUrl, request.url], + ); + console.log(` [OK] ${data.vendorName} ${data.model} → ${imageUrl.slice(0, 80)}`); + found++; + }, + + async failedRequestHandler({ request }) { + const data = request.userData as SwitchCrawlData; + console.log(` [FAIL] ${data.vendorName} ${data.model} — ${request.errorMessages?.[0] ?? "unknown error"}`); + errors++; + }, + }, + makeCrawleeConfig("switch-images-playwright"), + ); + + await crawler.run(requests); + + console.log(`\n=== Playwright Image Scraper Complete ===`); + console.log(` Images found: ${found}`); + console.log(` Missed: ${missed}`); + if (errors > 0) console.warn(` Errors: ${errors}`); +} + +if (require.main === module) { + const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1]; + fetchSwitchImagesPlaywright(vendor) + .then(() => pool.end()) + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); +}