feat: Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme)

This commit is contained in:
Rene Fichtmueller 2026-04-21 06:16:05 +02:00
parent 653824f23b
commit 18a9e1346e
2 changed files with 321 additions and 1 deletions

View File

@ -101,6 +101,8 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
"scrape:assets:switches",
// ── Switch og:image fetcher (daily, after switch-assets) ──────────
"scrape:images:switches",
// ── Playwright image fetcher for bot-blocked vendors (every 3d) ───
"scrape:images:switches:playwright",
// ── eBay enrichment (every 6h) ────────────────────────────────────
"enrich:ebay-transceivers",
"enrich:ebay-switches",
@ -241,6 +243,9 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
// og:image fetcher: daily at 08:30, after switch-assets completes at 07:30
await boss.schedule("scrape:images:switches", "30 8 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 });
// Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme)
// Every 3 days at 09:00 — Playwright is slower and heavier than plain HTTP
await boss.schedule("scrape:images:switches:playwright", "0 9 */3 * *", {}, { retryLimit: 1, expireInSeconds: 10800 });
// ══════════════════════════════════════════════════════════════════════
// EBAY ENRICHMENT — every 6h
@ -337,6 +342,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
const { scrapeEdgecore } = await import("./scrapers/edgecore");
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher");
const { fetchSwitchImagesPlaywright } = await import("./scrapers/switch-image-playwright");
const { scrapeFlexoptixCompatibility } = await import("./scrapers/flexoptix-compat");
// ── Prediction signal scrapers ────────────────────────────────────────
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
@ -537,6 +543,15 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await fetchSwitchImages();
});
await boss.work("scrape:images:switches:playwright", async () => {
console.log(`[${new Date().toISOString()}] Running: Switch image fetcher (Playwright — bot-blocked vendors)`);
if (!isLoadAcceptable(2.0)) {
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Playwright image fetch`);
return;
}
await fetchSwitchImagesPlaywright();
});
// ── eBay enrichment ───────────────────────────────────────────────────
await boss.work("enrich:ebay-transceivers", async () => {

View File

@ -0,0 +1,305 @@
/**
* Switch Image Fetcher Playwright edition for bot-blocked vendors
*
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs)
*
* Strategy:
* 1. Query switches without image_url for JS-blocked vendors
* 2. Open each product page in headless Chromium (stealth mode)
* 3. Extract og:image (or fallback: first large product <img>)
* 4. Apply same isGenericImage() filter as the plain HTTP fetcher
* 5. Write image_url + product_page_url to switches table
*
* Rate limit: maxConcurrency=1, 4s delay between requests.
* Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista]
*/
import { PlaywrightCrawler } from "crawlee";
import { pool } from "../utils/db";
import { makeCrawleeConfig } from "../utils/crawlee-config";
// ── Stealth headers injected into every page ─────────────────────────────────
const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
// ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ────────
const GENERIC_IMAGE_PATTERNS: RegExp[] = [
/[-/_]logo[-_.]|\/logos?\//i,
/cisco[-_]?logo/i,
/juniper[-_]networks[-_]logo/i,
/arista[-_]?logo/i,
/brand[-_]?logo/i,
/company[-_]?logo/i,
/\/svg\//i,
/\.svg(\?|$)/i,
/naas-homepag/i,
/al-enterprise.*\/images\/naas/i,
/og[-_]default/i,
/default[-_](?:og|social|share|image)/i,
/site[-_](?:default|image|og)/i,
/social[-_](?:default|share)/i,
/twitter[-_]default/i,
/default[-_]thumbnail/i,
/\/homepage\//i,
/hero[-_](?:banner|bg|background|image)/i,
/banner[-_](?:bg|background)/i,
/lifestyle/i,
/stock[-_]?photo/i,
/placeholder/i,
/no[-_]?image/i,
/image[-_]?not[-_]?found/i,
/\/fallback[/-]/i,
/missing[-_]image/i,
/\/press[-_]kit/i,
/\/media[-_]kit/i,
// Vendor-specific brand icons
/open-graph\.gif/i,
/social[-_]icon/i,
/favicon/i,
/og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic
];
function isGenericImage(url: string): boolean {
return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url));
}
// ── Product page URL builders ─────────────────────────────────────────────────
function buildAristaUrl(model: string): string | null {
// 7060X6-64PE → try series page and individual page
// Arista individual model pages: /en/products/<model-lowercase>
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.arista.com/en/products/${slug}`;
}
function buildDellUrl(model: string): string | null {
// PowerSwitch Z9332F-ON → try Dell networking product page
const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim();
const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
}
function buildEdgecoreUrl(model: string): string | null {
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.edge-core.com/product/${slug}.html`;
}
function buildFortinetUrl(model: string): string | null {
// FortiSwitch 424E → fortiswitch-424e
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
return `https://www.fortinet.com/products/fortiswitch/${slug}`;
}
function buildHpeArubaUrl(model: string): string | null {
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.arubanetworks.com/products/switches/${slug}/`;
}
function buildExtremeUrl(model: string): string | null {
// Extreme uses Coveo JS search — try series page
const m = model.toLowerCase();
if (m.startsWith("x6")) {
const series = model.match(/^(X\d+)/i)?.[1]?.toLowerCase() ?? "";
return `https://www.extremenetworks.com/products/switching/${series}-series/`;
}
if (m.startsWith("slx") || m.startsWith("8720") || m.startsWith("5520")) {
return `https://www.extremenetworks.com/products/switching/`;
}
return `https://www.extremenetworks.com/products/switching/`;
}
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
arista: buildAristaUrl,
dell: buildDellUrl,
edgecore: buildEdgecoreUrl,
fortinet: buildFortinetUrl,
"hpe-aruba": buildHpeArubaUrl,
extreme: buildExtremeUrl,
};
// ── Request data attached to each crawl URL ──────────────────────────────────
interface SwitchCrawlData {
switchId: string;
model: string;
vendorName: string;
vendorSlug: string;
productPageUrl: string;
}
// ── Main scraper ──────────────────────────────────────────────────────────────
export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise<void> {
console.log("=== Switch Image Fetcher (Playwright) ===\n");
const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`;
const { rows } = await pool.query<{
id: string;
model: string;
vendor_slug: string;
vendor_name: string;
product_page_url: string | null;
}>(`
SELECT sw.id, sw.model, sw.product_page_url,
v.slug AS vendor_slug, v.name AS vendor_name
FROM switches sw
JOIN vendors v ON v.id = sw.vendor_id
WHERE (sw.image_url IS NULL OR sw.image_url = '')
${slugFilter}
ORDER BY v.slug, sw.model
`);
if (rows.length === 0) {
console.log(" All target switches already have images.\n");
return;
}
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
const requests: Array<{ url: string; userData: SwitchCrawlData }> = [];
for (const row of rows) {
const builder = URL_BUILDERS[row.vendor_slug];
const productUrl = row.product_page_url || (builder ? builder(row.model) : null);
if (!productUrl) {
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
continue;
}
requests.push({
url: productUrl,
userData: {
switchId: row.id,
model: row.model,
vendorName: row.vendor_name,
vendorSlug: row.vendor_slug,
productPageUrl: productUrl,
},
});
}
if (requests.length === 0) {
console.log(" Nothing to crawl.\n");
return;
}
let found = 0;
let missed = 0;
let errors = 0;
const crawler = new PlaywrightCrawler(
{
maxConcurrency: 1, // one at a time — server-friendly
maxRequestsPerMinute: 12, // ~5s per request minimum
requestHandlerTimeoutSecs: 45,
navigationTimeoutSecs: 30,
headless: true,
launchContext: {
launchOptions: {
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-infobars",
"--window-size=1920,1080",
],
},
},
preNavigationHooks: [
async (_ctx, gotoOptions) => {
gotoOptions!.waitUntil = "domcontentloaded";
},
],
async requestHandler({ request, page }) {
const data = request.userData as SwitchCrawlData;
// Inject stealth UA
await page.setExtraHTTPHeaders({
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
});
await page.evaluate((ua) => {
Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true });
Object.defineProperty(navigator, "webdriver", { value: false, configurable: true });
}, STEALTH_UA);
// Wait for page to settle (JS rendering)
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
// Extract og:image
const imageUrl: string | null = await page.evaluate(() => {
const ogMeta = document.querySelector<HTMLMetaElement>('meta[property="og:image"]');
if (ogMeta?.content) return ogMeta.content;
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
if (twitterMeta?.content) return twitterMeta.content;
// Fallback: largest product-looking image
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const candidate = imgs
.filter((img) => {
const src = img.src || img.getAttribute("data-src") || "";
return src.startsWith("http") &&
(src.match(/\.(jpg|jpeg|png|webp)/i)) &&
img.naturalWidth > 300 &&
img.naturalHeight > 200 &&
(src.includes("product") || src.includes("switch") || src.includes("router") || src.includes("hardware"));
})
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
return candidate?.src ?? null;
});
if (!imageUrl || isGenericImage(imageUrl)) {
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
missed++;
// Save product_page_url even on miss to track that we tried
if (!data.productPageUrl) {
await pool.query(
`UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`,
[data.switchId, request.url],
);
}
return;
}
await pool.query(
`UPDATE switches
SET image_url = $2,
product_page_url = COALESCE(product_page_url, $3),
assets_scraped_at = NOW()
WHERE id = $1`,
[data.switchId, imageUrl, request.url],
);
console.log(` [OK] ${data.vendorName} ${data.model}${imageUrl.slice(0, 80)}`);
found++;
},
async failedRequestHandler({ request }) {
const data = request.userData as SwitchCrawlData;
console.log(` [FAIL] ${data.vendorName} ${data.model}${request.errorMessages?.[0] ?? "unknown error"}`);
errors++;
},
},
makeCrawleeConfig("switch-images-playwright"),
);
await crawler.run(requests);
console.log(`\n=== Playwright Image Scraper Complete ===`);
console.log(` Images found: ${found}`);
console.log(` Missed: ${missed}`);
if (errors > 0) console.warn(` Errors: ${errors}`);
}
if (require.main === module) {
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
fetchSwitchImagesPlaywright(vendor)
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}