feat: Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme)
This commit is contained in:
parent
892da2bcf5
commit
2742141c8b
@ -101,6 +101,8 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
"scrape:assets:switches",
|
"scrape:assets:switches",
|
||||||
// ── Switch og:image fetcher (daily, after switch-assets) ──────────
|
// ── Switch og:image fetcher (daily, after switch-assets) ──────────
|
||||||
"scrape:images:switches",
|
"scrape:images:switches",
|
||||||
|
// ── Playwright image fetcher for bot-blocked vendors (every 3d) ───
|
||||||
|
"scrape:images:switches:playwright",
|
||||||
// ── eBay enrichment (every 6h) ────────────────────────────────────
|
// ── eBay enrichment (every 6h) ────────────────────────────────────
|
||||||
"enrich:ebay-transceivers",
|
"enrich:ebay-transceivers",
|
||||||
"enrich:ebay-switches",
|
"enrich:ebay-switches",
|
||||||
@ -241,6 +243,9 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||||||
// og:image fetcher: daily at 08:30, after switch-assets completes at 07:30
|
// og:image fetcher: daily at 08:30, after switch-assets completes at 07:30
|
||||||
await boss.schedule("scrape:images:switches", "30 8 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
await boss.schedule("scrape:images:switches", "30 8 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||||
|
// Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme)
|
||||||
|
// Every 3 days at 09:00 — Playwright is slower and heavier than plain HTTP
|
||||||
|
await boss.schedule("scrape:images:switches:playwright", "0 9 */3 * *", {}, { retryLimit: 1, expireInSeconds: 10800 });
|
||||||
|
|
||||||
// ══════════════════════════════════════════════════════════════════════
|
// ══════════════════════════════════════════════════════════════════════
|
||||||
// EBAY ENRICHMENT — every 6h
|
// EBAY ENRICHMENT — every 6h
|
||||||
@ -336,7 +341,8 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
const { scrapeUfiSpace } = await import("./scrapers/ufispace");
|
const { scrapeUfiSpace } = await import("./scrapers/ufispace");
|
||||||
const { scrapeEdgecore } = await import("./scrapers/edgecore");
|
const { scrapeEdgecore } = await import("./scrapers/edgecore");
|
||||||
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
|
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
|
||||||
const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher");
|
const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher");
|
||||||
|
const { fetchSwitchImagesPlaywright } = await import("./scrapers/switch-image-playwright");
|
||||||
const { scrapeFlexoptixCompatibility } = await import("./scrapers/flexoptix-compat");
|
const { scrapeFlexoptixCompatibility } = await import("./scrapers/flexoptix-compat");
|
||||||
// ── Prediction signal scrapers ────────────────────────────────────────
|
// ── Prediction signal scrapers ────────────────────────────────────────
|
||||||
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
|
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
|
||||||
@ -537,6 +543,15 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await fetchSwitchImages();
|
await fetchSwitchImages();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await boss.work("scrape:images:switches:playwright", async () => {
|
||||||
|
console.log(`[${new Date().toISOString()}] Running: Switch image fetcher (Playwright — bot-blocked vendors)`);
|
||||||
|
if (!isLoadAcceptable(2.0)) {
|
||||||
|
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Playwright image fetch`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
await fetchSwitchImagesPlaywright();
|
||||||
|
});
|
||||||
|
|
||||||
// ── eBay enrichment ───────────────────────────────────────────────────
|
// ── eBay enrichment ───────────────────────────────────────────────────
|
||||||
|
|
||||||
await boss.work("enrich:ebay-transceivers", async () => {
|
await boss.work("enrich:ebay-transceivers", async () => {
|
||||||
|
|||||||
305
packages/scraper/src/scrapers/switch-image-playwright.ts
Normal file
305
packages/scraper/src/scrapers/switch-image-playwright.ts
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
/**
|
||||||
|
* Switch Image Fetcher — Playwright edition for bot-blocked vendors
|
||||||
|
*
|
||||||
|
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
|
||||||
|
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
|
||||||
|
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs)
|
||||||
|
*
|
||||||
|
* Strategy:
|
||||||
|
* 1. Query switches without image_url for JS-blocked vendors
|
||||||
|
* 2. Open each product page in headless Chromium (stealth mode)
|
||||||
|
* 3. Extract og:image (or fallback: first large product <img>)
|
||||||
|
* 4. Apply same isGenericImage() filter as the plain HTTP fetcher
|
||||||
|
* 5. Write image_url + product_page_url to switches table
|
||||||
|
*
|
||||||
|
* Rate limit: maxConcurrency=1, 4s delay between requests.
|
||||||
|
* Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista]
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { PlaywrightCrawler } from "crawlee";
|
||||||
|
import { pool } from "../utils/db";
|
||||||
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||||
|
|
||||||
|
// ── Stealth headers injected into every page ─────────────────────────────────
|
||||||
|
|
||||||
|
const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
||||||
|
|
||||||
|
// ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ────────
|
||||||
|
|
||||||
|
const GENERIC_IMAGE_PATTERNS: RegExp[] = [
|
||||||
|
/[-/_]logo[-_.]|\/logos?\//i,
|
||||||
|
/cisco[-_]?logo/i,
|
||||||
|
/juniper[-_]networks[-_]logo/i,
|
||||||
|
/arista[-_]?logo/i,
|
||||||
|
/brand[-_]?logo/i,
|
||||||
|
/company[-_]?logo/i,
|
||||||
|
/\/svg\//i,
|
||||||
|
/\.svg(\?|$)/i,
|
||||||
|
/naas-homepag/i,
|
||||||
|
/al-enterprise.*\/images\/naas/i,
|
||||||
|
/og[-_]default/i,
|
||||||
|
/default[-_](?:og|social|share|image)/i,
|
||||||
|
/site[-_](?:default|image|og)/i,
|
||||||
|
/social[-_](?:default|share)/i,
|
||||||
|
/twitter[-_]default/i,
|
||||||
|
/default[-_]thumbnail/i,
|
||||||
|
/\/homepage\//i,
|
||||||
|
/hero[-_](?:banner|bg|background|image)/i,
|
||||||
|
/banner[-_](?:bg|background)/i,
|
||||||
|
/lifestyle/i,
|
||||||
|
/stock[-_]?photo/i,
|
||||||
|
/placeholder/i,
|
||||||
|
/no[-_]?image/i,
|
||||||
|
/image[-_]?not[-_]?found/i,
|
||||||
|
/\/fallback[/-]/i,
|
||||||
|
/missing[-_]image/i,
|
||||||
|
/\/press[-_]kit/i,
|
||||||
|
/\/media[-_]kit/i,
|
||||||
|
// Vendor-specific brand icons
|
||||||
|
/open-graph\.gif/i,
|
||||||
|
/social[-_]icon/i,
|
||||||
|
/favicon/i,
|
||||||
|
/og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic
|
||||||
|
];
|
||||||
|
|
||||||
|
function isGenericImage(url: string): boolean {
|
||||||
|
return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Product page URL builders ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function buildAristaUrl(model: string): string | null {
|
||||||
|
// 7060X6-64PE → try series page and individual page
|
||||||
|
// Arista individual model pages: /en/products/<model-lowercase>
|
||||||
|
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||||
|
return `https://www.arista.com/en/products/${slug}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildDellUrl(model: string): string | null {
|
||||||
|
// PowerSwitch Z9332F-ON → try Dell networking product page
|
||||||
|
const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim();
|
||||||
|
const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||||
|
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildEdgecoreUrl(model: string): string | null {
|
||||||
|
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||||
|
return `https://www.edge-core.com/product/${slug}.html`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildFortinetUrl(model: string): string | null {
|
||||||
|
// FortiSwitch 424E → fortiswitch-424e
|
||||||
|
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||||
|
return `https://www.fortinet.com/products/fortiswitch/${slug}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildHpeArubaUrl(model: string): string | null {
|
||||||
|
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||||
|
return `https://www.arubanetworks.com/products/switches/${slug}/`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildExtremeUrl(model: string): string | null {
|
||||||
|
// Extreme uses Coveo JS search — try series page
|
||||||
|
const m = model.toLowerCase();
|
||||||
|
if (m.startsWith("x6")) {
|
||||||
|
const series = model.match(/^(X\d+)/i)?.[1]?.toLowerCase() ?? "";
|
||||||
|
return `https://www.extremenetworks.com/products/switching/${series}-series/`;
|
||||||
|
}
|
||||||
|
if (m.startsWith("slx") || m.startsWith("8720") || m.startsWith("5520")) {
|
||||||
|
return `https://www.extremenetworks.com/products/switching/`;
|
||||||
|
}
|
||||||
|
return `https://www.extremenetworks.com/products/switching/`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||||
|
arista: buildAristaUrl,
|
||||||
|
dell: buildDellUrl,
|
||||||
|
edgecore: buildEdgecoreUrl,
|
||||||
|
fortinet: buildFortinetUrl,
|
||||||
|
"hpe-aruba": buildHpeArubaUrl,
|
||||||
|
extreme: buildExtremeUrl,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Request data attached to each crawl URL ──────────────────────────────────
|
||||||
|
|
||||||
|
interface SwitchCrawlData {
|
||||||
|
switchId: string;
|
||||||
|
model: string;
|
||||||
|
vendorName: string;
|
||||||
|
vendorSlug: string;
|
||||||
|
productPageUrl: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main scraper ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise<void> {
|
||||||
|
console.log("=== Switch Image Fetcher (Playwright) ===\n");
|
||||||
|
|
||||||
|
const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`;
|
||||||
|
|
||||||
|
const { rows } = await pool.query<{
|
||||||
|
id: string;
|
||||||
|
model: string;
|
||||||
|
vendor_slug: string;
|
||||||
|
vendor_name: string;
|
||||||
|
product_page_url: string | null;
|
||||||
|
}>(`
|
||||||
|
SELECT sw.id, sw.model, sw.product_page_url,
|
||||||
|
v.slug AS vendor_slug, v.name AS vendor_name
|
||||||
|
FROM switches sw
|
||||||
|
JOIN vendors v ON v.id = sw.vendor_id
|
||||||
|
WHERE (sw.image_url IS NULL OR sw.image_url = '')
|
||||||
|
${slugFilter}
|
||||||
|
ORDER BY v.slug, sw.model
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
console.log(" All target switches already have images.\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
|
||||||
|
|
||||||
|
const requests: Array<{ url: string; userData: SwitchCrawlData }> = [];
|
||||||
|
|
||||||
|
for (const row of rows) {
|
||||||
|
const builder = URL_BUILDERS[row.vendor_slug];
|
||||||
|
const productUrl = row.product_page_url || (builder ? builder(row.model) : null);
|
||||||
|
if (!productUrl) {
|
||||||
|
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
requests.push({
|
||||||
|
url: productUrl,
|
||||||
|
userData: {
|
||||||
|
switchId: row.id,
|
||||||
|
model: row.model,
|
||||||
|
vendorName: row.vendor_name,
|
||||||
|
vendorSlug: row.vendor_slug,
|
||||||
|
productPageUrl: productUrl,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (requests.length === 0) {
|
||||||
|
console.log(" Nothing to crawl.\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let found = 0;
|
||||||
|
let missed = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
|
const crawler = new PlaywrightCrawler(
|
||||||
|
{
|
||||||
|
maxConcurrency: 1, // one at a time — server-friendly
|
||||||
|
maxRequestsPerMinute: 12, // ~5s per request minimum
|
||||||
|
requestHandlerTimeoutSecs: 45,
|
||||||
|
navigationTimeoutSecs: 30,
|
||||||
|
headless: true,
|
||||||
|
launchContext: {
|
||||||
|
launchOptions: {
|
||||||
|
args: [
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-setuid-sandbox",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-size=1920,1080",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
preNavigationHooks: [
|
||||||
|
async (_ctx, gotoOptions) => {
|
||||||
|
gotoOptions!.waitUntil = "domcontentloaded";
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
async requestHandler({ request, page }) {
|
||||||
|
const data = request.userData as SwitchCrawlData;
|
||||||
|
|
||||||
|
// Inject stealth UA
|
||||||
|
await page.setExtraHTTPHeaders({
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
});
|
||||||
|
await page.evaluate((ua) => {
|
||||||
|
Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true });
|
||||||
|
Object.defineProperty(navigator, "webdriver", { value: false, configurable: true });
|
||||||
|
}, STEALTH_UA);
|
||||||
|
|
||||||
|
// Wait for page to settle (JS rendering)
|
||||||
|
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
|
||||||
|
|
||||||
|
// Extract og:image
|
||||||
|
const imageUrl: string | null = await page.evaluate(() => {
|
||||||
|
const ogMeta = document.querySelector<HTMLMetaElement>('meta[property="og:image"]');
|
||||||
|
if (ogMeta?.content) return ogMeta.content;
|
||||||
|
|
||||||
|
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
|
||||||
|
if (twitterMeta?.content) return twitterMeta.content;
|
||||||
|
|
||||||
|
// Fallback: largest product-looking image
|
||||||
|
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
||||||
|
const candidate = imgs
|
||||||
|
.filter((img) => {
|
||||||
|
const src = img.src || img.getAttribute("data-src") || "";
|
||||||
|
return src.startsWith("http") &&
|
||||||
|
(src.match(/\.(jpg|jpeg|png|webp)/i)) &&
|
||||||
|
img.naturalWidth > 300 &&
|
||||||
|
img.naturalHeight > 200 &&
|
||||||
|
(src.includes("product") || src.includes("switch") || src.includes("router") || src.includes("hardware"));
|
||||||
|
})
|
||||||
|
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
||||||
|
return candidate?.src ?? null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!imageUrl || isGenericImage(imageUrl)) {
|
||||||
|
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
|
||||||
|
missed++;
|
||||||
|
|
||||||
|
// Save product_page_url even on miss to track that we tried
|
||||||
|
if (!data.productPageUrl) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`,
|
||||||
|
[data.switchId, request.url],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE switches
|
||||||
|
SET image_url = $2,
|
||||||
|
product_page_url = COALESCE(product_page_url, $3),
|
||||||
|
assets_scraped_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[data.switchId, imageUrl, request.url],
|
||||||
|
);
|
||||||
|
console.log(` [OK] ${data.vendorName} ${data.model} → ${imageUrl.slice(0, 80)}`);
|
||||||
|
found++;
|
||||||
|
},
|
||||||
|
|
||||||
|
async failedRequestHandler({ request }) {
|
||||||
|
const data = request.userData as SwitchCrawlData;
|
||||||
|
console.log(` [FAIL] ${data.vendorName} ${data.model} — ${request.errorMessages?.[0] ?? "unknown error"}`);
|
||||||
|
errors++;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
makeCrawleeConfig("switch-images-playwright"),
|
||||||
|
);
|
||||||
|
|
||||||
|
await crawler.run(requests);
|
||||||
|
|
||||||
|
console.log(`\n=== Playwright Image Scraper Complete ===`);
|
||||||
|
console.log(` Images found: ${found}`);
|
||||||
|
console.log(` Missed: ${missed}`);
|
||||||
|
if (errors > 0) console.warn(` Errors: ${errors}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
||||||
|
fetchSwitchImagesPlaywright(vendor)
|
||||||
|
.then(() => pool.end())
|
||||||
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user