diff --git a/packages/scraper/src/scrapers/switch-image-playwright.ts b/packages/scraper/src/scrapers/switch-image-playwright.ts index 383ea2e..9db2e40 100644 --- a/packages/scraper/src/scrapers/switch-image-playwright.ts +++ b/packages/scraper/src/scrapers/switch-image-playwright.ts @@ -69,10 +69,20 @@ function isGenericImage(url: string): boolean { // ── Product page URL builders ───────────────────────────────────────────────── function buildAristaUrl(model: string): string | null { - // 7060X6-64PE → try series page and individual page - // Arista individual model pages: /en/products/ - const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); - return `https://www.arista.com/en/products/${slug}`; + // Map model to its Arista series page (og:image lives on series pages, not individual model pages). + // Pattern: extract alphanumeric prefix before the first "-" port-count suffix. + // 7060X5-32QS → 7060x5 → /en/products/7060x5-series + // 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series + // 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series + // 7020R → 7020r → /en/products/7020r-series + const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i); + if (!leadMatch) return null; + + let series = leadMatch[1].toLowerCase(); + // Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page + series = series.replace(/([a-z]\d+)a$/, "$1"); + + return `https://www.arista.com/en/products/${series}-series`; } function buildDellUrl(model: string): string | null { @@ -160,17 +170,24 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr console.log(` ${rows.length} switches need images (Playwright vendors)\n`); - const requests: Array<{ url: string; userData: SwitchCrawlData }> = []; + const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = []; for (const row of rows) { const builder = URL_BUILDERS[row.vendor_slug]; - const productUrl = row.product_page_url || (builder ? builder(row.model) : null); + // For Arista: prefer freshly-built series URL over a stale stored model URL + const builtUrl = builder ? builder(row.model) : null; + const productUrl = row.vendor_slug === "arista" + ? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista + : (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL if (!productUrl) { console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`); continue; } requests.push({ url: productUrl, + // Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs. + // Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write. + uniqueKey: row.id, userData: { switchId: row.id, model: row.model,