fix(scraper): fix Arista series-level URL builder + bypass Crawlee URL deduplication

- buildAristaUrl() now extracts series prefix (7060X5-32QS → 7060x5-series)
  instead of individual model URLs that lack og:image
- Strip trailing sub-variant 'A' so R3A → R3 series page
- Add uniqueKey: row.id to each request — prevents Crawlee from deduplicating
  models that share the same series URL (e.g. 7060x5-series)
- For Arista: always prefer fresh builder URL over stored product_page_url
  so stale individual-model URLs don't override correct series pages
This commit is contained in:
Rene Fichtmueller 2026-04-21 06:22:41 +02:00
parent 2742141c8b
commit 8e30b49410

View File

@ -69,10 +69,20 @@ function isGenericImage(url: string): boolean {
// ── Product page URL builders ─────────────────────────────────────────────────
function buildAristaUrl(model: string): string | null {
// 7060X6-64PE → try series page and individual page
// Arista individual model pages: /en/products/<model-lowercase>
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.arista.com/en/products/${slug}`;
// Map model to its Arista series page (og:image lives on series pages, not individual model pages).
// Pattern: extract alphanumeric prefix before the first "-<digits>" port-count suffix.
// 7060X5-32QS → 7060x5 → /en/products/7060x5-series
// 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series
// 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series
// 7020R → 7020r → /en/products/7020r-series
const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i);
if (!leadMatch) return null;
let series = leadMatch[1].toLowerCase();
// Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page
series = series.replace(/([a-z]\d+)a$/, "$1");
return `https://www.arista.com/en/products/${series}-series`;
}
function buildDellUrl(model: string): string | null {
@ -160,17 +170,24 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
const requests: Array<{ url: string; userData: SwitchCrawlData }> = [];
const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = [];
for (const row of rows) {
const builder = URL_BUILDERS[row.vendor_slug];
const productUrl = row.product_page_url || (builder ? builder(row.model) : null);
// For Arista: prefer freshly-built series URL over a stale stored model URL
const builtUrl = builder ? builder(row.model) : null;
const productUrl = row.vendor_slug === "arista"
? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista
: (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL
if (!productUrl) {
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
continue;
}
requests.push({
url: productUrl,
// Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs.
// Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write.
uniqueKey: row.id,
userData: {
switchId: row.id,
model: row.model,