fix(scraper): fix Arista series-level URL builder + bypass Crawlee URL deduplication
- buildAristaUrl() now extracts series prefix (7060X5-32QS → 7060x5-series) instead of individual model URLs that lack og:image - Strip trailing sub-variant 'A' so R3A → R3 series page - Add uniqueKey: row.id to each request — prevents Crawlee from deduplicating models that share the same series URL (e.g. 7060x5-series) - For Arista: always prefer fresh builder URL over stored product_page_url so stale individual-model URLs don't override correct series pages
This commit is contained in:
parent
18a9e1346e
commit
87b9416592
@ -69,10 +69,20 @@ function isGenericImage(url: string): boolean {
|
||||
// ── Product page URL builders ─────────────────────────────────────────────────
|
||||
|
||||
function buildAristaUrl(model: string): string | null {
|
||||
// 7060X6-64PE → try series page and individual page
|
||||
// Arista individual model pages: /en/products/<model-lowercase>
|
||||
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||
return `https://www.arista.com/en/products/${slug}`;
|
||||
// Map model to its Arista series page (og:image lives on series pages, not individual model pages).
|
||||
// Pattern: extract alphanumeric prefix before the first "-<digits>" port-count suffix.
|
||||
// 7060X5-32QS → 7060x5 → /en/products/7060x5-series
|
||||
// 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series
|
||||
// 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series
|
||||
// 7020R → 7020r → /en/products/7020r-series
|
||||
const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i);
|
||||
if (!leadMatch) return null;
|
||||
|
||||
let series = leadMatch[1].toLowerCase();
|
||||
// Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page
|
||||
series = series.replace(/([a-z]\d+)a$/, "$1");
|
||||
|
||||
return `https://www.arista.com/en/products/${series}-series`;
|
||||
}
|
||||
|
||||
function buildDellUrl(model: string): string | null {
|
||||
@ -160,17 +170,24 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
||||
|
||||
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
|
||||
|
||||
const requests: Array<{ url: string; userData: SwitchCrawlData }> = [];
|
||||
const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = [];
|
||||
|
||||
for (const row of rows) {
|
||||
const builder = URL_BUILDERS[row.vendor_slug];
|
||||
const productUrl = row.product_page_url || (builder ? builder(row.model) : null);
|
||||
// For Arista: prefer freshly-built series URL over a stale stored model URL
|
||||
const builtUrl = builder ? builder(row.model) : null;
|
||||
const productUrl = row.vendor_slug === "arista"
|
||||
? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista
|
||||
: (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL
|
||||
if (!productUrl) {
|
||||
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
|
||||
continue;
|
||||
}
|
||||
requests.push({
|
||||
url: productUrl,
|
||||
// Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs.
|
||||
// Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write.
|
||||
uniqueKey: row.id,
|
||||
userData: {
|
||||
switchId: row.id,
|
||||
model: row.model,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user