fix(scraper): fix Arista series-level URL builder + bypass Crawlee URL deduplication
- buildAristaUrl() now extracts series prefix (7060X5-32QS → 7060x5-series) instead of individual model URLs that lack og:image - Strip trailing sub-variant 'A' so R3A → R3 series page - Add uniqueKey: row.id to each request — prevents Crawlee from deduplicating models that share the same series URL (e.g. 7060x5-series) - For Arista: always prefer fresh builder URL over stored product_page_url so stale individual-model URLs don't override correct series pages
This commit is contained in:
parent
18a9e1346e
commit
87b9416592
@ -69,10 +69,20 @@ function isGenericImage(url: string): boolean {
|
|||||||
// ── Product page URL builders ─────────────────────────────────────────────────
|
// ── Product page URL builders ─────────────────────────────────────────────────
|
||||||
|
|
||||||
function buildAristaUrl(model: string): string | null {
|
function buildAristaUrl(model: string): string | null {
|
||||||
// 7060X6-64PE → try series page and individual page
|
// Map model to its Arista series page (og:image lives on series pages, not individual model pages).
|
||||||
// Arista individual model pages: /en/products/<model-lowercase>
|
// Pattern: extract alphanumeric prefix before the first "-<digits>" port-count suffix.
|
||||||
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
// 7060X5-32QS → 7060x5 → /en/products/7060x5-series
|
||||||
return `https://www.arista.com/en/products/${slug}`;
|
// 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series
|
||||||
|
// 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series
|
||||||
|
// 7020R → 7020r → /en/products/7020r-series
|
||||||
|
const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i);
|
||||||
|
if (!leadMatch) return null;
|
||||||
|
|
||||||
|
let series = leadMatch[1].toLowerCase();
|
||||||
|
// Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page
|
||||||
|
series = series.replace(/([a-z]\d+)a$/, "$1");
|
||||||
|
|
||||||
|
return `https://www.arista.com/en/products/${series}-series`;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildDellUrl(model: string): string | null {
|
function buildDellUrl(model: string): string | null {
|
||||||
@ -160,17 +170,24 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
|||||||
|
|
||||||
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
|
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
|
||||||
|
|
||||||
const requests: Array<{ url: string; userData: SwitchCrawlData }> = [];
|
const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = [];
|
||||||
|
|
||||||
for (const row of rows) {
|
for (const row of rows) {
|
||||||
const builder = URL_BUILDERS[row.vendor_slug];
|
const builder = URL_BUILDERS[row.vendor_slug];
|
||||||
const productUrl = row.product_page_url || (builder ? builder(row.model) : null);
|
// For Arista: prefer freshly-built series URL over a stale stored model URL
|
||||||
|
const builtUrl = builder ? builder(row.model) : null;
|
||||||
|
const productUrl = row.vendor_slug === "arista"
|
||||||
|
? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista
|
||||||
|
: (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL
|
||||||
if (!productUrl) {
|
if (!productUrl) {
|
||||||
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
|
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
requests.push({
|
requests.push({
|
||||||
url: productUrl,
|
url: productUrl,
|
||||||
|
// Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs.
|
||||||
|
// Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write.
|
||||||
|
uniqueKey: row.id,
|
||||||
userData: {
|
userData: {
|
||||||
switchId: row.id,
|
switchId: row.id,
|
||||||
model: row.model,
|
model: row.model,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user