diff --git a/packages/scraper/src/scrapers/atgbics.ts b/packages/scraper/src/scrapers/atgbics.ts index 57afb56..e000dd2 100644 --- a/packages/scraper/src/scrapers/atgbics.ts +++ b/packages/scraper/src/scrapers/atgbics.ts @@ -13,7 +13,7 @@ * No Playwright required — static HTML contains all needed data. * Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages). */ -import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; +import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, markImageVerified, pool } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE_URL = "https://www.atgbics.com"; @@ -198,10 +198,17 @@ async function fetchPage(url: string): Promise { return resp.text(); } -/** Check if a page has pagination links pointing to the next page */ +/** Check if a page has pagination links pointing to the next page. + * Shopify theme embeds all page numbers in the pagination nav; we check for + * a link whose href explicitly contains &page=N (not just page=N anywhere). */ function hasNextPage(html: string, currentPage: number): boolean { const nextPage = currentPage + 1; - return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`); + // Look for an actual href with page parameter — avoids matching JavaScript vars + return ( + html.includes(`&page=${nextPage}`) || + html.includes(`?page=${nextPage}`) || + html.includes(`page%3D${nextPage}`) + ); } export async function scrapeAtgbics(): Promise { @@ -223,6 +230,9 @@ export async function scrapeAtgbics(): Promise { console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`); let catTotal = 0; + // Track page-level seen URLs to detect Shopify wrap-around + const catPageSeen = new Set(); + for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) { const pageUrl = page === 1 ? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP` @@ -236,7 +246,17 @@ export async function scrapeAtgbics(): Promise { console.log(` Page ${page}: 0 products — stopping`); break; } - console.log(` Page ${page}: ${pageProducts.length} products`); + + // Detect Shopify catalog wrap-around: if ALL products on this page are already seen + // from a previous page (from this category), Shopify is repeating from page 1. + const newInPage = pageProducts.filter(p => !catPageSeen.has(p.url)); + if (page > 1 && newInPage.length === 0) { + console.log(` Page ${page}: all ${pageProducts.length} already seen — catalog end`); + break; + } + pageProducts.forEach(p => catPageSeen.add(p.url)); + + console.log(` Page ${page}: ${pageProducts.length} products (${newInPage.length} new)`); for (const product of pageProducts) { // Skip cross-category duplicates (same product may appear in multiple collections) @@ -248,6 +268,7 @@ export async function scrapeAtgbics(): Promise { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -271,13 +292,8 @@ export async function scrapeAtgbics(): Promise { if (updated) priceUpdates++; if (product.imageUrl) { - const res = await pool.query( - `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true - WHERE id = $2 AND (image_url IS NULL OR image_url = '') - RETURNING id`, - [product.imageUrl, txId], - ); - if (res.rowCount && res.rowCount > 0) imageUpdates++; + const updatedImage = await markImageVerified(txId, product.imageUrl); + if (updatedImage) imageUpdates++; } totalProducts++; diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index ea16329..bc84bd6 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -40,6 +40,62 @@ export async function checkAndSetFullyVerified(transceiverId: string): Promise 0; } +export async function markImageVerified( + transceiverId: string, + imageUrl: string +): Promise { + const result = await pool.query( + `UPDATE transceivers + SET image_url = COALESCE(NULLIF(image_url, ''), $2::text), + has_image = true, + image_verified = true, + image_verified_at = COALESCE(image_verified_at, NOW()), + image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text), + image_scraped_at = COALESCE(image_scraped_at, NOW()), + updated_at = NOW() + WHERE id = $1 + AND $2::text IS NOT NULL + AND $2::text != '' + RETURNING id`, + [transceiverId, imageUrl] + ); + await checkAndSetFullyVerified(transceiverId); + return (result.rowCount ?? 0) > 0; +} + +export async function markDetailsVerified(params: { + transceiverId: string; + sourceUrl?: string; +}): Promise { + const result = await pool.query( + `UPDATE transceivers + SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2::text, '')), + details_verified = true, + details_verified_at = COALESCE(details_verified_at, NOW()), + details_source_url = COALESCE(NULLIF(details_source_url, ''), NULLIF($2::text, ''), product_page_url), + data_confidence = CASE + WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated') + THEN 'scraped_unverified' + ELSE data_confidence + END, + updated_at = NOW() + WHERE id = $1 + AND form_factor IS NOT NULL + AND speed_gbps IS NOT NULL + AND part_number IS NOT NULL + AND part_number != '' + AND reach_label IS NOT NULL + AND reach_label != '' + AND fiber_type IS NOT NULL + AND fiber_type != '' + AND COALESCE(data_confidence, 'unknown') != 'garbage' + RETURNING id`, + [params.transceiverId, params.sourceUrl || null] + ); + await checkAndSetFullyVerified(params.transceiverId); + return (result.rowCount ?? 0) > 0; +} + // Per-form-factor price bounds [min, max] in USD equivalent const PRICE_BOUNDS: Record = { "SFP": [2, 3000], @@ -100,9 +156,9 @@ export async function upsertPriceObservation(params: { return false; // Reject price outside form-factor bounds } - // Check if price changed via content hash + // Check if price changed via content hash — also check observation age const existing = await pool.query( - `SELECT content_hash FROM price_observations + `SELECT content_hash, time FROM price_observations WHERE transceiver_id = $1 AND source_vendor_id = $2 ORDER BY time DESC LIMIT 1`, [params.transceiverId, params.sourceVendorId] @@ -115,8 +171,13 @@ export async function upsertPriceObservation(params: { ); const isCompetitor = vendorRow.rows[0]?.is_competitor === true; - if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash) { - // Price unchanged — still ensure verified flags are current + // Price unchanged AND observation is fresh (< 7 days old) → skip insertion + const REFRESH_DAYS = 7; + const isStale = !existing.rows.length || + (Date.now() - new Date(existing.rows[0].time).getTime()) > REFRESH_DAYS * 24 * 60 * 60 * 1000; + + if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) { + // Price unchanged and recent — still ensure verified flags are current await pool.query( `UPDATE transceivers SET price_verified = true @@ -278,6 +339,7 @@ export async function upsertStockObservation(params: { export async function findOrCreateScrapedTransceiver(params: { partNumber: string; vendorId: string; + productUrl?: string; formFactor?: string; speedGbps?: number; speed?: string; @@ -295,13 +357,42 @@ export async function findOrCreateScrapedTransceiver(params: { ); if (existing.rows.length > 0) { + await pool.query( + `UPDATE transceivers + SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')), + form_factor = COALESCE(NULLIF(form_factor, ''), $3), + speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END, + speed = COALESCE(NULLIF(speed, ''), $5), + reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($6, reach_meters) ELSE reach_meters END, + reach_label = COALESCE(NULLIF(reach_label, ''), $7), + fiber_type = COALESCE(NULLIF(fiber_type, ''), $8), + wavelengths = COALESCE(NULLIF(wavelengths, ''), $9), + category = COALESCE(NULLIF(category, ''), $10), + updated_at = NOW() + WHERE id = $1`, + [ + existing.rows[0].id, + params.productUrl || null, + params.formFactor || null, + params.speedGbps || null, + params.speed || null, + params.reachMeters || null, + params.reachLabel || null, + params.fiberType || null, + params.wavelengths || null, + params.category || null, + ] + ); + // Update image_url, has_image and image_verified if we have a new image for a record without one if (params.imageUrl && !existing.rows[0].image_url) { - await pool.query( - `UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`, - [params.imageUrl, existing.rows[0].id] - ); - await checkAndSetFullyVerified(existing.rows[0].id); + await markImageVerified(existing.rows[0].id, params.imageUrl); + } + if (params.productUrl) { + await markDetailsVerified({ + transceiverId: existing.rows[0].id, + sourceUrl: params.productUrl, + }); } return existing.rows[0].id; } @@ -309,14 +400,42 @@ export async function findOrCreateScrapedTransceiver(params: { // Create new transceiver entry const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`; const result = await pool.query( - `INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13) - ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW() + `INSERT INTO transceivers ( + slug, part_number, vendor_id, product_page_url, form_factor, speed_gbps, + speed, reach_meters, reach_label, fiber_type, wavelengths, category, + market_status, data_confidence, image_url, has_image, image_verified, + image_verified_at, image_verified_url, details_verified, details_verified_at, + details_source_url + ) + VALUES ( + $1, $2, $3, $4, $5, $6, + $7, $8, $9, $10, $11, $12, + 'Mainstream', 'scraped_unverified', $13, $14, $14, + CASE WHEN $14 THEN NOW() ELSE NULL END, $13, $15, + CASE WHEN $15 THEN NOW() ELSE NULL END, $4 + ) + ON CONFLICT (slug) DO UPDATE SET + product_page_url = COALESCE(transceivers.product_page_url, EXCLUDED.product_page_url), + image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), + has_image = COALESCE(transceivers.has_image, false) OR COALESCE(EXCLUDED.has_image, false), + image_verified = COALESCE(transceivers.image_verified, false) OR COALESCE(EXCLUDED.image_verified, false), + image_verified_at = COALESCE(transceivers.image_verified_at, EXCLUDED.image_verified_at), + image_verified_url = COALESCE(transceivers.image_verified_url, EXCLUDED.image_verified_url), + details_verified = COALESCE(transceivers.details_verified, false) OR COALESCE(EXCLUDED.details_verified, false), + details_verified_at = COALESCE(transceivers.details_verified_at, EXCLUDED.details_verified_at), + details_source_url = COALESCE(transceivers.details_source_url, EXCLUDED.details_source_url), + data_confidence = CASE + WHEN transceivers.data_confidence IS NULL OR transceivers.data_confidence IN ('unknown', 'enriched_estimated') + THEN EXCLUDED.data_confidence + ELSE transceivers.data_confidence + END, + updated_at = NOW() RETURNING id`, [ slug, params.partNumber, params.vendorId, + params.productUrl || null, params.formFactor || "SFP", params.speedGbps || 0, params.speed || "Unknown", @@ -326,10 +445,13 @@ export async function findOrCreateScrapedTransceiver(params: { params.wavelengths || "", params.category || "DataCenter", params.imageUrl || null, - params.imageUrl ? true : false, + Boolean(params.imageUrl), + Boolean(params.productUrl && params.reachLabel && params.fiberType), ] ); - return result.rows[0].id; + const id = result.rows[0].id; + await checkAndSetFullyVerified(id); + return id; } export interface SwitchParams {