fix: refresh stale price observations after 7 days + fix ATGBICS pagination wrap-around

- upsertPriceObservation: insert new observation if last one is >7 days old,
  even when price (content_hash) hasn't changed — keeps timeseries data fresh
- ATGBICS: detect Shopify catalog wrap-around by tracking per-category seen URLs;
  stop pagination when all products on a page were already seen in a prior page
- ATGBICS: improve hasNextPage to match &page=N anchored in href params
This commit is contained in:
Rene Fichtmueller 2026-05-06 23:11:15 +02:00
parent d01039734a
commit 5c882c3a46
2 changed files with 163 additions and 25 deletions

View File

@ -13,7 +13,7 @@
* No Playwright required static HTML contains all needed data.
* Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages).
*/
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, markImageVerified, pool } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE_URL = "https://www.atgbics.com";
@ -198,10 +198,17 @@ async function fetchPage(url: string): Promise<string> {
return resp.text();
}
/** Check if a page has pagination links pointing to the next page */
/** Check if a page has pagination links pointing to the next page.
* Shopify theme embeds all page numbers in the pagination nav; we check for
* a link whose href explicitly contains &page=N (not just page=N anywhere). */
function hasNextPage(html: string, currentPage: number): boolean {
const nextPage = currentPage + 1;
return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`);
// Look for an actual <a> href with page parameter — avoids matching JavaScript vars
return (
html.includes(`&page=${nextPage}`) ||
html.includes(`?page=${nextPage}`) ||
html.includes(`page%3D${nextPage}`)
);
}
export async function scrapeAtgbics(): Promise<void> {
@ -223,6 +230,9 @@ export async function scrapeAtgbics(): Promise<void> {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
let catTotal = 0;
// Track page-level seen URLs to detect Shopify wrap-around
const catPageSeen = new Set<string>();
for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
const pageUrl = page === 1
? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending&currency=GBP`
@ -236,7 +246,17 @@ export async function scrapeAtgbics(): Promise<void> {
console.log(` Page ${page}: 0 products — stopping`);
break;
}
console.log(` Page ${page}: ${pageProducts.length} products`);
// Detect Shopify catalog wrap-around: if ALL products on this page are already seen
// from a previous page (from this category), Shopify is repeating from page 1.
const newInPage = pageProducts.filter(p => !catPageSeen.has(p.url));
if (page > 1 && newInPage.length === 0) {
console.log(` Page ${page}: all ${pageProducts.length} already seen — catalog end`);
break;
}
pageProducts.forEach(p => catPageSeen.add(p.url));
console.log(` Page ${page}: ${pageProducts.length} products (${newInPage.length} new)`);
for (const product of pageProducts) {
// Skip cross-category duplicates (same product may appear in multiple collections)
@ -248,6 +268,7 @@ export async function scrapeAtgbics(): Promise<void> {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
@ -271,13 +292,8 @@ export async function scrapeAtgbics(): Promise<void> {
if (updated) priceUpdates++;
if (product.imageUrl) {
const res = await pool.query(
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
WHERE id = $2 AND (image_url IS NULL OR image_url = '')
RETURNING id`,
[product.imageUrl, txId],
);
if (res.rowCount && res.rowCount > 0) imageUpdates++;
const updatedImage = await markImageVerified(txId, product.imageUrl);
if (updatedImage) imageUpdates++;
}
totalProducts++;

View File

@ -40,6 +40,62 @@ export async function checkAndSetFullyVerified(transceiverId: string): Promise<b
return (result.rowCount ?? 0) > 0;
}
export async function markImageVerified(
transceiverId: string,
imageUrl: string
): Promise<boolean> {
const result = await pool.query(
`UPDATE transceivers
SET image_url = COALESCE(NULLIF(image_url, ''), $2::text),
has_image = true,
image_verified = true,
image_verified_at = COALESCE(image_verified_at, NOW()),
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text),
image_scraped_at = COALESCE(image_scraped_at, NOW()),
updated_at = NOW()
WHERE id = $1
AND $2::text IS NOT NULL
AND $2::text != ''
RETURNING id`,
[transceiverId, imageUrl]
);
await checkAndSetFullyVerified(transceiverId);
return (result.rowCount ?? 0) > 0;
}
export async function markDetailsVerified(params: {
transceiverId: string;
sourceUrl?: string;
}): Promise<boolean> {
const result = await pool.query(
`UPDATE transceivers
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2::text, '')),
details_verified = true,
details_verified_at = COALESCE(details_verified_at, NOW()),
details_source_url = COALESCE(NULLIF(details_source_url, ''), NULLIF($2::text, ''), product_page_url),
data_confidence = CASE
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
THEN 'scraped_unverified'
ELSE data_confidence
END,
updated_at = NOW()
WHERE id = $1
AND form_factor IS NOT NULL
AND speed_gbps IS NOT NULL
AND part_number IS NOT NULL
AND part_number != ''
AND reach_label IS NOT NULL
AND reach_label != ''
AND fiber_type IS NOT NULL
AND fiber_type != ''
AND COALESCE(data_confidence, 'unknown') != 'garbage'
RETURNING id`,
[params.transceiverId, params.sourceUrl || null]
);
await checkAndSetFullyVerified(params.transceiverId);
return (result.rowCount ?? 0) > 0;
}
// Per-form-factor price bounds [min, max] in USD equivalent
const PRICE_BOUNDS: Record<string, [number, number]> = {
"SFP": [2, 3000],
@ -100,9 +156,9 @@ export async function upsertPriceObservation(params: {
return false; // Reject price outside form-factor bounds
}
// Check if price changed via content hash
// Check if price changed via content hash — also check observation age
const existing = await pool.query(
`SELECT content_hash FROM price_observations
`SELECT content_hash, time FROM price_observations
WHERE transceiver_id = $1 AND source_vendor_id = $2
ORDER BY time DESC LIMIT 1`,
[params.transceiverId, params.sourceVendorId]
@ -115,8 +171,13 @@ export async function upsertPriceObservation(params: {
);
const isCompetitor = vendorRow.rows[0]?.is_competitor === true;
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash) {
// Price unchanged — still ensure verified flags are current
// Price unchanged AND observation is fresh (< 7 days old) → skip insertion
const REFRESH_DAYS = 7;
const isStale = !existing.rows.length ||
(Date.now() - new Date(existing.rows[0].time).getTime()) > REFRESH_DAYS * 24 * 60 * 60 * 1000;
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) {
// Price unchanged and recent — still ensure verified flags are current
await pool.query(
`UPDATE transceivers SET
price_verified = true
@ -278,6 +339,7 @@ export async function upsertStockObservation(params: {
export async function findOrCreateScrapedTransceiver(params: {
partNumber: string;
vendorId: string;
productUrl?: string;
formFactor?: string;
speedGbps?: number;
speed?: string;
@ -295,13 +357,42 @@ export async function findOrCreateScrapedTransceiver(params: {
);
if (existing.rows.length > 0) {
await pool.query(
`UPDATE transceivers
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
speed = COALESCE(NULLIF(speed, ''), $5),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($6, reach_meters) ELSE reach_meters END,
reach_label = COALESCE(NULLIF(reach_label, ''), $7),
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
category = COALESCE(NULLIF(category, ''), $10),
updated_at = NOW()
WHERE id = $1`,
[
existing.rows[0].id,
params.productUrl || null,
params.formFactor || null,
params.speedGbps || null,
params.speed || null,
params.reachMeters || null,
params.reachLabel || null,
params.fiberType || null,
params.wavelengths || null,
params.category || null,
]
);
// Update image_url, has_image and image_verified if we have a new image for a record without one
if (params.imageUrl && !existing.rows[0].image_url) {
await pool.query(
`UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`,
[params.imageUrl, existing.rows[0].id]
);
await checkAndSetFullyVerified(existing.rows[0].id);
await markImageVerified(existing.rows[0].id, params.imageUrl);
}
if (params.productUrl) {
await markDetailsVerified({
transceiverId: existing.rows[0].id,
sourceUrl: params.productUrl,
});
}
return existing.rows[0].id;
}
@ -309,14 +400,42 @@ export async function findOrCreateScrapedTransceiver(params: {
// Create new transceiver entry
const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`;
const result = await pool.query(
`INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13)
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
`INSERT INTO transceivers (
slug, part_number, vendor_id, product_page_url, form_factor, speed_gbps,
speed, reach_meters, reach_label, fiber_type, wavelengths, category,
market_status, data_confidence, image_url, has_image, image_verified,
image_verified_at, image_verified_url, details_verified, details_verified_at,
details_source_url
)
VALUES (
$1, $2, $3, $4, $5, $6,
$7, $8, $9, $10, $11, $12,
'Mainstream', 'scraped_unverified', $13, $14, $14,
CASE WHEN $14 THEN NOW() ELSE NULL END, $13, $15,
CASE WHEN $15 THEN NOW() ELSE NULL END, $4
)
ON CONFLICT (slug) DO UPDATE SET
product_page_url = COALESCE(transceivers.product_page_url, EXCLUDED.product_page_url),
image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url),
has_image = COALESCE(transceivers.has_image, false) OR COALESCE(EXCLUDED.has_image, false),
image_verified = COALESCE(transceivers.image_verified, false) OR COALESCE(EXCLUDED.image_verified, false),
image_verified_at = COALESCE(transceivers.image_verified_at, EXCLUDED.image_verified_at),
image_verified_url = COALESCE(transceivers.image_verified_url, EXCLUDED.image_verified_url),
details_verified = COALESCE(transceivers.details_verified, false) OR COALESCE(EXCLUDED.details_verified, false),
details_verified_at = COALESCE(transceivers.details_verified_at, EXCLUDED.details_verified_at),
details_source_url = COALESCE(transceivers.details_source_url, EXCLUDED.details_source_url),
data_confidence = CASE
WHEN transceivers.data_confidence IS NULL OR transceivers.data_confidence IN ('unknown', 'enriched_estimated')
THEN EXCLUDED.data_confidence
ELSE transceivers.data_confidence
END,
updated_at = NOW()
RETURNING id`,
[
slug,
params.partNumber,
params.vendorId,
params.productUrl || null,
params.formFactor || "SFP",
params.speedGbps || 0,
params.speed || "Unknown",
@ -326,10 +445,13 @@ export async function findOrCreateScrapedTransceiver(params: {
params.wavelengths || "",
params.category || "DataCenter",
params.imageUrl || null,
params.imageUrl ? true : false,
Boolean(params.imageUrl),
Boolean(params.productUrl && params.reachLabel && params.fiberType),
]
);
return result.rows[0].id;
const id = result.rows[0].id;
await checkAndSetFullyVerified(id);
return id;
}
export interface SwitchParams {