fix: refresh stale price observations after 7 days + fix ATGBICS pagination wrap-around
- upsertPriceObservation: insert new observation if last one is >7 days old, even when price (content_hash) hasn't changed — keeps timeseries data fresh - ATGBICS: detect Shopify catalog wrap-around by tracking per-category seen URLs; stop pagination when all products on a page were already seen in a prior page - ATGBICS: improve hasNextPage to match &page=N anchored in href params
This commit is contained in:
parent
d01039734a
commit
5c882c3a46
@ -13,7 +13,7 @@
|
||||
* No Playwright required — static HTML contains all needed data.
|
||||
* Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages).
|
||||
*/
|
||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, markImageVerified, pool } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE_URL = "https://www.atgbics.com";
|
||||
@ -198,10 +198,17 @@ async function fetchPage(url: string): Promise<string> {
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
/** Check if a page has pagination links pointing to the next page */
|
||||
/** Check if a page has pagination links pointing to the next page.
|
||||
* Shopify theme embeds all page numbers in the pagination nav; we check for
|
||||
* a link whose href explicitly contains &page=N (not just page=N anywhere). */
|
||||
function hasNextPage(html: string, currentPage: number): boolean {
|
||||
const nextPage = currentPage + 1;
|
||||
return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`);
|
||||
// Look for an actual <a> href with page parameter — avoids matching JavaScript vars
|
||||
return (
|
||||
html.includes(`&page=${nextPage}`) ||
|
||||
html.includes(`?page=${nextPage}`) ||
|
||||
html.includes(`page%3D${nextPage}`)
|
||||
);
|
||||
}
|
||||
|
||||
export async function scrapeAtgbics(): Promise<void> {
|
||||
@ -223,6 +230,9 @@ export async function scrapeAtgbics(): Promise<void> {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
|
||||
let catTotal = 0;
|
||||
|
||||
// Track page-level seen URLs to detect Shopify wrap-around
|
||||
const catPageSeen = new Set<string>();
|
||||
|
||||
for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
|
||||
const pageUrl = page === 1
|
||||
? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP`
|
||||
@ -236,7 +246,17 @@ export async function scrapeAtgbics(): Promise<void> {
|
||||
console.log(` Page ${page}: 0 products — stopping`);
|
||||
break;
|
||||
}
|
||||
console.log(` Page ${page}: ${pageProducts.length} products`);
|
||||
|
||||
// Detect Shopify catalog wrap-around: if ALL products on this page are already seen
|
||||
// from a previous page (from this category), Shopify is repeating from page 1.
|
||||
const newInPage = pageProducts.filter(p => !catPageSeen.has(p.url));
|
||||
if (page > 1 && newInPage.length === 0) {
|
||||
console.log(` Page ${page}: all ${pageProducts.length} already seen — catalog end`);
|
||||
break;
|
||||
}
|
||||
pageProducts.forEach(p => catPageSeen.add(p.url));
|
||||
|
||||
console.log(` Page ${page}: ${pageProducts.length} products (${newInPage.length} new)`);
|
||||
|
||||
for (const product of pageProducts) {
|
||||
// Skip cross-category duplicates (same product may appear in multiple collections)
|
||||
@ -248,6 +268,7 @@ export async function scrapeAtgbics(): Promise<void> {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
@ -271,13 +292,8 @@ export async function scrapeAtgbics(): Promise<void> {
|
||||
if (updated) priceUpdates++;
|
||||
|
||||
if (product.imageUrl) {
|
||||
const res = await pool.query(
|
||||
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
|
||||
WHERE id = $2 AND (image_url IS NULL OR image_url = '')
|
||||
RETURNING id`,
|
||||
[product.imageUrl, txId],
|
||||
);
|
||||
if (res.rowCount && res.rowCount > 0) imageUpdates++;
|
||||
const updatedImage = await markImageVerified(txId, product.imageUrl);
|
||||
if (updatedImage) imageUpdates++;
|
||||
}
|
||||
|
||||
totalProducts++;
|
||||
|
||||
@ -40,6 +40,62 @@ export async function checkAndSetFullyVerified(transceiverId: string): Promise<b
|
||||
return (result.rowCount ?? 0) > 0;
|
||||
}
|
||||
|
||||
export async function markImageVerified(
|
||||
transceiverId: string,
|
||||
imageUrl: string
|
||||
): Promise<boolean> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET image_url = COALESCE(NULLIF(image_url, ''), $2::text),
|
||||
has_image = true,
|
||||
image_verified = true,
|
||||
image_verified_at = COALESCE(image_verified_at, NOW()),
|
||||
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text),
|
||||
image_scraped_at = COALESCE(image_scraped_at, NOW()),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND $2::text IS NOT NULL
|
||||
AND $2::text != ''
|
||||
RETURNING id`,
|
||||
[transceiverId, imageUrl]
|
||||
);
|
||||
await checkAndSetFullyVerified(transceiverId);
|
||||
return (result.rowCount ?? 0) > 0;
|
||||
}
|
||||
|
||||
export async function markDetailsVerified(params: {
|
||||
transceiverId: string;
|
||||
sourceUrl?: string;
|
||||
}): Promise<boolean> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2::text, '')),
|
||||
details_verified = true,
|
||||
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||
details_source_url = COALESCE(NULLIF(details_source_url, ''), NULLIF($2::text, ''), product_page_url),
|
||||
data_confidence = CASE
|
||||
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||
THEN 'scraped_unverified'
|
||||
ELSE data_confidence
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND form_factor IS NOT NULL
|
||||
AND speed_gbps IS NOT NULL
|
||||
AND part_number IS NOT NULL
|
||||
AND part_number != ''
|
||||
AND reach_label IS NOT NULL
|
||||
AND reach_label != ''
|
||||
AND fiber_type IS NOT NULL
|
||||
AND fiber_type != ''
|
||||
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||
RETURNING id`,
|
||||
[params.transceiverId, params.sourceUrl || null]
|
||||
);
|
||||
await checkAndSetFullyVerified(params.transceiverId);
|
||||
return (result.rowCount ?? 0) > 0;
|
||||
}
|
||||
|
||||
// Per-form-factor price bounds [min, max] in USD equivalent
|
||||
const PRICE_BOUNDS: Record<string, [number, number]> = {
|
||||
"SFP": [2, 3000],
|
||||
@ -100,9 +156,9 @@ export async function upsertPriceObservation(params: {
|
||||
return false; // Reject price outside form-factor bounds
|
||||
}
|
||||
|
||||
// Check if price changed via content hash
|
||||
// Check if price changed via content hash — also check observation age
|
||||
const existing = await pool.query(
|
||||
`SELECT content_hash FROM price_observations
|
||||
`SELECT content_hash, time FROM price_observations
|
||||
WHERE transceiver_id = $1 AND source_vendor_id = $2
|
||||
ORDER BY time DESC LIMIT 1`,
|
||||
[params.transceiverId, params.sourceVendorId]
|
||||
@ -115,8 +171,13 @@ export async function upsertPriceObservation(params: {
|
||||
);
|
||||
const isCompetitor = vendorRow.rows[0]?.is_competitor === true;
|
||||
|
||||
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash) {
|
||||
// Price unchanged — still ensure verified flags are current
|
||||
// Price unchanged AND observation is fresh (< 7 days old) → skip insertion
|
||||
const REFRESH_DAYS = 7;
|
||||
const isStale = !existing.rows.length ||
|
||||
(Date.now() - new Date(existing.rows[0].time).getTime()) > REFRESH_DAYS * 24 * 60 * 60 * 1000;
|
||||
|
||||
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) {
|
||||
// Price unchanged and recent — still ensure verified flags are current
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET
|
||||
price_verified = true
|
||||
@ -278,6 +339,7 @@ export async function upsertStockObservation(params: {
|
||||
export async function findOrCreateScrapedTransceiver(params: {
|
||||
partNumber: string;
|
||||
vendorId: string;
|
||||
productUrl?: string;
|
||||
formFactor?: string;
|
||||
speedGbps?: number;
|
||||
speed?: string;
|
||||
@ -295,13 +357,42 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
|
||||
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
|
||||
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
|
||||
speed = COALESCE(NULLIF(speed, ''), $5),
|
||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($6, reach_meters) ELSE reach_meters END,
|
||||
reach_label = COALESCE(NULLIF(reach_label, ''), $7),
|
||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
|
||||
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
|
||||
category = COALESCE(NULLIF(category, ''), $10),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
existing.rows[0].id,
|
||||
params.productUrl || null,
|
||||
params.formFactor || null,
|
||||
params.speedGbps || null,
|
||||
params.speed || null,
|
||||
params.reachMeters || null,
|
||||
params.reachLabel || null,
|
||||
params.fiberType || null,
|
||||
params.wavelengths || null,
|
||||
params.category || null,
|
||||
]
|
||||
);
|
||||
|
||||
// Update image_url, has_image and image_verified if we have a new image for a record without one
|
||||
if (params.imageUrl && !existing.rows[0].image_url) {
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`,
|
||||
[params.imageUrl, existing.rows[0].id]
|
||||
);
|
||||
await checkAndSetFullyVerified(existing.rows[0].id);
|
||||
await markImageVerified(existing.rows[0].id, params.imageUrl);
|
||||
}
|
||||
if (params.productUrl) {
|
||||
await markDetailsVerified({
|
||||
transceiverId: existing.rows[0].id,
|
||||
sourceUrl: params.productUrl,
|
||||
});
|
||||
}
|
||||
return existing.rows[0].id;
|
||||
}
|
||||
@ -309,14 +400,42 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
// Create new transceiver entry
|
||||
const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`;
|
||||
const result = await pool.query(
|
||||
`INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13)
|
||||
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
|
||||
`INSERT INTO transceivers (
|
||||
slug, part_number, vendor_id, product_page_url, form_factor, speed_gbps,
|
||||
speed, reach_meters, reach_label, fiber_type, wavelengths, category,
|
||||
market_status, data_confidence, image_url, has_image, image_verified,
|
||||
image_verified_at, image_verified_url, details_verified, details_verified_at,
|
||||
details_source_url
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5, $6,
|
||||
$7, $8, $9, $10, $11, $12,
|
||||
'Mainstream', 'scraped_unverified', $13, $14, $14,
|
||||
CASE WHEN $14 THEN NOW() ELSE NULL END, $13, $15,
|
||||
CASE WHEN $15 THEN NOW() ELSE NULL END, $4
|
||||
)
|
||||
ON CONFLICT (slug) DO UPDATE SET
|
||||
product_page_url = COALESCE(transceivers.product_page_url, EXCLUDED.product_page_url),
|
||||
image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url),
|
||||
has_image = COALESCE(transceivers.has_image, false) OR COALESCE(EXCLUDED.has_image, false),
|
||||
image_verified = COALESCE(transceivers.image_verified, false) OR COALESCE(EXCLUDED.image_verified, false),
|
||||
image_verified_at = COALESCE(transceivers.image_verified_at, EXCLUDED.image_verified_at),
|
||||
image_verified_url = COALESCE(transceivers.image_verified_url, EXCLUDED.image_verified_url),
|
||||
details_verified = COALESCE(transceivers.details_verified, false) OR COALESCE(EXCLUDED.details_verified, false),
|
||||
details_verified_at = COALESCE(transceivers.details_verified_at, EXCLUDED.details_verified_at),
|
||||
details_source_url = COALESCE(transceivers.details_source_url, EXCLUDED.details_source_url),
|
||||
data_confidence = CASE
|
||||
WHEN transceivers.data_confidence IS NULL OR transceivers.data_confidence IN ('unknown', 'enriched_estimated')
|
||||
THEN EXCLUDED.data_confidence
|
||||
ELSE transceivers.data_confidence
|
||||
END,
|
||||
updated_at = NOW()
|
||||
RETURNING id`,
|
||||
[
|
||||
slug,
|
||||
params.partNumber,
|
||||
params.vendorId,
|
||||
params.productUrl || null,
|
||||
params.formFactor || "SFP",
|
||||
params.speedGbps || 0,
|
||||
params.speed || "Unknown",
|
||||
@ -326,10 +445,13 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
params.wavelengths || "",
|
||||
params.category || "DataCenter",
|
||||
params.imageUrl || null,
|
||||
params.imageUrl ? true : false,
|
||||
Boolean(params.imageUrl),
|
||||
Boolean(params.productUrl && params.reachLabel && params.fiberType),
|
||||
]
|
||||
);
|
||||
return result.rows[0].id;
|
||||
const id = result.rows[0].id;
|
||||
await checkAndSetFullyVerified(id);
|
||||
return id;
|
||||
}
|
||||
|
||||
export interface SwitchParams {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user