fix: refresh stale price observations after 7 days + fix ATGBICS pagination wrap-around
- upsertPriceObservation: insert new observation if last one is >7 days old, even when price (content_hash) hasn't changed — keeps timeseries data fresh - ATGBICS: detect Shopify catalog wrap-around by tracking per-category seen URLs; stop pagination when all products on a page were already seen in a prior page - ATGBICS: improve hasNextPage to match &page=N anchored in href params
This commit is contained in:
parent
d01039734a
commit
5c882c3a46
@ -13,7 +13,7 @@
|
|||||||
* No Playwright required — static HTML contains all needed data.
|
* No Playwright required — static HTML contains all needed data.
|
||||||
* Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages).
|
* Rate limited: 1 req/1 sec. Runs from Mac or Erik (no IP issues with static pages).
|
||||||
*/
|
*/
|
||||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, markImageVerified, pool } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
|
|
||||||
const BASE_URL = "https://www.atgbics.com";
|
const BASE_URL = "https://www.atgbics.com";
|
||||||
@ -198,10 +198,17 @@ async function fetchPage(url: string): Promise<string> {
|
|||||||
return resp.text();
|
return resp.text();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check if a page has pagination links pointing to the next page */
|
/** Check if a page has pagination links pointing to the next page.
|
||||||
|
* Shopify theme embeds all page numbers in the pagination nav; we check for
|
||||||
|
* a link whose href explicitly contains &page=N (not just page=N anywhere). */
|
||||||
function hasNextPage(html: string, currentPage: number): boolean {
|
function hasNextPage(html: string, currentPage: number): boolean {
|
||||||
const nextPage = currentPage + 1;
|
const nextPage = currentPage + 1;
|
||||||
return html.includes(`page=${nextPage}`) || html.includes(`page%3D${nextPage}`);
|
// Look for an actual <a> href with page parameter — avoids matching JavaScript vars
|
||||||
|
return (
|
||||||
|
html.includes(`&page=${nextPage}`) ||
|
||||||
|
html.includes(`?page=${nextPage}`) ||
|
||||||
|
html.includes(`page%3D${nextPage}`)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeAtgbics(): Promise<void> {
|
export async function scrapeAtgbics(): Promise<void> {
|
||||||
@ -223,6 +230,9 @@ export async function scrapeAtgbics(): Promise<void> {
|
|||||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
|
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.handle}] ---`);
|
||||||
let catTotal = 0;
|
let catTotal = 0;
|
||||||
|
|
||||||
|
// Track page-level seen URLs to detect Shopify wrap-around
|
||||||
|
const catPageSeen = new Set<string>();
|
||||||
|
|
||||||
for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
|
for (let page = 1; page <= MAX_PAGES_PER_CAT; page++) {
|
||||||
const pageUrl = page === 1
|
const pageUrl = page === 1
|
||||||
? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP`
|
? `${BASE_URL}/collections/${cat.handle}?sort_by=price-ascending¤cy=GBP`
|
||||||
@ -236,7 +246,17 @@ export async function scrapeAtgbics(): Promise<void> {
|
|||||||
console.log(` Page ${page}: 0 products — stopping`);
|
console.log(` Page ${page}: 0 products — stopping`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
console.log(` Page ${page}: ${pageProducts.length} products`);
|
|
||||||
|
// Detect Shopify catalog wrap-around: if ALL products on this page are already seen
|
||||||
|
// from a previous page (from this category), Shopify is repeating from page 1.
|
||||||
|
const newInPage = pageProducts.filter(p => !catPageSeen.has(p.url));
|
||||||
|
if (page > 1 && newInPage.length === 0) {
|
||||||
|
console.log(` Page ${page}: all ${pageProducts.length} already seen — catalog end`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pageProducts.forEach(p => catPageSeen.add(p.url));
|
||||||
|
|
||||||
|
console.log(` Page ${page}: ${pageProducts.length} products (${newInPage.length} new)`);
|
||||||
|
|
||||||
for (const product of pageProducts) {
|
for (const product of pageProducts) {
|
||||||
// Skip cross-category duplicates (same product may appear in multiple collections)
|
// Skip cross-category duplicates (same product may appear in multiple collections)
|
||||||
@ -248,6 +268,7 @@ export async function scrapeAtgbics(): Promise<void> {
|
|||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: product.partNumber,
|
partNumber: product.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
formFactor: product.formFactor,
|
formFactor: product.formFactor,
|
||||||
speedGbps: product.speedGbps,
|
speedGbps: product.speedGbps,
|
||||||
speed: product.speed,
|
speed: product.speed,
|
||||||
@ -271,13 +292,8 @@ export async function scrapeAtgbics(): Promise<void> {
|
|||||||
if (updated) priceUpdates++;
|
if (updated) priceUpdates++;
|
||||||
|
|
||||||
if (product.imageUrl) {
|
if (product.imageUrl) {
|
||||||
const res = await pool.query(
|
const updatedImage = await markImageVerified(txId, product.imageUrl);
|
||||||
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
|
if (updatedImage) imageUpdates++;
|
||||||
WHERE id = $2 AND (image_url IS NULL OR image_url = '')
|
|
||||||
RETURNING id`,
|
|
||||||
[product.imageUrl, txId],
|
|
||||||
);
|
|
||||||
if (res.rowCount && res.rowCount > 0) imageUpdates++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
totalProducts++;
|
totalProducts++;
|
||||||
|
|||||||
@ -40,6 +40,62 @@ export async function checkAndSetFullyVerified(transceiverId: string): Promise<b
|
|||||||
return (result.rowCount ?? 0) > 0;
|
return (result.rowCount ?? 0) > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function markImageVerified(
|
||||||
|
transceiverId: string,
|
||||||
|
imageUrl: string
|
||||||
|
): Promise<boolean> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`UPDATE transceivers
|
||||||
|
SET image_url = COALESCE(NULLIF(image_url, ''), $2::text),
|
||||||
|
has_image = true,
|
||||||
|
image_verified = true,
|
||||||
|
image_verified_at = COALESCE(image_verified_at, NOW()),
|
||||||
|
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text),
|
||||||
|
image_scraped_at = COALESCE(image_scraped_at, NOW()),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
AND $2::text IS NOT NULL
|
||||||
|
AND $2::text != ''
|
||||||
|
RETURNING id`,
|
||||||
|
[transceiverId, imageUrl]
|
||||||
|
);
|
||||||
|
await checkAndSetFullyVerified(transceiverId);
|
||||||
|
return (result.rowCount ?? 0) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function markDetailsVerified(params: {
|
||||||
|
transceiverId: string;
|
||||||
|
sourceUrl?: string;
|
||||||
|
}): Promise<boolean> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`UPDATE transceivers
|
||||||
|
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2::text, '')),
|
||||||
|
details_verified = true,
|
||||||
|
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||||
|
details_source_url = COALESCE(NULLIF(details_source_url, ''), NULLIF($2::text, ''), product_page_url),
|
||||||
|
data_confidence = CASE
|
||||||
|
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||||
|
THEN 'scraped_unverified'
|
||||||
|
ELSE data_confidence
|
||||||
|
END,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
AND form_factor IS NOT NULL
|
||||||
|
AND speed_gbps IS NOT NULL
|
||||||
|
AND part_number IS NOT NULL
|
||||||
|
AND part_number != ''
|
||||||
|
AND reach_label IS NOT NULL
|
||||||
|
AND reach_label != ''
|
||||||
|
AND fiber_type IS NOT NULL
|
||||||
|
AND fiber_type != ''
|
||||||
|
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||||
|
RETURNING id`,
|
||||||
|
[params.transceiverId, params.sourceUrl || null]
|
||||||
|
);
|
||||||
|
await checkAndSetFullyVerified(params.transceiverId);
|
||||||
|
return (result.rowCount ?? 0) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Per-form-factor price bounds [min, max] in USD equivalent
|
// Per-form-factor price bounds [min, max] in USD equivalent
|
||||||
const PRICE_BOUNDS: Record<string, [number, number]> = {
|
const PRICE_BOUNDS: Record<string, [number, number]> = {
|
||||||
"SFP": [2, 3000],
|
"SFP": [2, 3000],
|
||||||
@ -100,9 +156,9 @@ export async function upsertPriceObservation(params: {
|
|||||||
return false; // Reject price outside form-factor bounds
|
return false; // Reject price outside form-factor bounds
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if price changed via content hash
|
// Check if price changed via content hash — also check observation age
|
||||||
const existing = await pool.query(
|
const existing = await pool.query(
|
||||||
`SELECT content_hash FROM price_observations
|
`SELECT content_hash, time FROM price_observations
|
||||||
WHERE transceiver_id = $1 AND source_vendor_id = $2
|
WHERE transceiver_id = $1 AND source_vendor_id = $2
|
||||||
ORDER BY time DESC LIMIT 1`,
|
ORDER BY time DESC LIMIT 1`,
|
||||||
[params.transceiverId, params.sourceVendorId]
|
[params.transceiverId, params.sourceVendorId]
|
||||||
@ -115,8 +171,13 @@ export async function upsertPriceObservation(params: {
|
|||||||
);
|
);
|
||||||
const isCompetitor = vendorRow.rows[0]?.is_competitor === true;
|
const isCompetitor = vendorRow.rows[0]?.is_competitor === true;
|
||||||
|
|
||||||
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash) {
|
// Price unchanged AND observation is fresh (< 7 days old) → skip insertion
|
||||||
// Price unchanged — still ensure verified flags are current
|
const REFRESH_DAYS = 7;
|
||||||
|
const isStale = !existing.rows.length ||
|
||||||
|
(Date.now() - new Date(existing.rows[0].time).getTime()) > REFRESH_DAYS * 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
|
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) {
|
||||||
|
// Price unchanged and recent — still ensure verified flags are current
|
||||||
await pool.query(
|
await pool.query(
|
||||||
`UPDATE transceivers SET
|
`UPDATE transceivers SET
|
||||||
price_verified = true
|
price_verified = true
|
||||||
@ -278,6 +339,7 @@ export async function upsertStockObservation(params: {
|
|||||||
export async function findOrCreateScrapedTransceiver(params: {
|
export async function findOrCreateScrapedTransceiver(params: {
|
||||||
partNumber: string;
|
partNumber: string;
|
||||||
vendorId: string;
|
vendorId: string;
|
||||||
|
productUrl?: string;
|
||||||
formFactor?: string;
|
formFactor?: string;
|
||||||
speedGbps?: number;
|
speedGbps?: number;
|
||||||
speed?: string;
|
speed?: string;
|
||||||
@ -295,13 +357,42 @@ export async function findOrCreateScrapedTransceiver(params: {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (existing.rows.length > 0) {
|
if (existing.rows.length > 0) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE transceivers
|
||||||
|
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
|
||||||
|
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
|
||||||
|
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
|
||||||
|
speed = COALESCE(NULLIF(speed, ''), $5),
|
||||||
|
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($6, reach_meters) ELSE reach_meters END,
|
||||||
|
reach_label = COALESCE(NULLIF(reach_label, ''), $7),
|
||||||
|
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
|
||||||
|
wavelengths = COALESCE(NULLIF(wavelengths, ''), $9),
|
||||||
|
category = COALESCE(NULLIF(category, ''), $10),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[
|
||||||
|
existing.rows[0].id,
|
||||||
|
params.productUrl || null,
|
||||||
|
params.formFactor || null,
|
||||||
|
params.speedGbps || null,
|
||||||
|
params.speed || null,
|
||||||
|
params.reachMeters || null,
|
||||||
|
params.reachLabel || null,
|
||||||
|
params.fiberType || null,
|
||||||
|
params.wavelengths || null,
|
||||||
|
params.category || null,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
// Update image_url, has_image and image_verified if we have a new image for a record without one
|
// Update image_url, has_image and image_verified if we have a new image for a record without one
|
||||||
if (params.imageUrl && !existing.rows[0].image_url) {
|
if (params.imageUrl && !existing.rows[0].image_url) {
|
||||||
await pool.query(
|
await markImageVerified(existing.rows[0].id, params.imageUrl);
|
||||||
`UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`,
|
}
|
||||||
[params.imageUrl, existing.rows[0].id]
|
if (params.productUrl) {
|
||||||
);
|
await markDetailsVerified({
|
||||||
await checkAndSetFullyVerified(existing.rows[0].id);
|
transceiverId: existing.rows[0].id,
|
||||||
|
sourceUrl: params.productUrl,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
return existing.rows[0].id;
|
return existing.rows[0].id;
|
||||||
}
|
}
|
||||||
@ -309,14 +400,42 @@ export async function findOrCreateScrapedTransceiver(params: {
|
|||||||
// Create new transceiver entry
|
// Create new transceiver entry
|
||||||
const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`;
|
const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`;
|
||||||
const result = await pool.query(
|
const result = await pool.query(
|
||||||
`INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified)
|
`INSERT INTO transceivers (
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13)
|
slug, part_number, vendor_id, product_page_url, form_factor, speed_gbps,
|
||||||
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
|
speed, reach_meters, reach_label, fiber_type, wavelengths, category,
|
||||||
|
market_status, data_confidence, image_url, has_image, image_verified,
|
||||||
|
image_verified_at, image_verified_url, details_verified, details_verified_at,
|
||||||
|
details_source_url
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
$1, $2, $3, $4, $5, $6,
|
||||||
|
$7, $8, $9, $10, $11, $12,
|
||||||
|
'Mainstream', 'scraped_unverified', $13, $14, $14,
|
||||||
|
CASE WHEN $14 THEN NOW() ELSE NULL END, $13, $15,
|
||||||
|
CASE WHEN $15 THEN NOW() ELSE NULL END, $4
|
||||||
|
)
|
||||||
|
ON CONFLICT (slug) DO UPDATE SET
|
||||||
|
product_page_url = COALESCE(transceivers.product_page_url, EXCLUDED.product_page_url),
|
||||||
|
image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url),
|
||||||
|
has_image = COALESCE(transceivers.has_image, false) OR COALESCE(EXCLUDED.has_image, false),
|
||||||
|
image_verified = COALESCE(transceivers.image_verified, false) OR COALESCE(EXCLUDED.image_verified, false),
|
||||||
|
image_verified_at = COALESCE(transceivers.image_verified_at, EXCLUDED.image_verified_at),
|
||||||
|
image_verified_url = COALESCE(transceivers.image_verified_url, EXCLUDED.image_verified_url),
|
||||||
|
details_verified = COALESCE(transceivers.details_verified, false) OR COALESCE(EXCLUDED.details_verified, false),
|
||||||
|
details_verified_at = COALESCE(transceivers.details_verified_at, EXCLUDED.details_verified_at),
|
||||||
|
details_source_url = COALESCE(transceivers.details_source_url, EXCLUDED.details_source_url),
|
||||||
|
data_confidence = CASE
|
||||||
|
WHEN transceivers.data_confidence IS NULL OR transceivers.data_confidence IN ('unknown', 'enriched_estimated')
|
||||||
|
THEN EXCLUDED.data_confidence
|
||||||
|
ELSE transceivers.data_confidence
|
||||||
|
END,
|
||||||
|
updated_at = NOW()
|
||||||
RETURNING id`,
|
RETURNING id`,
|
||||||
[
|
[
|
||||||
slug,
|
slug,
|
||||||
params.partNumber,
|
params.partNumber,
|
||||||
params.vendorId,
|
params.vendorId,
|
||||||
|
params.productUrl || null,
|
||||||
params.formFactor || "SFP",
|
params.formFactor || "SFP",
|
||||||
params.speedGbps || 0,
|
params.speedGbps || 0,
|
||||||
params.speed || "Unknown",
|
params.speed || "Unknown",
|
||||||
@ -326,10 +445,13 @@ export async function findOrCreateScrapedTransceiver(params: {
|
|||||||
params.wavelengths || "",
|
params.wavelengths || "",
|
||||||
params.category || "DataCenter",
|
params.category || "DataCenter",
|
||||||
params.imageUrl || null,
|
params.imageUrl || null,
|
||||||
params.imageUrl ? true : false,
|
Boolean(params.imageUrl),
|
||||||
|
Boolean(params.productUrl && params.reachLabel && params.fiberType),
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
return result.rows[0].id;
|
const id = result.rows[0].id;
|
||||||
|
await checkAndSetFullyVerified(id);
|
||||||
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SwitchParams {
|
export interface SwitchParams {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user