diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 6031e11..0778aca 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -58,6 +58,7 @@ import { upsertPriceObservation, upsertStockObservation, findOrCreateScrapedTransceiver, + markImageVerified, pool, } from "../utils/db"; import { contentHash } from "../utils/hash"; @@ -73,6 +74,7 @@ const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1"; const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1"; const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1"; +const URL_DISCOVERY_ONLY = process.env["FS_URL_DISCOVERY_ONLY"] === "1"; const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") @@ -262,6 +264,7 @@ interface ProductSummary { url: string; name: string; partNumber: string; + targetTransceiverId?: string; } interface ProductDetail extends ProductSummary { @@ -394,7 +397,7 @@ async function collectProductUrls( // ── Phase 2: Scrape product detail pages ────────────────────────────────────── async function scrapeProductDetails( - requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, + requests: Array<{ url: string; userData: { name: string; partNumber: string; targetTransceiverId?: string } }>, proxyConfiguration: ProxyConfiguration | undefined ): Promise { // Purge Phase 2 storage so it starts with a clean request queue @@ -425,7 +428,9 @@ async function scrapeProductDetails( const { name: listingName, partNumber: listingPn } = request.userData as { name: string; partNumber: string; + targetTransceiverId?: string; }; + const { targetTransceiverId } = request.userData as { targetTransceiverId?: string }; const url = request.url; try { @@ -732,6 +737,7 @@ async function scrapeProductDetails( specs: raw.specs, imageUrl: resolveUrl(raw.imageUrl), datasheetUrl: resolveUrl(raw.datasheetUrl), + targetTransceiverId, }); }, }, makeCrawleeConfig("fs-phase2")); @@ -789,11 +795,34 @@ export async function scrapeFs(): Promise { // ── Phase 1: Discover product URLs ───────────────────────────────────────── let productMap: Map; - if (DB_DETAIL_ONLY) { + if (URL_DISCOVERY_ONLY) { + console.log("\n[Phase 1] URL discovery mode — probing FS.COM rows without product URLs…"); + const dbRows = await pool.query( + ` + SELECT t.id, t.part_number + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'FS.COM' + AND COALESCE(t.product_page_url, '') = '' + AND t.part_number ~ '^FS-[0-9]+$' + ORDER BY t.part_number + LIMIT $1 + `, + [MAX_DETAIL_PAGES_PER_RUN] + ); + productMap = new Map( + dbRows.rows.map((row) => { + const partNumber = row.part_number as string; + const productId = partNumber.replace(/^FS-/, ""); + const url = `${BASE_URL}/products/${productId}.html`; + return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }]; + }) + ); + } else if (DB_DETAIL_ONLY) { console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…"); const dbRows = await pool.query( ` - SELECT t.part_number, t.product_page_url + SELECT t.id, t.part_number, t.product_page_url FROM transceivers t JOIN vendors v ON v.id = t.vendor_id WHERE v.name = 'FS.COM' @@ -820,7 +849,7 @@ export async function scrapeFs(): Promise { dbRows.rows.map((row) => { const url = normalizeFsProductUrl(row.product_page_url as string); const partNumber = row.part_number as string; - return [url, { url, name: partNumber, partNumber }]; + return [url, { url, name: partNumber, partNumber, targetTransceiverId: row.id as string }]; }) ); } else { @@ -889,7 +918,14 @@ export async function scrapeFs(): Promise { // ── Phase 2: Scrape detail pages ──────────────────────────────────────────── const detailRequests = urlsToScrape.map((url) => { const s = productMap.get(url); - return { url, userData: { name: s?.name ?? "FS.com Product", partNumber: s?.partNumber ?? "" } }; + return { + url, + userData: { + name: s?.name ?? "FS.com Product", + partNumber: s?.partNumber ?? "", + targetTransceiverId: s?.targetTransceiverId, + }, + }; }); const details = await scrapeProductDetails(detailRequests, proxyConfiguration); @@ -911,20 +947,50 @@ export async function scrapeFs(): Promise { const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`; const fiberType = parsed.fiberType ?? detectFiberType(textForInference); - const transceiverId = await findOrCreateScrapedTransceiver({ - partNumber: detail.partNumber, - vendorId, - productUrl: detail.url, - formFactor: ff, - speedGbps: speedInfo?.speedGbps, - speed: speedInfo?.speed, - reachLabel: reach ?? parsed.reachLabel, - reachMeters: parsed.reachMeters, - fiberType, - wavelengths: parsed.wavelengths, - imageUrl: detail.imageUrl, - category: "DataCenter", - }); + const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({ + partNumber: detail.partNumber, + vendorId, + productUrl: detail.url, + formFactor: ff, + speedGbps: speedInfo?.speedGbps, + speed: speedInfo?.speed, + reachLabel: reach ?? parsed.reachLabel, + reachMeters: parsed.reachMeters, + fiberType, + wavelengths: parsed.wavelengths, + imageUrl: detail.imageUrl, + category: "DataCenter", + })); + + if (detail.targetTransceiverId) { + await pool.query( + `UPDATE transceivers + SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), + form_factor = COALESCE(NULLIF(form_factor, ''), $3), + speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END, + speed = COALESCE(NULLIF(speed, ''), $5), + reach_label = COALESCE(NULLIF(reach_label, ''), $6), + reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END, + fiber_type = COALESCE(NULLIF(fiber_type, ''), $8), + wavelengths = COALESCE(NULLIF(wavelengths, ''), $9), + updated_at = NOW() + WHERE id = $1`, + [ + transceiverId, + detail.url, + ff, + speedInfo?.speedGbps ?? null, + speedInfo?.speed ?? null, + reach ?? parsed.reachLabel ?? null, + parsed.reachMeters ?? null, + fiberType ?? null, + parsed.wavelengths ?? null, + ] + ); + if (detail.imageUrl) { + await markImageVerified(transceiverId, detail.imageUrl); + } + } const stockLevel = deriveStockLevel(detail.deQty, detail.globalQty, detail.backorderQty); const totalQty = (detail.deQty ?? 0) + (detail.globalQty ?? 0); @@ -970,14 +1036,18 @@ export async function scrapeFs(): Promise { }); if (stockNew) stockWritten++; - if (Object.keys(detail.specs).length > 0) { + const hasSourceDetails = + Object.keys(detail.specs).length > 0 || + Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach); + + if (hasSourceDetails) { const updated = await updateVerifiedSpecs({ transceiverId, fiberType, connector: parsed.connector, wavelengths: parsed.wavelengths, reachMeters: parsed.reachMeters, - reachLabel: parsed.reachLabel, + reachLabel: reach ?? parsed.reachLabel, powerConsumptionW: parsed.powerConsumptionW, tempRange: parsed.tempRange, modulation: parsed.modulation,