diff --git a/packages/scraper/src/scrapers/ascentoptics.ts b/packages/scraper/src/scrapers/ascentoptics.ts index 4543633..2f93e2b 100644 --- a/packages/scraper/src/scrapers/ascentoptics.ts +++ b/packages/scraper/src/scrapers/ascentoptics.ts @@ -45,6 +45,7 @@ interface Product { partNumber: string; name: string; url: string; + imageUrl?: string; formFactor: string; speed: string; speedGbps: number; @@ -156,11 +157,16 @@ function parseProductTable( const combined = `${rawPart} ${desc}`; const reach = detectReach(combined); + const rawImg = $(cells[0]).find("img").first().attr("src") || $(cells[0]).find("img").first().attr("data-src"); + const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg) + ? (rawImg.startsWith("http") ? rawImg : BASE + rawImg) + : undefined; products.push({ partNumber: rawPart, name: desc || rawPart, url, + imageUrl, formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, @@ -211,6 +217,7 @@ export async function scrapeAscentOptics(): Promise { await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -219,6 +226,7 @@ export async function scrapeAscentOptics(): Promise { fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", + imageUrl: product.imageUrl, }); totalProducts++; } catch (err) { diff --git a/packages/scraper/src/scrapers/eoptolink.ts b/packages/scraper/src/scrapers/eoptolink.ts index 8c75726..d282e95 100644 --- a/packages/scraper/src/scrapers/eoptolink.ts +++ b/packages/scraper/src/scrapers/eoptolink.ts @@ -57,6 +57,12 @@ function speedFromSlug(slug: string): { speed: string; speedGbps: number } { return { speed: "Unknown", speedGbps: 0 }; } +function speedFromTitleThenSlug(title: string, slug: string): { speed: string; speedGbps: number } { + const titleSpeed = speedFromSlug(title); + if (titleSpeed.speedGbps > 0) return titleSpeed; + return speedFromSlug(slug); +} + function formFactorFromText(text: string): string { const t = text.toUpperCase(); if (/\bOSFP\b/.test(t)) return "OSFP"; @@ -124,10 +130,44 @@ interface EoptolinkProduct { speedGbps: number; formFactor: string; fiberType: string; + reachLabel?: string; + reachMeters?: number; + wavelength?: string; + imageUrl?: string; category: string; pageUrl: string; } +function reachFromText(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b120\s*km\b/i, "120km", 120000], + [/\b100\s*km\b/i, "100km", 100000], + [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], + [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b500\s*m\b/i, "500m", 500], + [/\b300\s*m\b/i, "300m", 300], + [/\b100\s*m\b/i, "100m", 100], + [/\bZR\b/i, "80km", 80000], + [/\bER\b/i, "40km", 40000], + [/\bLR\b/i, "10km", 10000], + [/\bFR\b/i, "2km", 2000], + [/\bDR\b/i, "500m", 500], + [/\bSR\b/i, "300m", 300], + ]; + for (const [regex, label, meters] of patterns) { + if (regex.test(text)) return { label, meters }; + } + return undefined; +} + +function wavelengthFromText(text: string): string { + const match = text.match(/(\d{3,4})\s*nm/i); + return match ? match[1] : ""; +} + function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | null { // Page title const titleMatch = html.match(/([^<]+)/i) || html.match(/<h1[^>]*>([^<]{5,80})</i); @@ -138,13 +178,36 @@ function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | nul const pnRegex = /E[A-Z]{2,5}-\d{2,3}[A-Z0-9]{1,3}(?:-\d{1,3})?(?:-[A-Z0-9]{1,6})*/g; const partNumbers = [...new Set([...(html.matchAll(pnRegex) ?? [])].map((m) => m[0].trim()))]; - const slug = pageUrl.split("/").slice(-2).join("-"); - const { speed, speedGbps } = speedFromSlug(slug + " " + pageTitle); - const formFactor = formFactorFromText(pageTitle + " " + slug); - const fiberType = fiberFromText(pageTitle + " " + slug); - const category = categoryFromText(pageTitle + " " + slug); + const slug = pageUrl.split("/").filter(Boolean).slice(-2).join("-"); + const titleEvidence = `${pageTitle} ${slug}`; + const pageEvidence = `${titleEvidence} ${html.replace(/<[^>]+>/g, " ").slice(0, 12000)}`; + const { speed, speedGbps } = speedFromTitleThenSlug(pageTitle, slug); + const formFactor = formFactorFromText(titleEvidence); + const fiberType = fiberFromText(titleEvidence); + const reach = reachFromText(pageEvidence); + const wavelength = wavelengthFromText(pageEvidence); + const category = categoryFromText(titleEvidence); + const rawImage = + html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1] || + html.match(/<img[^>]+src="([^"]+)"/i)?.[1]; + const imageUrl = rawImage && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImage) + ? (rawImage.startsWith("http") ? rawImage : `${BASE}${rawImage.startsWith("/") ? "" : "/"}${rawImage}`) + : undefined; - return { pageTitle, partNumbers, speed, speedGbps, formFactor, fiberType, category, pageUrl }; + return { + pageTitle, + partNumbers, + speed, + speedGbps, + formFactor, + fiberType, + reachLabel: reach?.label, + reachMeters: reach?.meters, + wavelength, + imageUrl, + category, + pageUrl, + }; } // ── Main ──────────────────────────────────────────────────────────────────── @@ -193,15 +256,47 @@ export async function scrapeEoptolink(): Promise<void> { for (const partNumber of namesToSeed) { try { - await findOrCreateScrapedTransceiver({ + const txId = await findOrCreateScrapedTransceiver({ partNumber: partNumber.slice(0, 80), vendorId, + productUrl: url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, fiberType: product.fiberType, + wavelengths: product.wavelength, category: product.category, + imageUrl: product.imageUrl, }); + + await pool.query(` + UPDATE transceivers + SET form_factor = $2, + speed_gbps = $3, + speed = $4, + reach_meters = CASE WHEN $5::int IS NOT NULL THEN $5::int ELSE reach_meters END, + reach_label = COALESCE(NULLIF($6::text, ''), reach_label), + fiber_type = COALESCE(NULLIF($7::text, ''), fiber_type), + wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths), + product_page_url = COALESCE(NULLIF($9::text, ''), product_page_url), + updated_at = NOW() + WHERE id = $1 + AND vendor_id = $10 + `, [ + txId, + product.formFactor, + product.speedGbps, + product.speed, + product.reachMeters ?? null, + product.reachLabel ?? null, + product.fiberType || null, + product.wavelength || null, + url, + vendorId, + ]); + added++; } catch (dbErr) { // Duplicate or constraint error — expected for re-runs diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 0778aca..b377c8c 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -226,7 +226,7 @@ function detectFormFactor(text: string): string | undefined { function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { const patterns: [RegExp, string, number][] = [ - [/1\.6\s*t/i, "1.6T", 1600], + [/1[\.,]6\s*t|1600\s*g/i, "1.6T", 1600], [/800\s*g/i, "800G", 800], [/400\s*g/i, "400G", 400], [/200\s*g/i, "200G", 200], @@ -248,6 +248,15 @@ function detectReach(text: string): string | undefined { return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined; } +function reachMetersFromLabel(label?: string): number | undefined { + if (!label) return undefined; + const match = label.match(/^(\d+(?:\.\d+)?)(m|km)$/i); + if (!match) return undefined; + const value = parseFloat(match[1]); + if (!Number.isFinite(value) || value <= 0) return undefined; + return match[2].toLowerCase() === "km" ? Math.round(value * 1000) : Math.round(value); +} + function detectFiberType(text: string): string | undefined { if (/active\s+optical|\baoc\b/i.test(text)) return "AOC"; if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper"; @@ -946,6 +955,8 @@ export async function scrapeFs(): Promise<void> { const parsed = parseSpecTable(detail.specs); const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`; const fiberType = parsed.fiberType ?? detectFiberType(textForInference); + const reachLabel = reach ?? parsed.reachLabel; + const reachMeters = parsed.reachMeters ?? reachMetersFromLabel(reachLabel); const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({ partNumber: detail.partNumber, @@ -954,8 +965,8 @@ export async function scrapeFs(): Promise<void> { formFactor: ff, speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, - reachLabel: reach ?? parsed.reachLabel, - reachMeters: parsed.reachMeters, + reachLabel, + reachMeters, fiberType, wavelengths: parsed.wavelengths, imageUrl: detail.imageUrl, @@ -968,7 +979,12 @@ export async function scrapeFs(): Promise<void> { SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), form_factor = COALESCE(NULLIF(form_factor, ''), $3), speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END, - speed = COALESCE(NULLIF(speed, ''), $5), + speed = CASE + WHEN $5::text IS NOT NULL + AND (speed IS NULL OR speed = '' OR speed = 'Unknown' OR $4::numeric = speed_gbps) + THEN $5::text + ELSE speed + END, reach_label = COALESCE(NULLIF(reach_label, ''), $6), reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END, fiber_type = COALESCE(NULLIF(fiber_type, ''), $8), @@ -981,8 +997,8 @@ export async function scrapeFs(): Promise<void> { ff, speedInfo?.speedGbps ?? null, speedInfo?.speed ?? null, - reach ?? parsed.reachLabel ?? null, - parsed.reachMeters ?? null, + reachLabel ?? null, + reachMeters ?? null, fiberType ?? null, parsed.wavelengths ?? null, ] @@ -1038,7 +1054,7 @@ export async function scrapeFs(): Promise<void> { const hasSourceDetails = Object.keys(detail.specs).length > 0 || - Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach); + Boolean(fiberType || parsed.connector || parsed.wavelengths || reachLabel); if (hasSourceDetails) { const updated = await updateVerifiedSpecs({ @@ -1046,8 +1062,8 @@ export async function scrapeFs(): Promise<void> { fiberType, connector: parsed.connector, wavelengths: parsed.wavelengths, - reachMeters: parsed.reachMeters, - reachLabel: reach ?? parsed.reachLabel, + reachMeters, + reachLabel, powerConsumptionW: parsed.powerConsumptionW, tempRange: parsed.tempRange, modulation: parsed.modulation, diff --git a/packages/scraper/src/scrapers/gaotek.ts b/packages/scraper/src/scrapers/gaotek.ts index e02c83a..a2ad158 100644 --- a/packages/scraper/src/scrapers/gaotek.ts +++ b/packages/scraper/src/scrapers/gaotek.ts @@ -31,6 +31,7 @@ interface Product { reachMeters?: number; fiberType?: string; wavelength?: string; + imageUrl?: string; } function sleep(ms: number): Promise<void> { @@ -116,6 +117,13 @@ function parseProductList(html: string): Product[] { const ff = detectFormFactor(name); const reach = detectReach(name); + const rawImg = + $(el).find("img").first().attr("data-src") || + $(el).find("img").first().attr("data-lazy-src") || + $(el).find("img").first().attr("src"); + const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg) + ? (rawImg.startsWith("http") ? rawImg : BASE + rawImg) + : undefined; products.push({ partNumber, @@ -127,6 +135,7 @@ function parseProductList(html: string): Product[] { reachMeters: reach?.meters, fiberType: detectFiber(name), wavelength: detectWavelength(name), + imageUrl, }); }); @@ -150,11 +159,19 @@ function parseProductList(html: string): Product[] { } const ff = detectFormFactor(name); const reach = detectReach(name); + const rawImg = + $(el).find("img").first().attr("data-src") || + $(el).find("img").first().attr("data-lazy-src") || + $(el).find("img").first().attr("src"); + const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg) + ? (rawImg.startsWith("http") ? rawImg : BASE + rawImg) + : undefined; products.push({ partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60), name, url, price, ...ff, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(name), wavelength: detectWavelength(name), + imageUrl, }); }); } @@ -224,6 +241,7 @@ export async function scrapeGaoTek(): Promise<void> { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -232,6 +250,7 @@ export async function scrapeGaoTek(): Promise<void> { fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", + imageUrl: product.imageUrl, }); if (product.price && product.price > 0) { diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts index 81bfcb3..7316b2d 100644 --- a/packages/scraper/src/scrapers/naddod.ts +++ b/packages/scraper/src/scrapers/naddod.ts @@ -15,7 +15,15 @@ * * Rate limited: 1 req/2sec. */ -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; +import { + pool, + findOrCreateScrapedTransceiver, + ensureVendor, + upsertPriceObservation, + upsertStockObservation, + markImageVerified, + markDetailsVerified, +} from "../utils/db"; import { contentHash } from "../utils/hash"; import { readFileSync, writeFileSync, existsSync } from "node:fs"; import { join } from "node:path"; @@ -29,7 +37,8 @@ const HEADERS = { }; // Limit detail-page fetches per run to stay reasonable -const MAX_DETAIL_PAGES = 600; +const MAX_DETAIL_PAGES = Math.max(1, Math.min(1000, parseInt(process.env["NADDOD_MAX_DETAIL_PAGES"] || "600", 10))); +const DB_DETAIL_ONLY = process.env["NADDOD_DB_DETAIL_ONLY"] === "1"; // Cursor file: persists across runs so each run covers the next 600 URLs const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json"); @@ -88,6 +97,26 @@ function detectSpeedGbps(text: string): { speed: string; speedGbps: number } { } function detectReach(text: string): { label: string; meters: number } | undefined { + const cableCode = text.match(/\b(?:AOC|DAC|CU|COPPER|MCP|MFS)[A-Z0-9._-]*?(\d+(?:\.\d+)?)M\b/i); + if (cableCode) { + const value = parseFloat(cableCode[1]); + if (Number.isFinite(value) && value > 0 && value <= 500) { + return { label: `${String(value).replace(/\.0$/, "")}m`, meters: Math.round(value) }; + } + } + + const generic = text.match(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/i); + if (generic) { + const value = parseFloat(generic[1]); + const unit = generic[2].toLowerCase(); + if (Number.isFinite(value) && value > 0) { + const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value); + const labelValue = String(value).replace(/\.0$/, ""); + const label = unit === "km" ? `${labelValue}km` : `${labelValue}m`; + return { label, meters }; + } + } + const patterns: [RegExp, string, number][] = [ [/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000], @@ -102,8 +131,9 @@ function detectReach(text: string): { label: string; meters: number } | undefine [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], - [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], - [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], + [/\bER4?\b/, "40km", 40000], [/\bZRP?\b|\bZR4?\b/, "80km", 80000], + [/\bSR\d*\b|\bVR\d*\b/, "100m", 100], + [/\bDR4?\b|\bXDR\d*\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], ]; for (const [re, label, meters] of patterns) { @@ -113,9 +143,10 @@ function detectReach(text: string): { label: string; meters: number } | undefine } function detectFiber(text: string): string { - if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; - if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; + if (/aoc|active.?optical|active.?fiber|[^a-z]sr\d*[^a-z]?|[^a-z]vr\d*[^a-z]?|850\s*nm/i.test(text)) return "MMF"; + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]?|[^a-z]dr\d*[^a-z]?|[^a-z]fr\d*[^a-z]?|xdr\d*|psm|bidi|cwdm|dwdm|1310\s*nm|1550\s*nm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]/i.test(text)) return "MMF"; return ""; } @@ -175,6 +206,7 @@ async function fetchText(url: string): Promise<string> { */ function parseDetailPage(html: string, url: string): { name: string; + imageUrl?: string; price?: number; stock: { qty?: number; confidence: 1 | 2 } | null; } | null { @@ -187,6 +219,20 @@ function parseDetailPage(html: string, url: string): { if (!name || name.length < 10) return null; if (!isTransceiver(name)) return null; + const imageUrl = (() => { + const candidates = [ + html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1], + html.match(/<meta\s+name="twitter:image"\s+content="([^"]+)"/i)?.[1], + html.match(/"image"\s*:\s*"([^"]+)"/i)?.[1], + ].filter(Boolean) as string[]; + + const img = candidates.find((candidate) => + !/(logo|placeholder|default|no-image|icon|sprite)/i.test(candidate) + ); + if (!img) return undefined; + return img.startsWith("http") ? img : `${BASE}${img.startsWith("/") ? "" : "/"}${img}`; + })(); + // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00") // Fall back to "US$ 10.90" or "$10.90" visible text patterns let price: number | undefined; @@ -220,7 +266,7 @@ function parseDetailPage(html: string, url: string): { // Stock count const stock = parseStockText(html); - return { name, price, stock }; + return { name, imageUrl, price, stock }; } // ── Sitemap parsing ───────────────────────────────────────────────────────── @@ -247,6 +293,25 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> { return [...new Set(urls)]; // deduplicate } +async function fetchDbTargets(limit: number): Promise<Array<{ url: string; targetTransceiverId: string }>> { + const result = await pool.query<{ id: string; product_page_url: string }>(` + SELECT t.id, t.product_page_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'NADDOD' + AND t.product_page_url IS NOT NULL + AND t.product_page_url != '' + AND (t.details_verified = false OR t.image_verified = false) + ORDER BY t.details_verified ASC, t.image_verified ASC, t.updated_at ASC + LIMIT $1 + `, [limit]); + + return result.rows.map((row) => ({ + url: row.product_page_url, + targetTransceiverId: row.id, + })); +} + // ── Main scraper ──────────────────────────────────────────────────────────── export async function scrapeNaddod(): Promise<void> { @@ -262,11 +327,18 @@ export async function scrapeNaddod(): Promise<void> { // ── Phase 1: Discover product URLs via sitemap ──────────────────────────── console.log("[Phase 1] Discovering products from sitemap..."); let productUrls: string[] = []; + let targets: Array<{ url: string; targetTransceiverId?: string }> = []; try { - productUrls = await fetchProductUrlsFromSitemap(); - console.log(` Found ${productUrls.length} product URLs in sitemap`); + if (DB_DETAIL_ONLY) { + targets = await fetchDbTargets(MAX_DETAIL_PAGES); + productUrls = targets.map((target) => target.url); + console.log(` DB detail targets: ${productUrls.length}`); + } else { + productUrls = await fetchProductUrlsFromSitemap(); + console.log(` Found ${productUrls.length} product URLs in sitemap`); + } } catch (err) { - console.error(` Sitemap fetch failed: ${(err as Error).message}`); + console.error(` Target discovery failed: ${(err as Error).message}`); return; } @@ -278,16 +350,16 @@ export async function scrapeNaddod(): Promise<void> { // Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs // (24 hours) we cover all ~7300 products. Wraps around when exhausted. const totalUrls = productUrls.length; - const offset = readCursor() % totalUrls; + const offset = DB_DETAIL_ONLY ? 0 : readCursor() % totalUrls; const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls); - let urls = productUrls.slice(offset, endIdx); - // Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list) - if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) { - const wrap = MAX_DETAIL_PAGES - urls.length; - urls = urls.concat(productUrls.slice(0, wrap)); + let batchTargets: Array<{ url: string; targetTransceiverId?: string }> = + DB_DETAIL_ONLY ? targets : productUrls.slice(offset, endIdx).map((url) => ({ url })); + if (!DB_DETAIL_ONLY && batchTargets.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) { + const wrap = MAX_DETAIL_PAGES - batchTargets.length; + batchTargets = batchTargets.concat(productUrls.slice(0, wrap).map((url) => ({ url }))); } - const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls; - console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`); + const nextOffset = DB_DETAIL_ONLY ? offset : (offset + MAX_DETAIL_PAGES) % totalUrls; + console.log(` Offset: ${offset}/${totalUrls} → processing ${batchTargets.length} products (next run starts at ${nextOffset})`); // ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── console.log("\n[Phase 2] Fetching product detail pages..."); @@ -299,7 +371,8 @@ export async function scrapeNaddod(): Promise<void> { let skippedNonTx = 0; let errors = 0; - for (const url of urls) { + for (const target of batchTargets) { + const url = target.url; await sleep(2000); try { const html = await fetchText(url); @@ -310,28 +383,64 @@ export async function scrapeNaddod(): Promise<void> { continue; } - const { name, price, stock } = detail; - const { speed, speedGbps } = detectSpeedGbps(name); - const formFactor = detectFormFactor(name); - const reach = detectReach(name); - const fiberType = detectFiber(name); - const wavelength = detectWavelength(name); + const { name, imageUrl, price, stock } = detail; + const evidenceText = `${name} ${html.replace(/<[^>]+>/g, " ").slice(0, 20000)}`; + const { speed, speedGbps } = detectSpeedGbps(evidenceText); + const formFactor = detectFormFactor(evidenceText); + const reach = detectReach(evidenceText); + const fiberType = detectFiber(evidenceText); + const wavelength = detectWavelength(evidenceText); // Extract part number from name (first word-group before "Compatible" or vendor name) const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); - const txId = await findOrCreateScrapedTransceiver({ - partNumber, - vendorId, - formFactor, - speedGbps, - speed, - reachMeters: reach?.meters, - reachLabel: reach?.label, - fiberType, - wavelengths: wavelength, - category: "DataCenter", - }); + let txId = target.targetTransceiverId; + + if (txId) { + await pool.query(` + UPDATE transceivers + SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), + form_factor = COALESCE(NULLIF($3::text, ''), form_factor), + speed_gbps = CASE WHEN $4::numeric > 0 THEN $4::numeric ELSE speed_gbps END, + speed = CASE WHEN $4::numeric > 0 THEN $5 ELSE speed END, + reach_meters = CASE WHEN $6::int IS NOT NULL AND $6::int > 0 THEN $6::int ELSE reach_meters END, + reach_label = COALESCE(NULLIF($7::text, ''), reach_label), + fiber_type = COALESCE(NULLIF($8::text, ''), fiber_type), + wavelengths = COALESCE(NULLIF($9::text, ''), wavelengths), + category = COALESCE(NULLIF(category, ''), 'DataCenter'), + updated_at = NOW() + WHERE id = $1 + AND vendor_id = $10 + `, [ + txId, + url, + formFactor, + speedGbps, + speed, + reach?.meters ?? null, + reach?.label ?? null, + fiberType || null, + wavelength || null, + vendorId, + ]); + if (imageUrl) await markImageVerified(txId, imageUrl); + await markDetailsVerified({ transceiverId: txId, sourceUrl: url }); + } else { + txId = await findOrCreateScrapedTransceiver({ + partNumber, + vendorId, + productUrl: url, + formFactor, + speedGbps, + speed, + reachMeters: reach?.meters, + reachLabel: reach?.label, + fiberType, + wavelengths: wavelength, + category: "DataCenter", + imageUrl, + }); + } // Price observation if (price && price > 0) { @@ -368,7 +477,7 @@ export async function scrapeNaddod(): Promise<void> { processed++; if (processed % 50 === 0) { - console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`); + console.log(` Progress: ${processed}/${batchTargets.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`); } } catch (err) { errors++; @@ -377,10 +486,10 @@ export async function scrapeNaddod(): Promise<void> { } // Advance cursor for next run - writeCursor(nextOffset); + if (!DB_DETAIL_ONLY) writeCursor(nextOffset); console.log("\n=== NADDOD Scraper v2 Complete ==="); - console.log(` URL range processed: ${offset}–${offset + urls.length - 1} of ${totalUrls}`); + console.log(` URL range processed: ${offset}–${offset + batchTargets.length - 1} of ${totalUrls}`); console.log(` Products processed: ${processed}`); console.log(` Non-transceivers skip: ${skippedNonTx}`); console.log(` Price observations: ${priceUpdates} new`); diff --git a/sync/CURRENT.md b/sync/CURRENT.md index c44321d..3611535 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,102 @@ # Current TIP Sync State -Updated: 2026-05-09 12:16 UTC +Updated: 2026-05-09 13:54 UTC ## Newest Work +- FS.com 1.6T DR8/2FR4 source correction on 2026-05-09: + - operator spotted that FS.com has two distinct 1.6T OSFP variants on the same family: + - `OSFP-DR8-1.6T-FL`: 500m, DR8, SMF + - `OSFP-2FR4-1.6T-FL`: 2km, 2FR4, SMF + - confirmed in TIP DB: + - both FS.com variants exist as separate rows + - `OSFP-2FR4-1.6T-FL` had `reach_meters=0` even though the source and row label said `2km` + - `OSFP-DR8-1.6T-FL` had no wavelength, causing the deterministic equivalence worker to reject the otherwise correct 500m Flexoptix match + - live DB correction: + - `OSFP-DR8-1.6T-FL` + - `speed=1.6T` + - `speed_gbps=1600` + - `reach_label=500m` + - `reach_meters=500` + - `fiber_type=SMF` + - `wavelengths=1310` + - `standard_name=1.6T OSFP DR8` + - fully verified remains true + - `OSFP-2FR4-1.6T-FL` + - `speed=1.6T` + - `speed_gbps=1600` + - `reach_label=2km` + - `reach_meters=2000` + - `fiber_type=SMF` + - `wavelengths=1310` + - `standard_name=1.6T OSFP 2FR4` + - fully verified true + - Flexoptix `O.1316T.C.05.M` + - confirmed as `500m`, `SMF`, `1.6T` + - `standard_name=1.6T OSFP DR8` + - equivalence correction: + - approved only `O.1316T.C.05.M` ↔ `OSFP-DR8-1.6T-FL` + - confidence `0.913` + - match basis: form factor, speed, reach, fiber, wavelength and source variant DR8/500m + - `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m DR8 Flexoptix product + - scraper hardening: + - `packages/scraper/src/scrapers/fs-com.ts` + - recognizes German/decimal `1,6T` and `1600G` as `1.6T`/`1600` + - converts reach labels such as `2km` into `reach_meters=2000` + - updates stale `speed` labels when the numeric source speed matches the row + - build: + - `pnpm -C packages/scraper build` passed on Erik + - truth: + - there are definitely two separate FS.com variants + - 500m DR8 is the correct equivalent for Flexoptix `O.1316T.C.05.M` + - 2km FR4 is a separate DB product and must not be collapsed into the 500m match + +- Targeted vendor verification push after equivalence revalidation on 2026-05-09: + - code improved: + - `NADDOD_DB_DETAIL_ONLY=1` mode verifies existing NADDOD rows with source URLs instead of rotating blindly through the full sitemap + - NADDOD now extracts `og:image`, source product URLs, reach/fiber/wavelength from page evidence, AOC/DAC cable lengths, and DR/FR/SR/VR/XDR patterns + - GAO Tek now writes product URLs and image evidence + - Ascent Optics now writes product URLs and table image evidence + - Eoptolink now writes product URLs, images, reach/wavelength evidence and corrects over-broad form-factor parsing by preferring title/slug evidence + - live low-load Erik runs: + - GAO Tek static crawl: + - `473` unique products processed + - GAO Tek detail coverage improved from `41` to `126` + - `no_url` dropped to `0` + - Ascent Optics static/API crawl: + - `253` catalog products processed + - image coverage `235/305` + - detail coverage `213/305` + - Eoptolink static crawl: + - `76` product-solution pages inspected + - after parser correction, Eoptolink is `287/287` image and detail verified + - NADDOD targeted DB-detail mode: + - first targeted wave `200` pages + - second wave `300` pages + - closure wave `385` pages + - special-case wave `83` pages + - NADDOD moved from `image=12`, `details=157`, `fully=0/1-ish` to: + - total `748` + - price `744` + - image `742` + - details `659` + - competitor `744` + - fully `659` + - no URL `6` + - global TIP counters after this push: + - price verified `11557` + - image verified `11963` + - details verified `11018` + - fully verified `9794` + - total transceivers `17647` + - health: + - TIP stayed `healthy` + - load status `ok` + - memory used about `13%` + - truth: + - NADDOD is not 100% complete; remaining detail gaps include likely non-transceiver switch/NIC products and a smaller set of parser-special cases + - OEM catalogs like Ascent and Eoptolink do not publish retail prices, so full verification cannot be forced honestly without price evidence + - Immediate full TIP equivalence revalidation on 2026-05-09: - operator requested all open TIP validation to be completed immediately and all product matches checked for true 1:1 equivalence - live preflight: diff --git a/sync/history/2026-05-09-fscom-16t-variant-correction-and-vendor-verification.md b/sync/history/2026-05-09-fscom-16t-variant-correction-and-vendor-verification.md new file mode 100644 index 0000000..52cd634 --- /dev/null +++ b/sync/history/2026-05-09-fscom-16t-variant-correction-and-vendor-verification.md @@ -0,0 +1,117 @@ +# FS.com 1.6T Variant Correction + Vendor Verification Push + +Date: 2026-05-09 +Actor: Codex + +## Operator Finding + +The operator spotted a concrete source-truth problem on FS.com: + +- `OSFP-DR8-1.6T-FL` is the 500m DR8 variant. +- `OSFP-2FR4-1.6T-FL` is the 2km 2FR4 variant. +- Flexoptix `O.1316T.C.05.M` is the 500m DR8 product. +- The 2km FR4 variant must be present as its own product and must not be collapsed into the 500m match. + +## Live DB Correction + +Corrected FS.com rows: + +- `OSFP-DR8-1.6T-FL` + - `speed=1.6T` + - `speed_gbps=1600` + - `reach_label=500m` + - `reach_meters=500` + - `fiber_type=SMF` + - `wavelengths=1310` + - `standard_name=1.6T OSFP DR8` + - fully verified + +- `OSFP-2FR4-1.6T-FL` + - `speed=1.6T` + - `speed_gbps=1600` + - `reach_label=2km` + - `reach_meters=2000` + - `fiber_type=SMF` + - `wavelengths=1310` + - `standard_name=1.6T OSFP 2FR4` + - fully verified + +Corrected Flexoptix row: + +- `O.1316T.C.05.M` + - confirmed `500m`, `SMF`, `1.6T` + - `standard_name=1.6T OSFP DR8` + +Corrected equivalence: + +- Approved only `O.1316T.C.05.M` ↔ `OSFP-DR8-1.6T-FL`. +- Confidence: `0.913`. +- Basis: form factor, speed, reach, fiber, wavelength and explicit source variant DR8/500m. +- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m Flexoptix product. + +## Scraper Hardening + +Updated `packages/scraper/src/scrapers/fs-com.ts`: + +- Detects `1,6T`, `1.6T` and `1600G` as `1.6T`/`1600`. +- Converts labels like `2km` to `reach_meters=2000`. +- Updates stale `speed` strings when the numeric source speed matches the row. + +Remote build on Erik passed: + +```text +pnpm -C packages/scraper build +``` + +## Vendor Verification Work In Same Push + +Updated: + +- `packages/scraper/src/scrapers/naddod.ts` +- `packages/scraper/src/scrapers/gaotek.ts` +- `packages/scraper/src/scrapers/ascentoptics.ts` +- `packages/scraper/src/scrapers/eoptolink.ts` + +Live results: + +- GAO Tek: + - details improved from `41` to `126` + - no-url dropped to `0` +- Ascent Optics: + - image `235/305` + - details `213/305` +- Eoptolink: + - image `287/287` + - details `287/287` +- NADDOD: + - total `748` + - price `744` + - image `742` + - details `659` + - competitor `744` + - fully `659` + - no URL `6` + +Global TIP counters after the push: + +- price verified `11557` +- image verified `11963` +- details verified `11018` +- fully verified `9794` +- total transceivers `17647` + +TIP remained healthy: + +- status `healthy` +- load status `ok` +- memory around `13%` + +## Lesson For TIPLLM + +Variant selectors on vendor pages must be treated as separate products when reach, optical protocol, connector or model changes. + +For FS.com 1.6T OSFP: + +- `DR8 500m` and `2FR4 2km` are distinct SKUs and distinct compatibility candidates. +- A Flexoptix 500m DR8 product must not be matched to a 2km FR4 FS.com product. +- Source pages can expose German decimal text (`1,6T`) and separate net/gross prices; normalize carefully.