fix: improve vendor verification and fscom 1.6t variants

This commit is contained in:
Rene Fichtmueller 2026-05-09 15:56:08 +02:00
parent 7da78a999d
commit b26696f0d1
7 changed files with 514 additions and 57 deletions

View File

@ -45,6 +45,7 @@ interface Product {
partNumber: string; partNumber: string;
name: string; name: string;
url: string; url: string;
imageUrl?: string;
formFactor: string; formFactor: string;
speed: string; speed: string;
speedGbps: number; speedGbps: number;
@ -156,11 +157,16 @@ function parseProductTable(
const combined = `${rawPart} ${desc}`; const combined = `${rawPart} ${desc}`;
const reach = detectReach(combined); const reach = detectReach(combined);
const rawImg = $(cells[0]).find("img").first().attr("src") || $(cells[0]).find("img").first().attr("data-src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({ products.push({
partNumber: rawPart, partNumber: rawPart,
name: desc || rawPart, name: desc || rawPart,
url, url,
imageUrl,
formFactor: cat.formFactor, formFactor: cat.formFactor,
speed: cat.speed, speed: cat.speed,
speedGbps: cat.speedGbps, speedGbps: cat.speedGbps,
@ -211,6 +217,7 @@ export async function scrapeAscentOptics(): Promise<void> {
await findOrCreateScrapedTransceiver({ await findOrCreateScrapedTransceiver({
partNumber: product.partNumber, partNumber: product.partNumber,
vendorId, vendorId,
productUrl: product.url,
formFactor: product.formFactor, formFactor: product.formFactor,
speedGbps: product.speedGbps, speedGbps: product.speedGbps,
speed: product.speed, speed: product.speed,
@ -219,6 +226,7 @@ export async function scrapeAscentOptics(): Promise<void> {
fiberType: product.fiberType, fiberType: product.fiberType,
wavelengths: product.wavelength, wavelengths: product.wavelength,
category: "DataCenter", category: "DataCenter",
imageUrl: product.imageUrl,
}); });
totalProducts++; totalProducts++;
} catch (err) { } catch (err) {

View File

@ -57,6 +57,12 @@ function speedFromSlug(slug: string): { speed: string; speedGbps: number } {
return { speed: "Unknown", speedGbps: 0 }; return { speed: "Unknown", speedGbps: 0 };
} }
function speedFromTitleThenSlug(title: string, slug: string): { speed: string; speedGbps: number } {
const titleSpeed = speedFromSlug(title);
if (titleSpeed.speedGbps > 0) return titleSpeed;
return speedFromSlug(slug);
}
function formFactorFromText(text: string): string { function formFactorFromText(text: string): string {
const t = text.toUpperCase(); const t = text.toUpperCase();
if (/\bOSFP\b/.test(t)) return "OSFP"; if (/\bOSFP\b/.test(t)) return "OSFP";
@ -124,10 +130,44 @@ interface EoptolinkProduct {
speedGbps: number; speedGbps: number;
formFactor: string; formFactor: string;
fiberType: string; fiberType: string;
reachLabel?: string;
reachMeters?: number;
wavelength?: string;
imageUrl?: string;
category: string; category: string;
pageUrl: string; pageUrl: string;
} }
function reachFromText(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b100\s*km\b/i, "100km", 100000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100],
[/\bZR\b/i, "80km", 80000],
[/\bER\b/i, "40km", 40000],
[/\bLR\b/i, "10km", 10000],
[/\bFR\b/i, "2km", 2000],
[/\bDR\b/i, "500m", 500],
[/\bSR\b/i, "300m", 300],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function wavelengthFromText(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | null { function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | null {
// Page title // Page title
const titleMatch = html.match(/<title>([^<]+)/i) || html.match(/<h1[^>]*>([^<]{5,80})</i); const titleMatch = html.match(/<title>([^<]+)/i) || html.match(/<h1[^>]*>([^<]{5,80})</i);
@ -138,13 +178,36 @@ function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | nul
const pnRegex = /E[A-Z]{2,5}-\d{2,3}[A-Z0-9]{1,3}(?:-\d{1,3})?(?:-[A-Z0-9]{1,6})*/g; const pnRegex = /E[A-Z]{2,5}-\d{2,3}[A-Z0-9]{1,3}(?:-\d{1,3})?(?:-[A-Z0-9]{1,6})*/g;
const partNumbers = [...new Set([...(html.matchAll(pnRegex) ?? [])].map((m) => m[0].trim()))]; const partNumbers = [...new Set([...(html.matchAll(pnRegex) ?? [])].map((m) => m[0].trim()))];
const slug = pageUrl.split("/").slice(-2).join("-"); const slug = pageUrl.split("/").filter(Boolean).slice(-2).join("-");
const { speed, speedGbps } = speedFromSlug(slug + " " + pageTitle); const titleEvidence = `${pageTitle} ${slug}`;
const formFactor = formFactorFromText(pageTitle + " " + slug); const pageEvidence = `${titleEvidence} ${html.replace(/<[^>]+>/g, " ").slice(0, 12000)}`;
const fiberType = fiberFromText(pageTitle + " " + slug); const { speed, speedGbps } = speedFromTitleThenSlug(pageTitle, slug);
const category = categoryFromText(pageTitle + " " + slug); const formFactor = formFactorFromText(titleEvidence);
const fiberType = fiberFromText(titleEvidence);
const reach = reachFromText(pageEvidence);
const wavelength = wavelengthFromText(pageEvidence);
const category = categoryFromText(titleEvidence);
const rawImage =
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1] ||
html.match(/<img[^>]+src="([^"]+)"/i)?.[1];
const imageUrl = rawImage && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImage)
? (rawImage.startsWith("http") ? rawImage : `${BASE}${rawImage.startsWith("/") ? "" : "/"}${rawImage}`)
: undefined;
return { pageTitle, partNumbers, speed, speedGbps, formFactor, fiberType, category, pageUrl }; return {
pageTitle,
partNumbers,
speed,
speedGbps,
formFactor,
fiberType,
reachLabel: reach?.label,
reachMeters: reach?.meters,
wavelength,
imageUrl,
category,
pageUrl,
};
} }
// ── Main ──────────────────────────────────────────────────────────────────── // ── Main ────────────────────────────────────────────────────────────────────
@ -193,15 +256,47 @@ export async function scrapeEoptolink(): Promise<void> {
for (const partNumber of namesToSeed) { for (const partNumber of namesToSeed) {
try { try {
await findOrCreateScrapedTransceiver({ const txId = await findOrCreateScrapedTransceiver({
partNumber: partNumber.slice(0, 80), partNumber: partNumber.slice(0, 80),
vendorId, vendorId,
productUrl: url,
formFactor: product.formFactor, formFactor: product.formFactor,
speedGbps: product.speedGbps, speedGbps: product.speedGbps,
speed: product.speed, speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType, fiberType: product.fiberType,
wavelengths: product.wavelength,
category: product.category, category: product.category,
imageUrl: product.imageUrl,
}); });
await pool.query(`
UPDATE transceivers
SET form_factor = $2,
speed_gbps = $3,
speed = $4,
reach_meters = CASE WHEN $5::int IS NOT NULL THEN $5::int ELSE reach_meters END,
reach_label = COALESCE(NULLIF($6::text, ''), reach_label),
fiber_type = COALESCE(NULLIF($7::text, ''), fiber_type),
wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths),
product_page_url = COALESCE(NULLIF($9::text, ''), product_page_url),
updated_at = NOW()
WHERE id = $1
AND vendor_id = $10
`, [
txId,
product.formFactor,
product.speedGbps,
product.speed,
product.reachMeters ?? null,
product.reachLabel ?? null,
product.fiberType || null,
product.wavelength || null,
url,
vendorId,
]);
added++; added++;
} catch (dbErr) { } catch (dbErr) {
// Duplicate or constraint error — expected for re-runs // Duplicate or constraint error — expected for re-runs

View File

@ -226,7 +226,7 @@ function detectFormFactor(text: string): string | undefined {
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined { function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
[/1\.6\s*t/i, "1.6T", 1600], [/1[\.,]6\s*t|1600\s*g/i, "1.6T", 1600],
[/800\s*g/i, "800G", 800], [/800\s*g/i, "800G", 800],
[/400\s*g/i, "400G", 400], [/400\s*g/i, "400G", 400],
[/200\s*g/i, "200G", 200], [/200\s*g/i, "200G", 200],
@ -248,6 +248,15 @@ function detectReach(text: string): string | undefined {
return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined; return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined;
} }
function reachMetersFromLabel(label?: string): number | undefined {
if (!label) return undefined;
const match = label.match(/^(\d+(?:\.\d+)?)(m|km)$/i);
if (!match) return undefined;
const value = parseFloat(match[1]);
if (!Number.isFinite(value) || value <= 0) return undefined;
return match[2].toLowerCase() === "km" ? Math.round(value * 1000) : Math.round(value);
}
function detectFiberType(text: string): string | undefined { function detectFiberType(text: string): string | undefined {
if (/active\s+optical|\baoc\b/i.test(text)) return "AOC"; if (/active\s+optical|\baoc\b/i.test(text)) return "AOC";
if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper"; if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper";
@ -946,6 +955,8 @@ export async function scrapeFs(): Promise<void> {
const parsed = parseSpecTable(detail.specs); const parsed = parseSpecTable(detail.specs);
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`; const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
const fiberType = parsed.fiberType ?? detectFiberType(textForInference); const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
const reachLabel = reach ?? parsed.reachLabel;
const reachMeters = parsed.reachMeters ?? reachMetersFromLabel(reachLabel);
const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({ const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
partNumber: detail.partNumber, partNumber: detail.partNumber,
@ -954,8 +965,8 @@ export async function scrapeFs(): Promise<void> {
formFactor: ff, formFactor: ff,
speedGbps: speedInfo?.speedGbps, speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed, speed: speedInfo?.speed,
reachLabel: reach ?? parsed.reachLabel, reachLabel,
reachMeters: parsed.reachMeters, reachMeters,
fiberType, fiberType,
wavelengths: parsed.wavelengths, wavelengths: parsed.wavelengths,
imageUrl: detail.imageUrl, imageUrl: detail.imageUrl,
@ -968,7 +979,12 @@ export async function scrapeFs(): Promise<void> {
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
form_factor = COALESCE(NULLIF(form_factor, ''), $3), form_factor = COALESCE(NULLIF(form_factor, ''), $3),
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END, speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
speed = COALESCE(NULLIF(speed, ''), $5), speed = CASE
WHEN $5::text IS NOT NULL
AND (speed IS NULL OR speed = '' OR speed = 'Unknown' OR $4::numeric = speed_gbps)
THEN $5::text
ELSE speed
END,
reach_label = COALESCE(NULLIF(reach_label, ''), $6), reach_label = COALESCE(NULLIF(reach_label, ''), $6),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END, reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8), fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
@ -981,8 +997,8 @@ export async function scrapeFs(): Promise<void> {
ff, ff,
speedInfo?.speedGbps ?? null, speedInfo?.speedGbps ?? null,
speedInfo?.speed ?? null, speedInfo?.speed ?? null,
reach ?? parsed.reachLabel ?? null, reachLabel ?? null,
parsed.reachMeters ?? null, reachMeters ?? null,
fiberType ?? null, fiberType ?? null,
parsed.wavelengths ?? null, parsed.wavelengths ?? null,
] ]
@ -1038,7 +1054,7 @@ export async function scrapeFs(): Promise<void> {
const hasSourceDetails = const hasSourceDetails =
Object.keys(detail.specs).length > 0 || Object.keys(detail.specs).length > 0 ||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach); Boolean(fiberType || parsed.connector || parsed.wavelengths || reachLabel);
if (hasSourceDetails) { if (hasSourceDetails) {
const updated = await updateVerifiedSpecs({ const updated = await updateVerifiedSpecs({
@ -1046,8 +1062,8 @@ export async function scrapeFs(): Promise<void> {
fiberType, fiberType,
connector: parsed.connector, connector: parsed.connector,
wavelengths: parsed.wavelengths, wavelengths: parsed.wavelengths,
reachMeters: parsed.reachMeters, reachMeters,
reachLabel: reach ?? parsed.reachLabel, reachLabel,
powerConsumptionW: parsed.powerConsumptionW, powerConsumptionW: parsed.powerConsumptionW,
tempRange: parsed.tempRange, tempRange: parsed.tempRange,
modulation: parsed.modulation, modulation: parsed.modulation,

View File

@ -31,6 +31,7 @@ interface Product {
reachMeters?: number; reachMeters?: number;
fiberType?: string; fiberType?: string;
wavelength?: string; wavelength?: string;
imageUrl?: string;
} }
function sleep(ms: number): Promise<void> { function sleep(ms: number): Promise<void> {
@ -116,6 +117,13 @@ function parseProductList(html: string): Product[] {
const ff = detectFormFactor(name); const ff = detectFormFactor(name);
const reach = detectReach(name); const reach = detectReach(name);
const rawImg =
$(el).find("img").first().attr("data-src") ||
$(el).find("img").first().attr("data-lazy-src") ||
$(el).find("img").first().attr("src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({ products.push({
partNumber, partNumber,
@ -127,6 +135,7 @@ function parseProductList(html: string): Product[] {
reachMeters: reach?.meters, reachMeters: reach?.meters,
fiberType: detectFiber(name), fiberType: detectFiber(name),
wavelength: detectWavelength(name), wavelength: detectWavelength(name),
imageUrl,
}); });
}); });
@ -150,11 +159,19 @@ function parseProductList(html: string): Product[] {
} }
const ff = detectFormFactor(name); const ff = detectFormFactor(name);
const reach = detectReach(name); const reach = detectReach(name);
const rawImg =
$(el).find("img").first().attr("data-src") ||
$(el).find("img").first().attr("data-lazy-src") ||
$(el).find("img").first().attr("src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({ products.push({
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60), partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60),
name, url, price, ...ff, name, url, price, ...ff,
reachLabel: reach?.label, reachMeters: reach?.meters, reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name), fiberType: detectFiber(name), wavelength: detectWavelength(name),
imageUrl,
}); });
}); });
} }
@ -224,6 +241,7 @@ export async function scrapeGaoTek(): Promise<void> {
const txId = await findOrCreateScrapedTransceiver({ const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber, partNumber: product.partNumber,
vendorId, vendorId,
productUrl: product.url,
formFactor: product.formFactor, formFactor: product.formFactor,
speedGbps: product.speedGbps, speedGbps: product.speedGbps,
speed: product.speed, speed: product.speed,
@ -232,6 +250,7 @@ export async function scrapeGaoTek(): Promise<void> {
fiberType: product.fiberType, fiberType: product.fiberType,
wavelengths: product.wavelength, wavelengths: product.wavelength,
category: "DataCenter", category: "DataCenter",
imageUrl: product.imageUrl,
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {

View File

@ -15,7 +15,15 @@
* *
* Rate limited: 1 req/2sec. * Rate limited: 1 req/2sec.
*/ */
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import {
pool,
findOrCreateScrapedTransceiver,
ensureVendor,
upsertPriceObservation,
upsertStockObservation,
markImageVerified,
markDetailsVerified,
} from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
import { readFileSync, writeFileSync, existsSync } from "node:fs"; import { readFileSync, writeFileSync, existsSync } from "node:fs";
import { join } from "node:path"; import { join } from "node:path";
@ -29,7 +37,8 @@ const HEADERS = {
}; };
// Limit detail-page fetches per run to stay reasonable // Limit detail-page fetches per run to stay reasonable
const MAX_DETAIL_PAGES = 600; const MAX_DETAIL_PAGES = Math.max(1, Math.min(1000, parseInt(process.env["NADDOD_MAX_DETAIL_PAGES"] || "600", 10)));
const DB_DETAIL_ONLY = process.env["NADDOD_DB_DETAIL_ONLY"] === "1";
// Cursor file: persists across runs so each run covers the next 600 URLs // Cursor file: persists across runs so each run covers the next 600 URLs
const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json"); const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json");
@ -88,6 +97,26 @@ function detectSpeedGbps(text: string): { speed: string; speedGbps: number } {
} }
function detectReach(text: string): { label: string; meters: number } | undefined { function detectReach(text: string): { label: string; meters: number } | undefined {
const cableCode = text.match(/\b(?:AOC|DAC|CU|COPPER|MCP|MFS)[A-Z0-9._-]*?(\d+(?:\.\d+)?)M\b/i);
if (cableCode) {
const value = parseFloat(cableCode[1]);
if (Number.isFinite(value) && value > 0 && value <= 500) {
return { label: `${String(value).replace(/\.0$/, "")}m`, meters: Math.round(value) };
}
}
const generic = text.match(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1]);
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value);
const labelValue = String(value).replace(/\.0$/, "");
const label = unit === "km" ? `${labelValue}km` : `${labelValue}m`;
return { label, meters };
}
}
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000], [/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000], [/\b80\s*km\b/i, "80km", 80000],
@ -102,8 +131,9 @@ function detectReach(text: string): { label: string; meters: number } | undefine
[/\b150\s*m\b/i, "150m", 150], [/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100], [/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bER4?\b/, "40km", 40000], [/\bZRP?\b|\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bSR\d*\b|\bVR\d*\b/, "100m", 100],
[/\bDR4?\b|\bXDR\d*\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000], [/\bFR4?\b/, "2km", 2000],
]; ];
for (const [re, label, meters] of patterns) { for (const [re, label, meters] of patterns) {
@ -113,9 +143,10 @@ function detectReach(text: string): { label: string; meters: number } | undefine
} }
function detectFiber(text: string): string { function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
if (/aoc|active.?optical|active.?fiber|[^a-z]sr\d*[^a-z]?|[^a-z]vr\d*[^a-z]?|850\s*nm/i.test(text)) return "MMF";
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]?|[^a-z]dr\d*[^a-z]?|[^a-z]fr\d*[^a-z]?|xdr\d*|psm|bidi|cwdm|dwdm|1310\s*nm|1550\s*nm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]/i.test(text)) return "MMF";
return ""; return "";
} }
@ -175,6 +206,7 @@ async function fetchText(url: string): Promise<string> {
*/ */
function parseDetailPage(html: string, url: string): { function parseDetailPage(html: string, url: string): {
name: string; name: string;
imageUrl?: string;
price?: number; price?: number;
stock: { qty?: number; confidence: 1 | 2 } | null; stock: { qty?: number; confidence: 1 | 2 } | null;
} | null { } | null {
@ -187,6 +219,20 @@ function parseDetailPage(html: string, url: string): {
if (!name || name.length < 10) return null; if (!name || name.length < 10) return null;
if (!isTransceiver(name)) return null; if (!isTransceiver(name)) return null;
const imageUrl = (() => {
const candidates = [
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1],
html.match(/<meta\s+name="twitter:image"\s+content="([^"]+)"/i)?.[1],
html.match(/"image"\s*:\s*"([^"]+)"/i)?.[1],
].filter(Boolean) as string[];
const img = candidates.find((candidate) =>
!/(logo|placeholder|default|no-image|icon|sprite)/i.test(candidate)
);
if (!img) return undefined;
return img.startsWith("http") ? img : `${BASE}${img.startsWith("/") ? "" : "/"}${img}`;
})();
// Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00") // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
// Fall back to "US$ 10.90" or "$10.90" visible text patterns // Fall back to "US$ 10.90" or "$10.90" visible text patterns
let price: number | undefined; let price: number | undefined;
@ -220,7 +266,7 @@ function parseDetailPage(html: string, url: string): {
// Stock count // Stock count
const stock = parseStockText(html); const stock = parseStockText(html);
return { name, price, stock }; return { name, imageUrl, price, stock };
} }
// ── Sitemap parsing ───────────────────────────────────────────────────────── // ── Sitemap parsing ─────────────────────────────────────────────────────────
@ -247,6 +293,25 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
return [...new Set(urls)]; // deduplicate return [...new Set(urls)]; // deduplicate
} }
async function fetchDbTargets(limit: number): Promise<Array<{ url: string; targetTransceiverId: string }>> {
const result = await pool.query<{ id: string; product_page_url: string }>(`
SELECT t.id, t.product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'NADDOD'
AND t.product_page_url IS NOT NULL
AND t.product_page_url != ''
AND (t.details_verified = false OR t.image_verified = false)
ORDER BY t.details_verified ASC, t.image_verified ASC, t.updated_at ASC
LIMIT $1
`, [limit]);
return result.rows.map((row) => ({
url: row.product_page_url,
targetTransceiverId: row.id,
}));
}
// ── Main scraper ──────────────────────────────────────────────────────────── // ── Main scraper ────────────────────────────────────────────────────────────
export async function scrapeNaddod(): Promise<void> { export async function scrapeNaddod(): Promise<void> {
@ -262,11 +327,18 @@ export async function scrapeNaddod(): Promise<void> {
// ── Phase 1: Discover product URLs via sitemap ──────────────────────────── // ── Phase 1: Discover product URLs via sitemap ────────────────────────────
console.log("[Phase 1] Discovering products from sitemap..."); console.log("[Phase 1] Discovering products from sitemap...");
let productUrls: string[] = []; let productUrls: string[] = [];
let targets: Array<{ url: string; targetTransceiverId?: string }> = [];
try { try {
productUrls = await fetchProductUrlsFromSitemap(); if (DB_DETAIL_ONLY) {
console.log(` Found ${productUrls.length} product URLs in sitemap`); targets = await fetchDbTargets(MAX_DETAIL_PAGES);
productUrls = targets.map((target) => target.url);
console.log(` DB detail targets: ${productUrls.length}`);
} else {
productUrls = await fetchProductUrlsFromSitemap();
console.log(` Found ${productUrls.length} product URLs in sitemap`);
}
} catch (err) { } catch (err) {
console.error(` Sitemap fetch failed: ${(err as Error).message}`); console.error(` Target discovery failed: ${(err as Error).message}`);
return; return;
} }
@ -278,16 +350,16 @@ export async function scrapeNaddod(): Promise<void> {
// Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs // Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs
// (24 hours) we cover all ~7300 products. Wraps around when exhausted. // (24 hours) we cover all ~7300 products. Wraps around when exhausted.
const totalUrls = productUrls.length; const totalUrls = productUrls.length;
const offset = readCursor() % totalUrls; const offset = DB_DETAIL_ONLY ? 0 : readCursor() % totalUrls;
const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls); const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls);
let urls = productUrls.slice(offset, endIdx); let batchTargets: Array<{ url: string; targetTransceiverId?: string }> =
// Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list) DB_DETAIL_ONLY ? targets : productUrls.slice(offset, endIdx).map((url) => ({ url }));
if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) { if (!DB_DETAIL_ONLY && batchTargets.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
const wrap = MAX_DETAIL_PAGES - urls.length; const wrap = MAX_DETAIL_PAGES - batchTargets.length;
urls = urls.concat(productUrls.slice(0, wrap)); batchTargets = batchTargets.concat(productUrls.slice(0, wrap).map((url) => ({ url })));
} }
const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls; const nextOffset = DB_DETAIL_ONLY ? offset : (offset + MAX_DETAIL_PAGES) % totalUrls;
console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`); console.log(` Offset: ${offset}/${totalUrls} → processing ${batchTargets.length} products (next run starts at ${nextOffset})`);
// ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── // ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
console.log("\n[Phase 2] Fetching product detail pages..."); console.log("\n[Phase 2] Fetching product detail pages...");
@ -299,7 +371,8 @@ export async function scrapeNaddod(): Promise<void> {
let skippedNonTx = 0; let skippedNonTx = 0;
let errors = 0; let errors = 0;
for (const url of urls) { for (const target of batchTargets) {
const url = target.url;
await sleep(2000); await sleep(2000);
try { try {
const html = await fetchText(url); const html = await fetchText(url);
@ -310,28 +383,64 @@ export async function scrapeNaddod(): Promise<void> {
continue; continue;
} }
const { name, price, stock } = detail; const { name, imageUrl, price, stock } = detail;
const { speed, speedGbps } = detectSpeedGbps(name); const evidenceText = `${name} ${html.replace(/<[^>]+>/g, " ").slice(0, 20000)}`;
const formFactor = detectFormFactor(name); const { speed, speedGbps } = detectSpeedGbps(evidenceText);
const reach = detectReach(name); const formFactor = detectFormFactor(evidenceText);
const fiberType = detectFiber(name); const reach = detectReach(evidenceText);
const wavelength = detectWavelength(name); const fiberType = detectFiber(evidenceText);
const wavelength = detectWavelength(evidenceText);
// Extract part number from name (first word-group before "Compatible" or vendor name) // Extract part number from name (first word-group before "Compatible" or vendor name)
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
const txId = await findOrCreateScrapedTransceiver({ let txId = target.targetTransceiverId;
partNumber,
vendorId, if (txId) {
formFactor, await pool.query(`
speedGbps, UPDATE transceivers
speed, SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
reachMeters: reach?.meters, form_factor = COALESCE(NULLIF($3::text, ''), form_factor),
reachLabel: reach?.label, speed_gbps = CASE WHEN $4::numeric > 0 THEN $4::numeric ELSE speed_gbps END,
fiberType, speed = CASE WHEN $4::numeric > 0 THEN $5 ELSE speed END,
wavelengths: wavelength, reach_meters = CASE WHEN $6::int IS NOT NULL AND $6::int > 0 THEN $6::int ELSE reach_meters END,
category: "DataCenter", reach_label = COALESCE(NULLIF($7::text, ''), reach_label),
}); fiber_type = COALESCE(NULLIF($8::text, ''), fiber_type),
wavelengths = COALESCE(NULLIF($9::text, ''), wavelengths),
category = COALESCE(NULLIF(category, ''), 'DataCenter'),
updated_at = NOW()
WHERE id = $1
AND vendor_id = $10
`, [
txId,
url,
formFactor,
speedGbps,
speed,
reach?.meters ?? null,
reach?.label ?? null,
fiberType || null,
wavelength || null,
vendorId,
]);
if (imageUrl) await markImageVerified(txId, imageUrl);
await markDetailsVerified({ transceiverId: txId, sourceUrl: url });
} else {
txId = await findOrCreateScrapedTransceiver({
partNumber,
vendorId,
productUrl: url,
formFactor,
speedGbps,
speed,
reachMeters: reach?.meters,
reachLabel: reach?.label,
fiberType,
wavelengths: wavelength,
category: "DataCenter",
imageUrl,
});
}
// Price observation // Price observation
if (price && price > 0) { if (price && price > 0) {
@ -368,7 +477,7 @@ export async function scrapeNaddod(): Promise<void> {
processed++; processed++;
if (processed % 50 === 0) { if (processed % 50 === 0) {
console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`); console.log(` Progress: ${processed}/${batchTargets.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
} }
} catch (err) { } catch (err) {
errors++; errors++;
@ -377,10 +486,10 @@ export async function scrapeNaddod(): Promise<void> {
} }
// Advance cursor for next run // Advance cursor for next run
writeCursor(nextOffset); if (!DB_DETAIL_ONLY) writeCursor(nextOffset);
console.log("\n=== NADDOD Scraper v2 Complete ==="); console.log("\n=== NADDOD Scraper v2 Complete ===");
console.log(` URL range processed: ${offset}${offset + urls.length - 1} of ${totalUrls}`); console.log(` URL range processed: ${offset}${offset + batchTargets.length - 1} of ${totalUrls}`);
console.log(` Products processed: ${processed}`); console.log(` Products processed: ${processed}`);
console.log(` Non-transceivers skip: ${skippedNonTx}`); console.log(` Non-transceivers skip: ${skippedNonTx}`);
console.log(` Price observations: ${priceUpdates} new`); console.log(` Price observations: ${priceUpdates} new`);

View File

@ -1,9 +1,102 @@
# Current TIP Sync State # Current TIP Sync State
Updated: 2026-05-09 12:16 UTC Updated: 2026-05-09 13:54 UTC
## Newest Work ## Newest Work
- FS.com 1.6T DR8/2FR4 source correction on 2026-05-09:
- operator spotted that FS.com has two distinct 1.6T OSFP variants on the same family:
- `OSFP-DR8-1.6T-FL`: 500m, DR8, SMF
- `OSFP-2FR4-1.6T-FL`: 2km, 2FR4, SMF
- confirmed in TIP DB:
- both FS.com variants exist as separate rows
- `OSFP-2FR4-1.6T-FL` had `reach_meters=0` even though the source and row label said `2km`
- `OSFP-DR8-1.6T-FL` had no wavelength, causing the deterministic equivalence worker to reject the otherwise correct 500m Flexoptix match
- live DB correction:
- `OSFP-DR8-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=500m`
- `reach_meters=500`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP DR8`
- fully verified remains true
- `OSFP-2FR4-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=2km`
- `reach_meters=2000`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP 2FR4`
- fully verified true
- Flexoptix `O.1316T.C.05.M`
- confirmed as `500m`, `SMF`, `1.6T`
- `standard_name=1.6T OSFP DR8`
- equivalence correction:
- approved only `O.1316T.C.05.M``OSFP-DR8-1.6T-FL`
- confidence `0.913`
- match basis: form factor, speed, reach, fiber, wavelength and source variant DR8/500m
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m DR8 Flexoptix product
- scraper hardening:
- `packages/scraper/src/scrapers/fs-com.ts`
- recognizes German/decimal `1,6T` and `1600G` as `1.6T`/`1600`
- converts reach labels such as `2km` into `reach_meters=2000`
- updates stale `speed` labels when the numeric source speed matches the row
- build:
- `pnpm -C packages/scraper build` passed on Erik
- truth:
- there are definitely two separate FS.com variants
- 500m DR8 is the correct equivalent for Flexoptix `O.1316T.C.05.M`
- 2km FR4 is a separate DB product and must not be collapsed into the 500m match
- Targeted vendor verification push after equivalence revalidation on 2026-05-09:
- code improved:
- `NADDOD_DB_DETAIL_ONLY=1` mode verifies existing NADDOD rows with source URLs instead of rotating blindly through the full sitemap
- NADDOD now extracts `og:image`, source product URLs, reach/fiber/wavelength from page evidence, AOC/DAC cable lengths, and DR/FR/SR/VR/XDR patterns
- GAO Tek now writes product URLs and image evidence
- Ascent Optics now writes product URLs and table image evidence
- Eoptolink now writes product URLs, images, reach/wavelength evidence and corrects over-broad form-factor parsing by preferring title/slug evidence
- live low-load Erik runs:
- GAO Tek static crawl:
- `473` unique products processed
- GAO Tek detail coverage improved from `41` to `126`
- `no_url` dropped to `0`
- Ascent Optics static/API crawl:
- `253` catalog products processed
- image coverage `235/305`
- detail coverage `213/305`
- Eoptolink static crawl:
- `76` product-solution pages inspected
- after parser correction, Eoptolink is `287/287` image and detail verified
- NADDOD targeted DB-detail mode:
- first targeted wave `200` pages
- second wave `300` pages
- closure wave `385` pages
- special-case wave `83` pages
- NADDOD moved from `image=12`, `details=157`, `fully=0/1-ish` to:
- total `748`
- price `744`
- image `742`
- details `659`
- competitor `744`
- fully `659`
- no URL `6`
- global TIP counters after this push:
- price verified `11557`
- image verified `11963`
- details verified `11018`
- fully verified `9794`
- total transceivers `17647`
- health:
- TIP stayed `healthy`
- load status `ok`
- memory used about `13%`
- truth:
- NADDOD is not 100% complete; remaining detail gaps include likely non-transceiver switch/NIC products and a smaller set of parser-special cases
- OEM catalogs like Ascent and Eoptolink do not publish retail prices, so full verification cannot be forced honestly without price evidence
- Immediate full TIP equivalence revalidation on 2026-05-09: - Immediate full TIP equivalence revalidation on 2026-05-09:
- operator requested all open TIP validation to be completed immediately and all product matches checked for true 1:1 equivalence - operator requested all open TIP validation to be completed immediately and all product matches checked for true 1:1 equivalence
- live preflight: - live preflight:

View File

@ -0,0 +1,117 @@
# FS.com 1.6T Variant Correction + Vendor Verification Push
Date: 2026-05-09
Actor: Codex
## Operator Finding
The operator spotted a concrete source-truth problem on FS.com:
- `OSFP-DR8-1.6T-FL` is the 500m DR8 variant.
- `OSFP-2FR4-1.6T-FL` is the 2km 2FR4 variant.
- Flexoptix `O.1316T.C.05.M` is the 500m DR8 product.
- The 2km FR4 variant must be present as its own product and must not be collapsed into the 500m match.
## Live DB Correction
Corrected FS.com rows:
- `OSFP-DR8-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=500m`
- `reach_meters=500`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP DR8`
- fully verified
- `OSFP-2FR4-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=2km`
- `reach_meters=2000`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP 2FR4`
- fully verified
Corrected Flexoptix row:
- `O.1316T.C.05.M`
- confirmed `500m`, `SMF`, `1.6T`
- `standard_name=1.6T OSFP DR8`
Corrected equivalence:
- Approved only `O.1316T.C.05.M``OSFP-DR8-1.6T-FL`.
- Confidence: `0.913`.
- Basis: form factor, speed, reach, fiber, wavelength and explicit source variant DR8/500m.
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m Flexoptix product.
## Scraper Hardening
Updated `packages/scraper/src/scrapers/fs-com.ts`:
- Detects `1,6T`, `1.6T` and `1600G` as `1.6T`/`1600`.
- Converts labels like `2km` to `reach_meters=2000`.
- Updates stale `speed` strings when the numeric source speed matches the row.
Remote build on Erik passed:
```text
pnpm -C packages/scraper build
```
## Vendor Verification Work In Same Push
Updated:
- `packages/scraper/src/scrapers/naddod.ts`
- `packages/scraper/src/scrapers/gaotek.ts`
- `packages/scraper/src/scrapers/ascentoptics.ts`
- `packages/scraper/src/scrapers/eoptolink.ts`
Live results:
- GAO Tek:
- details improved from `41` to `126`
- no-url dropped to `0`
- Ascent Optics:
- image `235/305`
- details `213/305`
- Eoptolink:
- image `287/287`
- details `287/287`
- NADDOD:
- total `748`
- price `744`
- image `742`
- details `659`
- competitor `744`
- fully `659`
- no URL `6`
Global TIP counters after the push:
- price verified `11557`
- image verified `11963`
- details verified `11018`
- fully verified `9794`
- total transceivers `17647`
TIP remained healthy:
- status `healthy`
- load status `ok`
- memory around `13%`
## Lesson For TIPLLM
Variant selectors on vendor pages must be treated as separate products when reach, optical protocol, connector or model changes.
For FS.com 1.6T OSFP:
- `DR8 500m` and `2FR4 2km` are distinct SKUs and distinct compatibility candidates.
- A Flexoptix 500m DR8 product must not be matched to a 2km FR4 FS.com product.
- Source pages can expose German decimal text (`1,6T`) and separate net/gross prices; normalize carefully.