fix: improve vendor verification and fscom 1.6t variants

This commit is contained in:
Rene Fichtmueller 2026-05-09 15:56:08 +02:00
parent 7da78a999d
commit b26696f0d1
7 changed files with 514 additions and 57 deletions

View File

@ -45,6 +45,7 @@ interface Product {
partNumber: string;
name: string;
url: string;
imageUrl?: string;
formFactor: string;
speed: string;
speedGbps: number;
@ -156,11 +157,16 @@ function parseProductTable(
const combined = `${rawPart} ${desc}`;
const reach = detectReach(combined);
const rawImg = $(cells[0]).find("img").first().attr("src") || $(cells[0]).find("img").first().attr("data-src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({
partNumber: rawPart,
name: desc || rawPart,
url,
imageUrl,
formFactor: cat.formFactor,
speed: cat.speed,
speedGbps: cat.speedGbps,
@ -211,6 +217,7 @@ export async function scrapeAscentOptics(): Promise<void> {
await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
@ -219,6 +226,7 @@ export async function scrapeAscentOptics(): Promise<void> {
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
imageUrl: product.imageUrl,
});
totalProducts++;
} catch (err) {

View File

@ -57,6 +57,12 @@ function speedFromSlug(slug: string): { speed: string; speedGbps: number } {
return { speed: "Unknown", speedGbps: 0 };
}
function speedFromTitleThenSlug(title: string, slug: string): { speed: string; speedGbps: number } {
const titleSpeed = speedFromSlug(title);
if (titleSpeed.speedGbps > 0) return titleSpeed;
return speedFromSlug(slug);
}
function formFactorFromText(text: string): string {
const t = text.toUpperCase();
if (/\bOSFP\b/.test(t)) return "OSFP";
@ -124,10 +130,44 @@ interface EoptolinkProduct {
speedGbps: number;
formFactor: string;
fiberType: string;
reachLabel?: string;
reachMeters?: number;
wavelength?: string;
imageUrl?: string;
category: string;
pageUrl: string;
}
function reachFromText(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b100\s*km\b/i, "100km", 100000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100],
[/\bZR\b/i, "80km", 80000],
[/\bER\b/i, "40km", 40000],
[/\bLR\b/i, "10km", 10000],
[/\bFR\b/i, "2km", 2000],
[/\bDR\b/i, "500m", 500],
[/\bSR\b/i, "300m", 300],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function wavelengthFromText(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | null {
// Page title
const titleMatch = html.match(/<title>([^<]+)/i) || html.match(/<h1[^>]*>([^<]{5,80})</i);
@ -138,13 +178,36 @@ function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | nul
const pnRegex = /E[A-Z]{2,5}-\d{2,3}[A-Z0-9]{1,3}(?:-\d{1,3})?(?:-[A-Z0-9]{1,6})*/g;
const partNumbers = [...new Set([...(html.matchAll(pnRegex) ?? [])].map((m) => m[0].trim()))];
const slug = pageUrl.split("/").slice(-2).join("-");
const { speed, speedGbps } = speedFromSlug(slug + " " + pageTitle);
const formFactor = formFactorFromText(pageTitle + " " + slug);
const fiberType = fiberFromText(pageTitle + " " + slug);
const category = categoryFromText(pageTitle + " " + slug);
const slug = pageUrl.split("/").filter(Boolean).slice(-2).join("-");
const titleEvidence = `${pageTitle} ${slug}`;
const pageEvidence = `${titleEvidence} ${html.replace(/<[^>]+>/g, " ").slice(0, 12000)}`;
const { speed, speedGbps } = speedFromTitleThenSlug(pageTitle, slug);
const formFactor = formFactorFromText(titleEvidence);
const fiberType = fiberFromText(titleEvidence);
const reach = reachFromText(pageEvidence);
const wavelength = wavelengthFromText(pageEvidence);
const category = categoryFromText(titleEvidence);
const rawImage =
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1] ||
html.match(/<img[^>]+src="([^"]+)"/i)?.[1];
const imageUrl = rawImage && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImage)
? (rawImage.startsWith("http") ? rawImage : `${BASE}${rawImage.startsWith("/") ? "" : "/"}${rawImage}`)
: undefined;
return { pageTitle, partNumbers, speed, speedGbps, formFactor, fiberType, category, pageUrl };
return {
pageTitle,
partNumbers,
speed,
speedGbps,
formFactor,
fiberType,
reachLabel: reach?.label,
reachMeters: reach?.meters,
wavelength,
imageUrl,
category,
pageUrl,
};
}
// ── Main ────────────────────────────────────────────────────────────────────
@ -193,15 +256,47 @@ export async function scrapeEoptolink(): Promise<void> {
for (const partNumber of namesToSeed) {
try {
await findOrCreateScrapedTransceiver({
const txId = await findOrCreateScrapedTransceiver({
partNumber: partNumber.slice(0, 80),
vendorId,
productUrl: url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: product.category,
imageUrl: product.imageUrl,
});
await pool.query(`
UPDATE transceivers
SET form_factor = $2,
speed_gbps = $3,
speed = $4,
reach_meters = CASE WHEN $5::int IS NOT NULL THEN $5::int ELSE reach_meters END,
reach_label = COALESCE(NULLIF($6::text, ''), reach_label),
fiber_type = COALESCE(NULLIF($7::text, ''), fiber_type),
wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths),
product_page_url = COALESCE(NULLIF($9::text, ''), product_page_url),
updated_at = NOW()
WHERE id = $1
AND vendor_id = $10
`, [
txId,
product.formFactor,
product.speedGbps,
product.speed,
product.reachMeters ?? null,
product.reachLabel ?? null,
product.fiberType || null,
product.wavelength || null,
url,
vendorId,
]);
added++;
} catch (dbErr) {
// Duplicate or constraint error — expected for re-runs

View File

@ -226,7 +226,7 @@ function detectFormFactor(text: string): string | undefined {
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/1\.6\s*t/i, "1.6T", 1600],
[/1[\.,]6\s*t|1600\s*g/i, "1.6T", 1600],
[/800\s*g/i, "800G", 800],
[/400\s*g/i, "400G", 400],
[/200\s*g/i, "200G", 200],
@ -248,6 +248,15 @@ function detectReach(text: string): string | undefined {
return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined;
}
function reachMetersFromLabel(label?: string): number | undefined {
if (!label) return undefined;
const match = label.match(/^(\d+(?:\.\d+)?)(m|km)$/i);
if (!match) return undefined;
const value = parseFloat(match[1]);
if (!Number.isFinite(value) || value <= 0) return undefined;
return match[2].toLowerCase() === "km" ? Math.round(value * 1000) : Math.round(value);
}
function detectFiberType(text: string): string | undefined {
if (/active\s+optical|\baoc\b/i.test(text)) return "AOC";
if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper";
@ -946,6 +955,8 @@ export async function scrapeFs(): Promise<void> {
const parsed = parseSpecTable(detail.specs);
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
const reachLabel = reach ?? parsed.reachLabel;
const reachMeters = parsed.reachMeters ?? reachMetersFromLabel(reachLabel);
const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
partNumber: detail.partNumber,
@ -954,8 +965,8 @@ export async function scrapeFs(): Promise<void> {
formFactor: ff,
speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed,
reachLabel: reach ?? parsed.reachLabel,
reachMeters: parsed.reachMeters,
reachLabel,
reachMeters,
fiberType,
wavelengths: parsed.wavelengths,
imageUrl: detail.imageUrl,
@ -968,7 +979,12 @@ export async function scrapeFs(): Promise<void> {
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
speed = COALESCE(NULLIF(speed, ''), $5),
speed = CASE
WHEN $5::text IS NOT NULL
AND (speed IS NULL OR speed = '' OR speed = 'Unknown' OR $4::numeric = speed_gbps)
THEN $5::text
ELSE speed
END,
reach_label = COALESCE(NULLIF(reach_label, ''), $6),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
@ -981,8 +997,8 @@ export async function scrapeFs(): Promise<void> {
ff,
speedInfo?.speedGbps ?? null,
speedInfo?.speed ?? null,
reach ?? parsed.reachLabel ?? null,
parsed.reachMeters ?? null,
reachLabel ?? null,
reachMeters ?? null,
fiberType ?? null,
parsed.wavelengths ?? null,
]
@ -1038,7 +1054,7 @@ export async function scrapeFs(): Promise<void> {
const hasSourceDetails =
Object.keys(detail.specs).length > 0 ||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
Boolean(fiberType || parsed.connector || parsed.wavelengths || reachLabel);
if (hasSourceDetails) {
const updated = await updateVerifiedSpecs({
@ -1046,8 +1062,8 @@ export async function scrapeFs(): Promise<void> {
fiberType,
connector: parsed.connector,
wavelengths: parsed.wavelengths,
reachMeters: parsed.reachMeters,
reachLabel: reach ?? parsed.reachLabel,
reachMeters,
reachLabel,
powerConsumptionW: parsed.powerConsumptionW,
tempRange: parsed.tempRange,
modulation: parsed.modulation,

View File

@ -31,6 +31,7 @@ interface Product {
reachMeters?: number;
fiberType?: string;
wavelength?: string;
imageUrl?: string;
}
function sleep(ms: number): Promise<void> {
@ -116,6 +117,13 @@ function parseProductList(html: string): Product[] {
const ff = detectFormFactor(name);
const reach = detectReach(name);
const rawImg =
$(el).find("img").first().attr("data-src") ||
$(el).find("img").first().attr("data-lazy-src") ||
$(el).find("img").first().attr("src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({
partNumber,
@ -127,6 +135,7 @@ function parseProductList(html: string): Product[] {
reachMeters: reach?.meters,
fiberType: detectFiber(name),
wavelength: detectWavelength(name),
imageUrl,
});
});
@ -150,11 +159,19 @@ function parseProductList(html: string): Product[] {
}
const ff = detectFormFactor(name);
const reach = detectReach(name);
const rawImg =
$(el).find("img").first().attr("data-src") ||
$(el).find("img").first().attr("data-lazy-src") ||
$(el).find("img").first().attr("src");
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined;
products.push({
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60),
name, url, price, ...ff,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
imageUrl,
});
});
}
@ -224,6 +241,7 @@ export async function scrapeGaoTek(): Promise<void> {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
@ -232,6 +250,7 @@ export async function scrapeGaoTek(): Promise<void> {
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
imageUrl: product.imageUrl,
});
if (product.price && product.price > 0) {

View File

@ -15,7 +15,15 @@
*
* Rate limited: 1 req/2sec.
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
import {
pool,
findOrCreateScrapedTransceiver,
ensureVendor,
upsertPriceObservation,
upsertStockObservation,
markImageVerified,
markDetailsVerified,
} from "../utils/db";
import { contentHash } from "../utils/hash";
import { readFileSync, writeFileSync, existsSync } from "node:fs";
import { join } from "node:path";
@ -29,7 +37,8 @@ const HEADERS = {
};
// Limit detail-page fetches per run to stay reasonable
const MAX_DETAIL_PAGES = 600;
const MAX_DETAIL_PAGES = Math.max(1, Math.min(1000, parseInt(process.env["NADDOD_MAX_DETAIL_PAGES"] || "600", 10)));
const DB_DETAIL_ONLY = process.env["NADDOD_DB_DETAIL_ONLY"] === "1";
// Cursor file: persists across runs so each run covers the next 600 URLs
const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json");
@ -88,6 +97,26 @@ function detectSpeedGbps(text: string): { speed: string; speedGbps: number } {
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const cableCode = text.match(/\b(?:AOC|DAC|CU|COPPER|MCP|MFS)[A-Z0-9._-]*?(\d+(?:\.\d+)?)M\b/i);
if (cableCode) {
const value = parseFloat(cableCode[1]);
if (Number.isFinite(value) && value > 0 && value <= 500) {
return { label: `${String(value).replace(/\.0$/, "")}m`, meters: Math.round(value) };
}
}
const generic = text.match(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1]);
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value);
const labelValue = String(value).replace(/\.0$/, "");
const label = unit === "km" ? `${labelValue}km` : `${labelValue}m`;
return { label, meters };
}
}
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
@ -102,8 +131,9 @@ function detectReach(text: string): { label: string; meters: number } | undefine
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
[/\bER4?\b/, "40km", 40000], [/\bZRP?\b|\bZR4?\b/, "80km", 80000],
[/\bSR\d*\b|\bVR\d*\b/, "100m", 100],
[/\bDR4?\b|\bXDR\d*\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
@ -113,9 +143,10 @@ function detectReach(text: string): { label: string; meters: number } | undefine
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
if (/aoc|active.?optical|active.?fiber|[^a-z]sr\d*[^a-z]?|[^a-z]vr\d*[^a-z]?|850\s*nm/i.test(text)) return "MMF";
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]?|[^a-z]dr\d*[^a-z]?|[^a-z]fr\d*[^a-z]?|xdr\d*|psm|bidi|cwdm|dwdm|1310\s*nm|1550\s*nm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]/i.test(text)) return "MMF";
return "";
}
@ -175,6 +206,7 @@ async function fetchText(url: string): Promise<string> {
*/
function parseDetailPage(html: string, url: string): {
name: string;
imageUrl?: string;
price?: number;
stock: { qty?: number; confidence: 1 | 2 } | null;
} | null {
@ -187,6 +219,20 @@ function parseDetailPage(html: string, url: string): {
if (!name || name.length < 10) return null;
if (!isTransceiver(name)) return null;
const imageUrl = (() => {
const candidates = [
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1],
html.match(/<meta\s+name="twitter:image"\s+content="([^"]+)"/i)?.[1],
html.match(/"image"\s*:\s*"([^"]+)"/i)?.[1],
].filter(Boolean) as string[];
const img = candidates.find((candidate) =>
!/(logo|placeholder|default|no-image|icon|sprite)/i.test(candidate)
);
if (!img) return undefined;
return img.startsWith("http") ? img : `${BASE}${img.startsWith("/") ? "" : "/"}${img}`;
})();
// Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
// Fall back to "US$ 10.90" or "$10.90" visible text patterns
let price: number | undefined;
@ -220,7 +266,7 @@ function parseDetailPage(html: string, url: string): {
// Stock count
const stock = parseStockText(html);
return { name, price, stock };
return { name, imageUrl, price, stock };
}
// ── Sitemap parsing ─────────────────────────────────────────────────────────
@ -247,6 +293,25 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
return [...new Set(urls)]; // deduplicate
}
async function fetchDbTargets(limit: number): Promise<Array<{ url: string; targetTransceiverId: string }>> {
const result = await pool.query<{ id: string; product_page_url: string }>(`
SELECT t.id, t.product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'NADDOD'
AND t.product_page_url IS NOT NULL
AND t.product_page_url != ''
AND (t.details_verified = false OR t.image_verified = false)
ORDER BY t.details_verified ASC, t.image_verified ASC, t.updated_at ASC
LIMIT $1
`, [limit]);
return result.rows.map((row) => ({
url: row.product_page_url,
targetTransceiverId: row.id,
}));
}
// ── Main scraper ────────────────────────────────────────────────────────────
export async function scrapeNaddod(): Promise<void> {
@ -262,11 +327,18 @@ export async function scrapeNaddod(): Promise<void> {
// ── Phase 1: Discover product URLs via sitemap ────────────────────────────
console.log("[Phase 1] Discovering products from sitemap...");
let productUrls: string[] = [];
let targets: Array<{ url: string; targetTransceiverId?: string }> = [];
try {
productUrls = await fetchProductUrlsFromSitemap();
console.log(` Found ${productUrls.length} product URLs in sitemap`);
if (DB_DETAIL_ONLY) {
targets = await fetchDbTargets(MAX_DETAIL_PAGES);
productUrls = targets.map((target) => target.url);
console.log(` DB detail targets: ${productUrls.length}`);
} else {
productUrls = await fetchProductUrlsFromSitemap();
console.log(` Found ${productUrls.length} product URLs in sitemap`);
}
} catch (err) {
console.error(` Sitemap fetch failed: ${(err as Error).message}`);
console.error(` Target discovery failed: ${(err as Error).message}`);
return;
}
@ -278,16 +350,16 @@ export async function scrapeNaddod(): Promise<void> {
// Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs
// (24 hours) we cover all ~7300 products. Wraps around when exhausted.
const totalUrls = productUrls.length;
const offset = readCursor() % totalUrls;
const offset = DB_DETAIL_ONLY ? 0 : readCursor() % totalUrls;
const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls);
let urls = productUrls.slice(offset, endIdx);
// Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list)
if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
const wrap = MAX_DETAIL_PAGES - urls.length;
urls = urls.concat(productUrls.slice(0, wrap));
let batchTargets: Array<{ url: string; targetTransceiverId?: string }> =
DB_DETAIL_ONLY ? targets : productUrls.slice(offset, endIdx).map((url) => ({ url }));
if (!DB_DETAIL_ONLY && batchTargets.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
const wrap = MAX_DETAIL_PAGES - batchTargets.length;
batchTargets = batchTargets.concat(productUrls.slice(0, wrap).map((url) => ({ url })));
}
const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls;
console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`);
const nextOffset = DB_DETAIL_ONLY ? offset : (offset + MAX_DETAIL_PAGES) % totalUrls;
console.log(` Offset: ${offset}/${totalUrls} → processing ${batchTargets.length} products (next run starts at ${nextOffset})`);
// ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
console.log("\n[Phase 2] Fetching product detail pages...");
@ -299,7 +371,8 @@ export async function scrapeNaddod(): Promise<void> {
let skippedNonTx = 0;
let errors = 0;
for (const url of urls) {
for (const target of batchTargets) {
const url = target.url;
await sleep(2000);
try {
const html = await fetchText(url);
@ -310,28 +383,64 @@ export async function scrapeNaddod(): Promise<void> {
continue;
}
const { name, price, stock } = detail;
const { speed, speedGbps } = detectSpeedGbps(name);
const formFactor = detectFormFactor(name);
const reach = detectReach(name);
const fiberType = detectFiber(name);
const wavelength = detectWavelength(name);
const { name, imageUrl, price, stock } = detail;
const evidenceText = `${name} ${html.replace(/<[^>]+>/g, " ").slice(0, 20000)}`;
const { speed, speedGbps } = detectSpeedGbps(evidenceText);
const formFactor = detectFormFactor(evidenceText);
const reach = detectReach(evidenceText);
const fiberType = detectFiber(evidenceText);
const wavelength = detectWavelength(evidenceText);
// Extract part number from name (first word-group before "Compatible" or vendor name)
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
const txId = await findOrCreateScrapedTransceiver({
partNumber,
vendorId,
formFactor,
speedGbps,
speed,
reachMeters: reach?.meters,
reachLabel: reach?.label,
fiberType,
wavelengths: wavelength,
category: "DataCenter",
});
let txId = target.targetTransceiverId;
if (txId) {
await pool.query(`
UPDATE transceivers
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
form_factor = COALESCE(NULLIF($3::text, ''), form_factor),
speed_gbps = CASE WHEN $4::numeric > 0 THEN $4::numeric ELSE speed_gbps END,
speed = CASE WHEN $4::numeric > 0 THEN $5 ELSE speed END,
reach_meters = CASE WHEN $6::int IS NOT NULL AND $6::int > 0 THEN $6::int ELSE reach_meters END,
reach_label = COALESCE(NULLIF($7::text, ''), reach_label),
fiber_type = COALESCE(NULLIF($8::text, ''), fiber_type),
wavelengths = COALESCE(NULLIF($9::text, ''), wavelengths),
category = COALESCE(NULLIF(category, ''), 'DataCenter'),
updated_at = NOW()
WHERE id = $1
AND vendor_id = $10
`, [
txId,
url,
formFactor,
speedGbps,
speed,
reach?.meters ?? null,
reach?.label ?? null,
fiberType || null,
wavelength || null,
vendorId,
]);
if (imageUrl) await markImageVerified(txId, imageUrl);
await markDetailsVerified({ transceiverId: txId, sourceUrl: url });
} else {
txId = await findOrCreateScrapedTransceiver({
partNumber,
vendorId,
productUrl: url,
formFactor,
speedGbps,
speed,
reachMeters: reach?.meters,
reachLabel: reach?.label,
fiberType,
wavelengths: wavelength,
category: "DataCenter",
imageUrl,
});
}
// Price observation
if (price && price > 0) {
@ -368,7 +477,7 @@ export async function scrapeNaddod(): Promise<void> {
processed++;
if (processed % 50 === 0) {
console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
console.log(` Progress: ${processed}/${batchTargets.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
}
} catch (err) {
errors++;
@ -377,10 +486,10 @@ export async function scrapeNaddod(): Promise<void> {
}
// Advance cursor for next run
writeCursor(nextOffset);
if (!DB_DETAIL_ONLY) writeCursor(nextOffset);
console.log("\n=== NADDOD Scraper v2 Complete ===");
console.log(` URL range processed: ${offset}${offset + urls.length - 1} of ${totalUrls}`);
console.log(` URL range processed: ${offset}${offset + batchTargets.length - 1} of ${totalUrls}`);
console.log(` Products processed: ${processed}`);
console.log(` Non-transceivers skip: ${skippedNonTx}`);
console.log(` Price observations: ${priceUpdates} new`);

View File

@ -1,9 +1,102 @@
# Current TIP Sync State
Updated: 2026-05-09 12:16 UTC
Updated: 2026-05-09 13:54 UTC
## Newest Work
- FS.com 1.6T DR8/2FR4 source correction on 2026-05-09:
- operator spotted that FS.com has two distinct 1.6T OSFP variants on the same family:
- `OSFP-DR8-1.6T-FL`: 500m, DR8, SMF
- `OSFP-2FR4-1.6T-FL`: 2km, 2FR4, SMF
- confirmed in TIP DB:
- both FS.com variants exist as separate rows
- `OSFP-2FR4-1.6T-FL` had `reach_meters=0` even though the source and row label said `2km`
- `OSFP-DR8-1.6T-FL` had no wavelength, causing the deterministic equivalence worker to reject the otherwise correct 500m Flexoptix match
- live DB correction:
- `OSFP-DR8-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=500m`
- `reach_meters=500`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP DR8`
- fully verified remains true
- `OSFP-2FR4-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=2km`
- `reach_meters=2000`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP 2FR4`
- fully verified true
- Flexoptix `O.1316T.C.05.M`
- confirmed as `500m`, `SMF`, `1.6T`
- `standard_name=1.6T OSFP DR8`
- equivalence correction:
- approved only `O.1316T.C.05.M``OSFP-DR8-1.6T-FL`
- confidence `0.913`
- match basis: form factor, speed, reach, fiber, wavelength and source variant DR8/500m
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m DR8 Flexoptix product
- scraper hardening:
- `packages/scraper/src/scrapers/fs-com.ts`
- recognizes German/decimal `1,6T` and `1600G` as `1.6T`/`1600`
- converts reach labels such as `2km` into `reach_meters=2000`
- updates stale `speed` labels when the numeric source speed matches the row
- build:
- `pnpm -C packages/scraper build` passed on Erik
- truth:
- there are definitely two separate FS.com variants
- 500m DR8 is the correct equivalent for Flexoptix `O.1316T.C.05.M`
- 2km FR4 is a separate DB product and must not be collapsed into the 500m match
- Targeted vendor verification push after equivalence revalidation on 2026-05-09:
- code improved:
- `NADDOD_DB_DETAIL_ONLY=1` mode verifies existing NADDOD rows with source URLs instead of rotating blindly through the full sitemap
- NADDOD now extracts `og:image`, source product URLs, reach/fiber/wavelength from page evidence, AOC/DAC cable lengths, and DR/FR/SR/VR/XDR patterns
- GAO Tek now writes product URLs and image evidence
- Ascent Optics now writes product URLs and table image evidence
- Eoptolink now writes product URLs, images, reach/wavelength evidence and corrects over-broad form-factor parsing by preferring title/slug evidence
- live low-load Erik runs:
- GAO Tek static crawl:
- `473` unique products processed
- GAO Tek detail coverage improved from `41` to `126`
- `no_url` dropped to `0`
- Ascent Optics static/API crawl:
- `253` catalog products processed
- image coverage `235/305`
- detail coverage `213/305`
- Eoptolink static crawl:
- `76` product-solution pages inspected
- after parser correction, Eoptolink is `287/287` image and detail verified
- NADDOD targeted DB-detail mode:
- first targeted wave `200` pages
- second wave `300` pages
- closure wave `385` pages
- special-case wave `83` pages
- NADDOD moved from `image=12`, `details=157`, `fully=0/1-ish` to:
- total `748`
- price `744`
- image `742`
- details `659`
- competitor `744`
- fully `659`
- no URL `6`
- global TIP counters after this push:
- price verified `11557`
- image verified `11963`
- details verified `11018`
- fully verified `9794`
- total transceivers `17647`
- health:
- TIP stayed `healthy`
- load status `ok`
- memory used about `13%`
- truth:
- NADDOD is not 100% complete; remaining detail gaps include likely non-transceiver switch/NIC products and a smaller set of parser-special cases
- OEM catalogs like Ascent and Eoptolink do not publish retail prices, so full verification cannot be forced honestly without price evidence
- Immediate full TIP equivalence revalidation on 2026-05-09:
- operator requested all open TIP validation to be completed immediately and all product matches checked for true 1:1 equivalence
- live preflight:

View File

@ -0,0 +1,117 @@
# FS.com 1.6T Variant Correction + Vendor Verification Push
Date: 2026-05-09
Actor: Codex
## Operator Finding
The operator spotted a concrete source-truth problem on FS.com:
- `OSFP-DR8-1.6T-FL` is the 500m DR8 variant.
- `OSFP-2FR4-1.6T-FL` is the 2km 2FR4 variant.
- Flexoptix `O.1316T.C.05.M` is the 500m DR8 product.
- The 2km FR4 variant must be present as its own product and must not be collapsed into the 500m match.
## Live DB Correction
Corrected FS.com rows:
- `OSFP-DR8-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=500m`
- `reach_meters=500`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP DR8`
- fully verified
- `OSFP-2FR4-1.6T-FL`
- `speed=1.6T`
- `speed_gbps=1600`
- `reach_label=2km`
- `reach_meters=2000`
- `fiber_type=SMF`
- `wavelengths=1310`
- `standard_name=1.6T OSFP 2FR4`
- fully verified
Corrected Flexoptix row:
- `O.1316T.C.05.M`
- confirmed `500m`, `SMF`, `1.6T`
- `standard_name=1.6T OSFP DR8`
Corrected equivalence:
- Approved only `O.1316T.C.05.M``OSFP-DR8-1.6T-FL`.
- Confidence: `0.913`.
- Basis: form factor, speed, reach, fiber, wavelength and explicit source variant DR8/500m.
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m Flexoptix product.
## Scraper Hardening
Updated `packages/scraper/src/scrapers/fs-com.ts`:
- Detects `1,6T`, `1.6T` and `1600G` as `1.6T`/`1600`.
- Converts labels like `2km` to `reach_meters=2000`.
- Updates stale `speed` strings when the numeric source speed matches the row.
Remote build on Erik passed:
```text
pnpm -C packages/scraper build
```
## Vendor Verification Work In Same Push
Updated:
- `packages/scraper/src/scrapers/naddod.ts`
- `packages/scraper/src/scrapers/gaotek.ts`
- `packages/scraper/src/scrapers/ascentoptics.ts`
- `packages/scraper/src/scrapers/eoptolink.ts`
Live results:
- GAO Tek:
- details improved from `41` to `126`
- no-url dropped to `0`
- Ascent Optics:
- image `235/305`
- details `213/305`
- Eoptolink:
- image `287/287`
- details `287/287`
- NADDOD:
- total `748`
- price `744`
- image `742`
- details `659`
- competitor `744`
- fully `659`
- no URL `6`
Global TIP counters after the push:
- price verified `11557`
- image verified `11963`
- details verified `11018`
- fully verified `9794`
- total transceivers `17647`
TIP remained healthy:
- status `healthy`
- load status `ok`
- memory around `13%`
## Lesson For TIPLLM
Variant selectors on vendor pages must be treated as separate products when reach, optical protocol, connector or model changes.
For FS.com 1.6T OSFP:
- `DR8 500m` and `2FR4 2km` are distinct SKUs and distinct compatibility candidates.
- A Flexoptix 500m DR8 product must not be matched to a 2km FR4 FS.com product.
- Source pages can expose German decimal text (`1,6T`) and separate net/gross prices; normalize carefully.