fix: improve vendor verification and fscom 1.6t variants
This commit is contained in:
parent
7da78a999d
commit
b26696f0d1
@ -45,6 +45,7 @@ interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
imageUrl?: string;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
@ -156,11 +157,16 @@ function parseProductTable(
|
||||
|
||||
const combined = `${rawPart} ${desc}`;
|
||||
const reach = detectReach(combined);
|
||||
const rawImg = $(cells[0]).find("img").first().attr("src") || $(cells[0]).find("img").first().attr("data-src");
|
||||
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
|
||||
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
||||
: undefined;
|
||||
|
||||
products.push({
|
||||
partNumber: rawPart,
|
||||
name: desc || rawPart,
|
||||
url,
|
||||
imageUrl,
|
||||
formFactor: cat.formFactor,
|
||||
speed: cat.speed,
|
||||
speedGbps: cat.speedGbps,
|
||||
@ -211,6 +217,7 @@ export async function scrapeAscentOptics(): Promise<void> {
|
||||
await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
@ -219,6 +226,7 @@ export async function scrapeAscentOptics(): Promise<void> {
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
imageUrl: product.imageUrl,
|
||||
});
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
|
||||
@ -57,6 +57,12 @@ function speedFromSlug(slug: string): { speed: string; speedGbps: number } {
|
||||
return { speed: "Unknown", speedGbps: 0 };
|
||||
}
|
||||
|
||||
function speedFromTitleThenSlug(title: string, slug: string): { speed: string; speedGbps: number } {
|
||||
const titleSpeed = speedFromSlug(title);
|
||||
if (titleSpeed.speedGbps > 0) return titleSpeed;
|
||||
return speedFromSlug(slug);
|
||||
}
|
||||
|
||||
function formFactorFromText(text: string): string {
|
||||
const t = text.toUpperCase();
|
||||
if (/\bOSFP\b/.test(t)) return "OSFP";
|
||||
@ -124,10 +130,44 @@ interface EoptolinkProduct {
|
||||
speedGbps: number;
|
||||
formFactor: string;
|
||||
fiberType: string;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
wavelength?: string;
|
||||
imageUrl?: string;
|
||||
category: string;
|
||||
pageUrl: string;
|
||||
}
|
||||
|
||||
function reachFromText(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b100\s*km\b/i, "100km", 100000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bZR\b/i, "80km", 80000],
|
||||
[/\bER\b/i, "40km", 40000],
|
||||
[/\bLR\b/i, "10km", 10000],
|
||||
[/\bFR\b/i, "2km", 2000],
|
||||
[/\bDR\b/i, "500m", 500],
|
||||
[/\bSR\b/i, "300m", 300],
|
||||
];
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function wavelengthFromText(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | null {
|
||||
// Page title
|
||||
const titleMatch = html.match(/<title>([^<]+)/i) || html.match(/<h1[^>]*>([^<]{5,80})</i);
|
||||
@ -138,13 +178,36 @@ function parseProductPage(html: string, pageUrl: string): EoptolinkProduct | nul
|
||||
const pnRegex = /E[A-Z]{2,5}-\d{2,3}[A-Z0-9]{1,3}(?:-\d{1,3})?(?:-[A-Z0-9]{1,6})*/g;
|
||||
const partNumbers = [...new Set([...(html.matchAll(pnRegex) ?? [])].map((m) => m[0].trim()))];
|
||||
|
||||
const slug = pageUrl.split("/").slice(-2).join("-");
|
||||
const { speed, speedGbps } = speedFromSlug(slug + " " + pageTitle);
|
||||
const formFactor = formFactorFromText(pageTitle + " " + slug);
|
||||
const fiberType = fiberFromText(pageTitle + " " + slug);
|
||||
const category = categoryFromText(pageTitle + " " + slug);
|
||||
const slug = pageUrl.split("/").filter(Boolean).slice(-2).join("-");
|
||||
const titleEvidence = `${pageTitle} ${slug}`;
|
||||
const pageEvidence = `${titleEvidence} ${html.replace(/<[^>]+>/g, " ").slice(0, 12000)}`;
|
||||
const { speed, speedGbps } = speedFromTitleThenSlug(pageTitle, slug);
|
||||
const formFactor = formFactorFromText(titleEvidence);
|
||||
const fiberType = fiberFromText(titleEvidence);
|
||||
const reach = reachFromText(pageEvidence);
|
||||
const wavelength = wavelengthFromText(pageEvidence);
|
||||
const category = categoryFromText(titleEvidence);
|
||||
const rawImage =
|
||||
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1] ||
|
||||
html.match(/<img[^>]+src="([^"]+)"/i)?.[1];
|
||||
const imageUrl = rawImage && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImage)
|
||||
? (rawImage.startsWith("http") ? rawImage : `${BASE}${rawImage.startsWith("/") ? "" : "/"}${rawImage}`)
|
||||
: undefined;
|
||||
|
||||
return { pageTitle, partNumbers, speed, speedGbps, formFactor, fiberType, category, pageUrl };
|
||||
return {
|
||||
pageTitle,
|
||||
partNumbers,
|
||||
speed,
|
||||
speedGbps,
|
||||
formFactor,
|
||||
fiberType,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
wavelength,
|
||||
imageUrl,
|
||||
category,
|
||||
pageUrl,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Main ────────────────────────────────────────────────────────────────────
|
||||
@ -193,15 +256,47 @@ export async function scrapeEoptolink(): Promise<void> {
|
||||
|
||||
for (const partNumber of namesToSeed) {
|
||||
try {
|
||||
await findOrCreateScrapedTransceiver({
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: partNumber.slice(0, 80),
|
||||
vendorId,
|
||||
productUrl: url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: product.category,
|
||||
imageUrl: product.imageUrl,
|
||||
});
|
||||
|
||||
await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET form_factor = $2,
|
||||
speed_gbps = $3,
|
||||
speed = $4,
|
||||
reach_meters = CASE WHEN $5::int IS NOT NULL THEN $5::int ELSE reach_meters END,
|
||||
reach_label = COALESCE(NULLIF($6::text, ''), reach_label),
|
||||
fiber_type = COALESCE(NULLIF($7::text, ''), fiber_type),
|
||||
wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths),
|
||||
product_page_url = COALESCE(NULLIF($9::text, ''), product_page_url),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND vendor_id = $10
|
||||
`, [
|
||||
txId,
|
||||
product.formFactor,
|
||||
product.speedGbps,
|
||||
product.speed,
|
||||
product.reachMeters ?? null,
|
||||
product.reachLabel ?? null,
|
||||
product.fiberType || null,
|
||||
product.wavelength || null,
|
||||
url,
|
||||
vendorId,
|
||||
]);
|
||||
|
||||
added++;
|
||||
} catch (dbErr) {
|
||||
// Duplicate or constraint error — expected for re-runs
|
||||
|
||||
@ -226,7 +226,7 @@ function detectFormFactor(text: string): string | undefined {
|
||||
|
||||
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/1\.6\s*t/i, "1.6T", 1600],
|
||||
[/1[\.,]6\s*t|1600\s*g/i, "1.6T", 1600],
|
||||
[/800\s*g/i, "800G", 800],
|
||||
[/400\s*g/i, "400G", 400],
|
||||
[/200\s*g/i, "200G", 200],
|
||||
@ -248,6 +248,15 @@ function detectReach(text: string): string | undefined {
|
||||
return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined;
|
||||
}
|
||||
|
||||
function reachMetersFromLabel(label?: string): number | undefined {
|
||||
if (!label) return undefined;
|
||||
const match = label.match(/^(\d+(?:\.\d+)?)(m|km)$/i);
|
||||
if (!match) return undefined;
|
||||
const value = parseFloat(match[1]);
|
||||
if (!Number.isFinite(value) || value <= 0) return undefined;
|
||||
return match[2].toLowerCase() === "km" ? Math.round(value * 1000) : Math.round(value);
|
||||
}
|
||||
|
||||
function detectFiberType(text: string): string | undefined {
|
||||
if (/active\s+optical|\baoc\b/i.test(text)) return "AOC";
|
||||
if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper";
|
||||
@ -946,6 +955,8 @@ export async function scrapeFs(): Promise<void> {
|
||||
const parsed = parseSpecTable(detail.specs);
|
||||
const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`;
|
||||
const fiberType = parsed.fiberType ?? detectFiberType(textForInference);
|
||||
const reachLabel = reach ?? parsed.reachLabel;
|
||||
const reachMeters = parsed.reachMeters ?? reachMetersFromLabel(reachLabel);
|
||||
|
||||
const transceiverId = detail.targetTransceiverId ?? (await findOrCreateScrapedTransceiver({
|
||||
partNumber: detail.partNumber,
|
||||
@ -954,8 +965,8 @@ export async function scrapeFs(): Promise<void> {
|
||||
formFactor: ff,
|
||||
speedGbps: speedInfo?.speedGbps,
|
||||
speed: speedInfo?.speed,
|
||||
reachLabel: reach ?? parsed.reachLabel,
|
||||
reachMeters: parsed.reachMeters,
|
||||
reachLabel,
|
||||
reachMeters,
|
||||
fiberType,
|
||||
wavelengths: parsed.wavelengths,
|
||||
imageUrl: detail.imageUrl,
|
||||
@ -968,7 +979,12 @@ export async function scrapeFs(): Promise<void> {
|
||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
|
||||
form_factor = COALESCE(NULLIF(form_factor, ''), $3),
|
||||
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($4, speed_gbps) ELSE speed_gbps END,
|
||||
speed = COALESCE(NULLIF(speed, ''), $5),
|
||||
speed = CASE
|
||||
WHEN $5::text IS NOT NULL
|
||||
AND (speed IS NULL OR speed = '' OR speed = 'Unknown' OR $4::numeric = speed_gbps)
|
||||
THEN $5::text
|
||||
ELSE speed
|
||||
END,
|
||||
reach_label = COALESCE(NULLIF(reach_label, ''), $6),
|
||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
|
||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $8),
|
||||
@ -981,8 +997,8 @@ export async function scrapeFs(): Promise<void> {
|
||||
ff,
|
||||
speedInfo?.speedGbps ?? null,
|
||||
speedInfo?.speed ?? null,
|
||||
reach ?? parsed.reachLabel ?? null,
|
||||
parsed.reachMeters ?? null,
|
||||
reachLabel ?? null,
|
||||
reachMeters ?? null,
|
||||
fiberType ?? null,
|
||||
parsed.wavelengths ?? null,
|
||||
]
|
||||
@ -1038,7 +1054,7 @@ export async function scrapeFs(): Promise<void> {
|
||||
|
||||
const hasSourceDetails =
|
||||
Object.keys(detail.specs).length > 0 ||
|
||||
Boolean(fiberType || parsed.connector || parsed.wavelengths || parsed.reachLabel || reach);
|
||||
Boolean(fiberType || parsed.connector || parsed.wavelengths || reachLabel);
|
||||
|
||||
if (hasSourceDetails) {
|
||||
const updated = await updateVerifiedSpecs({
|
||||
@ -1046,8 +1062,8 @@ export async function scrapeFs(): Promise<void> {
|
||||
fiberType,
|
||||
connector: parsed.connector,
|
||||
wavelengths: parsed.wavelengths,
|
||||
reachMeters: parsed.reachMeters,
|
||||
reachLabel: reach ?? parsed.reachLabel,
|
||||
reachMeters,
|
||||
reachLabel,
|
||||
powerConsumptionW: parsed.powerConsumptionW,
|
||||
tempRange: parsed.tempRange,
|
||||
modulation: parsed.modulation,
|
||||
|
||||
@ -31,6 +31,7 @@ interface Product {
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
imageUrl?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
@ -116,6 +117,13 @@ function parseProductList(html: string): Product[] {
|
||||
|
||||
const ff = detectFormFactor(name);
|
||||
const reach = detectReach(name);
|
||||
const rawImg =
|
||||
$(el).find("img").first().attr("data-src") ||
|
||||
$(el).find("img").first().attr("data-lazy-src") ||
|
||||
$(el).find("img").first().attr("src");
|
||||
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
|
||||
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
||||
: undefined;
|
||||
|
||||
products.push({
|
||||
partNumber,
|
||||
@ -127,6 +135,7 @@ function parseProductList(html: string): Product[] {
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name),
|
||||
wavelength: detectWavelength(name),
|
||||
imageUrl,
|
||||
});
|
||||
});
|
||||
|
||||
@ -150,11 +159,19 @@ function parseProductList(html: string): Product[] {
|
||||
}
|
||||
const ff = detectFormFactor(name);
|
||||
const reach = detectReach(name);
|
||||
const rawImg =
|
||||
$(el).find("img").first().attr("data-src") ||
|
||||
$(el).find("img").first().attr("data-lazy-src") ||
|
||||
$(el).find("img").first().attr("src");
|
||||
const imageUrl = rawImg && !/(logo|placeholder|default|no-image|icon|sprite)/i.test(rawImg)
|
||||
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
||||
: undefined;
|
||||
products.push({
|
||||
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60),
|
||||
name, url, price, ...ff,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
imageUrl,
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -224,6 +241,7 @@ export async function scrapeGaoTek(): Promise<void> {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
@ -232,6 +250,7 @@ export async function scrapeGaoTek(): Promise<void> {
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
imageUrl: product.imageUrl,
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
|
||||
@ -15,7 +15,15 @@
|
||||
*
|
||||
* Rate limited: 1 req/2sec.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
|
||||
import {
|
||||
pool,
|
||||
findOrCreateScrapedTransceiver,
|
||||
ensureVendor,
|
||||
upsertPriceObservation,
|
||||
upsertStockObservation,
|
||||
markImageVerified,
|
||||
markDetailsVerified,
|
||||
} from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
import { readFileSync, writeFileSync, existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
@ -29,7 +37,8 @@ const HEADERS = {
|
||||
};
|
||||
|
||||
// Limit detail-page fetches per run to stay reasonable
|
||||
const MAX_DETAIL_PAGES = 600;
|
||||
const MAX_DETAIL_PAGES = Math.max(1, Math.min(1000, parseInt(process.env["NADDOD_MAX_DETAIL_PAGES"] || "600", 10)));
|
||||
const DB_DETAIL_ONLY = process.env["NADDOD_DB_DETAIL_ONLY"] === "1";
|
||||
|
||||
// Cursor file: persists across runs so each run covers the next 600 URLs
|
||||
const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json");
|
||||
@ -88,6 +97,26 @@ function detectSpeedGbps(text: string): { speed: string; speedGbps: number } {
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const cableCode = text.match(/\b(?:AOC|DAC|CU|COPPER|MCP|MFS)[A-Z0-9._-]*?(\d+(?:\.\d+)?)M\b/i);
|
||||
if (cableCode) {
|
||||
const value = parseFloat(cableCode[1]);
|
||||
if (Number.isFinite(value) && value > 0 && value <= 500) {
|
||||
return { label: `${String(value).replace(/\.0$/, "")}m`, meters: Math.round(value) };
|
||||
}
|
||||
}
|
||||
|
||||
const generic = text.match(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
||||
if (generic) {
|
||||
const value = parseFloat(generic[1]);
|
||||
const unit = generic[2].toLowerCase();
|
||||
if (Number.isFinite(value) && value > 0) {
|
||||
const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value);
|
||||
const labelValue = String(value).replace(/\.0$/, "");
|
||||
const label = unit === "km" ? `${labelValue}km` : `${labelValue}m`;
|
||||
return { label, meters };
|
||||
}
|
||||
}
|
||||
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
@ -102,8 +131,9 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZRP?\b|\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR\d*\b|\bVR\d*\b/, "100m", 100],
|
||||
[/\bDR4?\b|\bXDR\d*\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [re, label, meters] of patterns) {
|
||||
@ -113,9 +143,10 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
if (/aoc|active.?optical|active.?fiber|[^a-z]sr\d*[^a-z]?|[^a-z]vr\d*[^a-z]?|850\s*nm/i.test(text)) return "MMF";
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]?|[^a-z]dr\d*[^a-z]?|[^a-z]fr\d*[^a-z]?|xdr\d*|psm|bidi|cwdm|dwdm|1310\s*nm|1550\s*nm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]/i.test(text)) return "MMF";
|
||||
return "";
|
||||
}
|
||||
|
||||
@ -175,6 +206,7 @@ async function fetchText(url: string): Promise<string> {
|
||||
*/
|
||||
function parseDetailPage(html: string, url: string): {
|
||||
name: string;
|
||||
imageUrl?: string;
|
||||
price?: number;
|
||||
stock: { qty?: number; confidence: 1 | 2 } | null;
|
||||
} | null {
|
||||
@ -187,6 +219,20 @@ function parseDetailPage(html: string, url: string): {
|
||||
if (!name || name.length < 10) return null;
|
||||
if (!isTransceiver(name)) return null;
|
||||
|
||||
const imageUrl = (() => {
|
||||
const candidates = [
|
||||
html.match(/<meta\s+property="og:image"\s+content="([^"]+)"/i)?.[1],
|
||||
html.match(/<meta\s+name="twitter:image"\s+content="([^"]+)"/i)?.[1],
|
||||
html.match(/"image"\s*:\s*"([^"]+)"/i)?.[1],
|
||||
].filter(Boolean) as string[];
|
||||
|
||||
const img = candidates.find((candidate) =>
|
||||
!/(logo|placeholder|default|no-image|icon|sprite)/i.test(candidate)
|
||||
);
|
||||
if (!img) return undefined;
|
||||
return img.startsWith("http") ? img : `${BASE}${img.startsWith("/") ? "" : "/"}${img}`;
|
||||
})();
|
||||
|
||||
// Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
|
||||
// Fall back to "US$ 10.90" or "$10.90" visible text patterns
|
||||
let price: number | undefined;
|
||||
@ -220,7 +266,7 @@ function parseDetailPage(html: string, url: string): {
|
||||
// Stock count
|
||||
const stock = parseStockText(html);
|
||||
|
||||
return { name, price, stock };
|
||||
return { name, imageUrl, price, stock };
|
||||
}
|
||||
|
||||
// ── Sitemap parsing ─────────────────────────────────────────────────────────
|
||||
@ -247,6 +293,25 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
|
||||
return [...new Set(urls)]; // deduplicate
|
||||
}
|
||||
|
||||
async function fetchDbTargets(limit: number): Promise<Array<{ url: string; targetTransceiverId: string }>> {
|
||||
const result = await pool.query<{ id: string; product_page_url: string }>(`
|
||||
SELECT t.id, t.product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'NADDOD'
|
||||
AND t.product_page_url IS NOT NULL
|
||||
AND t.product_page_url != ''
|
||||
AND (t.details_verified = false OR t.image_verified = false)
|
||||
ORDER BY t.details_verified ASC, t.image_verified ASC, t.updated_at ASC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
|
||||
return result.rows.map((row) => ({
|
||||
url: row.product_page_url,
|
||||
targetTransceiverId: row.id,
|
||||
}));
|
||||
}
|
||||
|
||||
// ── Main scraper ────────────────────────────────────────────────────────────
|
||||
|
||||
export async function scrapeNaddod(): Promise<void> {
|
||||
@ -262,11 +327,18 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
// ── Phase 1: Discover product URLs via sitemap ────────────────────────────
|
||||
console.log("[Phase 1] Discovering products from sitemap...");
|
||||
let productUrls: string[] = [];
|
||||
let targets: Array<{ url: string; targetTransceiverId?: string }> = [];
|
||||
try {
|
||||
if (DB_DETAIL_ONLY) {
|
||||
targets = await fetchDbTargets(MAX_DETAIL_PAGES);
|
||||
productUrls = targets.map((target) => target.url);
|
||||
console.log(` DB detail targets: ${productUrls.length}`);
|
||||
} else {
|
||||
productUrls = await fetchProductUrlsFromSitemap();
|
||||
console.log(` Found ${productUrls.length} product URLs in sitemap`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Sitemap fetch failed: ${(err as Error).message}`);
|
||||
console.error(` Target discovery failed: ${(err as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -278,16 +350,16 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
// Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs
|
||||
// (24 hours) we cover all ~7300 products. Wraps around when exhausted.
|
||||
const totalUrls = productUrls.length;
|
||||
const offset = readCursor() % totalUrls;
|
||||
const offset = DB_DETAIL_ONLY ? 0 : readCursor() % totalUrls;
|
||||
const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls);
|
||||
let urls = productUrls.slice(offset, endIdx);
|
||||
// Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list)
|
||||
if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
|
||||
const wrap = MAX_DETAIL_PAGES - urls.length;
|
||||
urls = urls.concat(productUrls.slice(0, wrap));
|
||||
let batchTargets: Array<{ url: string; targetTransceiverId?: string }> =
|
||||
DB_DETAIL_ONLY ? targets : productUrls.slice(offset, endIdx).map((url) => ({ url }));
|
||||
if (!DB_DETAIL_ONLY && batchTargets.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
|
||||
const wrap = MAX_DETAIL_PAGES - batchTargets.length;
|
||||
batchTargets = batchTargets.concat(productUrls.slice(0, wrap).map((url) => ({ url })));
|
||||
}
|
||||
const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls;
|
||||
console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`);
|
||||
const nextOffset = DB_DETAIL_ONLY ? offset : (offset + MAX_DETAIL_PAGES) % totalUrls;
|
||||
console.log(` Offset: ${offset}/${totalUrls} → processing ${batchTargets.length} products (next run starts at ${nextOffset})`);
|
||||
|
||||
// ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
|
||||
console.log("\n[Phase 2] Fetching product detail pages...");
|
||||
@ -299,7 +371,8 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
let skippedNonTx = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const url of urls) {
|
||||
for (const target of batchTargets) {
|
||||
const url = target.url;
|
||||
await sleep(2000);
|
||||
try {
|
||||
const html = await fetchText(url);
|
||||
@ -310,19 +383,53 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
continue;
|
||||
}
|
||||
|
||||
const { name, price, stock } = detail;
|
||||
const { speed, speedGbps } = detectSpeedGbps(name);
|
||||
const formFactor = detectFormFactor(name);
|
||||
const reach = detectReach(name);
|
||||
const fiberType = detectFiber(name);
|
||||
const wavelength = detectWavelength(name);
|
||||
const { name, imageUrl, price, stock } = detail;
|
||||
const evidenceText = `${name} ${html.replace(/<[^>]+>/g, " ").slice(0, 20000)}`;
|
||||
const { speed, speedGbps } = detectSpeedGbps(evidenceText);
|
||||
const formFactor = detectFormFactor(evidenceText);
|
||||
const reach = detectReach(evidenceText);
|
||||
const fiberType = detectFiber(evidenceText);
|
||||
const wavelength = detectWavelength(evidenceText);
|
||||
|
||||
// Extract part number from name (first word-group before "Compatible" or vendor name)
|
||||
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
let txId = target.targetTransceiverId;
|
||||
|
||||
if (txId) {
|
||||
await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
|
||||
form_factor = COALESCE(NULLIF($3::text, ''), form_factor),
|
||||
speed_gbps = CASE WHEN $4::numeric > 0 THEN $4::numeric ELSE speed_gbps END,
|
||||
speed = CASE WHEN $4::numeric > 0 THEN $5 ELSE speed END,
|
||||
reach_meters = CASE WHEN $6::int IS NOT NULL AND $6::int > 0 THEN $6::int ELSE reach_meters END,
|
||||
reach_label = COALESCE(NULLIF($7::text, ''), reach_label),
|
||||
fiber_type = COALESCE(NULLIF($8::text, ''), fiber_type),
|
||||
wavelengths = COALESCE(NULLIF($9::text, ''), wavelengths),
|
||||
category = COALESCE(NULLIF(category, ''), 'DataCenter'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND vendor_id = $10
|
||||
`, [
|
||||
txId,
|
||||
url,
|
||||
formFactor,
|
||||
speedGbps,
|
||||
speed,
|
||||
reach?.meters ?? null,
|
||||
reach?.label ?? null,
|
||||
fiberType || null,
|
||||
wavelength || null,
|
||||
vendorId,
|
||||
]);
|
||||
if (imageUrl) await markImageVerified(txId, imageUrl);
|
||||
await markDetailsVerified({ transceiverId: txId, sourceUrl: url });
|
||||
} else {
|
||||
txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber,
|
||||
vendorId,
|
||||
productUrl: url,
|
||||
formFactor,
|
||||
speedGbps,
|
||||
speed,
|
||||
@ -331,7 +438,9 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
fiberType,
|
||||
wavelengths: wavelength,
|
||||
category: "DataCenter",
|
||||
imageUrl,
|
||||
});
|
||||
}
|
||||
|
||||
// Price observation
|
||||
if (price && price > 0) {
|
||||
@ -368,7 +477,7 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
|
||||
processed++;
|
||||
if (processed % 50 === 0) {
|
||||
console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
|
||||
console.log(` Progress: ${processed}/${batchTargets.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`);
|
||||
}
|
||||
} catch (err) {
|
||||
errors++;
|
||||
@ -377,10 +486,10 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
}
|
||||
|
||||
// Advance cursor for next run
|
||||
writeCursor(nextOffset);
|
||||
if (!DB_DETAIL_ONLY) writeCursor(nextOffset);
|
||||
|
||||
console.log("\n=== NADDOD Scraper v2 Complete ===");
|
||||
console.log(` URL range processed: ${offset}–${offset + urls.length - 1} of ${totalUrls}`);
|
||||
console.log(` URL range processed: ${offset}–${offset + batchTargets.length - 1} of ${totalUrls}`);
|
||||
console.log(` Products processed: ${processed}`);
|
||||
console.log(` Non-transceivers skip: ${skippedNonTx}`);
|
||||
console.log(` Price observations: ${priceUpdates} new`);
|
||||
|
||||
@ -1,9 +1,102 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 12:16 UTC
|
||||
Updated: 2026-05-09 13:54 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- FS.com 1.6T DR8/2FR4 source correction on 2026-05-09:
|
||||
- operator spotted that FS.com has two distinct 1.6T OSFP variants on the same family:
|
||||
- `OSFP-DR8-1.6T-FL`: 500m, DR8, SMF
|
||||
- `OSFP-2FR4-1.6T-FL`: 2km, 2FR4, SMF
|
||||
- confirmed in TIP DB:
|
||||
- both FS.com variants exist as separate rows
|
||||
- `OSFP-2FR4-1.6T-FL` had `reach_meters=0` even though the source and row label said `2km`
|
||||
- `OSFP-DR8-1.6T-FL` had no wavelength, causing the deterministic equivalence worker to reject the otherwise correct 500m Flexoptix match
|
||||
- live DB correction:
|
||||
- `OSFP-DR8-1.6T-FL`
|
||||
- `speed=1.6T`
|
||||
- `speed_gbps=1600`
|
||||
- `reach_label=500m`
|
||||
- `reach_meters=500`
|
||||
- `fiber_type=SMF`
|
||||
- `wavelengths=1310`
|
||||
- `standard_name=1.6T OSFP DR8`
|
||||
- fully verified remains true
|
||||
- `OSFP-2FR4-1.6T-FL`
|
||||
- `speed=1.6T`
|
||||
- `speed_gbps=1600`
|
||||
- `reach_label=2km`
|
||||
- `reach_meters=2000`
|
||||
- `fiber_type=SMF`
|
||||
- `wavelengths=1310`
|
||||
- `standard_name=1.6T OSFP 2FR4`
|
||||
- fully verified true
|
||||
- Flexoptix `O.1316T.C.05.M`
|
||||
- confirmed as `500m`, `SMF`, `1.6T`
|
||||
- `standard_name=1.6T OSFP DR8`
|
||||
- equivalence correction:
|
||||
- approved only `O.1316T.C.05.M` ↔ `OSFP-DR8-1.6T-FL`
|
||||
- confidence `0.913`
|
||||
- match basis: form factor, speed, reach, fiber, wavelength and source variant DR8/500m
|
||||
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m DR8 Flexoptix product
|
||||
- scraper hardening:
|
||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
||||
- recognizes German/decimal `1,6T` and `1600G` as `1.6T`/`1600`
|
||||
- converts reach labels such as `2km` into `reach_meters=2000`
|
||||
- updates stale `speed` labels when the numeric source speed matches the row
|
||||
- build:
|
||||
- `pnpm -C packages/scraper build` passed on Erik
|
||||
- truth:
|
||||
- there are definitely two separate FS.com variants
|
||||
- 500m DR8 is the correct equivalent for Flexoptix `O.1316T.C.05.M`
|
||||
- 2km FR4 is a separate DB product and must not be collapsed into the 500m match
|
||||
|
||||
- Targeted vendor verification push after equivalence revalidation on 2026-05-09:
|
||||
- code improved:
|
||||
- `NADDOD_DB_DETAIL_ONLY=1` mode verifies existing NADDOD rows with source URLs instead of rotating blindly through the full sitemap
|
||||
- NADDOD now extracts `og:image`, source product URLs, reach/fiber/wavelength from page evidence, AOC/DAC cable lengths, and DR/FR/SR/VR/XDR patterns
|
||||
- GAO Tek now writes product URLs and image evidence
|
||||
- Ascent Optics now writes product URLs and table image evidence
|
||||
- Eoptolink now writes product URLs, images, reach/wavelength evidence and corrects over-broad form-factor parsing by preferring title/slug evidence
|
||||
- live low-load Erik runs:
|
||||
- GAO Tek static crawl:
|
||||
- `473` unique products processed
|
||||
- GAO Tek detail coverage improved from `41` to `126`
|
||||
- `no_url` dropped to `0`
|
||||
- Ascent Optics static/API crawl:
|
||||
- `253` catalog products processed
|
||||
- image coverage `235/305`
|
||||
- detail coverage `213/305`
|
||||
- Eoptolink static crawl:
|
||||
- `76` product-solution pages inspected
|
||||
- after parser correction, Eoptolink is `287/287` image and detail verified
|
||||
- NADDOD targeted DB-detail mode:
|
||||
- first targeted wave `200` pages
|
||||
- second wave `300` pages
|
||||
- closure wave `385` pages
|
||||
- special-case wave `83` pages
|
||||
- NADDOD moved from `image=12`, `details=157`, `fully=0/1-ish` to:
|
||||
- total `748`
|
||||
- price `744`
|
||||
- image `742`
|
||||
- details `659`
|
||||
- competitor `744`
|
||||
- fully `659`
|
||||
- no URL `6`
|
||||
- global TIP counters after this push:
|
||||
- price verified `11557`
|
||||
- image verified `11963`
|
||||
- details verified `11018`
|
||||
- fully verified `9794`
|
||||
- total transceivers `17647`
|
||||
- health:
|
||||
- TIP stayed `healthy`
|
||||
- load status `ok`
|
||||
- memory used about `13%`
|
||||
- truth:
|
||||
- NADDOD is not 100% complete; remaining detail gaps include likely non-transceiver switch/NIC products and a smaller set of parser-special cases
|
||||
- OEM catalogs like Ascent and Eoptolink do not publish retail prices, so full verification cannot be forced honestly without price evidence
|
||||
|
||||
- Immediate full TIP equivalence revalidation on 2026-05-09:
|
||||
- operator requested all open TIP validation to be completed immediately and all product matches checked for true 1:1 equivalence
|
||||
- live preflight:
|
||||
|
||||
@ -0,0 +1,117 @@
|
||||
# FS.com 1.6T Variant Correction + Vendor Verification Push
|
||||
|
||||
Date: 2026-05-09
|
||||
Actor: Codex
|
||||
|
||||
## Operator Finding
|
||||
|
||||
The operator spotted a concrete source-truth problem on FS.com:
|
||||
|
||||
- `OSFP-DR8-1.6T-FL` is the 500m DR8 variant.
|
||||
- `OSFP-2FR4-1.6T-FL` is the 2km 2FR4 variant.
|
||||
- Flexoptix `O.1316T.C.05.M` is the 500m DR8 product.
|
||||
- The 2km FR4 variant must be present as its own product and must not be collapsed into the 500m match.
|
||||
|
||||
## Live DB Correction
|
||||
|
||||
Corrected FS.com rows:
|
||||
|
||||
- `OSFP-DR8-1.6T-FL`
|
||||
- `speed=1.6T`
|
||||
- `speed_gbps=1600`
|
||||
- `reach_label=500m`
|
||||
- `reach_meters=500`
|
||||
- `fiber_type=SMF`
|
||||
- `wavelengths=1310`
|
||||
- `standard_name=1.6T OSFP DR8`
|
||||
- fully verified
|
||||
|
||||
- `OSFP-2FR4-1.6T-FL`
|
||||
- `speed=1.6T`
|
||||
- `speed_gbps=1600`
|
||||
- `reach_label=2km`
|
||||
- `reach_meters=2000`
|
||||
- `fiber_type=SMF`
|
||||
- `wavelengths=1310`
|
||||
- `standard_name=1.6T OSFP 2FR4`
|
||||
- fully verified
|
||||
|
||||
Corrected Flexoptix row:
|
||||
|
||||
- `O.1316T.C.05.M`
|
||||
- confirmed `500m`, `SMF`, `1.6T`
|
||||
- `standard_name=1.6T OSFP DR8`
|
||||
|
||||
Corrected equivalence:
|
||||
|
||||
- Approved only `O.1316T.C.05.M` ↔ `OSFP-DR8-1.6T-FL`.
|
||||
- Confidence: `0.913`.
|
||||
- Basis: form factor, speed, reach, fiber, wavelength and explicit source variant DR8/500m.
|
||||
- `OSFP-2FR4-1.6T-FL` remains separate and is not linked to the 500m Flexoptix product.
|
||||
|
||||
## Scraper Hardening
|
||||
|
||||
Updated `packages/scraper/src/scrapers/fs-com.ts`:
|
||||
|
||||
- Detects `1,6T`, `1.6T` and `1600G` as `1.6T`/`1600`.
|
||||
- Converts labels like `2km` to `reach_meters=2000`.
|
||||
- Updates stale `speed` strings when the numeric source speed matches the row.
|
||||
|
||||
Remote build on Erik passed:
|
||||
|
||||
```text
|
||||
pnpm -C packages/scraper build
|
||||
```
|
||||
|
||||
## Vendor Verification Work In Same Push
|
||||
|
||||
Updated:
|
||||
|
||||
- `packages/scraper/src/scrapers/naddod.ts`
|
||||
- `packages/scraper/src/scrapers/gaotek.ts`
|
||||
- `packages/scraper/src/scrapers/ascentoptics.ts`
|
||||
- `packages/scraper/src/scrapers/eoptolink.ts`
|
||||
|
||||
Live results:
|
||||
|
||||
- GAO Tek:
|
||||
- details improved from `41` to `126`
|
||||
- no-url dropped to `0`
|
||||
- Ascent Optics:
|
||||
- image `235/305`
|
||||
- details `213/305`
|
||||
- Eoptolink:
|
||||
- image `287/287`
|
||||
- details `287/287`
|
||||
- NADDOD:
|
||||
- total `748`
|
||||
- price `744`
|
||||
- image `742`
|
||||
- details `659`
|
||||
- competitor `744`
|
||||
- fully `659`
|
||||
- no URL `6`
|
||||
|
||||
Global TIP counters after the push:
|
||||
|
||||
- price verified `11557`
|
||||
- image verified `11963`
|
||||
- details verified `11018`
|
||||
- fully verified `9794`
|
||||
- total transceivers `17647`
|
||||
|
||||
TIP remained healthy:
|
||||
|
||||
- status `healthy`
|
||||
- load status `ok`
|
||||
- memory around `13%`
|
||||
|
||||
## Lesson For TIPLLM
|
||||
|
||||
Variant selectors on vendor pages must be treated as separate products when reach, optical protocol, connector or model changes.
|
||||
|
||||
For FS.com 1.6T OSFP:
|
||||
|
||||
- `DR8 500m` and `2FR4 2km` are distinct SKUs and distinct compatibility candidates.
|
||||
- A Flexoptix 500m DR8 product must not be matched to a 2km FR4 FS.com product.
|
||||
- Source pages can expose German decimal text (`1,6T`) and separate net/gross prices; normalize carefully.
|
||||
Loading…
x
Reference in New Issue
Block a user