- All 247 FS.com prices were €79 (shipping threshold, not product prices) - Root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' banner matched first - Fix 1: DOM price extraction in page.evaluate with bad-parent skip list - Fix 2: bodyText qualified patterns skip matches near shipping keywords - Fix 3: waitForSelector for price DOM element before evaluate - Fix 4: Deleted 247 invalid €79 observations from DB Also included from previous session: - db.ts: set has_image=true on image writes (fix 632 desync rows) - spec-updater.ts: DR/FR/LR/ER/ZR → SMF, SR → MMF fiber type inference
195 lines
6.9 KiB
TypeScript
195 lines
6.9 KiB
TypeScript
/**
|
|
* Update transceiver specs with verified data from vendor product pages.
|
|
* Only updates fields that are currently empty/estimated, preserving vendor-verified data.
|
|
* Marks updated products as 'scraped_unverified' (higher confidence than 'enriched_estimated').
|
|
*/
|
|
import { pool } from "./db";
|
|
|
|
export interface VerifiedSpecs {
|
|
transceiverId: string;
|
|
fiberType?: string; // SMF, MMF, Copper, AOC
|
|
connector?: string; // LC, SC, MPO-12, MPO-16, RJ45, DAC, AOC
|
|
wavelengths?: string; // "850nm", "1310nm", "1310nm (4λ CWDM)", etc.
|
|
reachMeters?: number;
|
|
reachLabel?: string;
|
|
powerConsumptionW?: number;
|
|
tempRange?: string; // COM, IND
|
|
modulation?: string; // NRZ, PAM4
|
|
domSupport?: boolean;
|
|
imageUrl?: string;
|
|
datasheetUrl?: string;
|
|
source: string; // "fs.com", "flexoptix.net", etc.
|
|
}
|
|
|
|
/**
|
|
* Update transceiver with verified specs from a vendor product page.
|
|
* Sets data_confidence to 'scraped_unverified' (better than 'enriched_estimated').
|
|
*/
|
|
export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean> {
|
|
const updates: string[] = [];
|
|
const values: any[] = [];
|
|
let idx = 1;
|
|
|
|
// Only update fields that have a new value
|
|
if (specs.fiberType) {
|
|
updates.push(`fiber_type = $${idx}`);
|
|
values.push(specs.fiberType);
|
|
idx++;
|
|
}
|
|
if (specs.connector) {
|
|
updates.push(`connector = $${idx}`);
|
|
values.push(specs.connector);
|
|
idx++;
|
|
}
|
|
if (specs.wavelengths) {
|
|
updates.push(`wavelengths = $${idx}`);
|
|
values.push(specs.wavelengths);
|
|
idx++;
|
|
}
|
|
if (specs.reachMeters && specs.reachMeters > 0) {
|
|
updates.push(`reach_meters = $${idx}`);
|
|
values.push(specs.reachMeters);
|
|
idx++;
|
|
}
|
|
if (specs.reachLabel) {
|
|
updates.push(`reach_label = $${idx}`);
|
|
values.push(specs.reachLabel);
|
|
idx++;
|
|
}
|
|
if (specs.powerConsumptionW && specs.powerConsumptionW > 0) {
|
|
updates.push(`power_consumption_w = $${idx}`);
|
|
values.push(specs.powerConsumptionW);
|
|
idx++;
|
|
}
|
|
if (specs.tempRange) {
|
|
updates.push(`temp_range = $${idx}`);
|
|
values.push(specs.tempRange);
|
|
idx++;
|
|
}
|
|
if (specs.modulation) {
|
|
updates.push(`modulation = $${idx}`);
|
|
values.push(specs.modulation);
|
|
idx++;
|
|
}
|
|
if (specs.domSupport !== undefined) {
|
|
updates.push(`dom_support = $${idx}`);
|
|
values.push(specs.domSupport);
|
|
idx++;
|
|
}
|
|
if (specs.imageUrl) {
|
|
updates.push(`image_url = $${idx}, has_image = true`);
|
|
values.push(specs.imageUrl);
|
|
idx++;
|
|
}
|
|
if (specs.datasheetUrl) {
|
|
// Use the correct column name based on schema
|
|
updates.push(`datasheet_r2_key = $${idx}`);
|
|
values.push(specs.datasheetUrl);
|
|
idx++;
|
|
}
|
|
|
|
if (updates.length === 0) return false;
|
|
|
|
// Always upgrade confidence from estimated to scraped
|
|
updates.push(`data_confidence = 'scraped_unverified'`);
|
|
updates.push(`updated_at = NOW()`);
|
|
|
|
values.push(specs.transceiverId);
|
|
await pool.query(
|
|
`UPDATE transceivers SET ${updates.join(", ")} WHERE id = $${idx}`,
|
|
values
|
|
);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Parse a spec table from a product page into structured data.
|
|
* Works for fs.com, 10gtek, and similar HTML spec tables.
|
|
*/
|
|
export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedSpecs> {
|
|
const result: Partial<VerifiedSpecs> = {};
|
|
|
|
for (const [rawKey, rawVal] of Object.entries(specs)) {
|
|
const key = rawKey.toLowerCase().trim();
|
|
const val = rawVal.trim();
|
|
|
|
// Fiber Type
|
|
if (key.includes("fiber") && key.includes("type") || key === "cable type" || key === "media") {
|
|
if (/single.?mode|smf|os2/i.test(val)) result.fiberType = "SMF";
|
|
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
|
|
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
|
|
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
|
|
// 400G/800G parallel-optic standards: DR/FR/LR = SMF, SR = MMF
|
|
else if (/\bdr\d*\b|\bfr\d*\b|\blr\d*\b|\ber\d*\b|\bzr\d*\b/i.test(val)) result.fiberType = "SMF";
|
|
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
|
|
}
|
|
// Also infer fiber type from part-number-style keys when fiber key absent
|
|
if (!result.fiberType && (key === "part number" || key === "model" || key === "sku")) {
|
|
if (/\b(dr|fr|lr|er|zr)\d*\b/i.test(val)) result.fiberType = "SMF";
|
|
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
|
|
}
|
|
|
|
// Connector
|
|
if (key.includes("connector") || key.includes("interface")) {
|
|
if (/duplex\s*lc|lc\s*duplex|lc\/pc|lc\/upc|lc\/apc/i.test(val)) result.connector = "LC";
|
|
else if (/\blc\b/i.test(val)) result.connector = "LC";
|
|
else if (/sc\/pc|sc\/apc|\bsc\b/i.test(val)) result.connector = "SC";
|
|
else if (/mpo-?24/i.test(val)) result.connector = "MPO-24";
|
|
else if (/mpo-?16/i.test(val)) result.connector = "MPO-16";
|
|
else if (/mpo-?12|mtp-?12|mpo\b|mtp\b/i.test(val)) result.connector = "MPO-12";
|
|
else if (/rj-?45|copper/i.test(val)) result.connector = "RJ45";
|
|
else if (/cs\b/i.test(val)) result.connector = "CS";
|
|
else if (/sn\b/i.test(val)) result.connector = "SN";
|
|
}
|
|
|
|
// Wavelength
|
|
if (key.includes("wavelength") || key.includes("laser") || key === "tx wavelength") {
|
|
const nmMatch = val.match(/([\d.]+)\s*nm/i);
|
|
if (nmMatch) result.wavelengths = nmMatch[1] + "nm";
|
|
// Check for multi-wavelength
|
|
if (/cwdm/i.test(val)) result.wavelengths = val;
|
|
if (/dwdm/i.test(val)) result.wavelengths = val;
|
|
}
|
|
|
|
// Reach / Distance
|
|
if (key.includes("distance") || key.includes("reach") || key.includes("transmission") || key === "max link length") {
|
|
const kmMatch = val.match(/([\d.]+)\s*km/i);
|
|
const mMatch = val.match(/([\d.]+)\s*m\b/i);
|
|
if (kmMatch) {
|
|
const km = parseFloat(kmMatch[1]!);
|
|
result.reachMeters = Math.round(km * 1000);
|
|
result.reachLabel = km >= 1 ? `${km}km` : `${result.reachMeters}m`;
|
|
} else if (mMatch) {
|
|
result.reachMeters = parseInt(mMatch[1]!);
|
|
result.reachLabel = `${result.reachMeters}m`;
|
|
}
|
|
}
|
|
|
|
// Power Consumption
|
|
if (key.includes("power") && (key.includes("consumption") || key.includes("dissipation") || key.includes("max"))) {
|
|
const wMatch = val.match(/([\d.]+)\s*w/i);
|
|
if (wMatch) result.powerConsumptionW = parseFloat(wMatch[1]!);
|
|
}
|
|
|
|
// Temperature
|
|
if (key.includes("temperature") || key.includes("temp") && key.includes("range")) {
|
|
if (/0.*70|commercial/i.test(val)) result.tempRange = "COM";
|
|
else if (/-40.*85|industrial/i.test(val)) result.tempRange = "IND";
|
|
}
|
|
|
|
// DOM
|
|
if (key.includes("dom") || key.includes("ddm") || key.includes("diagnostic")) {
|
|
result.domSupport = /yes|supported|ddm|dom/i.test(val);
|
|
}
|
|
|
|
// Modulation
|
|
if (key.includes("modulation") || key.includes("encoding")) {
|
|
if (/pam4|pam-4/i.test(val)) result.modulation = "PAM4";
|
|
else if (/nrz/i.test(val)) result.modulation = "NRZ";
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|