/** * Flexoptix detail-page verifier. * * Targeted pass for Flexoptix rows that already have product_page_url but are * missing price/image/details signals. Uses static product HTML only. */ import * as cheerio from "cheerio"; import { ensureVendor, markDetailsVerified, markImageVerified, pool, upsertPriceObservation, } from "../utils/db"; import { contentHash } from "../utils/hash"; const HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)", Accept: "text/html,application/xhtml+xml", }; type DetailPatch = { title: string; description: string; price?: number; currency: string; imageUrl?: string; reachLabel?: string; reachMeters?: number; fiberType?: string; wavelengths?: string; connector?: string; standardName?: string; }; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } function cleanText(value: string | undefined): string { return (value || "") .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) .replace(/&/g, "&") .replace(/"/g, '"') .replace(/ /g, " ") .replace(/\s+/g, " ") .trim(); } function formatNumber(value: number): string { return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, ""); } function detectReach(text: string): { label: string; meters: number } | undefined { const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); if (range) { const max = parseFloat(range[2].replace(/,/g, "")); const unit = range[3].toLowerCase(); const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max)); return { label: `${formatNumber(max)}${unit}`, meters }; } const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); if (generic) { const value = parseFloat(generic[1].replace(/,/g, "")); const unit = generic[2].toLowerCase(); if (Number.isFinite(value) && value > 0) { const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value)); return { label: `${formatNumber(value)}${unit}`, meters }; } } if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 }; return undefined; } function detectFiber(text: string): string | undefined { if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper"; if (/active optical cable|\baoc\b/i.test(text)) return "AOC"; const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text); const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text); if (hasSingle && hasMulti) return "SMF/MMF"; if (hasSingle) return "SMF"; if (hasMulti) return "MMF"; return undefined; } function detectWavelengths(text: string): string | undefined { const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)] .map((match) => match[1]) .filter((value, index, values) => values.indexOf(value) === index); return matches.length ? matches.join("/") : undefined; } function detectConnector(text: string): string | undefined { const connectors = [ /MTP\/MPO[-\w/]*/i, /LC[-\w/]*/i, /\bCS\b/i, /\bSN\b/i, /\bRJ-?45\b/i, ]; for (const regex of connectors) { const match = text.match(regex); if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45"); } return undefined; } function detectStandard(text: string): string | undefined { const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i); return match ? match[0].toUpperCase() : undefined; } function parseDetail(html: string): DetailPatch { const $ = cheerio.load(html); const title = cleanText($('meta[name="title"]').attr("content") || $("title").text()); const description = cleanText( $('meta[name="description"]').attr("content") || $('meta[property="og:description"]').attr("content") || "" ); const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`; const priceText = $('meta[property="product:price:amount"]').attr("content"); const price = priceText ? parseFloat(priceText) : undefined; const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR"; const imageUrl = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || undefined; const reach = detectReach(combined); return { title, description, price: price && price > 0 && price < 100000 ? price : undefined, currency, imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(combined), wavelengths: detectWavelengths(combined), connector: detectConnector(combined), standardName: detectStandard(combined), }; } async function run(): Promise { const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/"); const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10))); const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false"; const rows = await pool.query( ` SELECT t.id, t.part_number, t.product_page_url FROM transceivers t JOIN vendors v ON v.id = t.vendor_id WHERE UPPER(v.name) LIKE '%FLEXOPTIX%' AND t.product_page_url IS NOT NULL AND t.product_page_url != '' AND t.product_page_url LIKE 'https://www.flexoptix.net/%' AND t.product_page_url LIKE '%.html%' AND ( $2::boolean = false OR t.price_verified = false OR t.price_verified IS NULL OR t.image_verified = false OR t.image_verified IS NULL OR t.details_verified = false OR t.details_verified IS NULL OR t.reach_label IS NULL OR t.reach_label = '' OR t.fiber_type IS NULL OR t.fiber_type = '' OR t.wavelengths IS NULL OR t.wavelengths = '' ) ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number LIMIT $1 `, [limit, onlyMissing] ); console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`); let fetched = 0; let failed = 0; let prices = 0; let images = 0; let details = 0; for (const row of rows.rows) { try { const resp = await fetch(row.product_page_url, { headers: HEADERS, signal: AbortSignal.timeout(20000), }); if (!resp.ok) { failed++; console.warn(` ${row.part_number}: HTTP ${resp.status}`); continue; } const html = await resp.text(); const patch = parseDetail(html); await pool.query( ` UPDATE transceivers SET reach_label = COALESCE(NULLIF(reach_label, ''), $2), reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END, fiber_type = COALESCE(NULLIF(fiber_type, ''), $4), wavelengths = COALESCE(NULLIF(wavelengths, ''), $5), connector = COALESCE(NULLIF(connector, ''), $6), standard_name = COALESCE(NULLIF(standard_name, ''), $7), product_page_url = COALESCE(NULLIF(product_page_url, ''), $8), updated_at = NOW() WHERE id = $1 `, [ row.id, patch.reachLabel || null, patch.reachMeters ?? null, patch.fiberType || null, patch.wavelengths || null, patch.connector || null, patch.standardName || null, row.product_page_url, ] ); if (patch.imageUrl) { const marked = await markImageVerified(row.id, patch.imageUrl); if (marked) images++; } if (patch.price) { const updated = await upsertPriceObservation({ transceiverId: row.id, sourceVendorId: vendorId, price: patch.price, currency: patch.currency, stockLevel: "in_stock", url: row.product_page_url, contentHash: contentHash({ price: patch.price, part: row.part_number }), }); if (updated) prices++; } const verified = await markDetailsVerified({ transceiverId: row.id, sourceUrl: row.product_page_url, }); if (verified) details++; fetched++; if (fetched % 25 === 0) { console.log(` processed ${fetched}/${rows.rows.length}`); } } catch (error) { failed++; console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`); } await sleep(800); } console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`); } if (require.main === module) { run() .then(() => pool.end()) .catch((error) => { console.error("Fatal:", error); pool.end(); process.exit(1); }); }