276 lines
9.2 KiB
TypeScript
276 lines
9.2 KiB
TypeScript
/**
|
|
* Flexoptix detail-page verifier.
|
|
*
|
|
* Targeted pass for Flexoptix rows that already have product_page_url but are
|
|
* missing price/image/details signals. Uses static product HTML only.
|
|
*/
|
|
import * as cheerio from "cheerio";
|
|
import {
|
|
ensureVendor,
|
|
markDetailsVerified,
|
|
markImageVerified,
|
|
pool,
|
|
upsertPriceObservation,
|
|
} from "../utils/db";
|
|
import { contentHash } from "../utils/hash";
|
|
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)",
|
|
Accept: "text/html,application/xhtml+xml",
|
|
};
|
|
|
|
type DetailPatch = {
|
|
title: string;
|
|
description: string;
|
|
price?: number;
|
|
currency: string;
|
|
imageUrl?: string;
|
|
reachLabel?: string;
|
|
reachMeters?: number;
|
|
fiberType?: string;
|
|
wavelengths?: string;
|
|
connector?: string;
|
|
standardName?: string;
|
|
};
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function cleanText(value: string | undefined): string {
|
|
return (value || "")
|
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
|
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
|
|
.replace(/&/g, "&")
|
|
.replace(/"/g, '"')
|
|
.replace(/ /g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function formatNumber(value: number): string {
|
|
return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
|
|
}
|
|
|
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
|
const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
|
if (range) {
|
|
const max = parseFloat(range[2].replace(/,/g, ""));
|
|
const unit = range[3].toLowerCase();
|
|
const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max));
|
|
return { label: `${formatNumber(max)}${unit}`, meters };
|
|
}
|
|
|
|
const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
|
const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
|
if (generic) {
|
|
const value = parseFloat(generic[1].replace(/,/g, ""));
|
|
const unit = generic[2].toLowerCase();
|
|
if (Number.isFinite(value) && value > 0) {
|
|
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
|
|
return { label: `${formatNumber(value)}${unit}`, meters };
|
|
}
|
|
}
|
|
|
|
if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 };
|
|
return undefined;
|
|
}
|
|
|
|
function detectFiber(text: string): string | undefined {
|
|
if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper";
|
|
if (/active optical cable|\baoc\b/i.test(text)) return "AOC";
|
|
const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text);
|
|
const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text);
|
|
if (hasSingle && hasMulti) return "SMF/MMF";
|
|
if (hasSingle) return "SMF";
|
|
if (hasMulti) return "MMF";
|
|
return undefined;
|
|
}
|
|
|
|
function detectWavelengths(text: string): string | undefined {
|
|
const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
|
|
.map((match) => match[1])
|
|
.filter((value, index, values) => values.indexOf(value) === index);
|
|
return matches.length ? matches.join("/") : undefined;
|
|
}
|
|
|
|
function detectConnector(text: string): string | undefined {
|
|
const connectors = [
|
|
/MTP\/MPO[-\w/]*/i,
|
|
/LC[-\w/]*/i,
|
|
/\bCS\b/i,
|
|
/\bSN\b/i,
|
|
/\bRJ-?45\b/i,
|
|
];
|
|
for (const regex of connectors) {
|
|
const match = text.match(regex);
|
|
if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45");
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectStandard(text: string): string | undefined {
|
|
const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i);
|
|
return match ? match[0].toUpperCase() : undefined;
|
|
}
|
|
|
|
function parseDetail(html: string): DetailPatch {
|
|
const $ = cheerio.load(html);
|
|
const title = cleanText($('meta[name="title"]').attr("content") || $("title").text());
|
|
const description = cleanText(
|
|
$('meta[name="description"]').attr("content") ||
|
|
$('meta[property="og:description"]').attr("content") ||
|
|
""
|
|
);
|
|
const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`;
|
|
const priceText = $('meta[property="product:price:amount"]').attr("content");
|
|
const price = priceText ? parseFloat(priceText) : undefined;
|
|
const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR";
|
|
const imageUrl =
|
|
$('meta[property="og:image"]').attr("content") ||
|
|
$('meta[name="twitter:image"]').attr("content") ||
|
|
undefined;
|
|
const reach = detectReach(combined);
|
|
|
|
return {
|
|
title,
|
|
description,
|
|
price: price && price > 0 && price < 100000 ? price : undefined,
|
|
currency,
|
|
imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined,
|
|
reachLabel: reach?.label,
|
|
reachMeters: reach?.meters,
|
|
fiberType: detectFiber(combined),
|
|
wavelengths: detectWavelengths(combined),
|
|
connector: detectConnector(combined),
|
|
standardName: detectStandard(combined),
|
|
};
|
|
}
|
|
|
|
async function run(): Promise<void> {
|
|
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
|
|
const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10)));
|
|
const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false";
|
|
|
|
const rows = await pool.query(
|
|
`
|
|
SELECT t.id, t.part_number, t.product_page_url
|
|
FROM transceivers t
|
|
JOIN vendors v ON v.id = t.vendor_id
|
|
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
|
|
AND t.product_page_url IS NOT NULL
|
|
AND t.product_page_url != ''
|
|
AND t.product_page_url LIKE 'https://www.flexoptix.net/%'
|
|
AND t.product_page_url LIKE '%.html%'
|
|
AND (
|
|
$2::boolean = false
|
|
OR t.price_verified = false OR t.price_verified IS NULL
|
|
OR t.image_verified = false OR t.image_verified IS NULL
|
|
OR t.details_verified = false OR t.details_verified IS NULL
|
|
OR t.reach_label IS NULL OR t.reach_label = ''
|
|
OR t.fiber_type IS NULL OR t.fiber_type = ''
|
|
OR t.wavelengths IS NULL OR t.wavelengths = ''
|
|
)
|
|
ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number
|
|
LIMIT $1
|
|
`,
|
|
[limit, onlyMissing]
|
|
);
|
|
|
|
console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`);
|
|
|
|
let fetched = 0;
|
|
let failed = 0;
|
|
let prices = 0;
|
|
let images = 0;
|
|
let details = 0;
|
|
|
|
for (const row of rows.rows) {
|
|
try {
|
|
const resp = await fetch(row.product_page_url, {
|
|
headers: HEADERS,
|
|
signal: AbortSignal.timeout(20000),
|
|
});
|
|
if (!resp.ok) {
|
|
failed++;
|
|
console.warn(` ${row.part_number}: HTTP ${resp.status}`);
|
|
continue;
|
|
}
|
|
|
|
const html = await resp.text();
|
|
const patch = parseDetail(html);
|
|
|
|
await pool.query(
|
|
`
|
|
UPDATE transceivers
|
|
SET reach_label = COALESCE(NULLIF(reach_label, ''), $2),
|
|
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END,
|
|
fiber_type = COALESCE(NULLIF(fiber_type, ''), $4),
|
|
wavelengths = COALESCE(NULLIF(wavelengths, ''), $5),
|
|
connector = COALESCE(NULLIF(connector, ''), $6),
|
|
standard_name = COALESCE(NULLIF(standard_name, ''), $7),
|
|
product_page_url = COALESCE(NULLIF(product_page_url, ''), $8),
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`,
|
|
[
|
|
row.id,
|
|
patch.reachLabel || null,
|
|
patch.reachMeters ?? null,
|
|
patch.fiberType || null,
|
|
patch.wavelengths || null,
|
|
patch.connector || null,
|
|
patch.standardName || null,
|
|
row.product_page_url,
|
|
]
|
|
);
|
|
|
|
if (patch.imageUrl) {
|
|
const marked = await markImageVerified(row.id, patch.imageUrl);
|
|
if (marked) images++;
|
|
}
|
|
|
|
if (patch.price) {
|
|
const updated = await upsertPriceObservation({
|
|
transceiverId: row.id,
|
|
sourceVendorId: vendorId,
|
|
price: patch.price,
|
|
currency: patch.currency,
|
|
stockLevel: "in_stock",
|
|
url: row.product_page_url,
|
|
contentHash: contentHash({ price: patch.price, part: row.part_number }),
|
|
});
|
|
if (updated) prices++;
|
|
}
|
|
|
|
const verified = await markDetailsVerified({
|
|
transceiverId: row.id,
|
|
sourceUrl: row.product_page_url,
|
|
});
|
|
if (verified) details++;
|
|
fetched++;
|
|
|
|
if (fetched % 25 === 0) {
|
|
console.log(` processed ${fetched}/${rows.rows.length}`);
|
|
}
|
|
} catch (error) {
|
|
failed++;
|
|
console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`);
|
|
}
|
|
|
|
await sleep(800);
|
|
}
|
|
|
|
console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
run()
|
|
.then(() => pool.end())
|
|
.catch((error) => {
|
|
console.error("Fatal:", error);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|