transceiver-db/packages/scraper/src/scrapers/flexoptix-detail-pages.ts
2026-05-09 09:36:28 +02:00

276 lines
9.2 KiB
TypeScript

/**
* Flexoptix detail-page verifier.
*
* Targeted pass for Flexoptix rows that already have product_page_url but are
* missing price/image/details signals. Uses static product HTML only.
*/
import * as cheerio from "cheerio";
import {
ensureVendor,
markDetailsVerified,
markImageVerified,
pool,
upsertPriceObservation,
} from "../utils/db";
import { contentHash } from "../utils/hash";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)",
Accept: "text/html,application/xhtml+xml",
};
type DetailPatch = {
title: string;
description: string;
price?: number;
currency: string;
imageUrl?: string;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelengths?: string;
connector?: string;
standardName?: string;
};
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function cleanText(value: string | undefined): string {
return (value || "")
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
.replace(/&amp;/g, "&")
.replace(/&quot;/g, '"')
.replace(/&nbsp;/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function formatNumber(value: number): string {
return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (range) {
const max = parseFloat(range[2].replace(/,/g, ""));
const unit = range[3].toLowerCase();
const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max));
return { label: `${formatNumber(max)}${unit}`, meters };
}
const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1].replace(/,/g, ""));
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
return { label: `${formatNumber(value)}${unit}`, meters };
}
}
if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 };
return undefined;
}
function detectFiber(text: string): string | undefined {
if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper";
if (/active optical cable|\baoc\b/i.test(text)) return "AOC";
const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text);
const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text);
if (hasSingle && hasMulti) return "SMF/MMF";
if (hasSingle) return "SMF";
if (hasMulti) return "MMF";
return undefined;
}
function detectWavelengths(text: string): string | undefined {
const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
.map((match) => match[1])
.filter((value, index, values) => values.indexOf(value) === index);
return matches.length ? matches.join("/") : undefined;
}
function detectConnector(text: string): string | undefined {
const connectors = [
/MTP\/MPO[-\w/]*/i,
/LC[-\w/]*/i,
/\bCS\b/i,
/\bSN\b/i,
/\bRJ-?45\b/i,
];
for (const regex of connectors) {
const match = text.match(regex);
if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45");
}
return undefined;
}
function detectStandard(text: string): string | undefined {
const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i);
return match ? match[0].toUpperCase() : undefined;
}
function parseDetail(html: string): DetailPatch {
const $ = cheerio.load(html);
const title = cleanText($('meta[name="title"]').attr("content") || $("title").text());
const description = cleanText(
$('meta[name="description"]').attr("content") ||
$('meta[property="og:description"]').attr("content") ||
""
);
const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`;
const priceText = $('meta[property="product:price:amount"]').attr("content");
const price = priceText ? parseFloat(priceText) : undefined;
const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR";
const imageUrl =
$('meta[property="og:image"]').attr("content") ||
$('meta[name="twitter:image"]').attr("content") ||
undefined;
const reach = detectReach(combined);
return {
title,
description,
price: price && price > 0 && price < 100000 ? price : undefined,
currency,
imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelengths: detectWavelengths(combined),
connector: detectConnector(combined),
standardName: detectStandard(combined),
};
}
async function run(): Promise<void> {
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10)));
const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false";
const rows = await pool.query(
`
SELECT t.id, t.part_number, t.product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
AND t.product_page_url IS NOT NULL
AND t.product_page_url != ''
AND t.product_page_url LIKE 'https://www.flexoptix.net/%'
AND t.product_page_url LIKE '%.html%'
AND (
$2::boolean = false
OR t.price_verified = false OR t.price_verified IS NULL
OR t.image_verified = false OR t.image_verified IS NULL
OR t.details_verified = false OR t.details_verified IS NULL
OR t.reach_label IS NULL OR t.reach_label = ''
OR t.fiber_type IS NULL OR t.fiber_type = ''
OR t.wavelengths IS NULL OR t.wavelengths = ''
)
ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number
LIMIT $1
`,
[limit, onlyMissing]
);
console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`);
let fetched = 0;
let failed = 0;
let prices = 0;
let images = 0;
let details = 0;
for (const row of rows.rows) {
try {
const resp = await fetch(row.product_page_url, {
headers: HEADERS,
signal: AbortSignal.timeout(20000),
});
if (!resp.ok) {
failed++;
console.warn(` ${row.part_number}: HTTP ${resp.status}`);
continue;
}
const html = await resp.text();
const patch = parseDetail(html);
await pool.query(
`
UPDATE transceivers
SET reach_label = COALESCE(NULLIF(reach_label, ''), $2),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END,
fiber_type = COALESCE(NULLIF(fiber_type, ''), $4),
wavelengths = COALESCE(NULLIF(wavelengths, ''), $5),
connector = COALESCE(NULLIF(connector, ''), $6),
standard_name = COALESCE(NULLIF(standard_name, ''), $7),
product_page_url = COALESCE(NULLIF(product_page_url, ''), $8),
updated_at = NOW()
WHERE id = $1
`,
[
row.id,
patch.reachLabel || null,
patch.reachMeters ?? null,
patch.fiberType || null,
patch.wavelengths || null,
patch.connector || null,
patch.standardName || null,
row.product_page_url,
]
);
if (patch.imageUrl) {
const marked = await markImageVerified(row.id, patch.imageUrl);
if (marked) images++;
}
if (patch.price) {
const updated = await upsertPriceObservation({
transceiverId: row.id,
sourceVendorId: vendorId,
price: patch.price,
currency: patch.currency,
stockLevel: "in_stock",
url: row.product_page_url,
contentHash: contentHash({ price: patch.price, part: row.part_number }),
});
if (updated) prices++;
}
const verified = await markDetailsVerified({
transceiverId: row.id,
sourceUrl: row.product_page_url,
});
if (verified) details++;
fetched++;
if (fetched % 25 === 0) {
console.log(` processed ${fetched}/${rows.rows.length}`);
}
} catch (error) {
failed++;
console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`);
}
await sleep(800);
}
console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`);
}
if (require.main === module) {
run()
.then(() => pool.end())
.catch((error) => {
console.error("Fatal:", error);
pool.end();
process.exit(1);
});
}