fix: enrich flexoptix detail verification

This commit is contained in:
Rene Fichtmueller 2026-05-09 09:36:28 +02:00
parent 5522bb2152
commit 549b4430df
4 changed files with 469 additions and 3 deletions

View File

@ -182,6 +182,18 @@ function detectReach(text: string): { label: string; meters: number } | undefine
for (const [regex, label, meters] of patterns) { for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters }; if (regex.test(text)) return { label, meters };
} }
const generic = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1].replace(/,/g, ""));
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
const normalized = Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
const label = unit === "km" ? `${normalized}km` : `${normalized}m`;
return { label, meters };
}
}
return undefined; return undefined;
} }
@ -194,8 +206,10 @@ function detectFiber(text: string): string {
} }
function detectWavelength(text: string): string { function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i); const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
if (match) return match[1]; .map((match) => match[1])
.filter((value, index, values) => values.indexOf(value) === index);
if (matches.length) return matches.join("/");
return ""; return "";
} }
@ -561,6 +575,7 @@ export async function scrapeFlexoptixCatalog(): Promise<void> {
const txId = await findOrCreateScrapedTransceiver({ const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber, partNumber: product.partNumber,
vendorId, vendorId,
productUrl: product.url,
formFactor: product.formFactor, formFactor: product.formFactor,
speedGbps: product.speedGbps, speedGbps: product.speedGbps,
speed: product.speed, speed: product.speed,

View File

@ -0,0 +1,275 @@
/**
* Flexoptix detail-page verifier.
*
* Targeted pass for Flexoptix rows that already have product_page_url but are
* missing price/image/details signals. Uses static product HTML only.
*/
import * as cheerio from "cheerio";
import {
ensureVendor,
markDetailsVerified,
markImageVerified,
pool,
upsertPriceObservation,
} from "../utils/db";
import { contentHash } from "../utils/hash";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)",
Accept: "text/html,application/xhtml+xml",
};
type DetailPatch = {
title: string;
description: string;
price?: number;
currency: string;
imageUrl?: string;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelengths?: string;
connector?: string;
standardName?: string;
};
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function cleanText(value: string | undefined): string {
return (value || "")
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
.replace(/&amp;/g, "&")
.replace(/&quot;/g, '"')
.replace(/&nbsp;/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function formatNumber(value: number): string {
return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (range) {
const max = parseFloat(range[2].replace(/,/g, ""));
const unit = range[3].toLowerCase();
const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max));
return { label: `${formatNumber(max)}${unit}`, meters };
}
const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
if (generic) {
const value = parseFloat(generic[1].replace(/,/g, ""));
const unit = generic[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
return { label: `${formatNumber(value)}${unit}`, meters };
}
}
if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 };
return undefined;
}
function detectFiber(text: string): string | undefined {
if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper";
if (/active optical cable|\baoc\b/i.test(text)) return "AOC";
const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text);
const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text);
if (hasSingle && hasMulti) return "SMF/MMF";
if (hasSingle) return "SMF";
if (hasMulti) return "MMF";
return undefined;
}
function detectWavelengths(text: string): string | undefined {
const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
.map((match) => match[1])
.filter((value, index, values) => values.indexOf(value) === index);
return matches.length ? matches.join("/") : undefined;
}
function detectConnector(text: string): string | undefined {
const connectors = [
/MTP\/MPO[-\w/]*/i,
/LC[-\w/]*/i,
/\bCS\b/i,
/\bSN\b/i,
/\bRJ-?45\b/i,
];
for (const regex of connectors) {
const match = text.match(regex);
if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45");
}
return undefined;
}
function detectStandard(text: string): string | undefined {
const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i);
return match ? match[0].toUpperCase() : undefined;
}
function parseDetail(html: string): DetailPatch {
const $ = cheerio.load(html);
const title = cleanText($('meta[name="title"]').attr("content") || $("title").text());
const description = cleanText(
$('meta[name="description"]').attr("content") ||
$('meta[property="og:description"]').attr("content") ||
""
);
const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`;
const priceText = $('meta[property="product:price:amount"]').attr("content");
const price = priceText ? parseFloat(priceText) : undefined;
const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR";
const imageUrl =
$('meta[property="og:image"]').attr("content") ||
$('meta[name="twitter:image"]').attr("content") ||
undefined;
const reach = detectReach(combined);
return {
title,
description,
price: price && price > 0 && price < 100000 ? price : undefined,
currency,
imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelengths: detectWavelengths(combined),
connector: detectConnector(combined),
standardName: detectStandard(combined),
};
}
async function run(): Promise<void> {
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10)));
const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false";
const rows = await pool.query(
`
SELECT t.id, t.part_number, t.product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
AND t.product_page_url IS NOT NULL
AND t.product_page_url != ''
AND t.product_page_url LIKE 'https://www.flexoptix.net/%'
AND t.product_page_url LIKE '%.html%'
AND (
$2::boolean = false
OR t.price_verified = false OR t.price_verified IS NULL
OR t.image_verified = false OR t.image_verified IS NULL
OR t.details_verified = false OR t.details_verified IS NULL
OR t.reach_label IS NULL OR t.reach_label = ''
OR t.fiber_type IS NULL OR t.fiber_type = ''
OR t.wavelengths IS NULL OR t.wavelengths = ''
)
ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number
LIMIT $1
`,
[limit, onlyMissing]
);
console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`);
let fetched = 0;
let failed = 0;
let prices = 0;
let images = 0;
let details = 0;
for (const row of rows.rows) {
try {
const resp = await fetch(row.product_page_url, {
headers: HEADERS,
signal: AbortSignal.timeout(20000),
});
if (!resp.ok) {
failed++;
console.warn(` ${row.part_number}: HTTP ${resp.status}`);
continue;
}
const html = await resp.text();
const patch = parseDetail(html);
await pool.query(
`
UPDATE transceivers
SET reach_label = COALESCE(NULLIF(reach_label, ''), $2),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END,
fiber_type = COALESCE(NULLIF(fiber_type, ''), $4),
wavelengths = COALESCE(NULLIF(wavelengths, ''), $5),
connector = COALESCE(NULLIF(connector, ''), $6),
standard_name = COALESCE(NULLIF(standard_name, ''), $7),
product_page_url = COALESCE(NULLIF(product_page_url, ''), $8),
updated_at = NOW()
WHERE id = $1
`,
[
row.id,
patch.reachLabel || null,
patch.reachMeters ?? null,
patch.fiberType || null,
patch.wavelengths || null,
patch.connector || null,
patch.standardName || null,
row.product_page_url,
]
);
if (patch.imageUrl) {
const marked = await markImageVerified(row.id, patch.imageUrl);
if (marked) images++;
}
if (patch.price) {
const updated = await upsertPriceObservation({
transceiverId: row.id,
sourceVendorId: vendorId,
price: patch.price,
currency: patch.currency,
stockLevel: "in_stock",
url: row.product_page_url,
contentHash: contentHash({ price: patch.price, part: row.part_number }),
});
if (updated) prices++;
}
const verified = await markDetailsVerified({
transceiverId: row.id,
sourceUrl: row.product_page_url,
});
if (verified) details++;
fetched++;
if (fetched % 25 === 0) {
console.log(` processed ${fetched}/${rows.rows.length}`);
}
} catch (error) {
failed++;
console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`);
}
await sleep(800);
}
console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`);
}
if (require.main === module) {
run()
.then(() => pool.end())
.catch((error) => {
console.error("Fatal:", error);
pool.end();
process.exit(1);
});
}

View File

@ -1,9 +1,87 @@
# Current TIP Sync State # Current TIP Sync State
Updated: 2026-05-09 06:15 UTC Updated: 2026-05-09 07:34 UTC
## Newest Work ## Newest Work
- TIP Flexoptix completion push on 2026-05-09:
- operator said "feuer frei" after confirming Flexoptix was not yet complete
- TIPLLM training pool was updated immediately with the truth rule:
- all Flexoptix products are not complete
- active catalog coverage must be separated from historical/extra DB rows
- never claim 100% verification without exact counters and fresh source timestamps
- code improved:
- `packages/scraper/src/scrapers/flexoptix-catalog.ts`
- generic reach parsing now handles values such as `50 m`, `1,000 m`, decimal/range forms
- wavelength parsing now handles multiple `λ... nm` values
- product URL is now passed into `findOrCreateScrapedTransceiver`
- `packages/scraper/src/scrapers/flexoptix-detail-pages.ts`
- new targeted Flexoptix detail-page verifier
- fetches only Flexoptix `.html` product pages with missing price/image/detail fields
- parses static product page metadata:
- title
- description
- `og:image`
- `product:price:amount`
- reach
- fiber type
- wavelengths
- connector
- standard name
- writes only DB evidence from Flexoptix pages, no external AI
- live run results on Erik:
- `pnpm -C packages/scraper build` passed
- improved catalog run completed:
- `Total unique products after GraphQL: 615`
- `Flexoptix Catalog Complete: 615 products, 0 prices`
- details improved from:
- `details_verified: 500`
- `price+image+details: 496`
- `fully_verified: 496`
- after catalog parser improvement:
- `details_verified: 606`
- `price+image+details: 602`
- `fully_verified: 602`
- detail verifier run:
- target: `191` real `.html` product pages
- fetched: `191`
- failed: `0`
- new/updated price observations: `177`
- images marked: `187`
- details marked: `185`
- after detail verifier and explicit BiDi correction:
- total Flexoptix rows: `744`
- HTML product-like rows: `626`
- price verified: `626`
- image verified: `622`
- details verified: `624`
- price+image+details verified: `620`
- fully verified: `620`
- filter/category rows with no verification: `108`
- other non-product/generic rows with no verification: `10`
- manual evidence correction:
- four BiDi SFP products had `1,000 m` in the Flexoptix title
- updated from source evidence:
- `S.B1312.M.DIL`
- `S.B1312.M.DL`
- `S.B1512.M.DIL`
- `S.B1512.M.DL`
- set:
- `reach_label=1000m`
- `reach_meters=1000`
- `fiber_type=MMF`
- `details_verified=true`
- remaining truth:
- active/product-like Flexoptix rows are much closer to complete
- not all `744` Flexoptix rows can honestly be 100% verified because `118` are filter/category/generic/non-product URLs rather than concrete product pages
- remaining HTML product-like gaps observed before SSH became unavailable:
- `4` product-like rows without image verification
- `2` FLEXBOX/accessory-like rows without reach/details
- operational note:
- Erik SSH became unavailable with `connection refused` after the last verification checks
- public TIP HTTPS still responded through Cloudflare
- no further live commands were started after SSH refused
- TIP Flexoptix price truth recheck on 2026-05-09: - TIP Flexoptix price truth recheck on 2026-05-09:
- operator question: - operator question:
- are all Flexoptix prices, images and information present - are all Flexoptix prices, images and information present

View File

@ -0,0 +1,98 @@
# Flexoptix Completion Push
Date: 2026-05-09
## Goal
Push Flexoptix products as far as possible toward complete automated verification without manually approving incomplete data.
## Code Changes
- `packages/scraper/src/scrapers/flexoptix-catalog.ts`
- Added generic reach parsing for values such as `50 m`, `1,000 m`, decimal values, and ranges.
- Improved wavelength parsing for multiple `λ... nm` values.
- Passed `productUrl` into `findOrCreateScrapedTransceiver`.
- `packages/scraper/src/scrapers/flexoptix-detail-pages.ts`
- Added a targeted Flexoptix detail-page verifier.
- Fetches only real Flexoptix `.html` product pages with missing signals.
- Parses static product page evidence:
- title
- description
- `og:image`
- `product:price:amount`
- reach
- fiber type
- wavelengths
- connector
- standard name
## Live Runs
- Built on Erik:
- `pnpm -C packages/scraper build`
- Ran improved Flexoptix catalog scraper:
- `Total unique products after GraphQL: 615`
- `Flexoptix Catalog Complete: 615 products, 0 prices`
- Ran detail-page verifier:
- target: `191`
- fetched: `191`
- failed: `0`
- new/updated price observations: `177`
- images marked: `187`
- details marked: `185`
## Verification Improvement
Before the completion push:
- details verified: `500`
- price + image + details verified: `496`
- fully verified: `496`
After catalog parser improvement:
- details verified: `606`
- price + image + details verified: `602`
- fully verified: `602`
After detail verifier and explicit BiDi correction:
- total Flexoptix rows: `744`
- HTML product-like rows: `626`
- price verified: `626`
- image verified: `622`
- details verified: `624`
- price + image + details verified: `620`
- fully verified: `620`
- filter/category rows with no verification: `108`
- other non-product/generic rows with no verification: `10`
## Source Evidence Correction
Four BiDi SFP products had `1,000 m` in the Flexoptix page title. They were corrected from Flexoptix source evidence:
- `S.B1312.M.DIL`
- `S.B1312.M.DL`
- `S.B1512.M.DIL`
- `S.B1512.M.DL`
Set:
- `reach_label=1000m`
- `reach_meters=1000`
- `fiber_type=MMF`
- `details_verified=true`
## Remaining Truth
Do not claim all `744` Flexoptix rows are complete. The remaining unverified rows are mostly filter/category/generic URLs rather than concrete product pages.
Remaining product-like gaps observed before SSH became unavailable:
- `4` product-like rows without image verification
- `2` FLEXBOX/accessory-like rows without reach/details
## Operational Note
After the last verification checks, SSH to Erik returned `connection refused`. Public TIP HTTPS still responded through Cloudflare. No further live commands were started after SSH refused.