fix: enrich flexoptix detail verification
This commit is contained in:
parent
5522bb2152
commit
549b4430df
@ -182,6 +182,18 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
|
||||
const generic = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
||||
if (generic) {
|
||||
const value = parseFloat(generic[1].replace(/,/g, ""));
|
||||
const unit = generic[2].toLowerCase();
|
||||
if (Number.isFinite(value) && value > 0) {
|
||||
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
|
||||
const normalized = Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
|
||||
const label = unit === "km" ? `${normalized}km` : `${normalized}m`;
|
||||
return { label, meters };
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
@ -194,8 +206,10 @@ function detectFiber(text: string): string {
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
if (match) return match[1];
|
||||
const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
|
||||
.map((match) => match[1])
|
||||
.filter((value, index, values) => values.indexOf(value) === index);
|
||||
if (matches.length) return matches.join("/");
|
||||
return "";
|
||||
}
|
||||
|
||||
@ -561,6 +575,7 @@ export async function scrapeFlexoptixCatalog(): Promise<void> {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
|
||||
275
packages/scraper/src/scrapers/flexoptix-detail-pages.ts
Normal file
275
packages/scraper/src/scrapers/flexoptix-detail-pages.ts
Normal file
@ -0,0 +1,275 @@
|
||||
/**
|
||||
* Flexoptix detail-page verifier.
|
||||
*
|
||||
* Targeted pass for Flexoptix rows that already have product_page_url but are
|
||||
* missing price/image/details signals. Uses static product HTML only.
|
||||
*/
|
||||
import * as cheerio from "cheerio";
|
||||
import {
|
||||
ensureVendor,
|
||||
markDetailsVerified,
|
||||
markImageVerified,
|
||||
pool,
|
||||
upsertPriceObservation,
|
||||
} from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
type DetailPatch = {
|
||||
title: string;
|
||||
description: string;
|
||||
price?: number;
|
||||
currency: string;
|
||||
imageUrl?: string;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelengths?: string;
|
||||
connector?: string;
|
||||
standardName?: string;
|
||||
};
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function cleanText(value: string | undefined): string {
|
||||
return (value || "")
|
||||
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
||||
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/ /g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function formatNumber(value: number): string {
|
||||
return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, "");
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
||||
if (range) {
|
||||
const max = parseFloat(range[2].replace(/,/g, ""));
|
||||
const unit = range[3].toLowerCase();
|
||||
const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max));
|
||||
return { label: `${formatNumber(max)}${unit}`, meters };
|
||||
}
|
||||
|
||||
const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
||||
const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i);
|
||||
if (generic) {
|
||||
const value = parseFloat(generic[1].replace(/,/g, ""));
|
||||
const unit = generic[2].toLowerCase();
|
||||
if (Number.isFinite(value) && value > 0) {
|
||||
const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value));
|
||||
return { label: `${formatNumber(value)}${unit}`, meters };
|
||||
}
|
||||
}
|
||||
|
||||
if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 };
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string | undefined {
|
||||
if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper";
|
||||
if (/active optical cable|\baoc\b/i.test(text)) return "AOC";
|
||||
const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text);
|
||||
const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text);
|
||||
if (hasSingle && hasMulti) return "SMF/MMF";
|
||||
if (hasSingle) return "SMF";
|
||||
if (hasMulti) return "MMF";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectWavelengths(text: string): string | undefined {
|
||||
const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)]
|
||||
.map((match) => match[1])
|
||||
.filter((value, index, values) => values.indexOf(value) === index);
|
||||
return matches.length ? matches.join("/") : undefined;
|
||||
}
|
||||
|
||||
function detectConnector(text: string): string | undefined {
|
||||
const connectors = [
|
||||
/MTP\/MPO[-\w/]*/i,
|
||||
/LC[-\w/]*/i,
|
||||
/\bCS\b/i,
|
||||
/\bSN\b/i,
|
||||
/\bRJ-?45\b/i,
|
||||
];
|
||||
for (const regex of connectors) {
|
||||
const match = text.match(regex);
|
||||
if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45");
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectStandard(text: string): string | undefined {
|
||||
const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i);
|
||||
return match ? match[0].toUpperCase() : undefined;
|
||||
}
|
||||
|
||||
function parseDetail(html: string): DetailPatch {
|
||||
const $ = cheerio.load(html);
|
||||
const title = cleanText($('meta[name="title"]').attr("content") || $("title").text());
|
||||
const description = cleanText(
|
||||
$('meta[name="description"]').attr("content") ||
|
||||
$('meta[property="og:description"]').attr("content") ||
|
||||
""
|
||||
);
|
||||
const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`;
|
||||
const priceText = $('meta[property="product:price:amount"]').attr("content");
|
||||
const price = priceText ? parseFloat(priceText) : undefined;
|
||||
const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR";
|
||||
const imageUrl =
|
||||
$('meta[property="og:image"]').attr("content") ||
|
||||
$('meta[name="twitter:image"]').attr("content") ||
|
||||
undefined;
|
||||
const reach = detectReach(combined);
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
currency,
|
||||
imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelengths: detectWavelengths(combined),
|
||||
connector: detectConnector(combined),
|
||||
standardName: detectStandard(combined),
|
||||
};
|
||||
}
|
||||
|
||||
async function run(): Promise<void> {
|
||||
const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/");
|
||||
const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10)));
|
||||
const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false";
|
||||
|
||||
const rows = await pool.query(
|
||||
`
|
||||
SELECT t.id, t.part_number, t.product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
|
||||
AND t.product_page_url IS NOT NULL
|
||||
AND t.product_page_url != ''
|
||||
AND t.product_page_url LIKE 'https://www.flexoptix.net/%'
|
||||
AND t.product_page_url LIKE '%.html%'
|
||||
AND (
|
||||
$2::boolean = false
|
||||
OR t.price_verified = false OR t.price_verified IS NULL
|
||||
OR t.image_verified = false OR t.image_verified IS NULL
|
||||
OR t.details_verified = false OR t.details_verified IS NULL
|
||||
OR t.reach_label IS NULL OR t.reach_label = ''
|
||||
OR t.fiber_type IS NULL OR t.fiber_type = ''
|
||||
OR t.wavelengths IS NULL OR t.wavelengths = ''
|
||||
)
|
||||
ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number
|
||||
LIMIT $1
|
||||
`,
|
||||
[limit, onlyMissing]
|
||||
);
|
||||
|
||||
console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`);
|
||||
|
||||
let fetched = 0;
|
||||
let failed = 0;
|
||||
let prices = 0;
|
||||
let images = 0;
|
||||
let details = 0;
|
||||
|
||||
for (const row of rows.rows) {
|
||||
try {
|
||||
const resp = await fetch(row.product_page_url, {
|
||||
headers: HEADERS,
|
||||
signal: AbortSignal.timeout(20000),
|
||||
});
|
||||
if (!resp.ok) {
|
||||
failed++;
|
||||
console.warn(` ${row.part_number}: HTTP ${resp.status}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const html = await resp.text();
|
||||
const patch = parseDetail(html);
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE transceivers
|
||||
SET reach_label = COALESCE(NULLIF(reach_label, ''), $2),
|
||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END,
|
||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $4),
|
||||
wavelengths = COALESCE(NULLIF(wavelengths, ''), $5),
|
||||
connector = COALESCE(NULLIF(connector, ''), $6),
|
||||
standard_name = COALESCE(NULLIF(standard_name, ''), $7),
|
||||
product_page_url = COALESCE(NULLIF(product_page_url, ''), $8),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[
|
||||
row.id,
|
||||
patch.reachLabel || null,
|
||||
patch.reachMeters ?? null,
|
||||
patch.fiberType || null,
|
||||
patch.wavelengths || null,
|
||||
patch.connector || null,
|
||||
patch.standardName || null,
|
||||
row.product_page_url,
|
||||
]
|
||||
);
|
||||
|
||||
if (patch.imageUrl) {
|
||||
const marked = await markImageVerified(row.id, patch.imageUrl);
|
||||
if (marked) images++;
|
||||
}
|
||||
|
||||
if (patch.price) {
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: row.id,
|
||||
sourceVendorId: vendorId,
|
||||
price: patch.price,
|
||||
currency: patch.currency,
|
||||
stockLevel: "in_stock",
|
||||
url: row.product_page_url,
|
||||
contentHash: contentHash({ price: patch.price, part: row.part_number }),
|
||||
});
|
||||
if (updated) prices++;
|
||||
}
|
||||
|
||||
const verified = await markDetailsVerified({
|
||||
transceiverId: row.id,
|
||||
sourceUrl: row.product_page_url,
|
||||
});
|
||||
if (verified) details++;
|
||||
fetched++;
|
||||
|
||||
if (fetched % 25 === 0) {
|
||||
console.log(` processed ${fetched}/${rows.rows.length}`);
|
||||
}
|
||||
} catch (error) {
|
||||
failed++;
|
||||
console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`);
|
||||
}
|
||||
|
||||
await sleep(800);
|
||||
}
|
||||
|
||||
console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
run()
|
||||
.then(() => pool.end())
|
||||
.catch((error) => {
|
||||
console.error("Fatal:", error);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -1,9 +1,87 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 06:15 UTC
|
||||
Updated: 2026-05-09 07:34 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP Flexoptix completion push on 2026-05-09:
|
||||
- operator said "feuer frei" after confirming Flexoptix was not yet complete
|
||||
- TIPLLM training pool was updated immediately with the truth rule:
|
||||
- all Flexoptix products are not complete
|
||||
- active catalog coverage must be separated from historical/extra DB rows
|
||||
- never claim 100% verification without exact counters and fresh source timestamps
|
||||
- code improved:
|
||||
- `packages/scraper/src/scrapers/flexoptix-catalog.ts`
|
||||
- generic reach parsing now handles values such as `50 m`, `1,000 m`, decimal/range forms
|
||||
- wavelength parsing now handles multiple `λ... nm` values
|
||||
- product URL is now passed into `findOrCreateScrapedTransceiver`
|
||||
- `packages/scraper/src/scrapers/flexoptix-detail-pages.ts`
|
||||
- new targeted Flexoptix detail-page verifier
|
||||
- fetches only Flexoptix `.html` product pages with missing price/image/detail fields
|
||||
- parses static product page metadata:
|
||||
- title
|
||||
- description
|
||||
- `og:image`
|
||||
- `product:price:amount`
|
||||
- reach
|
||||
- fiber type
|
||||
- wavelengths
|
||||
- connector
|
||||
- standard name
|
||||
- writes only DB evidence from Flexoptix pages, no external AI
|
||||
- live run results on Erik:
|
||||
- `pnpm -C packages/scraper build` passed
|
||||
- improved catalog run completed:
|
||||
- `Total unique products after GraphQL: 615`
|
||||
- `Flexoptix Catalog Complete: 615 products, 0 prices`
|
||||
- details improved from:
|
||||
- `details_verified: 500`
|
||||
- `price+image+details: 496`
|
||||
- `fully_verified: 496`
|
||||
- after catalog parser improvement:
|
||||
- `details_verified: 606`
|
||||
- `price+image+details: 602`
|
||||
- `fully_verified: 602`
|
||||
- detail verifier run:
|
||||
- target: `191` real `.html` product pages
|
||||
- fetched: `191`
|
||||
- failed: `0`
|
||||
- new/updated price observations: `177`
|
||||
- images marked: `187`
|
||||
- details marked: `185`
|
||||
- after detail verifier and explicit BiDi correction:
|
||||
- total Flexoptix rows: `744`
|
||||
- HTML product-like rows: `626`
|
||||
- price verified: `626`
|
||||
- image verified: `622`
|
||||
- details verified: `624`
|
||||
- price+image+details verified: `620`
|
||||
- fully verified: `620`
|
||||
- filter/category rows with no verification: `108`
|
||||
- other non-product/generic rows with no verification: `10`
|
||||
- manual evidence correction:
|
||||
- four BiDi SFP products had `1,000 m` in the Flexoptix title
|
||||
- updated from source evidence:
|
||||
- `S.B1312.M.DIL`
|
||||
- `S.B1312.M.DL`
|
||||
- `S.B1512.M.DIL`
|
||||
- `S.B1512.M.DL`
|
||||
- set:
|
||||
- `reach_label=1000m`
|
||||
- `reach_meters=1000`
|
||||
- `fiber_type=MMF`
|
||||
- `details_verified=true`
|
||||
- remaining truth:
|
||||
- active/product-like Flexoptix rows are much closer to complete
|
||||
- not all `744` Flexoptix rows can honestly be 100% verified because `118` are filter/category/generic/non-product URLs rather than concrete product pages
|
||||
- remaining HTML product-like gaps observed before SSH became unavailable:
|
||||
- `4` product-like rows without image verification
|
||||
- `2` FLEXBOX/accessory-like rows without reach/details
|
||||
- operational note:
|
||||
- Erik SSH became unavailable with `connection refused` after the last verification checks
|
||||
- public TIP HTTPS still responded through Cloudflare
|
||||
- no further live commands were started after SSH refused
|
||||
|
||||
- TIP Flexoptix price truth recheck on 2026-05-09:
|
||||
- operator question:
|
||||
- are all Flexoptix prices, images and information present
|
||||
|
||||
98
sync/history/2026-05-09-flexoptix-completion-push.md
Normal file
98
sync/history/2026-05-09-flexoptix-completion-push.md
Normal file
@ -0,0 +1,98 @@
|
||||
# Flexoptix Completion Push
|
||||
|
||||
Date: 2026-05-09
|
||||
|
||||
## Goal
|
||||
|
||||
Push Flexoptix products as far as possible toward complete automated verification without manually approving incomplete data.
|
||||
|
||||
## Code Changes
|
||||
|
||||
- `packages/scraper/src/scrapers/flexoptix-catalog.ts`
|
||||
- Added generic reach parsing for values such as `50 m`, `1,000 m`, decimal values, and ranges.
|
||||
- Improved wavelength parsing for multiple `λ... nm` values.
|
||||
- Passed `productUrl` into `findOrCreateScrapedTransceiver`.
|
||||
|
||||
- `packages/scraper/src/scrapers/flexoptix-detail-pages.ts`
|
||||
- Added a targeted Flexoptix detail-page verifier.
|
||||
- Fetches only real Flexoptix `.html` product pages with missing signals.
|
||||
- Parses static product page evidence:
|
||||
- title
|
||||
- description
|
||||
- `og:image`
|
||||
- `product:price:amount`
|
||||
- reach
|
||||
- fiber type
|
||||
- wavelengths
|
||||
- connector
|
||||
- standard name
|
||||
|
||||
## Live Runs
|
||||
|
||||
- Built on Erik:
|
||||
- `pnpm -C packages/scraper build`
|
||||
- Ran improved Flexoptix catalog scraper:
|
||||
- `Total unique products after GraphQL: 615`
|
||||
- `Flexoptix Catalog Complete: 615 products, 0 prices`
|
||||
- Ran detail-page verifier:
|
||||
- target: `191`
|
||||
- fetched: `191`
|
||||
- failed: `0`
|
||||
- new/updated price observations: `177`
|
||||
- images marked: `187`
|
||||
- details marked: `185`
|
||||
|
||||
## Verification Improvement
|
||||
|
||||
Before the completion push:
|
||||
|
||||
- details verified: `500`
|
||||
- price + image + details verified: `496`
|
||||
- fully verified: `496`
|
||||
|
||||
After catalog parser improvement:
|
||||
|
||||
- details verified: `606`
|
||||
- price + image + details verified: `602`
|
||||
- fully verified: `602`
|
||||
|
||||
After detail verifier and explicit BiDi correction:
|
||||
|
||||
- total Flexoptix rows: `744`
|
||||
- HTML product-like rows: `626`
|
||||
- price verified: `626`
|
||||
- image verified: `622`
|
||||
- details verified: `624`
|
||||
- price + image + details verified: `620`
|
||||
- fully verified: `620`
|
||||
- filter/category rows with no verification: `108`
|
||||
- other non-product/generic rows with no verification: `10`
|
||||
|
||||
## Source Evidence Correction
|
||||
|
||||
Four BiDi SFP products had `1,000 m` in the Flexoptix page title. They were corrected from Flexoptix source evidence:
|
||||
|
||||
- `S.B1312.M.DIL`
|
||||
- `S.B1312.M.DL`
|
||||
- `S.B1512.M.DIL`
|
||||
- `S.B1512.M.DL`
|
||||
|
||||
Set:
|
||||
|
||||
- `reach_label=1000m`
|
||||
- `reach_meters=1000`
|
||||
- `fiber_type=MMF`
|
||||
- `details_verified=true`
|
||||
|
||||
## Remaining Truth
|
||||
|
||||
Do not claim all `744` Flexoptix rows are complete. The remaining unverified rows are mostly filter/category/generic URLs rather than concrete product pages.
|
||||
|
||||
Remaining product-like gaps observed before SSH became unavailable:
|
||||
|
||||
- `4` product-like rows without image verification
|
||||
- `2` FLEXBOX/accessory-like rows without reach/details
|
||||
|
||||
## Operational Note
|
||||
|
||||
After the last verification checks, SSH to Erik returned `connection refused`. Public TIP HTTPS still responded through Cloudflare. No further live commands were started after SSH refused.
|
||||
Loading…
x
Reference in New Issue
Block a user