diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts index 07f65ed..b0b8c07 100644 --- a/packages/scraper/src/scrapers/flexoptix-catalog.ts +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -182,6 +182,18 @@ function detectReach(text: string): { label: string; meters: number } | undefine for (const [regex, label, meters] of patterns) { if (regex.test(text)) return { label, meters }; } + + const generic = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); + if (generic) { + const value = parseFloat(generic[1].replace(/,/g, "")); + const unit = generic[2].toLowerCase(); + if (Number.isFinite(value) && value > 0) { + const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value)); + const normalized = Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, ""); + const label = unit === "km" ? `${normalized}km` : `${normalized}m`; + return { label, meters }; + } + } return undefined; } @@ -194,8 +206,10 @@ function detectFiber(text: string): string { } function detectWavelength(text: string): string { - const match = text.match(/(\d{3,4})\s*nm/i); - if (match) return match[1]; + const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)] + .map((match) => match[1]) + .filter((value, index, values) => values.indexOf(value) === index); + if (matches.length) return matches.join("/"); return ""; } @@ -561,6 +575,7 @@ export async function scrapeFlexoptixCatalog(): Promise { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, diff --git a/packages/scraper/src/scrapers/flexoptix-detail-pages.ts b/packages/scraper/src/scrapers/flexoptix-detail-pages.ts new file mode 100644 index 0000000..0ad1d2f --- /dev/null +++ b/packages/scraper/src/scrapers/flexoptix-detail-pages.ts @@ -0,0 +1,275 @@ +/** + * Flexoptix detail-page verifier. + * + * Targeted pass for Flexoptix rows that already have product_page_url but are + * missing price/image/details signals. Uses static product HTML only. + */ +import * as cheerio from "cheerio"; +import { + ensureVendor, + markDetailsVerified, + markImageVerified, + pool, + upsertPriceObservation, +} from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix-detail)", + Accept: "text/html,application/xhtml+xml", +}; + +type DetailPatch = { + title: string; + description: string; + price?: number; + currency: string; + imageUrl?: string; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelengths?: string; + connector?: string; + standardName?: string; +}; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function cleanText(value: string | undefined): string { + return (value || "") + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) + .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) + .replace(/&/g, "&") + .replace(/"/g, '"') + .replace(/ /g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function formatNumber(value: number): string { + return Number.isInteger(value) ? String(value) : String(value).replace(/0+$/, "").replace(/\.$/, ""); +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const range = text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*-\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); + if (range) { + const max = parseFloat(range[2].replace(/,/g, "")); + const unit = range[3].toLowerCase(); + const meters = unit === "km" ? Math.round(max * 1000) : Math.max(1, Math.round(max)); + return { label: `${formatNumber(max)}${unit}`, meters }; + } + + const upTo = text.match(/\b(?:up to|max\.?|distance[:\s]*)\s*(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); + const generic = upTo || text.match(/\b(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(km|m)\b/i); + if (generic) { + const value = parseFloat(generic[1].replace(/,/g, "")); + const unit = generic[2].toLowerCase(); + if (Number.isFinite(value) && value > 0) { + const meters = unit === "km" ? Math.round(value * 1000) : Math.max(1, Math.round(value)); + return { label: `${formatNumber(value)}${unit}`, meters }; + } + } + + if (/\badapter|converter|serial to ip/i.test(text)) return { label: "N/A", meters: 0 }; + return undefined; +} + +function detectFiber(text: string): string | undefined { + if (/active electrical cable|\baec\b|copper|dac|twinax|rj45|base-t|serial to ip/i.test(text)) return "Copper"; + if (/active optical cable|\baoc\b/i.test(text)) return "AOC"; + const hasSingle = /single.?mode|\bsmf\b|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text); + const hasMulti = /multi.?mode|\bmmf\b|\bsx\b|\bsr\b/i.test(text); + if (hasSingle && hasMulti) return "SMF/MMF"; + if (hasSingle) return "SMF"; + if (hasMulti) return "MMF"; + return undefined; +} + +function detectWavelengths(text: string): string | undefined { + const matches = [...text.matchAll(/(?:λ|lambda)?\s*(\d{3,4})\s*nm/gi)] + .map((match) => match[1]) + .filter((value, index, values) => values.indexOf(value) === index); + return matches.length ? matches.join("/") : undefined; +} + +function detectConnector(text: string): string | undefined { + const connectors = [ + /MTP\/MPO[-\w/]*/i, + /LC[-\w/]*/i, + /\bCS\b/i, + /\bSN\b/i, + /\bRJ-?45\b/i, + ]; + for (const regex of connectors) { + const match = text.match(regex); + if (match) return match[0].toUpperCase().replace("RJ45", "RJ-45"); + } + return undefined; +} + +function detectStandard(text: string): string | undefined { + const match = text.match(/\b(?:\d+(?:\.\d+)?[GT]?BASE-[A-Z0-9.+-]+|[A-Z0-9]+GBASE-[A-Z0-9.+-]+)\b/i); + return match ? match[0].toUpperCase() : undefined; +} + +function parseDetail(html: string): DetailPatch { + const $ = cheerio.load(html); + const title = cleanText($('meta[name="title"]').attr("content") || $("title").text()); + const description = cleanText( + $('meta[name="description"]').attr("content") || + $('meta[property="og:description"]').attr("content") || + "" + ); + const combined = `${title} ${description} ${cleanText($(".description-list-item").text())}`; + const priceText = $('meta[property="product:price:amount"]').attr("content"); + const price = priceText ? parseFloat(priceText) : undefined; + const currency = $('meta[property="product:price:currency"]').attr("content") || "EUR"; + const imageUrl = + $('meta[property="og:image"]').attr("content") || + $('meta[name="twitter:image"]').attr("content") || + undefined; + const reach = detectReach(combined); + + return { + title, + description, + price: price && price > 0 && price < 100000 ? price : undefined, + currency, + imageUrl: imageUrl && !/placeholder|no-image|logo/i.test(imageUrl) ? imageUrl : undefined, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(combined), + wavelengths: detectWavelengths(combined), + connector: detectConnector(combined), + standardName: detectStandard(combined), + }; +} + +async function run(): Promise { + const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/"); + const limit = Math.max(1, Math.min(1000, parseInt(process.env["FLEXOPTIX_DETAIL_LIMIT"] || "300", 10))); + const onlyMissing = process.env["FLEXOPTIX_DETAIL_ONLY_MISSING"] !== "false"; + + const rows = await pool.query( + ` + SELECT t.id, t.part_number, t.product_page_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE UPPER(v.name) LIKE '%FLEXOPTIX%' + AND t.product_page_url IS NOT NULL + AND t.product_page_url != '' + AND t.product_page_url LIKE 'https://www.flexoptix.net/%' + AND t.product_page_url LIKE '%.html%' + AND ( + $2::boolean = false + OR t.price_verified = false OR t.price_verified IS NULL + OR t.image_verified = false OR t.image_verified IS NULL + OR t.details_verified = false OR t.details_verified IS NULL + OR t.reach_label IS NULL OR t.reach_label = '' + OR t.fiber_type IS NULL OR t.fiber_type = '' + OR t.wavelengths IS NULL OR t.wavelengths = '' + ) + ORDER BY t.price_verified DESC, t.image_verified DESC, t.details_verified ASC, t.part_number + LIMIT $1 + `, + [limit, onlyMissing] + ); + + console.log(`=== Flexoptix detail verifier: ${rows.rows.length} products ===`); + + let fetched = 0; + let failed = 0; + let prices = 0; + let images = 0; + let details = 0; + + for (const row of rows.rows) { + try { + const resp = await fetch(row.product_page_url, { + headers: HEADERS, + signal: AbortSignal.timeout(20000), + }); + if (!resp.ok) { + failed++; + console.warn(` ${row.part_number}: HTTP ${resp.status}`); + continue; + } + + const html = await resp.text(); + const patch = parseDetail(html); + + await pool.query( + ` + UPDATE transceivers + SET reach_label = COALESCE(NULLIF(reach_label, ''), $2), + reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($3, reach_meters) ELSE reach_meters END, + fiber_type = COALESCE(NULLIF(fiber_type, ''), $4), + wavelengths = COALESCE(NULLIF(wavelengths, ''), $5), + connector = COALESCE(NULLIF(connector, ''), $6), + standard_name = COALESCE(NULLIF(standard_name, ''), $7), + product_page_url = COALESCE(NULLIF(product_page_url, ''), $8), + updated_at = NOW() + WHERE id = $1 + `, + [ + row.id, + patch.reachLabel || null, + patch.reachMeters ?? null, + patch.fiberType || null, + patch.wavelengths || null, + patch.connector || null, + patch.standardName || null, + row.product_page_url, + ] + ); + + if (patch.imageUrl) { + const marked = await markImageVerified(row.id, patch.imageUrl); + if (marked) images++; + } + + if (patch.price) { + const updated = await upsertPriceObservation({ + transceiverId: row.id, + sourceVendorId: vendorId, + price: patch.price, + currency: patch.currency, + stockLevel: "in_stock", + url: row.product_page_url, + contentHash: contentHash({ price: patch.price, part: row.part_number }), + }); + if (updated) prices++; + } + + const verified = await markDetailsVerified({ + transceiverId: row.id, + sourceUrl: row.product_page_url, + }); + if (verified) details++; + fetched++; + + if (fetched % 25 === 0) { + console.log(` processed ${fetched}/${rows.rows.length}`); + } + } catch (error) { + failed++; + console.warn(` ${row.part_number}: ${(error as Error).message.slice(0, 100)}`); + } + + await sleep(800); + } + + console.log(`=== Flexoptix detail verifier complete: fetched=${fetched}, failed=${failed}, newPrices=${prices}, imagesMarked=${images}, detailsMarked=${details} ===`); +} + +if (require.main === module) { + run() + .then(() => pool.end()) + .catch((error) => { + console.error("Fatal:", error); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 06bb1ad..8332aa8 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,87 @@ # Current TIP Sync State -Updated: 2026-05-09 06:15 UTC +Updated: 2026-05-09 07:34 UTC ## Newest Work +- TIP Flexoptix completion push on 2026-05-09: + - operator said "feuer frei" after confirming Flexoptix was not yet complete + - TIPLLM training pool was updated immediately with the truth rule: + - all Flexoptix products are not complete + - active catalog coverage must be separated from historical/extra DB rows + - never claim 100% verification without exact counters and fresh source timestamps + - code improved: + - `packages/scraper/src/scrapers/flexoptix-catalog.ts` + - generic reach parsing now handles values such as `50 m`, `1,000 m`, decimal/range forms + - wavelength parsing now handles multiple `λ... nm` values + - product URL is now passed into `findOrCreateScrapedTransceiver` + - `packages/scraper/src/scrapers/flexoptix-detail-pages.ts` + - new targeted Flexoptix detail-page verifier + - fetches only Flexoptix `.html` product pages with missing price/image/detail fields + - parses static product page metadata: + - title + - description + - `og:image` + - `product:price:amount` + - reach + - fiber type + - wavelengths + - connector + - standard name + - writes only DB evidence from Flexoptix pages, no external AI + - live run results on Erik: + - `pnpm -C packages/scraper build` passed + - improved catalog run completed: + - `Total unique products after GraphQL: 615` + - `Flexoptix Catalog Complete: 615 products, 0 prices` + - details improved from: + - `details_verified: 500` + - `price+image+details: 496` + - `fully_verified: 496` + - after catalog parser improvement: + - `details_verified: 606` + - `price+image+details: 602` + - `fully_verified: 602` + - detail verifier run: + - target: `191` real `.html` product pages + - fetched: `191` + - failed: `0` + - new/updated price observations: `177` + - images marked: `187` + - details marked: `185` + - after detail verifier and explicit BiDi correction: + - total Flexoptix rows: `744` + - HTML product-like rows: `626` + - price verified: `626` + - image verified: `622` + - details verified: `624` + - price+image+details verified: `620` + - fully verified: `620` + - filter/category rows with no verification: `108` + - other non-product/generic rows with no verification: `10` + - manual evidence correction: + - four BiDi SFP products had `1,000 m` in the Flexoptix title + - updated from source evidence: + - `S.B1312.M.DIL` + - `S.B1312.M.DL` + - `S.B1512.M.DIL` + - `S.B1512.M.DL` + - set: + - `reach_label=1000m` + - `reach_meters=1000` + - `fiber_type=MMF` + - `details_verified=true` + - remaining truth: + - active/product-like Flexoptix rows are much closer to complete + - not all `744` Flexoptix rows can honestly be 100% verified because `118` are filter/category/generic/non-product URLs rather than concrete product pages + - remaining HTML product-like gaps observed before SSH became unavailable: + - `4` product-like rows without image verification + - `2` FLEXBOX/accessory-like rows without reach/details + - operational note: + - Erik SSH became unavailable with `connection refused` after the last verification checks + - public TIP HTTPS still responded through Cloudflare + - no further live commands were started after SSH refused + - TIP Flexoptix price truth recheck on 2026-05-09: - operator question: - are all Flexoptix prices, images and information present diff --git a/sync/history/2026-05-09-flexoptix-completion-push.md b/sync/history/2026-05-09-flexoptix-completion-push.md new file mode 100644 index 0000000..49c9285 --- /dev/null +++ b/sync/history/2026-05-09-flexoptix-completion-push.md @@ -0,0 +1,98 @@ +# Flexoptix Completion Push + +Date: 2026-05-09 + +## Goal + +Push Flexoptix products as far as possible toward complete automated verification without manually approving incomplete data. + +## Code Changes + +- `packages/scraper/src/scrapers/flexoptix-catalog.ts` + - Added generic reach parsing for values such as `50 m`, `1,000 m`, decimal values, and ranges. + - Improved wavelength parsing for multiple `λ... nm` values. + - Passed `productUrl` into `findOrCreateScrapedTransceiver`. + +- `packages/scraper/src/scrapers/flexoptix-detail-pages.ts` + - Added a targeted Flexoptix detail-page verifier. + - Fetches only real Flexoptix `.html` product pages with missing signals. + - Parses static product page evidence: + - title + - description + - `og:image` + - `product:price:amount` + - reach + - fiber type + - wavelengths + - connector + - standard name + +## Live Runs + +- Built on Erik: + - `pnpm -C packages/scraper build` +- Ran improved Flexoptix catalog scraper: + - `Total unique products after GraphQL: 615` + - `Flexoptix Catalog Complete: 615 products, 0 prices` +- Ran detail-page verifier: + - target: `191` + - fetched: `191` + - failed: `0` + - new/updated price observations: `177` + - images marked: `187` + - details marked: `185` + +## Verification Improvement + +Before the completion push: + +- details verified: `500` +- price + image + details verified: `496` +- fully verified: `496` + +After catalog parser improvement: + +- details verified: `606` +- price + image + details verified: `602` +- fully verified: `602` + +After detail verifier and explicit BiDi correction: + +- total Flexoptix rows: `744` +- HTML product-like rows: `626` +- price verified: `626` +- image verified: `622` +- details verified: `624` +- price + image + details verified: `620` +- fully verified: `620` +- filter/category rows with no verification: `108` +- other non-product/generic rows with no verification: `10` + +## Source Evidence Correction + +Four BiDi SFP products had `1,000 m` in the Flexoptix page title. They were corrected from Flexoptix source evidence: + +- `S.B1312.M.DIL` +- `S.B1312.M.DL` +- `S.B1512.M.DIL` +- `S.B1512.M.DL` + +Set: + +- `reach_label=1000m` +- `reach_meters=1000` +- `fiber_type=MMF` +- `details_verified=true` + +## Remaining Truth + +Do not claim all `744` Flexoptix rows are complete. The remaining unverified rows are mostly filter/category/generic URLs rather than concrete product pages. + +Remaining product-like gaps observed before SSH became unavailable: + +- `4` product-like rows without image verification +- `2` FLEXBOX/accessory-like rows without reach/details + +## Operational Note + +After the last verification checks, SSH to Erik returned `connection refused`. Public TIP HTTPS still responded through Cloudflare. No further live commands were started after SSH refused.