diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index d00b17b..72532c2 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -68,8 +68,10 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; // FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist) const BASE_URL = "https://www.fs.com/de"; const MAX_PAGES_PER_CATEGORY = 10; -const MAX_DETAIL_PAGES_PER_RUN = 300; -const STOCK_FRESH_HOURS = 12; +const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_RUN"] ?? "300", 10); +const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10); +const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1"; +const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1"; const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") @@ -81,6 +83,12 @@ function buildProxyConfiguration(): ProxyConfiguration | undefined { return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); } +function normalizeFsProductUrl(url: string): string { + return url.replace(/^https:\/\/www\.fs\.com\/de\/de\//, "https://www.fs.com/de/") + .replace(/\?.*$/, "") + .replace(/\/$/, ""); +} + const CATEGORY_URLS = [ "/c/1g-sfp-81", "/c/10g-sfp-63", @@ -524,12 +532,33 @@ async function scrapeProductDetails( } } - const imgEl = document.querySelector( - ".product-image img, .prod-img img, .product-gallery img, " + + const imageCandidates = Array.from(document.querySelectorAll( + ".big_img_box img, img.big_img, .big_img_m_active, .big_img_m, " + + ".small_img_active img, .product-image img, .prod-img img, .product-gallery img, " + '[class*="main-image"] img, [class*="primary-image"] img, ' + ".slick-current img, .product__image img" - ); - const imageUrl = imgEl?.src ?? imgEl?.getAttribute("data-src") ?? ""; + )).map((img) => { + const url = + img.currentSrc || + img.getAttribute("data-src") || + img.getAttribute("data-original") || + img.getAttribute("data-lazy") || + img.getAttribute("src") || + ""; + const cls = `${img.className || ""} ${img.parentElement?.className || ""}`; + const score = + /big_img_box|big_img|big_img_m_active/.test(cls) ? 100 : + /small_img_active/.test(cls) ? 50 : + 10; + return { url, score, w: img.naturalWidth || 0, h: img.naturalHeight || 0 }; + }).filter((candidate) => + candidate.url && + /resource\.fs\.com/.test(candidate.url) && + !/default\.jpg|generalImg|logo|icon|svg/i.test(candidate.url) && + (candidate.w === 0 || candidate.w >= 120) && + (candidate.h === 0 || candidate.h >= 120) + ).sort((a, b) => b.score - a.score || (b.w * b.h) - (a.w * a.h)); + const imageUrl = imageCandidates[0]?.url ?? ""; const dsEl = document.querySelector( 'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]' @@ -759,7 +788,7 @@ export async function scrapeFs(): Promise { // ── Filter: skip products with fresh stock data ───────────────────────────── const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean); const freshlyScraped = new Set(); - if (allPartNumbers.length > 0) { + if (!FORCE_REVALIDATE && allPartNumbers.length > 0) { const freshResult = await pool.query( `SELECT DISTINCT t.part_number FROM transceivers t @@ -772,12 +801,37 @@ export async function scrapeFs(): Promise { for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string); } + let missingImageUrls = new Set(); + if (ONLY_MISSING_IMAGES) { + const missingResult = await pool.query( + `SELECT DISTINCT product_page_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'FS.COM' + AND COALESCE(t.image_verified, false) = false + AND product_page_url LIKE '%/products/%'` + ); + missingImageUrls = new Set( + missingResult.rows + .map((row) => normalizeFsProductUrl(row.product_page_url as string)) + .filter(Boolean) + ); + } + const urlsToScrape = [...productMap.keys()] .filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? "")) + .filter((url) => !ONLY_MISSING_IMAGES || missingImageUrls.has(normalizeFsProductUrl(url))) .slice(0, MAX_DETAIL_PAGES_PER_RUN); console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`); - console.log(` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`); + console.log( + ONLY_MISSING_IMAGES + ? ` (${missingImageUrls.size} DB product URLs missing images; ${productMap.size - urlsToScrape.length} skipped by targeted image filter)` + : + FORCE_REVALIDATE + ? ` (${productMap.size - urlsToScrape.length} skipped — max detail cap ${MAX_DETAIL_PAGES_PER_RUN})` + : ` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)` + ); if (urlsToScrape.length === 0) { console.log("[Phase 2] All products have fresh stock data — nothing to scrape."); diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index bc84bd6..9778925 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -46,12 +46,18 @@ export async function markImageVerified( ): Promise { const result = await pool.query( `UPDATE transceivers - SET image_url = COALESCE(NULLIF(image_url, ''), $2::text), + SET image_url = CASE + WHEN image_url IS NULL + OR image_url = '' + OR image_url ~* '(placeholder|no-image|no_image|missing|default)' + THEN $2::text + ELSE image_url + END, has_image = true, image_verified = true, - image_verified_at = COALESCE(image_verified_at, NOW()), - image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text), - image_scraped_at = COALESCE(image_scraped_at, NOW()), + image_verified_at = NOW(), + image_verified_url = $2::text, + image_scraped_at = NOW(), updated_at = NOW() WHERE id = $1 AND $2::text IS NOT NULL @@ -178,9 +184,20 @@ export async function upsertPriceObservation(params: { if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) { // Price unchanged and recent — still ensure verified flags are current + await pool.query( + `UPDATE price_observations + SET is_verified = true, + verified_at = COALESCE(verified_at, NOW()) + WHERE transceiver_id = $1 + AND source_vendor_id = $2 + AND content_hash = $3 + AND time > NOW() - INTERVAL '${REFRESH_DAYS} days'`, + [params.transceiverId, params.sourceVendorId, params.contentHash] + ); await pool.query( `UPDATE transceivers SET - price_verified = true + price_verified = true, + price_verified_at = COALESCE(price_verified_at, NOW()) ${isCompetitor ? ", competitor_verified = true, competitor_verified_at = COALESCE(competitor_verified_at, NOW())" : ""} WHERE id = $1 AND (price_verified IS NULL OR price_verified = false OR ${isCompetitor ? "competitor_verified IS NULL OR competitor_verified = false" : "false"})`, [params.transceiverId] @@ -190,8 +207,11 @@ export async function upsertPriceObservation(params: { } await pool.query( - `INSERT INTO price_observations (time, transceiver_id, source_vendor_id, price, currency, stock_level, quantity_available, lead_time_days, url, content_hash) - VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9)`, + `INSERT INTO price_observations ( + time, transceiver_id, source_vendor_id, price, currency, stock_level, + quantity_available, lead_time_days, url, content_hash, is_verified, verified_at + ) + VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9, true, NOW())`, [ params.transceiverId, params.sourceVendorId, @@ -210,6 +230,7 @@ export async function upsertPriceObservation(params: { await pool.query( `UPDATE transceivers SET price_verified = true, + price_verified_at = COALESCE(price_verified_at, NOW()), competitor_verified = true, competitor_verified_at = COALESCE(competitor_verified_at, NOW()) WHERE id = $1`, @@ -217,7 +238,11 @@ export async function upsertPriceObservation(params: { ); } else { await pool.query( - `UPDATE transceivers SET price_verified = true WHERE id = $1 AND (price_verified IS NULL OR price_verified = false)`, + `UPDATE transceivers + SET price_verified = true, + price_verified_at = COALESCE(price_verified_at, NOW()) + WHERE id = $1 + AND (price_verified IS NULL OR price_verified = false OR price_verified_at IS NULL)`, [params.transceiverId] ); } @@ -384,8 +409,8 @@ export async function findOrCreateScrapedTransceiver(params: { ] ); - // Update image_url, has_image and image_verified if we have a new image for a record without one - if (params.imageUrl && !existing.rows[0].image_url) { + // Re-validate image metadata whenever the scraper sees a current product image. + if (params.imageUrl) { await markImageVerified(existing.rows[0].id, params.imageUrl); } if (params.productUrl) { diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 41cbdc1..25ea39e 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,67 @@ # Current TIP Sync State -Updated: 2026-05-09 02:05 UTC +Updated: 2026-05-09 03:15 UTC ## Newest Work +- TIP Flexoptix + FS.com price/image revalidation completed on 2026-05-09: + - live root cause: + - scraper runs had set `transceivers.price_verified`, but `price_observations.is_verified` stayed false + - FS.com product image selector was stale and missed current `.big_img` / `.big_img_m` product images + - code fixed: + - `packages/scraper/src/utils/db.ts` + - new/fresh unchanged price observations now get `is_verified = true` and `verified_at` + - `price_verified_at` is refreshed when price verification is confirmed + - image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at` + - existing records revalidate images whenever current scraper output contains an image URL + - `packages/scraper/src/scrapers/fs-com.ts` + - added `TIP_FORCE_REVALIDATE` + - added `FS_MAX_DETAIL_PAGES_PER_RUN` + - added `FS_ONLY_MISSING_IMAGES` + - updated FS.com image extraction to prefer current `resource.fs.com` product images from `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, `.small_img_active` + - rejects default/logo/general/icon/SVG image URLs + - live runs on Erik: + - `pnpm -C packages/scraper build` passed on `/opt/tip` + - Flexoptix catalog revalidation: + - 615 products processed + - 615 Flexoptix price observations marked verified + - 605 Flexoptix images verified in the run window + - FS.com full force revalidation: + - 270 products discovered + - 270 detail pages scraped + - 0 failed detail requests + - 17 new price observations in first full pass + - 266 FS.com price observations marked verified after first pass + - FS.com targeted missing-image revalidation: + - 99 detail pages scraped + - 0 failed detail requests + - FS.com image-verified products increased from 207 to 299 + - FS.com verified price observations increased to 271 after targeted pass + - final checked counters: + - Flexoptix: + - products: 744 + - product price_verified: 619 + - product image_verified: 615 + - price observation rows: 1288 + - verified price observation rows: 615 + - FS.COM: + - products: 383 + - product price_verified: 379 + - product image_verified: 299 + - price observation rows: 818 + - verified price observation rows: 271 + - operations: + - `tip-scraper-daemon` restarted and is online + - Erik remained stable; final load was about `2.16, 2.22, 2.47` + - CT115 / `tip-scraper` SSH did not respond quickly from this session, so it was not used + - TIPLLM training pool: + - `/tmp/tip-training-data` was recloned from Gitea + - crawler experience was written to: + - `robot-experiences/2026-05-09.jsonl` + - `qa-pairs/robot-control-high.jsonl` + - pushed to Gitea commit: + - `850083f crawl: add flexoptix fs revalidation learning record` + - MAGATAMA dashboard truthfulness / UX hardening on 2026-05-09: - live `api/llm/status` on MAGATAMA now publicly confirms the corrected `magatamallm` lane counts: - `15679` train / collected diff --git a/sync/history/2026-05-09-flexoptix-fs-price-image-revalidation.md b/sync/history/2026-05-09-flexoptix-fs-price-image-revalidation.md new file mode 100644 index 0000000..108c691 --- /dev/null +++ b/sync/history/2026-05-09-flexoptix-fs-price-image-revalidation.md @@ -0,0 +1,116 @@ +# 2026-05-09 Flexoptix + FS.com Price/Image Revalidation + +## Request + +Rene reported that many TIP prices, especially Flexoptix prices, were wrong and asked for all Flexoptix and FS.com prices to be fully revalidated and images checked. + +Standing constraints were preserved: + +- TIP crawler/robot planning and extraction feedback stays TIPLLM-only. +- No external AI was used for crawler planning or extraction feedback. +- Erik must not be overloaded. +- Robot/crawler experiences must be written into the Gitea-backed TIPLLM training pool. +- Work status must be written back to `sync/`. + +## Root Cause + +Two concrete issues were found: + +1. `upsertPriceObservation` marked `transceivers.price_verified`, but inserted price rows did not set `price_observations.is_verified` or `verified_at`. +2. FS.com image extraction still used older selectors. Current FS.com product pages expose product images under `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, and `.small_img_active`, usually from `resource.fs.com/mall/mainImg/...`. + +## Code Changed + +- `packages/scraper/src/utils/db.ts` + - Price observations now set `is_verified = true` and `verified_at` for new observations. + - Fresh unchanged observations are backfilled to verified. + - `price_verified_at` is maintained. + - Image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at`. + - Existing transceivers now call `markImageVerified` whenever a scraper provides an image URL. + +- `packages/scraper/src/scrapers/fs-com.ts` + - Added `TIP_FORCE_REVALIDATE`. + - Added `FS_MAX_DETAIL_PAGES_PER_RUN`. + - Added `FS_ONLY_MISSING_IMAGES`. + - Added URL normalization for FS.com product URLs. + - Updated image extraction to prefer current product image DOM and reject default/logo/general/icon/SVG URLs. + +## Live Runs + +All runs were executed sequentially and rate-limited on Erik after CT115 / `tip-scraper` SSH did not respond quickly enough from this session. + +Build: + +```bash +pnpm -C packages/scraper build +``` + +Result: passed on `/opt/tip`. + +Flexoptix: + +- 615 products processed. +- 615 Flexoptix price observation rows marked verified. +- 605 Flexoptix images verified in the run window. + +FS.com full force revalidation: + +- 270 products discovered. +- 270 detail pages scraped. +- 0 failed detail requests. +- 17 new price observations. +- 266 FS.com price observations verified after the pass. + +FS.com targeted missing-image pass: + +- 99 DB product URLs without images matched current category listings. +- 99 detail pages scraped. +- 0 failed detail requests. +- FS.com image-verified products increased from 207 to 299. +- FS.com verified price observations increased to 271. + +## Final Counters + +Flexoptix: + +- products: 744 +- product price_verified: 619 +- product image_verified: 615 +- price observation rows: 1288 +- verified price observation rows: 615 + +FS.COM: + +- products: 383 +- product price_verified: 379 +- product image_verified: 299 +- price observation rows: 818 +- verified price observation rows: 271 + +## Operations + +- `tip-scraper-daemon` restarted and is online. +- `tip-api` remained online. +- Erik remained stable; final load around `2.16, 2.22, 2.47`. +- External dashboard health curl failed once from local DNS resolution, while PM2 and DB checks were healthy. + +## TIPLLM Training Pool + +The local clone `/tmp/tip-training-data` was recreated from Gitea. + +New records were written to: + +- `robot-experiences/2026-05-09.jsonl` +- `qa-pairs/robot-control-high.jsonl` + +Pushed to Gitea: + +```text +850083f crawl: add flexoptix fs revalidation learning record +``` + +## Follow-Up + +- FS.com still has 84 products without `image_verified`; 67 of those had no usable `/products/` URL in the current DB snapshot or were not found in current category listings. +- A future robot wave should specifically reconcile FS.com rows with blank/missing `product_page_url`. +- For future heavy FS.com work, prefer CT115/Proxmox/Pi once SSH reachability is confirmed; Erik should remain the controller or slow emergency runner only.