fix: revalidate flexoptix fs prices and images

This commit is contained in:
Rene Fichtmueller 2026-05-09 05:13:37 +02:00
parent fd29bee5cb
commit ef225c7dc5
4 changed files with 272 additions and 19 deletions

View File

@ -68,8 +68,10 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
// FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist)
const BASE_URL = "https://www.fs.com/de";
const MAX_PAGES_PER_CATEGORY = 10;
const MAX_DETAIL_PAGES_PER_RUN = 300;
const STOCK_FRESH_HOURS = 12;
const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_RUN"] ?? "300", 10);
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
@ -81,6 +83,12 @@ function buildProxyConfiguration(): ProxyConfiguration | undefined {
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
}
function normalizeFsProductUrl(url: string): string {
return url.replace(/^https:\/\/www\.fs\.com\/de\/de\//, "https://www.fs.com/de/")
.replace(/\?.*$/, "")
.replace(/\/$/, "");
}
const CATEGORY_URLS = [
"/c/1g-sfp-81",
"/c/10g-sfp-63",
@ -524,12 +532,33 @@ async function scrapeProductDetails(
}
}
const imgEl = document.querySelector<HTMLImageElement>(
".product-image img, .prod-img img, .product-gallery img, " +
const imageCandidates = Array.from(document.querySelectorAll<HTMLImageElement>(
".big_img_box img, img.big_img, .big_img_m_active, .big_img_m, " +
".small_img_active img, .product-image img, .prod-img img, .product-gallery img, " +
'[class*="main-image"] img, [class*="primary-image"] img, ' +
".slick-current img, .product__image img"
);
const imageUrl = imgEl?.src ?? imgEl?.getAttribute("data-src") ?? "";
)).map((img) => {
const url =
img.currentSrc ||
img.getAttribute("data-src") ||
img.getAttribute("data-original") ||
img.getAttribute("data-lazy") ||
img.getAttribute("src") ||
"";
const cls = `${img.className || ""} ${img.parentElement?.className || ""}`;
const score =
/big_img_box|big_img|big_img_m_active/.test(cls) ? 100 :
/small_img_active/.test(cls) ? 50 :
10;
return { url, score, w: img.naturalWidth || 0, h: img.naturalHeight || 0 };
}).filter((candidate) =>
candidate.url &&
/resource\.fs\.com/.test(candidate.url) &&
!/default\.jpg|generalImg|logo|icon|svg/i.test(candidate.url) &&
(candidate.w === 0 || candidate.w >= 120) &&
(candidate.h === 0 || candidate.h >= 120)
).sort((a, b) => b.score - a.score || (b.w * b.h) - (a.w * a.h));
const imageUrl = imageCandidates[0]?.url ?? "";
const dsEl = document.querySelector<HTMLAnchorElement>(
'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]'
@ -759,7 +788,7 @@ export async function scrapeFs(): Promise<void> {
// ── Filter: skip products with fresh stock data ─────────────────────────────
const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean);
const freshlyScraped = new Set<string>();
if (allPartNumbers.length > 0) {
if (!FORCE_REVALIDATE && allPartNumbers.length > 0) {
const freshResult = await pool.query(
`SELECT DISTINCT t.part_number
FROM transceivers t
@ -772,12 +801,37 @@ export async function scrapeFs(): Promise<void> {
for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string);
}
let missingImageUrls = new Set<string>();
if (ONLY_MISSING_IMAGES) {
const missingResult = await pool.query(
`SELECT DISTINCT product_page_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'FS.COM'
AND COALESCE(t.image_verified, false) = false
AND product_page_url LIKE '%/products/%'`
);
missingImageUrls = new Set(
missingResult.rows
.map((row) => normalizeFsProductUrl(row.product_page_url as string))
.filter(Boolean)
);
}
const urlsToScrape = [...productMap.keys()]
.filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? ""))
.filter((url) => !ONLY_MISSING_IMAGES || missingImageUrls.has(normalizeFsProductUrl(url)))
.slice(0, MAX_DETAIL_PAGES_PER_RUN);
console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`);
console.log(` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`);
console.log(
ONLY_MISSING_IMAGES
? ` (${missingImageUrls.size} DB product URLs missing images; ${productMap.size - urlsToScrape.length} skipped by targeted image filter)`
:
FORCE_REVALIDATE
? ` (${productMap.size - urlsToScrape.length} skipped — max detail cap ${MAX_DETAIL_PAGES_PER_RUN})`
: ` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`
);
if (urlsToScrape.length === 0) {
console.log("[Phase 2] All products have fresh stock data — nothing to scrape.");

View File

@ -46,12 +46,18 @@ export async function markImageVerified(
): Promise<boolean> {
const result = await pool.query(
`UPDATE transceivers
SET image_url = COALESCE(NULLIF(image_url, ''), $2::text),
SET image_url = CASE
WHEN image_url IS NULL
OR image_url = ''
OR image_url ~* '(placeholder|no-image|no_image|missing|default)'
THEN $2::text
ELSE image_url
END,
has_image = true,
image_verified = true,
image_verified_at = COALESCE(image_verified_at, NOW()),
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text),
image_scraped_at = COALESCE(image_scraped_at, NOW()),
image_verified_at = NOW(),
image_verified_url = $2::text,
image_scraped_at = NOW(),
updated_at = NOW()
WHERE id = $1
AND $2::text IS NOT NULL
@ -178,9 +184,20 @@ export async function upsertPriceObservation(params: {
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) {
// Price unchanged and recent — still ensure verified flags are current
await pool.query(
`UPDATE price_observations
SET is_verified = true,
verified_at = COALESCE(verified_at, NOW())
WHERE transceiver_id = $1
AND source_vendor_id = $2
AND content_hash = $3
AND time > NOW() - INTERVAL '${REFRESH_DAYS} days'`,
[params.transceiverId, params.sourceVendorId, params.contentHash]
);
await pool.query(
`UPDATE transceivers SET
price_verified = true
price_verified = true,
price_verified_at = COALESCE(price_verified_at, NOW())
${isCompetitor ? ", competitor_verified = true, competitor_verified_at = COALESCE(competitor_verified_at, NOW())" : ""}
WHERE id = $1 AND (price_verified IS NULL OR price_verified = false OR ${isCompetitor ? "competitor_verified IS NULL OR competitor_verified = false" : "false"})`,
[params.transceiverId]
@ -190,8 +207,11 @@ export async function upsertPriceObservation(params: {
}
await pool.query(
`INSERT INTO price_observations (time, transceiver_id, source_vendor_id, price, currency, stock_level, quantity_available, lead_time_days, url, content_hash)
VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9)`,
`INSERT INTO price_observations (
time, transceiver_id, source_vendor_id, price, currency, stock_level,
quantity_available, lead_time_days, url, content_hash, is_verified, verified_at
)
VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9, true, NOW())`,
[
params.transceiverId,
params.sourceVendorId,
@ -210,6 +230,7 @@ export async function upsertPriceObservation(params: {
await pool.query(
`UPDATE transceivers SET
price_verified = true,
price_verified_at = COALESCE(price_verified_at, NOW()),
competitor_verified = true,
competitor_verified_at = COALESCE(competitor_verified_at, NOW())
WHERE id = $1`,
@ -217,7 +238,11 @@ export async function upsertPriceObservation(params: {
);
} else {
await pool.query(
`UPDATE transceivers SET price_verified = true WHERE id = $1 AND (price_verified IS NULL OR price_verified = false)`,
`UPDATE transceivers
SET price_verified = true,
price_verified_at = COALESCE(price_verified_at, NOW())
WHERE id = $1
AND (price_verified IS NULL OR price_verified = false OR price_verified_at IS NULL)`,
[params.transceiverId]
);
}
@ -384,8 +409,8 @@ export async function findOrCreateScrapedTransceiver(params: {
]
);
// Update image_url, has_image and image_verified if we have a new image for a record without one
if (params.imageUrl && !existing.rows[0].image_url) {
// Re-validate image metadata whenever the scraper sees a current product image.
if (params.imageUrl) {
await markImageVerified(existing.rows[0].id, params.imageUrl);
}
if (params.productUrl) {

View File

@ -1,9 +1,67 @@
# Current TIP Sync State
Updated: 2026-05-09 02:05 UTC
Updated: 2026-05-09 03:15 UTC
## Newest Work
- TIP Flexoptix + FS.com price/image revalidation completed on 2026-05-09:
- live root cause:
- scraper runs had set `transceivers.price_verified`, but `price_observations.is_verified` stayed false
- FS.com product image selector was stale and missed current `.big_img` / `.big_img_m` product images
- code fixed:
- `packages/scraper/src/utils/db.ts`
- new/fresh unchanged price observations now get `is_verified = true` and `verified_at`
- `price_verified_at` is refreshed when price verification is confirmed
- image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at`
- existing records revalidate images whenever current scraper output contains an image URL
- `packages/scraper/src/scrapers/fs-com.ts`
- added `TIP_FORCE_REVALIDATE`
- added `FS_MAX_DETAIL_PAGES_PER_RUN`
- added `FS_ONLY_MISSING_IMAGES`
- updated FS.com image extraction to prefer current `resource.fs.com` product images from `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, `.small_img_active`
- rejects default/logo/general/icon/SVG image URLs
- live runs on Erik:
- `pnpm -C packages/scraper build` passed on `/opt/tip`
- Flexoptix catalog revalidation:
- 615 products processed
- 615 Flexoptix price observations marked verified
- 605 Flexoptix images verified in the run window
- FS.com full force revalidation:
- 270 products discovered
- 270 detail pages scraped
- 0 failed detail requests
- 17 new price observations in first full pass
- 266 FS.com price observations marked verified after first pass
- FS.com targeted missing-image revalidation:
- 99 detail pages scraped
- 0 failed detail requests
- FS.com image-verified products increased from 207 to 299
- FS.com verified price observations increased to 271 after targeted pass
- final checked counters:
- Flexoptix:
- products: 744
- product price_verified: 619
- product image_verified: 615
- price observation rows: 1288
- verified price observation rows: 615
- FS.COM:
- products: 383
- product price_verified: 379
- product image_verified: 299
- price observation rows: 818
- verified price observation rows: 271
- operations:
- `tip-scraper-daemon` restarted and is online
- Erik remained stable; final load was about `2.16, 2.22, 2.47`
- CT115 / `tip-scraper` SSH did not respond quickly from this session, so it was not used
- TIPLLM training pool:
- `/tmp/tip-training-data` was recloned from Gitea
- crawler experience was written to:
- `robot-experiences/2026-05-09.jsonl`
- `qa-pairs/robot-control-high.jsonl`
- pushed to Gitea commit:
- `850083f crawl: add flexoptix fs revalidation learning record`
- MAGATAMA dashboard truthfulness / UX hardening on 2026-05-09:
- live `api/llm/status` on MAGATAMA now publicly confirms the corrected `magatamallm` lane counts:
- `15679` train / collected

View File

@ -0,0 +1,116 @@
# 2026-05-09 Flexoptix + FS.com Price/Image Revalidation
## Request
Rene reported that many TIP prices, especially Flexoptix prices, were wrong and asked for all Flexoptix and FS.com prices to be fully revalidated and images checked.
Standing constraints were preserved:
- TIP crawler/robot planning and extraction feedback stays TIPLLM-only.
- No external AI was used for crawler planning or extraction feedback.
- Erik must not be overloaded.
- Robot/crawler experiences must be written into the Gitea-backed TIPLLM training pool.
- Work status must be written back to `sync/`.
## Root Cause
Two concrete issues were found:
1. `upsertPriceObservation` marked `transceivers.price_verified`, but inserted price rows did not set `price_observations.is_verified` or `verified_at`.
2. FS.com image extraction still used older selectors. Current FS.com product pages expose product images under `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, and `.small_img_active`, usually from `resource.fs.com/mall/mainImg/...`.
## Code Changed
- `packages/scraper/src/utils/db.ts`
- Price observations now set `is_verified = true` and `verified_at` for new observations.
- Fresh unchanged observations are backfilled to verified.
- `price_verified_at` is maintained.
- Image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at`.
- Existing transceivers now call `markImageVerified` whenever a scraper provides an image URL.
- `packages/scraper/src/scrapers/fs-com.ts`
- Added `TIP_FORCE_REVALIDATE`.
- Added `FS_MAX_DETAIL_PAGES_PER_RUN`.
- Added `FS_ONLY_MISSING_IMAGES`.
- Added URL normalization for FS.com product URLs.
- Updated image extraction to prefer current product image DOM and reject default/logo/general/icon/SVG URLs.
## Live Runs
All runs were executed sequentially and rate-limited on Erik after CT115 / `tip-scraper` SSH did not respond quickly enough from this session.
Build:
```bash
pnpm -C packages/scraper build
```
Result: passed on `/opt/tip`.
Flexoptix:
- 615 products processed.
- 615 Flexoptix price observation rows marked verified.
- 605 Flexoptix images verified in the run window.
FS.com full force revalidation:
- 270 products discovered.
- 270 detail pages scraped.
- 0 failed detail requests.
- 17 new price observations.
- 266 FS.com price observations verified after the pass.
FS.com targeted missing-image pass:
- 99 DB product URLs without images matched current category listings.
- 99 detail pages scraped.
- 0 failed detail requests.
- FS.com image-verified products increased from 207 to 299.
- FS.com verified price observations increased to 271.
## Final Counters
Flexoptix:
- products: 744
- product price_verified: 619
- product image_verified: 615
- price observation rows: 1288
- verified price observation rows: 615
FS.COM:
- products: 383
- product price_verified: 379
- product image_verified: 299
- price observation rows: 818
- verified price observation rows: 271
## Operations
- `tip-scraper-daemon` restarted and is online.
- `tip-api` remained online.
- Erik remained stable; final load around `2.16, 2.22, 2.47`.
- External dashboard health curl failed once from local DNS resolution, while PM2 and DB checks were healthy.
## TIPLLM Training Pool
The local clone `/tmp/tip-training-data` was recreated from Gitea.
New records were written to:
- `robot-experiences/2026-05-09.jsonl`
- `qa-pairs/robot-control-high.jsonl`
Pushed to Gitea:
```text
850083f crawl: add flexoptix fs revalidation learning record
```
## Follow-Up
- FS.com still has 84 products without `image_verified`; 67 of those had no usable `/products/` URL in the current DB snapshot or were not found in current category listings.
- A future robot wave should specifically reconcile FS.com rows with blank/missing `product_page_url`.
- For future heavy FS.com work, prefer CT115/Proxmox/Pi once SSH reachability is confirmed; Erik should remain the controller or slow emergency runner only.