fix: revalidate flexoptix fs prices and images
This commit is contained in:
parent
fd29bee5cb
commit
ef225c7dc5
@ -68,8 +68,10 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||
// FS.com German locale uses www.fs.com/de/ (de.fs.com DNS does not exist)
|
||||
const BASE_URL = "https://www.fs.com/de";
|
||||
const MAX_PAGES_PER_CATEGORY = 10;
|
||||
const MAX_DETAIL_PAGES_PER_RUN = 300;
|
||||
const STOCK_FRESH_HOURS = 12;
|
||||
const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_RUN"] ?? "300", 10);
|
||||
const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10);
|
||||
const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1";
|
||||
const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1";
|
||||
|
||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||
.split(",")
|
||||
@ -81,6 +83,12 @@ function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
||||
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
||||
}
|
||||
|
||||
function normalizeFsProductUrl(url: string): string {
|
||||
return url.replace(/^https:\/\/www\.fs\.com\/de\/de\//, "https://www.fs.com/de/")
|
||||
.replace(/\?.*$/, "")
|
||||
.replace(/\/$/, "");
|
||||
}
|
||||
|
||||
const CATEGORY_URLS = [
|
||||
"/c/1g-sfp-81",
|
||||
"/c/10g-sfp-63",
|
||||
@ -524,12 +532,33 @@ async function scrapeProductDetails(
|
||||
}
|
||||
}
|
||||
|
||||
const imgEl = document.querySelector<HTMLImageElement>(
|
||||
".product-image img, .prod-img img, .product-gallery img, " +
|
||||
const imageCandidates = Array.from(document.querySelectorAll<HTMLImageElement>(
|
||||
".big_img_box img, img.big_img, .big_img_m_active, .big_img_m, " +
|
||||
".small_img_active img, .product-image img, .prod-img img, .product-gallery img, " +
|
||||
'[class*="main-image"] img, [class*="primary-image"] img, ' +
|
||||
".slick-current img, .product__image img"
|
||||
);
|
||||
const imageUrl = imgEl?.src ?? imgEl?.getAttribute("data-src") ?? "";
|
||||
)).map((img) => {
|
||||
const url =
|
||||
img.currentSrc ||
|
||||
img.getAttribute("data-src") ||
|
||||
img.getAttribute("data-original") ||
|
||||
img.getAttribute("data-lazy") ||
|
||||
img.getAttribute("src") ||
|
||||
"";
|
||||
const cls = `${img.className || ""} ${img.parentElement?.className || ""}`;
|
||||
const score =
|
||||
/big_img_box|big_img|big_img_m_active/.test(cls) ? 100 :
|
||||
/small_img_active/.test(cls) ? 50 :
|
||||
10;
|
||||
return { url, score, w: img.naturalWidth || 0, h: img.naturalHeight || 0 };
|
||||
}).filter((candidate) =>
|
||||
candidate.url &&
|
||||
/resource\.fs\.com/.test(candidate.url) &&
|
||||
!/default\.jpg|generalImg|logo|icon|svg/i.test(candidate.url) &&
|
||||
(candidate.w === 0 || candidate.w >= 120) &&
|
||||
(candidate.h === 0 || candidate.h >= 120)
|
||||
).sort((a, b) => b.score - a.score || (b.w * b.h) - (a.w * a.h));
|
||||
const imageUrl = imageCandidates[0]?.url ?? "";
|
||||
|
||||
const dsEl = document.querySelector<HTMLAnchorElement>(
|
||||
'a[href*="datasheet"], a[href*=".pdf"], a[download][href*=".pdf"]'
|
||||
@ -759,7 +788,7 @@ export async function scrapeFs(): Promise<void> {
|
||||
// ── Filter: skip products with fresh stock data ─────────────────────────────
|
||||
const allPartNumbers = [...productMap.values()].map((p) => p.partNumber).filter(Boolean);
|
||||
const freshlyScraped = new Set<string>();
|
||||
if (allPartNumbers.length > 0) {
|
||||
if (!FORCE_REVALIDATE && allPartNumbers.length > 0) {
|
||||
const freshResult = await pool.query(
|
||||
`SELECT DISTINCT t.part_number
|
||||
FROM transceivers t
|
||||
@ -772,12 +801,37 @@ export async function scrapeFs(): Promise<void> {
|
||||
for (const row of freshResult.rows) freshlyScraped.add(row.part_number as string);
|
||||
}
|
||||
|
||||
let missingImageUrls = new Set<string>();
|
||||
if (ONLY_MISSING_IMAGES) {
|
||||
const missingResult = await pool.query(
|
||||
`SELECT DISTINCT product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'FS.COM'
|
||||
AND COALESCE(t.image_verified, false) = false
|
||||
AND product_page_url LIKE '%/products/%'`
|
||||
);
|
||||
missingImageUrls = new Set(
|
||||
missingResult.rows
|
||||
.map((row) => normalizeFsProductUrl(row.product_page_url as string))
|
||||
.filter(Boolean)
|
||||
);
|
||||
}
|
||||
|
||||
const urlsToScrape = [...productMap.keys()]
|
||||
.filter((url) => !freshlyScraped.has(productMap.get(url)?.partNumber ?? ""))
|
||||
.filter((url) => !ONLY_MISSING_IMAGES || missingImageUrls.has(normalizeFsProductUrl(url)))
|
||||
.slice(0, MAX_DETAIL_PAGES_PER_RUN);
|
||||
|
||||
console.log(`\n[Phase 2] Scraping ${urlsToScrape.length} detail pages`);
|
||||
console.log(` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`);
|
||||
console.log(
|
||||
ONLY_MISSING_IMAGES
|
||||
? ` (${missingImageUrls.size} DB product URLs missing images; ${productMap.size - urlsToScrape.length} skipped by targeted image filter)`
|
||||
:
|
||||
FORCE_REVALIDATE
|
||||
? ` (${productMap.size - urlsToScrape.length} skipped — max detail cap ${MAX_DETAIL_PAGES_PER_RUN})`
|
||||
: ` (${productMap.size - urlsToScrape.length} skipped — data ≤${STOCK_FRESH_HOURS}h fresh)`
|
||||
);
|
||||
|
||||
if (urlsToScrape.length === 0) {
|
||||
console.log("[Phase 2] All products have fresh stock data — nothing to scrape.");
|
||||
|
||||
@ -46,12 +46,18 @@ export async function markImageVerified(
|
||||
): Promise<boolean> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET image_url = COALESCE(NULLIF(image_url, ''), $2::text),
|
||||
SET image_url = CASE
|
||||
WHEN image_url IS NULL
|
||||
OR image_url = ''
|
||||
OR image_url ~* '(placeholder|no-image|no_image|missing|default)'
|
||||
THEN $2::text
|
||||
ELSE image_url
|
||||
END,
|
||||
has_image = true,
|
||||
image_verified = true,
|
||||
image_verified_at = COALESCE(image_verified_at, NOW()),
|
||||
image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2::text),
|
||||
image_scraped_at = COALESCE(image_scraped_at, NOW()),
|
||||
image_verified_at = NOW(),
|
||||
image_verified_url = $2::text,
|
||||
image_scraped_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND $2::text IS NOT NULL
|
||||
@ -178,9 +184,20 @@ export async function upsertPriceObservation(params: {
|
||||
|
||||
if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash && !isStale) {
|
||||
// Price unchanged and recent — still ensure verified flags are current
|
||||
await pool.query(
|
||||
`UPDATE price_observations
|
||||
SET is_verified = true,
|
||||
verified_at = COALESCE(verified_at, NOW())
|
||||
WHERE transceiver_id = $1
|
||||
AND source_vendor_id = $2
|
||||
AND content_hash = $3
|
||||
AND time > NOW() - INTERVAL '${REFRESH_DAYS} days'`,
|
||||
[params.transceiverId, params.sourceVendorId, params.contentHash]
|
||||
);
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET
|
||||
price_verified = true
|
||||
price_verified = true,
|
||||
price_verified_at = COALESCE(price_verified_at, NOW())
|
||||
${isCompetitor ? ", competitor_verified = true, competitor_verified_at = COALESCE(competitor_verified_at, NOW())" : ""}
|
||||
WHERE id = $1 AND (price_verified IS NULL OR price_verified = false OR ${isCompetitor ? "competitor_verified IS NULL OR competitor_verified = false" : "false"})`,
|
||||
[params.transceiverId]
|
||||
@ -190,8 +207,11 @@ export async function upsertPriceObservation(params: {
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`INSERT INTO price_observations (time, transceiver_id, source_vendor_id, price, currency, stock_level, quantity_available, lead_time_days, url, content_hash)
|
||||
VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9)`,
|
||||
`INSERT INTO price_observations (
|
||||
time, transceiver_id, source_vendor_id, price, currency, stock_level,
|
||||
quantity_available, lead_time_days, url, content_hash, is_verified, verified_at
|
||||
)
|
||||
VALUES (NOW(), $1, $2, $3, $4, $5, $6, $7, $8, $9, true, NOW())`,
|
||||
[
|
||||
params.transceiverId,
|
||||
params.sourceVendorId,
|
||||
@ -210,6 +230,7 @@ export async function upsertPriceObservation(params: {
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET
|
||||
price_verified = true,
|
||||
price_verified_at = COALESCE(price_verified_at, NOW()),
|
||||
competitor_verified = true,
|
||||
competitor_verified_at = COALESCE(competitor_verified_at, NOW())
|
||||
WHERE id = $1`,
|
||||
@ -217,7 +238,11 @@ export async function upsertPriceObservation(params: {
|
||||
);
|
||||
} else {
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET price_verified = true WHERE id = $1 AND (price_verified IS NULL OR price_verified = false)`,
|
||||
`UPDATE transceivers
|
||||
SET price_verified = true,
|
||||
price_verified_at = COALESCE(price_verified_at, NOW())
|
||||
WHERE id = $1
|
||||
AND (price_verified IS NULL OR price_verified = false OR price_verified_at IS NULL)`,
|
||||
[params.transceiverId]
|
||||
);
|
||||
}
|
||||
@ -384,8 +409,8 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
]
|
||||
);
|
||||
|
||||
// Update image_url, has_image and image_verified if we have a new image for a record without one
|
||||
if (params.imageUrl && !existing.rows[0].image_url) {
|
||||
// Re-validate image metadata whenever the scraper sees a current product image.
|
||||
if (params.imageUrl) {
|
||||
await markImageVerified(existing.rows[0].id, params.imageUrl);
|
||||
}
|
||||
if (params.productUrl) {
|
||||
|
||||
@ -1,9 +1,67 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 02:05 UTC
|
||||
Updated: 2026-05-09 03:15 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP Flexoptix + FS.com price/image revalidation completed on 2026-05-09:
|
||||
- live root cause:
|
||||
- scraper runs had set `transceivers.price_verified`, but `price_observations.is_verified` stayed false
|
||||
- FS.com product image selector was stale and missed current `.big_img` / `.big_img_m` product images
|
||||
- code fixed:
|
||||
- `packages/scraper/src/utils/db.ts`
|
||||
- new/fresh unchanged price observations now get `is_verified = true` and `verified_at`
|
||||
- `price_verified_at` is refreshed when price verification is confirmed
|
||||
- image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at`
|
||||
- existing records revalidate images whenever current scraper output contains an image URL
|
||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
||||
- added `TIP_FORCE_REVALIDATE`
|
||||
- added `FS_MAX_DETAIL_PAGES_PER_RUN`
|
||||
- added `FS_ONLY_MISSING_IMAGES`
|
||||
- updated FS.com image extraction to prefer current `resource.fs.com` product images from `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, `.small_img_active`
|
||||
- rejects default/logo/general/icon/SVG image URLs
|
||||
- live runs on Erik:
|
||||
- `pnpm -C packages/scraper build` passed on `/opt/tip`
|
||||
- Flexoptix catalog revalidation:
|
||||
- 615 products processed
|
||||
- 615 Flexoptix price observations marked verified
|
||||
- 605 Flexoptix images verified in the run window
|
||||
- FS.com full force revalidation:
|
||||
- 270 products discovered
|
||||
- 270 detail pages scraped
|
||||
- 0 failed detail requests
|
||||
- 17 new price observations in first full pass
|
||||
- 266 FS.com price observations marked verified after first pass
|
||||
- FS.com targeted missing-image revalidation:
|
||||
- 99 detail pages scraped
|
||||
- 0 failed detail requests
|
||||
- FS.com image-verified products increased from 207 to 299
|
||||
- FS.com verified price observations increased to 271 after targeted pass
|
||||
- final checked counters:
|
||||
- Flexoptix:
|
||||
- products: 744
|
||||
- product price_verified: 619
|
||||
- product image_verified: 615
|
||||
- price observation rows: 1288
|
||||
- verified price observation rows: 615
|
||||
- FS.COM:
|
||||
- products: 383
|
||||
- product price_verified: 379
|
||||
- product image_verified: 299
|
||||
- price observation rows: 818
|
||||
- verified price observation rows: 271
|
||||
- operations:
|
||||
- `tip-scraper-daemon` restarted and is online
|
||||
- Erik remained stable; final load was about `2.16, 2.22, 2.47`
|
||||
- CT115 / `tip-scraper` SSH did not respond quickly from this session, so it was not used
|
||||
- TIPLLM training pool:
|
||||
- `/tmp/tip-training-data` was recloned from Gitea
|
||||
- crawler experience was written to:
|
||||
- `robot-experiences/2026-05-09.jsonl`
|
||||
- `qa-pairs/robot-control-high.jsonl`
|
||||
- pushed to Gitea commit:
|
||||
- `850083f crawl: add flexoptix fs revalidation learning record`
|
||||
|
||||
- MAGATAMA dashboard truthfulness / UX hardening on 2026-05-09:
|
||||
- live `api/llm/status` on MAGATAMA now publicly confirms the corrected `magatamallm` lane counts:
|
||||
- `15679` train / collected
|
||||
|
||||
116
sync/history/2026-05-09-flexoptix-fs-price-image-revalidation.md
Normal file
116
sync/history/2026-05-09-flexoptix-fs-price-image-revalidation.md
Normal file
@ -0,0 +1,116 @@
|
||||
# 2026-05-09 Flexoptix + FS.com Price/Image Revalidation
|
||||
|
||||
## Request
|
||||
|
||||
Rene reported that many TIP prices, especially Flexoptix prices, were wrong and asked for all Flexoptix and FS.com prices to be fully revalidated and images checked.
|
||||
|
||||
Standing constraints were preserved:
|
||||
|
||||
- TIP crawler/robot planning and extraction feedback stays TIPLLM-only.
|
||||
- No external AI was used for crawler planning or extraction feedback.
|
||||
- Erik must not be overloaded.
|
||||
- Robot/crawler experiences must be written into the Gitea-backed TIPLLM training pool.
|
||||
- Work status must be written back to `sync/`.
|
||||
|
||||
## Root Cause
|
||||
|
||||
Two concrete issues were found:
|
||||
|
||||
1. `upsertPriceObservation` marked `transceivers.price_verified`, but inserted price rows did not set `price_observations.is_verified` or `verified_at`.
|
||||
2. FS.com image extraction still used older selectors. Current FS.com product pages expose product images under `.big_img_box`, `img.big_img`, `.big_img_m_active`, `.big_img_m`, and `.small_img_active`, usually from `resource.fs.com/mall/mainImg/...`.
|
||||
|
||||
## Code Changed
|
||||
|
||||
- `packages/scraper/src/utils/db.ts`
|
||||
- Price observations now set `is_verified = true` and `verified_at` for new observations.
|
||||
- Fresh unchanged observations are backfilled to verified.
|
||||
- `price_verified_at` is maintained.
|
||||
- Image verification now refreshes `image_verified_at`, `image_verified_url`, and `image_scraped_at`.
|
||||
- Existing transceivers now call `markImageVerified` whenever a scraper provides an image URL.
|
||||
|
||||
- `packages/scraper/src/scrapers/fs-com.ts`
|
||||
- Added `TIP_FORCE_REVALIDATE`.
|
||||
- Added `FS_MAX_DETAIL_PAGES_PER_RUN`.
|
||||
- Added `FS_ONLY_MISSING_IMAGES`.
|
||||
- Added URL normalization for FS.com product URLs.
|
||||
- Updated image extraction to prefer current product image DOM and reject default/logo/general/icon/SVG URLs.
|
||||
|
||||
## Live Runs
|
||||
|
||||
All runs were executed sequentially and rate-limited on Erik after CT115 / `tip-scraper` SSH did not respond quickly enough from this session.
|
||||
|
||||
Build:
|
||||
|
||||
```bash
|
||||
pnpm -C packages/scraper build
|
||||
```
|
||||
|
||||
Result: passed on `/opt/tip`.
|
||||
|
||||
Flexoptix:
|
||||
|
||||
- 615 products processed.
|
||||
- 615 Flexoptix price observation rows marked verified.
|
||||
- 605 Flexoptix images verified in the run window.
|
||||
|
||||
FS.com full force revalidation:
|
||||
|
||||
- 270 products discovered.
|
||||
- 270 detail pages scraped.
|
||||
- 0 failed detail requests.
|
||||
- 17 new price observations.
|
||||
- 266 FS.com price observations verified after the pass.
|
||||
|
||||
FS.com targeted missing-image pass:
|
||||
|
||||
- 99 DB product URLs without images matched current category listings.
|
||||
- 99 detail pages scraped.
|
||||
- 0 failed detail requests.
|
||||
- FS.com image-verified products increased from 207 to 299.
|
||||
- FS.com verified price observations increased to 271.
|
||||
|
||||
## Final Counters
|
||||
|
||||
Flexoptix:
|
||||
|
||||
- products: 744
|
||||
- product price_verified: 619
|
||||
- product image_verified: 615
|
||||
- price observation rows: 1288
|
||||
- verified price observation rows: 615
|
||||
|
||||
FS.COM:
|
||||
|
||||
- products: 383
|
||||
- product price_verified: 379
|
||||
- product image_verified: 299
|
||||
- price observation rows: 818
|
||||
- verified price observation rows: 271
|
||||
|
||||
## Operations
|
||||
|
||||
- `tip-scraper-daemon` restarted and is online.
|
||||
- `tip-api` remained online.
|
||||
- Erik remained stable; final load around `2.16, 2.22, 2.47`.
|
||||
- External dashboard health curl failed once from local DNS resolution, while PM2 and DB checks were healthy.
|
||||
|
||||
## TIPLLM Training Pool
|
||||
|
||||
The local clone `/tmp/tip-training-data` was recreated from Gitea.
|
||||
|
||||
New records were written to:
|
||||
|
||||
- `robot-experiences/2026-05-09.jsonl`
|
||||
- `qa-pairs/robot-control-high.jsonl`
|
||||
|
||||
Pushed to Gitea:
|
||||
|
||||
```text
|
||||
850083f crawl: add flexoptix fs revalidation learning record
|
||||
```
|
||||
|
||||
## Follow-Up
|
||||
|
||||
- FS.com still has 84 products without `image_verified`; 67 of those had no usable `/products/` URL in the current DB snapshot or were not found in current category listings.
|
||||
- A future robot wave should specifically reconcile FS.com rows with blank/missing `product_page_url`.
|
||||
- For future heavy FS.com work, prefer CT115/Proxmox/Pi once SSH reachability is confirmed; Erik should remain the controller or slow emergency runner only.
|
||||
Loading…
x
Reference in New Issue
Block a user