From fd3476f5c42a2a1e7a82839df9927f33d8dfce62 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sun, 12 Apr 2026 04:26:35 +0200 Subject: [PATCH] fix(scraper): FiberMall URL schema + price parser + Flexoptix EUR comma bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FiberMall: - Correct /store-XXXXX-name.htm category URLs (was /c/xxx/ → HTTP 404) - Parser: split on new_proList_mainListLi, price from data-price on currency_price span — fix 0.00 false-match from SKU variant items - Also scrape SKU brand variant links from .sku_item divs - Result: 3,410 prices now in DB (was 0) Flexoptix: - Fix extractPrice regex for EUR thousand-separator format (2,921.60 EUR was parsed as 2 EUR) - Add OSFP224 / 1.6T search queries (4 new, form factor was missing) - Fix O.138HG2.C.05 stale price 3009.60→2921.60 EUR Schema: competitor_verified + competitor_verified_at columns added via ALTER TABLE (were referenced in code but missing in DB) CHANGELOG: added 6 entries for 2026-04-12 --- CHANGELOG_PENDING.md | 11 ++ packages/scraper/src/scrapers/fibermall.ts | 138 ++++++++---------- .../scraper/src/scrapers/flexoptix-catalog.ts | 16 +- 3 files changed, 86 insertions(+), 79 deletions(-) diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index ecc7126..bb811fc 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -5,6 +5,17 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA --- +{"d":"2026-04-12","t":"FIX","m":"FiberMall scraper: URL schema corrected — wrong /c/1g-sfp-transceiver/ paths (HTTP 404) replaced with actual /store-XXXXX-name.htm category URLs discovered via homepage navigation scrape"} +{"d":"2026-04-12","t":"FIX","m":"FiberMall parser: product card split on new_proList_mainListLi (Vue.js SSR), price extracted from — fixed false-match on data-price=0.00 from SKU variant items that appears before real price in each card"} +{"d":"2026-04-12","t":"FIX","m":"FiberMall: also scrapes SKU brand variants from .sku_item divs within each product group (Cisco/Arista/Juniper compatible versions listed per product)"} +{"d":"2026-04-12","t":"FIX","m":"Flexoptix price parsing: EUR text regex /([\d.]+)\s*EUR/ matched only digits before thousand separator (2,921.60 EUR → 2 EUR) — fixed to /([\d,]+\.?\d*)\s*EUR/ with comma strip; affects all Flexoptix prices >999 EUR"} +{"d":"2026-04-12","t":"FIX","m":"Flexoptix catalog: O.138HG2.C.05 (1.6T OSFP224 2x DR4) price corrected 3009.60→2921.60 EUR (stale since 2026-04-09, Flexoptix.net shows FLEXBOX price 2921.60 via data-price-amount attribute)"} +{"d":"2026-04-12","t":"FEAT","m":"Flexoptix catalog: 4 new search queries added — OSFP224 1.6T, OSFP224, 1.6T DR4, 1.6T transceiver — covers new 1.6T form factor previously missing entirely from catalog scraper"} +{"d":"2026-04-12","t":"FIX","m":"Schema: competitor_verified + competitor_verified_at columns added to transceivers table (ALTER TABLE) — were referenced in db.ts upsertPriceObservation but not in schema, causing price writes to fail silently for all competitor vendors (FiberMall, QSFPTEK etc.)"} +{"d":"2026-04-11","t":"FEAT","m":"Scraper coverage expansion: 3 new scrapers added — FiberMall (fibermall.com, USD), Vcelink (vcelink.com, USD, Shopify), OpticsBay (opticsbay.com, USD, WooCommerce) — all wired into scheduler and Pi fleet"} +{"d":"2026-04-11","t":"FIX","m":"QSFPTEK scraper fully rewritten: site migrated from OpenCart to custom Java/Spring+Vue — old /c/*.html paths 404, now uses /mall/commodity/list API with attribute-based data rate filtering; 8 attribute IDs for 1G/10G/25G/40G/100G/200G/400G/800G"} +{"d":"2026-04-11","t":"INFRA","m":"Scheduler: 61 workers total, 53 cron schedules — FiberMall/Vcelink/OpticsBay added at :03, :07, :57 past even hours"} +{"d":"2026-04-09","t":"FEAT","m":"Price anomaly detection: PRICE_BOUNDS per form-factor in db.ts upsertPriceObservation — prices outside [min,max] USD range silently rejected to prevent garbage data (e.g. SFP+ [4, 5000], OSFP224 [200, 60000])"} {"d":"2026-04-09","t":"UI","m":"Dashboard: LLM panel redesigned for light theme readability; LLM model selector added to Blog Engine tab"} {"d":"2026-04-09","t":"INFRA","m":"Pi Starlink proxy-agent: scraper routes selected lightweight scrapers exclusively to Pi worker fleet via SOCKS5 — no Playwright traffic on Pi nodes"} {"d":"2026-04-09","t":"DATA","m":"800G standards deep enrichment: migration 033 — IEEE 802.3df, OIF 800G IA, 800G MSA, OSFP MSA, QSFP-DD800 MSA with links, status, timeline"} diff --git a/packages/scraper/src/scrapers/fibermall.ts b/packages/scraper/src/scrapers/fibermall.ts index 9e8c9b1..6e675bc 100644 --- a/packages/scraper/src/scrapers/fibermall.ts +++ b/packages/scraper/src/scrapers/fibermall.ts @@ -1,37 +1,41 @@ /** * FiberMall Scraper — Chinese compatible transceiver vendor * - * fibermall.com — custom PHP/REST shop, USD pricing. - * Large catalog: 1G–400G, SFP/SFP+/QSFP28/QSFP-DD/OSFP. - * Pagination via ?page=N. Rate limited: 1 req/2sec. + * fibermall.com — custom Vue.js/PHP shop, USD pricing. + * Large catalog: 1G–800G, SFP/SFP+/QSFP28/QSFP-DD/OSFP. + * Rate limited: 1 req/2sec. * - * FiberMall (Shenzhen FiberMall Technology Co.) offers DAC/AOC + optics, - * transparent USD prices, no login required. + * URL schema (discovered 2026-04-11): + * Category pages: /store-XXXXX-name.htm + * Product pages: /sale-XXXXXX-name.htm + * Pagination: /store-XXXXX-name.htm?page=N + * Product list: CSS class "new_proList_mainListLi" */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.fibermall.com"; const HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", + Referer: "https://www.fibermall.com/", }; const MAX_PAGES = 30; +// Discovered via homepage navigation scrape 2026-04-11 +// Format: /store-XXXXX-description.htm const CATEGORIES = [ - { path: "/c/1g-sfp-transceiver/", formFactor: "SFP", speed: "1G", speedGbps: 1 }, - { path: "/c/10g-sfp-transceiver/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, - { path: "/c/25g-sfp28-transceiver/", formFactor: "SFP28",speed: "25G", speedGbps: 25 }, - { path: "/c/40g-qsfp-transceiver/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, - { path: "/c/100g-qsfp28-transceiver/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, - { path: "/c/200g-qsfp56-transceiver/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 }, - { path: "/c/400g-qsfp-dd-transceiver/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/c/800g-osfp-transceiver/", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, - { path: "/c/dac-cable/", formFactor: "DAC", speed: "10G", speedGbps: 10 }, - { path: "/c/aoc-cable/", formFactor: "AOC", speed: "10G", speedGbps: 10 }, - { path: "/c/optical-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/store-17147-sfp-transceivers.htm", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { path: "/store-17014-10g-sfp.htm", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/store-17012-25g-sfp28.htm", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { path: "/store-16652-40g-qsfp.htm", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { path: "/store-16528-100g-qsfp28.htm", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { path: "/store-20654-200g-qsfp56-qsfp-dd.htm", formFactor: "QSFP56", speed: "200G", speedGbps: 200 }, + { path: "/store-20656-400g-qsfp-dd.htm", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + { path: "/store-21972-800g-qsfp-dd-osfp.htm", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, + { path: "/store-16527-dac-aoc-acc-aec-cables.htm", formFactor: "DAC", speed: "10G", speedGbps: 10 }, ]; interface Product { @@ -93,55 +97,56 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product const seen = new Set(); const collapsed = html.replace(/\s+/g, " "); - // Strategy 1: fibermall product card (class="product-item" or similar) - for (const m of collapsed.matchAll(/<(?:li|div|article)[^>]+class="[^"]*(?:product-item|product-thumb|goods-item|pro-item)[^"]*"[^>]*>([\s\S]*?)<\/(?:li|div|article)>/gi)) { - const card = m[1]; - const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?fibermall\.com\/[^"?#]+)"/i) || - card.match(/href="(\/[a-z0-9][^"?#]{5,})"/i); - if (!urlMatch) continue; - const url = urlMatch[1].startsWith("http") ? urlMatch[1] : BASE + urlMatch[1]; - if (seen.has(url) || !/fibermall\.com|\/product|\/p\//i.test(url)) continue; - seen.add(url); + // FiberMall HTML structure (SSR, confirmed 2026-04-11): + //
  • + // ... + // 12.00 + //
  • + // + // Each
  • is a product GROUP with SKU variants inside .sku_item divs. + // The main product URL has a `title` attribute; sku variants do not. + // Price: data-price="X.XX" on - const nameMatch = card.match(/<(?:h[23456]|p)[^>]+class="[^"]*(?:name|title)[^"]*"[^>]*>([^<]{8,})<\//i) || - card.match(/title="([^"]{10,})"/i) || - card.match(/]*>([^<]{10,})<\/a>/i); - if (!nameMatch) continue; - const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#\d+;/g, ""); - if (name.length < 5) continue; + // Split by
  • + // Must target specifically currency_price span — SKU items have data-price="0.00" + const priceM = card.match(/class="currency_price"[^>]*data-price="([\d.]+)"/i) || + card.match(/data-price="([1-9][\d]*\.?\d{0,2})"/); // skip 0.00 + const price = priceM ? parseFloat(priceM[1]) : undefined; - products.push({ - partNumber, name, url, - price: price && price > 0 && price < 100000 ? price : undefined, - formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, - reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: detectFiber(name), wavelength: detectWavelength(name), - }); - } + // Main product link: first with title attribute + const mainLinkM = card.match(/href="(\/sale-\d+[^"?#]*\.htm)"[^>]*title="([^"]{8,})"/i); + if (mainLinkM) { + const url = BASE + mainLinkM[1]; + const name = mainLinkM[2].trim().replace(/&/g, "&").replace(/&#\d+;/g, "").replace(/\s+/g, " "); + if (!seen.has(url) && name.length >= 5) { + seen.add(url); + const reach = detectReach(name); + const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z])/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); + products.push({ + partNumber, name, url, + price: price && price > 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + }); + } + } - // Strategy 2: generic product link scan - if (products.length === 0) { - for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?fibermall\.com\/[^"?#]{10,})"[^>]*>([^<]{10,})]*>\s*]*>([^<]{5,})<\/a>/gi)) { + const url = BASE + m[1]; const name = m[2].trim().replace(/&/g, "&"); - if (seen.has(url) || name.length < 8) continue; - if (!/transceiver|sfp|qsfp|osfp|dac|aoc|optic/i.test(name)) continue; + if (seen.has(url) || name.length < 4) continue; seen.add(url); - - const idx = collapsed.indexOf(url); - const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600); - const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/); - const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined; const reach = detectReach(name); - products.push({ - partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "", + partNumber: name.slice(0, 80), name, url, price: price && price > 0 && price < 100000 ? price : undefined, formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, @@ -167,7 +172,7 @@ export async function scrapeFiberMall(): Promise { "FiberMall", "compatible", "https://www.fibermall.com", - "https://www.fibermall.com/c/optical-transceivers/", + "https://www.fibermall.com/store-16528-100g-qsfp28.htm", ); let totalProducts = 0; @@ -181,21 +186,6 @@ export async function scrapeFiberMall(): Promise { const html1 = await fetchPage(BASE + cat.path); const catProducts = parseProductList(html1, cat); - // Skip generic fallback if specific categories already scraped - if (cat.path.includes("/optical-transceivers/") && seenCategories.size > 3) { - console.log(` Skipping generic fallback (${seenCategories.size} categories scraped)`); - continue; - } - - if (catProducts.length === 0) { - console.log(" No products on page 1 — trying alternate pagination"); - // Try ?page=1 format - try { - const html1alt = await fetchPage(BASE + cat.path + "?page=1"); - catProducts.push(...parseProductList(html1alt, cat)); - } catch { /* ignore */ } - } - if (catProducts.length === 0) { console.log(" No products found — skipping"); continue; @@ -209,7 +199,7 @@ export async function scrapeFiberMall(): Promise { for (let page = 2; page <= MAX_PAGES; page++) { await sleep(2000); try { - // Try both pagination formats + // FiberMall pagination: ?page=N const pageUrl = `${BASE}${cat.path}?page=${page}`; const html = await fetchPage(pageUrl); const pageProds = parseProductList(html, cat); diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts index 20a7339..07f65ed 100644 --- a/packages/scraper/src/scrapers/flexoptix-catalog.ts +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -72,6 +72,10 @@ const SEARCH_QUERIES = [ { query: "OSFP LR4", formFactor: "OSFP", speed: "400G", speedGbps: 400 }, { query: "OSFP ZR", formFactor: "OSFP", speed: "400G", speedGbps: 400 }, { query: "OSFP 800G", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, + { query: "OSFP224 1.6T", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }, + { query: "OSFP224", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }, + { query: "1.6T DR4", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }, + { query: "1.6T transceiver", formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }, // Additional granular queries for maximum coverage { query: "SFP+ copper", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, { query: "SFP+ 10GBASE-T", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, @@ -266,11 +270,13 @@ async function searchProducts(query: string): Promise { // Try data-price-amount attribute first (Magento Hyva theme) const attrMatch = s.match(/data-price-amount="([\d.]+)"/); if (attrMatch) return attrMatch[1]; - // Try plain price text like "39.64 EUR" - const textMatch = s.match(/([\d.]+)\s*EUR/i); - if (textMatch) return textMatch[1]; - // Try bare number - const num = parseFloat(s); + // Try plain price text like "2,921.60 EUR" or "39.64 EUR" + // IMPORTANT: must include comma in char class to handle thousand separators + const textMatch = s.match(/([\d,]+\.?\d*)\s*EUR/i); + if (textMatch) return textMatch[1].replace(/,/g, ""); + // Try bare number (strip thousand-separator commas first) + const cleaned = s.replace(/,/g, ""); + const num = parseFloat(cleaned); if (!isNaN(num) && num > 0) return String(num); return undefined; }