From eed599cc2cb2afcab2d76dc6c6e417fe533cf8ba Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 05:27:49 +0200 Subject: [PATCH] fix: 10Gtek scraper now fetches prices from sfpcables.com 10gtek.com main site only exposes technical spec tables with no prices. sfpcables.com is 10Gtek's own retail store and has both Model numbers and USD prices in standard Magento product listings. Changes: - Switch scraping target from www.10gtek.com to sfpcables.com - Parse Model: + US.XX per product block (Magento structure) - XFP fallback: extract part number from title after '|' separator - Add fetchAllPages() with Magento loop-detection via seen-part dedup - Remove QSFP-DD category (not available on sfpcables.com) - Drop XFP-less categories from old 10gtek.com spec-table parser Verified: 10/10 SFP prices, 10/10 SFP+ prices, 4/4 XFP prices on live site. --- packages/scraper/src/scrapers/tenGtek.ts | 227 ++++++++++++++--------- 1 file changed, 141 insertions(+), 86 deletions(-) diff --git a/packages/scraper/src/scrapers/tenGtek.ts b/packages/scraper/src/scrapers/tenGtek.ts index b875c7b..6f4a3e1 100644 --- a/packages/scraper/src/scrapers/tenGtek.ts +++ b/packages/scraper/src/scrapers/tenGtek.ts @@ -1,29 +1,31 @@ /** * 10Gtek.com Scraper — Chinese OEM Transceiver Vendor * - * 10gtek.com is a direct competitor to FS.com at lower price points. - * Uses plain fetch (server-rendered HTML). - * Rate limited: 1 req/2sec. + * 10Gtek's main site (www.10gtek.com) only shows technical spec tables, no prices. + * Prices are available on their retail store: sfpcables.com (same company/brand). + * This scraper targets sfpcables.com which has both part numbers and USD prices. * - * Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP + * Strategy: Paginate each category on sfpcables.com, extract Model + price per product. + * Rate limited: 1 req/2sec between pages. + * + * Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, XFP */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash, parsePrice } from "../utils/hash"; -const BASE = "https://www.10gtek.com"; +const BASE = "https://www.sfpcables.com"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", }; const CATEGORIES = [ - { path: "/sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 }, - { path: "/10g-sfp+", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, - { path: "/sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, - { path: "/qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, - { path: "/qsfp28", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, - { path: "/qsfpdd", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/xfp", formFactor: "XFP", speed: "10G", speedGbps: 10 }, + { slug: "sfp-1-25g-series", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { slug: "sfp-transceivers", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { slug: "sfp28-transceivers", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { slug: "qsfp-transceivers", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { slug: "100g-qsfp28-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { slug: "xfp-transceivers", formFactor: "XFP", speed: "10G", speedGbps: 10 }, ]; interface Product { @@ -69,94 +71,94 @@ function detectReach(text: string): { label: string; meters: number } | undefine } function detectFiber(text: string): string { - if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) + return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|rj.?45/i.test(text)) return "Copper"; return ""; } /** Strip HTML tags and decode common entities */ function stripHtml(s: string): string { - return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<") - .replace(/>/g, ">").replace(/ /g, " ").replace(/°/g, "°") - .replace(/&#\d+;/g, "").trim(); + return s + .replace(/<[^>]+>/g, "") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/ /g, " ") + .replace(/&#\d+;/g, "") + .trim(); } -function parseDistance(text: string): { label: string; meters: number } | undefined { - const km = text.match(/(\d+)\s*km/i); - if (km) return { label: `${km[1]}km`, meters: parseInt(km[1]) * 1000 }; - const m = text.match(/(\d+)\s*m\b/i); - if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) }; - return undefined; -} - -function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { +/** + * Parse product listings from a sfpcables.com category page. + * + * HTML structure per product (Magento): + *
+ *

NAME

+ *
Model: PART_NUMBER
← most categories + * US$X.XX ← appears twice; first = listing price + *
+ * + * XFP exception: no Model: div; part number is in title after "|" + * e.g. "XFP Transceiver 10GBase-SR 850nm, 300M | XFP-10G-MM-SR" + */ +function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Product[] { const products: Product[] = []; - // 10Gtek uses HTML tables with columns: - // Part No. | Spec | Data Rate | Wavelength | Fiber Type | Distance | Optical Comp. | Tx Power | E.R | Rx Sens. | Temp. - // Extract all rows and parse cells - const rowRegex = /]*>([\s\S]*?)<\/tr>/gi; - let rowMatch; - while ((rowMatch = rowRegex.exec(html)) !== null) { - const rowHtml = rowMatch[1]; - // Extract all cell contents - const cellRegex = /]*>([\s\S]*?)<\/td>/gi; - const cells: string[] = []; - let cellMatch; - while ((cellMatch = cellRegex.exec(rowHtml)) !== null) { - cells.push(stripHtml(cellMatch[1])); + // Each product block starts at product-shop, ends before next product-shop or toolbar-bottom + const blockRegex = + /
([\s\S]*?)(?=
)/g; + let blockMatch: RegExpExecArray | null; + + while ((blockMatch = blockRegex.exec(html)) !== null) { + const block = blockMatch[1]; + + // Extract product URL and title from

+ const nameMatch = block.match(/

\s*]+href="([^"]+)"[^>]+title="([^"]+)"/); + if (!nameMatch) continue; + const productUrl = nameMatch[1].trim(); + const rawTitle = stripHtml(nameMatch[2]); + + // Extract part number from
Model: PART
+ const modelMatch = block.match(/
Model:\s*([^<]+)<\/div>/i); + let partNumber = modelMatch ? modelMatch[1].trim() : ""; + + // XFP fallback: parse title after "|" + if (!partNumber && rawTitle.includes("|")) { + const afterPipe = rawTitle.split("|").pop()?.trim() ?? ""; + if (afterPipe.length >= 3 && /^[A-Z0-9]/i.test(afterPipe)) { + partNumber = afterPipe.trim(); + } } - // Need at least 6 columns, first cell must look like a part number (starts with A or contains letters+digits) - if (cells.length < 6) continue; - const partNumber = cells[0]; + // Skip products without a usable part number if (!partNumber || partNumber.length < 3) continue; - // Skip header rows - if (/^Part\s*No/i.test(partNumber) || /^Spec/i.test(partNumber)) continue; - // Part numbers typically start with A (ASF, AXS, AXQ, AQS, etc.) or contain alphanumeric - if (!/^[A-Z][A-Z0-9]/i.test(partNumber)) continue; - const spec = cells[1] || ""; - const dataRate = cells[2] || ""; - const wavelength = cells.length >= 4 ? cells[3] : ""; - const fiberType = cells.length >= 5 ? cells[4] : ""; - const distance = cells.length >= 6 ? cells[5] : ""; - const txPower = cells.length >= 8 ? cells[7] : ""; + // Extract price — first occurrence of US$X.XX in the block + const priceMatch = block.match(/US\$([0-9]+(?:\.[0-9]{1,2})?)<\/span>/); + const price = priceMatch ? parseFloat(priceMatch[1]) : undefined; - // Build descriptive name - const name = `${partNumber} ${spec} ${dataRate}`.trim(); - const reach = parseDistance(distance) || detectReach(spec + " " + distance); - - // Determine fiber type from table cell or spec - let fiber = ""; - if (/SMF|single/i.test(fiberType)) fiber = "SMF"; - else if (/MMF|multi/i.test(fiberType)) fiber = "MMF"; - else if (/CAT|RJ|copper/i.test(fiberType)) fiber = "Copper"; - else fiber = detectFiber(spec); - - // Extract wavelength - const wl = wavelength.replace(/[^0-9]/g, ""); + // Detect reach and fiber type from product title + const reach = detectReach(rawTitle); + const fiber = detectFiber(rawTitle); products.push({ partNumber, - name, - url: `${BASE}${cat.path}#${partNumber}`, + name: rawTitle, + url: productUrl, + price, + currency: price !== undefined ? "USD" : undefined, formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: fiber, + fiberType: fiber || undefined, }); } - // Dedupe by part number - const seen = new Set(); - return products.filter((p) => { - if (seen.has(p.partNumber)) return false; - seen.add(p.partNumber); - return true; - }); + return products; } async function fetchPage(url: string): Promise { @@ -165,21 +167,72 @@ async function fetchPage(url: string): Promise { return resp.text(); } -export async function scrape10Gtek(): Promise { - console.log("=== 10Gtek Scraper Starting ===\n"); +/** + * Fetch all pages of a category, stopping when no new part numbers appear. + * Magento loops back to page 1 for out-of-range page numbers, so we detect + * this by checking for duplicate part numbers from previous pages. + */ +async function fetchAllPages(cat: typeof CATEGORIES[number]): Promise { + const MAX_PAGES = 30; + const allProducts: Product[] = []; + const seenPartNumbers = new Set(); - const vendorId = await ensureVendor("10Gtek", "compatible", "https://www.10gtek.com", "https://www.10gtek.com"); + for (let page = 1; page <= MAX_PAGES; page++) { + const url = page === 1 ? `${BASE}/${cat.slug}` : `${BASE}/${cat.slug}?p=${page}`; + + let html: string; + try { + html = await fetchPage(url); + } catch (err) { + console.error(` Fetch error on p${page}: ${(err as Error).message}`); + break; + } + + const pageProducts = parseProductsFromPage(html, cat); + + // Stop if no products found (truly empty page) + if (pageProducts.length === 0) break; + + // Count new (unseen) products — detects Magento catalog wrap-around + let newCount = 0; + for (const p of pageProducts) { + if (!seenPartNumbers.has(p.partNumber)) { + seenPartNumbers.add(p.partNumber); + allProducts.push(p); + newCount++; + } + } + + console.log(` p${page}: ${pageProducts.length} parsed, ${newCount} new`); + + // All products on this page already seen → we've looped back to start + if (newCount === 0) break; + + await sleep(2000); + } + + return allProducts; +} + +export async function scrape10Gtek(): Promise { + console.log("=== 10Gtek Scraper Starting (via sfpcables.com) ===\n"); + + const vendorId = await ensureVendor( + "10Gtek", + "compatible", + "https://www.10gtek.com", + "https://www.sfpcables.com" + ); let totalProducts = 0; let priceUpdates = 0; for (const cat of CATEGORIES) { - console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`); + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [/${cat.slug}] ---`); try { - const html = await fetchPage(BASE + cat.path); - const catProducts = parseProductList(html, cat); - console.log(` Found ${catProducts.length} products`); + const catProducts = await fetchAllPages(cat); + console.log(` Fetched: ${catProducts.length} unique products`); for (const product of catProducts) { try { @@ -201,7 +254,7 @@ export async function scrape10Gtek(): Promise { transceiverId: txId, sourceVendorId: vendorId, price: product.price, - currency: product.currency || "USD", + currency: product.currency ?? "USD", stockLevel: "in_stock", url: product.url, contentHash: hash, @@ -211,14 +264,12 @@ export async function scrape10Gtek(): Promise { totalProducts++; } catch (err) { - console.warn(` Error: ${(err as Error).message.slice(0, 80)}`); + console.warn(` Error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); } } } catch (err) { console.error(` Category failed: ${(err as Error).message}`); } - - await sleep(2000); } console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`); @@ -227,5 +278,9 @@ export async function scrape10Gtek(): Promise { if (require.main === module) { scrape10Gtek() .then(() => pool.end()) - .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); }