diff --git a/packages/scraper/src/scrapers/tenGtek.ts b/packages/scraper/src/scrapers/tenGtek.ts index b875c7b..6f4a3e1 100644 --- a/packages/scraper/src/scrapers/tenGtek.ts +++ b/packages/scraper/src/scrapers/tenGtek.ts @@ -1,29 +1,31 @@ /** * 10Gtek.com Scraper — Chinese OEM Transceiver Vendor * - * 10gtek.com is a direct competitor to FS.com at lower price points. - * Uses plain fetch (server-rendered HTML). - * Rate limited: 1 req/2sec. + * 10Gtek's main site (www.10gtek.com) only shows technical spec tables, no prices. + * Prices are available on their retail store: sfpcables.com (same company/brand). + * This scraper targets sfpcables.com which has both part numbers and USD prices. * - * Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP + * Strategy: Paginate each category on sfpcables.com, extract Model + price per product. + * Rate limited: 1 req/2sec between pages. + * + * Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, XFP */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash, parsePrice } from "../utils/hash"; -const BASE = "https://www.10gtek.com"; +const BASE = "https://www.sfpcables.com"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", }; const CATEGORIES = [ - { path: "/sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 }, - { path: "/10g-sfp+", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, - { path: "/sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, - { path: "/qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, - { path: "/qsfp28", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, - { path: "/qsfpdd", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/xfp", formFactor: "XFP", speed: "10G", speedGbps: 10 }, + { slug: "sfp-1-25g-series", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { slug: "sfp-transceivers", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { slug: "sfp28-transceivers", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { slug: "qsfp-transceivers", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { slug: "100g-qsfp28-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { slug: "xfp-transceivers", formFactor: "XFP", speed: "10G", speedGbps: 10 }, ]; interface Product { @@ -69,94 +71,94 @@ function detectReach(text: string): { label: string; meters: number } | undefine } function detectFiber(text: string): string { - if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) + return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|rj.?45/i.test(text)) return "Copper"; return ""; } /** Strip HTML tags and decode common entities */ function stripHtml(s: string): string { - return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<") - .replace(/>/g, ">").replace(/ /g, " ").replace(/°/g, "°") - .replace(/&#\d+;/g, "").trim(); + return s + .replace(/<[^>]+>/g, "") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/ /g, " ") + .replace(/&#\d+;/g, "") + .trim(); } -function parseDistance(text: string): { label: string; meters: number } | undefined { - const km = text.match(/(\d+)\s*km/i); - if (km) return { label: `${km[1]}km`, meters: parseInt(km[1]) * 1000 }; - const m = text.match(/(\d+)\s*m\b/i); - if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) }; - return undefined; -} - -function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { +/** + * Parse product listings from a sfpcables.com category page. + * + * HTML structure per product (Magento): + *
+ *

NAME

+ *
Model: PART_NUMBER
← most categories + * US$X.XX ← appears twice; first = listing price + *
+ * + * XFP exception: no Model: div; part number is in title after "|" + * e.g. "XFP Transceiver 10GBase-SR 850nm, 300M | XFP-10G-MM-SR" + */ +function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Product[] { const products: Product[] = []; - // 10Gtek uses HTML tables with columns: - // Part No. | Spec | Data Rate | Wavelength | Fiber Type | Distance | Optical Comp. | Tx Power | E.R | Rx Sens. | Temp. - // Extract all rows and parse cells - const rowRegex = /]*>([\s\S]*?)<\/tr>/gi; - let rowMatch; - while ((rowMatch = rowRegex.exec(html)) !== null) { - const rowHtml = rowMatch[1]; - // Extract all cell contents - const cellRegex = /]*>([\s\S]*?)<\/td>/gi; - const cells: string[] = []; - let cellMatch; - while ((cellMatch = cellRegex.exec(rowHtml)) !== null) { - cells.push(stripHtml(cellMatch[1])); + // Each product block starts at product-shop, ends before next product-shop or toolbar-bottom + const blockRegex = + /
([\s\S]*?)(?=
)/g; + let blockMatch: RegExpExecArray | null; + + while ((blockMatch = blockRegex.exec(html)) !== null) { + const block = blockMatch[1]; + + // Extract product URL and title from

+ const nameMatch = block.match(/

\s*]+href="([^"]+)"[^>]+title="([^"]+)"/); + if (!nameMatch) continue; + const productUrl = nameMatch[1].trim(); + const rawTitle = stripHtml(nameMatch[2]); + + // Extract part number from
Model: PART
+ const modelMatch = block.match(/
Model:\s*([^<]+)<\/div>/i); + let partNumber = modelMatch ? modelMatch[1].trim() : ""; + + // XFP fallback: parse title after "|" + if (!partNumber && rawTitle.includes("|")) { + const afterPipe = rawTitle.split("|").pop()?.trim() ?? ""; + if (afterPipe.length >= 3 && /^[A-Z0-9]/i.test(afterPipe)) { + partNumber = afterPipe.trim(); + } } - // Need at least 6 columns, first cell must look like a part number (starts with A or contains letters+digits) - if (cells.length < 6) continue; - const partNumber = cells[0]; + // Skip products without a usable part number if (!partNumber || partNumber.length < 3) continue; - // Skip header rows - if (/^Part\s*No/i.test(partNumber) || /^Spec/i.test(partNumber)) continue; - // Part numbers typically start with A (ASF, AXS, AXQ, AQS, etc.) or contain alphanumeric - if (!/^[A-Z][A-Z0-9]/i.test(partNumber)) continue; - const spec = cells[1] || ""; - const dataRate = cells[2] || ""; - const wavelength = cells.length >= 4 ? cells[3] : ""; - const fiberType = cells.length >= 5 ? cells[4] : ""; - const distance = cells.length >= 6 ? cells[5] : ""; - const txPower = cells.length >= 8 ? cells[7] : ""; + // Extract price — first occurrence of US$X.XX in the block + const priceMatch = block.match(/US\$([0-9]+(?:\.[0-9]{1,2})?)<\/span>/); + const price = priceMatch ? parseFloat(priceMatch[1]) : undefined; - // Build descriptive name - const name = `${partNumber} ${spec} ${dataRate}`.trim(); - const reach = parseDistance(distance) || detectReach(spec + " " + distance); - - // Determine fiber type from table cell or spec - let fiber = ""; - if (/SMF|single/i.test(fiberType)) fiber = "SMF"; - else if (/MMF|multi/i.test(fiberType)) fiber = "MMF"; - else if (/CAT|RJ|copper/i.test(fiberType)) fiber = "Copper"; - else fiber = detectFiber(spec); - - // Extract wavelength - const wl = wavelength.replace(/[^0-9]/g, ""); + // Detect reach and fiber type from product title + const reach = detectReach(rawTitle); + const fiber = detectFiber(rawTitle); products.push({ partNumber, - name, - url: `${BASE}${cat.path}#${partNumber}`, + name: rawTitle, + url: productUrl, + price, + currency: price !== undefined ? "USD" : undefined, formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: fiber, + fiberType: fiber || undefined, }); } - // Dedupe by part number - const seen = new Set(); - return products.filter((p) => { - if (seen.has(p.partNumber)) return false; - seen.add(p.partNumber); - return true; - }); + return products; } async function fetchPage(url: string): Promise { @@ -165,21 +167,72 @@ async function fetchPage(url: string): Promise { return resp.text(); } -export async function scrape10Gtek(): Promise { - console.log("=== 10Gtek Scraper Starting ===\n"); +/** + * Fetch all pages of a category, stopping when no new part numbers appear. + * Magento loops back to page 1 for out-of-range page numbers, so we detect + * this by checking for duplicate part numbers from previous pages. + */ +async function fetchAllPages(cat: typeof CATEGORIES[number]): Promise { + const MAX_PAGES = 30; + const allProducts: Product[] = []; + const seenPartNumbers = new Set(); - const vendorId = await ensureVendor("10Gtek", "compatible", "https://www.10gtek.com", "https://www.10gtek.com"); + for (let page = 1; page <= MAX_PAGES; page++) { + const url = page === 1 ? `${BASE}/${cat.slug}` : `${BASE}/${cat.slug}?p=${page}`; + + let html: string; + try { + html = await fetchPage(url); + } catch (err) { + console.error(` Fetch error on p${page}: ${(err as Error).message}`); + break; + } + + const pageProducts = parseProductsFromPage(html, cat); + + // Stop if no products found (truly empty page) + if (pageProducts.length === 0) break; + + // Count new (unseen) products — detects Magento catalog wrap-around + let newCount = 0; + for (const p of pageProducts) { + if (!seenPartNumbers.has(p.partNumber)) { + seenPartNumbers.add(p.partNumber); + allProducts.push(p); + newCount++; + } + } + + console.log(` p${page}: ${pageProducts.length} parsed, ${newCount} new`); + + // All products on this page already seen → we've looped back to start + if (newCount === 0) break; + + await sleep(2000); + } + + return allProducts; +} + +export async function scrape10Gtek(): Promise { + console.log("=== 10Gtek Scraper Starting (via sfpcables.com) ===\n"); + + const vendorId = await ensureVendor( + "10Gtek", + "compatible", + "https://www.10gtek.com", + "https://www.sfpcables.com" + ); let totalProducts = 0; let priceUpdates = 0; for (const cat of CATEGORIES) { - console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`); + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [/${cat.slug}] ---`); try { - const html = await fetchPage(BASE + cat.path); - const catProducts = parseProductList(html, cat); - console.log(` Found ${catProducts.length} products`); + const catProducts = await fetchAllPages(cat); + console.log(` Fetched: ${catProducts.length} unique products`); for (const product of catProducts) { try { @@ -201,7 +254,7 @@ export async function scrape10Gtek(): Promise { transceiverId: txId, sourceVendorId: vendorId, price: product.price, - currency: product.currency || "USD", + currency: product.currency ?? "USD", stockLevel: "in_stock", url: product.url, contentHash: hash, @@ -211,14 +264,12 @@ export async function scrape10Gtek(): Promise { totalProducts++; } catch (err) { - console.warn(` Error: ${(err as Error).message.slice(0, 80)}`); + console.warn(` Error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); } } } catch (err) { console.error(` Category failed: ${(err as Error).message}`); } - - await sleep(2000); } console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`); @@ -227,5 +278,9 @@ export async function scrape10Gtek(): Promise { if (require.main === module) { scrape10Gtek() .then(() => pool.end()) - .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); }