fix: 10Gtek scraper now fetches prices from sfpcables.com

10gtek.com main site only exposes technical spec tables with no prices.
sfpcables.com is 10Gtek's own retail store and has both Model numbers
and USD prices in standard Magento product listings.

Changes:
- Switch scraping target from www.10gtek.com to sfpcables.com
- Parse Model: <part> + US.XX per product block (Magento structure)
- XFP fallback: extract part number from title after '|' separator
- Add fetchAllPages() with Magento loop-detection via seen-part dedup
- Remove QSFP-DD category (not available on sfpcables.com)
- Drop XFP-less categories from old 10gtek.com spec-table parser

Verified: 10/10 SFP prices, 10/10 SFP+ prices, 4/4 XFP prices on live site.
This commit is contained in:
Rene Fichtmueller 2026-04-18 05:27:49 +02:00
parent 582965ecb5
commit eed599cc2c

View File

@ -1,29 +1,31 @@
/** /**
* 10Gtek.com Scraper Chinese OEM Transceiver Vendor * 10Gtek.com Scraper Chinese OEM Transceiver Vendor
* *
* 10gtek.com is a direct competitor to FS.com at lower price points. * 10Gtek's main site (www.10gtek.com) only shows technical spec tables, no prices.
* Uses plain fetch (server-rendered HTML). * Prices are available on their retail store: sfpcables.com (same company/brand).
* Rate limited: 1 req/2sec. * This scraper targets sfpcables.com which has both part numbers and USD prices.
* *
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP * Strategy: Paginate each category on sfpcables.com, extract Model + price per product.
* Rate limited: 1 req/2sec between pages.
*
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, XFP
*/ */
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash, parsePrice } from "../utils/hash"; import { contentHash, parsePrice } from "../utils/hash";
const BASE = "https://www.10gtek.com"; const BASE = "https://www.sfpcables.com";
const HEADERS = { const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml", Accept: "text/html,application/xhtml+xml",
}; };
const CATEGORIES = [ const CATEGORIES = [
{ path: "/sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 }, { slug: "sfp-1-25g-series", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/10g-sfp+", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, { slug: "sfp-transceivers", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, { slug: "sfp28-transceivers", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, { slug: "qsfp-transceivers", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/qsfp28", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, { slug: "100g-qsfp28-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/qsfpdd", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { slug: "xfp-transceivers", formFactor: "XFP", speed: "10G", speedGbps: 10 },
{ path: "/xfp", formFactor: "XFP", speed: "10G", speedGbps: 10 },
]; ];
interface Product { interface Product {
@ -69,94 +71,94 @@ function detectReach(text: string): { label: string; meters: number } | undefine
} }
function detectFiber(text: string): string { function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text))
return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|rj.?45/i.test(text)) return "Copper";
return ""; return "";
} }
/** Strip HTML tags and decode common entities */ /** Strip HTML tags and decode common entities */
function stripHtml(s: string): string { function stripHtml(s: string): string {
return s.replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<") return s
.replace(/&gt;/g, ">").replace(/&nbsp;/g, " ").replace(/&deg;/g, "°") .replace(/<[^>]+>/g, "")
.replace(/&#\d+;/g, "").trim(); .replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&nbsp;/g, " ")
.replace(/&#\d+;/g, "")
.trim();
} }
function parseDistance(text: string): { label: string; meters: number } | undefined { /**
const km = text.match(/(\d+)\s*km/i); * Parse product listings from a sfpcables.com category page.
if (km) return { label: `${km[1]}km`, meters: parseInt(km[1]) * 1000 }; *
const m = text.match(/(\d+)\s*m\b/i); * HTML structure per product (Magento):
if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) }; * <div class="product-shop">
return undefined; * <h2 class="product-name"><a href="URL" title="NAME">NAME</a></h2>
} * <div>Model: PART_NUMBER</div> most categories
* <span class="price">US$X.XX</span> appears twice; first = listing price
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { * </div>
*
* XFP exception: no Model: div; part number is in title after "|"
* e.g. "XFP Transceiver 10GBase-SR 850nm, 300M | XFP-10G-MM-SR"
*/
function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = []; const products: Product[] = [];
// 10Gtek uses HTML tables with columns: // Each product block starts at product-shop, ends before next product-shop or toolbar-bottom
// Part No. | Spec | Data Rate | Wavelength | Fiber Type | Distance | Optical Comp. | Tx Power | E.R | Rx Sens. | Temp. const blockRegex =
// Extract all <tr> rows and parse cells /<div class="product-shop">([\s\S]*?)(?=<div class="product-shop"|<div class="toolbar-bottom">)/g;
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let blockMatch: RegExpExecArray | null;
let rowMatch;
while ((rowMatch = rowRegex.exec(html)) !== null) { while ((blockMatch = blockRegex.exec(html)) !== null) {
const rowHtml = rowMatch[1]; const block = blockMatch[1];
// Extract all <td> cell contents
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; // Extract product URL and title from <h2 class="product-name"><a href="..." title="...">
const cells: string[] = []; const nameMatch = block.match(/<h2 class="product-name">\s*<a[^>]+href="([^"]+)"[^>]+title="([^"]+)"/);
let cellMatch; if (!nameMatch) continue;
while ((cellMatch = cellRegex.exec(rowHtml)) !== null) { const productUrl = nameMatch[1].trim();
cells.push(stripHtml(cellMatch[1])); const rawTitle = stripHtml(nameMatch[2]);
// Extract part number from <div>Model: PART</div>
const modelMatch = block.match(/<div>Model:\s*([^<]+)<\/div>/i);
let partNumber = modelMatch ? modelMatch[1].trim() : "";
// XFP fallback: parse title after "|"
if (!partNumber && rawTitle.includes("|")) {
const afterPipe = rawTitle.split("|").pop()?.trim() ?? "";
if (afterPipe.length >= 3 && /^[A-Z0-9]/i.test(afterPipe)) {
partNumber = afterPipe.trim();
}
} }
// Need at least 6 columns, first cell must look like a part number (starts with A or contains letters+digits) // Skip products without a usable part number
if (cells.length < 6) continue;
const partNumber = cells[0];
if (!partNumber || partNumber.length < 3) continue; if (!partNumber || partNumber.length < 3) continue;
// Skip header rows
if (/^Part\s*No/i.test(partNumber) || /^Spec/i.test(partNumber)) continue;
// Part numbers typically start with A (ASF, AXS, AXQ, AQS, etc.) or contain alphanumeric
if (!/^[A-Z][A-Z0-9]/i.test(partNumber)) continue;
const spec = cells[1] || ""; // Extract price — first occurrence of US$X.XX in the block
const dataRate = cells[2] || ""; const priceMatch = block.match(/<span class="price">US\$([0-9]+(?:\.[0-9]{1,2})?)<\/span>/);
const wavelength = cells.length >= 4 ? cells[3] : ""; const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
const fiberType = cells.length >= 5 ? cells[4] : "";
const distance = cells.length >= 6 ? cells[5] : "";
const txPower = cells.length >= 8 ? cells[7] : "";
// Build descriptive name // Detect reach and fiber type from product title
const name = `${partNumber} ${spec} ${dataRate}`.trim(); const reach = detectReach(rawTitle);
const reach = parseDistance(distance) || detectReach(spec + " " + distance); const fiber = detectFiber(rawTitle);
// Determine fiber type from table cell or spec
let fiber = "";
if (/SMF|single/i.test(fiberType)) fiber = "SMF";
else if (/MMF|multi/i.test(fiberType)) fiber = "MMF";
else if (/CAT|RJ|copper/i.test(fiberType)) fiber = "Copper";
else fiber = detectFiber(spec);
// Extract wavelength
const wl = wavelength.replace(/[^0-9]/g, "");
products.push({ products.push({
partNumber, partNumber,
name, name: rawTitle,
url: `${BASE}${cat.path}#${partNumber}`, url: productUrl,
price,
currency: price !== undefined ? "USD" : undefined,
formFactor: cat.formFactor, formFactor: cat.formFactor,
speed: cat.speed, speed: cat.speed,
speedGbps: cat.speedGbps, speedGbps: cat.speedGbps,
reachLabel: reach?.label, reachLabel: reach?.label,
reachMeters: reach?.meters, reachMeters: reach?.meters,
fiberType: fiber, fiberType: fiber || undefined,
}); });
} }
// Dedupe by part number return products;
const seen = new Set<string>();
return products.filter((p) => {
if (seen.has(p.partNumber)) return false;
seen.add(p.partNumber);
return true;
});
} }
async function fetchPage(url: string): Promise<string> { async function fetchPage(url: string): Promise<string> {
@ -165,21 +167,72 @@ async function fetchPage(url: string): Promise<string> {
return resp.text(); return resp.text();
} }
export async function scrape10Gtek(): Promise<void> { /**
console.log("=== 10Gtek Scraper Starting ===\n"); * Fetch all pages of a category, stopping when no new part numbers appear.
* Magento loops back to page 1 for out-of-range page numbers, so we detect
* this by checking for duplicate part numbers from previous pages.
*/
async function fetchAllPages(cat: typeof CATEGORIES[number]): Promise<Product[]> {
const MAX_PAGES = 30;
const allProducts: Product[] = [];
const seenPartNumbers = new Set<string>();
const vendorId = await ensureVendor("10Gtek", "compatible", "https://www.10gtek.com", "https://www.10gtek.com"); for (let page = 1; page <= MAX_PAGES; page++) {
const url = page === 1 ? `${BASE}/${cat.slug}` : `${BASE}/${cat.slug}?p=${page}`;
let html: string;
try {
html = await fetchPage(url);
} catch (err) {
console.error(` Fetch error on p${page}: ${(err as Error).message}`);
break;
}
const pageProducts = parseProductsFromPage(html, cat);
// Stop if no products found (truly empty page)
if (pageProducts.length === 0) break;
// Count new (unseen) products — detects Magento catalog wrap-around
let newCount = 0;
for (const p of pageProducts) {
if (!seenPartNumbers.has(p.partNumber)) {
seenPartNumbers.add(p.partNumber);
allProducts.push(p);
newCount++;
}
}
console.log(` p${page}: ${pageProducts.length} parsed, ${newCount} new`);
// All products on this page already seen → we've looped back to start
if (newCount === 0) break;
await sleep(2000);
}
return allProducts;
}
export async function scrape10Gtek(): Promise<void> {
console.log("=== 10Gtek Scraper Starting (via sfpcables.com) ===\n");
const vendorId = await ensureVendor(
"10Gtek",
"compatible",
"https://www.10gtek.com",
"https://www.sfpcables.com"
);
let totalProducts = 0; let totalProducts = 0;
let priceUpdates = 0; let priceUpdates = 0;
for (const cat of CATEGORIES) { for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`); console.log(`\n--- ${cat.formFactor} (${cat.speed}) [/${cat.slug}] ---`);
try { try {
const html = await fetchPage(BASE + cat.path); const catProducts = await fetchAllPages(cat);
const catProducts = parseProductList(html, cat); console.log(` Fetched: ${catProducts.length} unique products`);
console.log(` Found ${catProducts.length} products`);
for (const product of catProducts) { for (const product of catProducts) {
try { try {
@ -201,7 +254,7 @@ export async function scrape10Gtek(): Promise<void> {
transceiverId: txId, transceiverId: txId,
sourceVendorId: vendorId, sourceVendorId: vendorId,
price: product.price, price: product.price,
currency: product.currency || "USD", currency: product.currency ?? "USD",
stockLevel: "in_stock", stockLevel: "in_stock",
url: product.url, url: product.url,
contentHash: hash, contentHash: hash,
@ -211,14 +264,12 @@ export async function scrape10Gtek(): Promise<void> {
totalProducts++; totalProducts++;
} catch (err) { } catch (err) {
console.warn(` Error: ${(err as Error).message.slice(0, 80)}`); console.warn(` Error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
} }
} }
} catch (err) { } catch (err) {
console.error(` Category failed: ${(err as Error).message}`); console.error(` Category failed: ${(err as Error).message}`);
} }
await sleep(2000);
} }
console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`); console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`);
@ -227,5 +278,9 @@ export async function scrape10Gtek(): Promise<void> {
if (require.main === module) { if (require.main === module) {
scrape10Gtek() scrape10Gtek()
.then(() => pool.end()) .then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); .catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
} }