fix: 10Gtek scraper now fetches prices from sfpcables.com

10gtek.com main site only exposes technical spec tables with no prices.
sfpcables.com is 10Gtek's own retail store and has both Model numbers
and USD prices in standard Magento product listings.

Changes:
- Switch scraping target from www.10gtek.com to sfpcables.com
- Parse Model: <part> + US.XX per product block (Magento structure)
- XFP fallback: extract part number from title after '|' separator
- Add fetchAllPages() with Magento loop-detection via seen-part dedup
- Remove QSFP-DD category (not available on sfpcables.com)
- Drop XFP-less categories from old 10gtek.com spec-table parser

Verified: 10/10 SFP prices, 10/10 SFP+ prices, 4/4 XFP prices on live site.
This commit is contained in:
Rene Fichtmueller 2026-04-18 05:27:49 +02:00
parent 2a6ec90ecd
commit fcdd258369

View File

@ -1,29 +1,31 @@
/**
* 10Gtek.com Scraper Chinese OEM Transceiver Vendor
*
* 10gtek.com is a direct competitor to FS.com at lower price points.
* Uses plain fetch (server-rendered HTML).
* Rate limited: 1 req/2sec.
* 10Gtek's main site (www.10gtek.com) only shows technical spec tables, no prices.
* Prices are available on their retail store: sfpcables.com (same company/brand).
* This scraper targets sfpcables.com which has both part numbers and USD prices.
*
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP
* Strategy: Paginate each category on sfpcables.com, extract Model + price per product.
* Rate limited: 1 req/2sec between pages.
*
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, XFP
*/
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash, parsePrice } from "../utils/hash";
const BASE = "https://www.10gtek.com";
const BASE = "https://www.sfpcables.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
};
const CATEGORIES = [
{ path: "/sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/10g-sfp+", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/qsfp28", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/qsfpdd", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/xfp", formFactor: "XFP", speed: "10G", speedGbps: 10 },
{ slug: "sfp-1-25g-series", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ slug: "sfp-transceivers", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ slug: "sfp28-transceivers", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ slug: "qsfp-transceivers", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ slug: "100g-qsfp28-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ slug: "xfp-transceivers", formFactor: "XFP", speed: "10G", speedGbps: 10 },
];
interface Product {
@ -69,94 +71,94 @@ function detectReach(text: string): { label: string; meters: number } | undefine
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text))
return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|rj.?45/i.test(text)) return "Copper";
return "";
}
/** Strip HTML tags and decode common entities */
function stripHtml(s: string): string {
return s.replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<")
.replace(/&gt;/g, ">").replace(/&nbsp;/g, " ").replace(/&deg;/g, "°")
.replace(/&#\d+;/g, "").trim();
return s
.replace(/<[^>]+>/g, "")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&nbsp;/g, " ")
.replace(/&#\d+;/g, "")
.trim();
}
function parseDistance(text: string): { label: string; meters: number } | undefined {
const km = text.match(/(\d+)\s*km/i);
if (km) return { label: `${km[1]}km`, meters: parseInt(km[1]) * 1000 };
const m = text.match(/(\d+)\s*m\b/i);
if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) };
return undefined;
}
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
/**
* Parse product listings from a sfpcables.com category page.
*
* HTML structure per product (Magento):
* <div class="product-shop">
* <h2 class="product-name"><a href="URL" title="NAME">NAME</a></h2>
* <div>Model: PART_NUMBER</div> most categories
* <span class="price">US$X.XX</span> appears twice; first = listing price
* </div>
*
* XFP exception: no Model: div; part number is in title after "|"
* e.g. "XFP Transceiver 10GBase-SR 850nm, 300M | XFP-10G-MM-SR"
*/
function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
// 10Gtek uses HTML tables with columns:
// Part No. | Spec | Data Rate | Wavelength | Fiber Type | Distance | Optical Comp. | Tx Power | E.R | Rx Sens. | Temp.
// Extract all <tr> rows and parse cells
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
let rowMatch;
while ((rowMatch = rowRegex.exec(html)) !== null) {
const rowHtml = rowMatch[1];
// Extract all <td> cell contents
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi;
const cells: string[] = [];
let cellMatch;
while ((cellMatch = cellRegex.exec(rowHtml)) !== null) {
cells.push(stripHtml(cellMatch[1]));
// Each product block starts at product-shop, ends before next product-shop or toolbar-bottom
const blockRegex =
/<div class="product-shop">([\s\S]*?)(?=<div class="product-shop"|<div class="toolbar-bottom">)/g;
let blockMatch: RegExpExecArray | null;
while ((blockMatch = blockRegex.exec(html)) !== null) {
const block = blockMatch[1];
// Extract product URL and title from <h2 class="product-name"><a href="..." title="...">
const nameMatch = block.match(/<h2 class="product-name">\s*<a[^>]+href="([^"]+)"[^>]+title="([^"]+)"/);
if (!nameMatch) continue;
const productUrl = nameMatch[1].trim();
const rawTitle = stripHtml(nameMatch[2]);
// Extract part number from <div>Model: PART</div>
const modelMatch = block.match(/<div>Model:\s*([^<]+)<\/div>/i);
let partNumber = modelMatch ? modelMatch[1].trim() : "";
// XFP fallback: parse title after "|"
if (!partNumber && rawTitle.includes("|")) {
const afterPipe = rawTitle.split("|").pop()?.trim() ?? "";
if (afterPipe.length >= 3 && /^[A-Z0-9]/i.test(afterPipe)) {
partNumber = afterPipe.trim();
}
}
// Need at least 6 columns, first cell must look like a part number (starts with A or contains letters+digits)
if (cells.length < 6) continue;
const partNumber = cells[0];
// Skip products without a usable part number
if (!partNumber || partNumber.length < 3) continue;
// Skip header rows
if (/^Part\s*No/i.test(partNumber) || /^Spec/i.test(partNumber)) continue;
// Part numbers typically start with A (ASF, AXS, AXQ, AQS, etc.) or contain alphanumeric
if (!/^[A-Z][A-Z0-9]/i.test(partNumber)) continue;
const spec = cells[1] || "";
const dataRate = cells[2] || "";
const wavelength = cells.length >= 4 ? cells[3] : "";
const fiberType = cells.length >= 5 ? cells[4] : "";
const distance = cells.length >= 6 ? cells[5] : "";
const txPower = cells.length >= 8 ? cells[7] : "";
// Extract price — first occurrence of US$X.XX in the block
const priceMatch = block.match(/<span class="price">US\$([0-9]+(?:\.[0-9]{1,2})?)<\/span>/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
// Build descriptive name
const name = `${partNumber} ${spec} ${dataRate}`.trim();
const reach = parseDistance(distance) || detectReach(spec + " " + distance);
// Determine fiber type from table cell or spec
let fiber = "";
if (/SMF|single/i.test(fiberType)) fiber = "SMF";
else if (/MMF|multi/i.test(fiberType)) fiber = "MMF";
else if (/CAT|RJ|copper/i.test(fiberType)) fiber = "Copper";
else fiber = detectFiber(spec);
// Extract wavelength
const wl = wavelength.replace(/[^0-9]/g, "");
// Detect reach and fiber type from product title
const reach = detectReach(rawTitle);
const fiber = detectFiber(rawTitle);
products.push({
partNumber,
name,
url: `${BASE}${cat.path}#${partNumber}`,
name: rawTitle,
url: productUrl,
price,
currency: price !== undefined ? "USD" : undefined,
formFactor: cat.formFactor,
speed: cat.speed,
speedGbps: cat.speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: fiber,
fiberType: fiber || undefined,
});
}
// Dedupe by part number
const seen = new Set<string>();
return products.filter((p) => {
if (seen.has(p.partNumber)) return false;
seen.add(p.partNumber);
return true;
});
return products;
}
async function fetchPage(url: string): Promise<string> {
@ -165,21 +167,72 @@ async function fetchPage(url: string): Promise<string> {
return resp.text();
}
export async function scrape10Gtek(): Promise<void> {
console.log("=== 10Gtek Scraper Starting ===\n");
/**
* Fetch all pages of a category, stopping when no new part numbers appear.
* Magento loops back to page 1 for out-of-range page numbers, so we detect
* this by checking for duplicate part numbers from previous pages.
*/
async function fetchAllPages(cat: typeof CATEGORIES[number]): Promise<Product[]> {
const MAX_PAGES = 30;
const allProducts: Product[] = [];
const seenPartNumbers = new Set<string>();
const vendorId = await ensureVendor("10Gtek", "compatible", "https://www.10gtek.com", "https://www.10gtek.com");
for (let page = 1; page <= MAX_PAGES; page++) {
const url = page === 1 ? `${BASE}/${cat.slug}` : `${BASE}/${cat.slug}?p=${page}`;
let html: string;
try {
html = await fetchPage(url);
} catch (err) {
console.error(` Fetch error on p${page}: ${(err as Error).message}`);
break;
}
const pageProducts = parseProductsFromPage(html, cat);
// Stop if no products found (truly empty page)
if (pageProducts.length === 0) break;
// Count new (unseen) products — detects Magento catalog wrap-around
let newCount = 0;
for (const p of pageProducts) {
if (!seenPartNumbers.has(p.partNumber)) {
seenPartNumbers.add(p.partNumber);
allProducts.push(p);
newCount++;
}
}
console.log(` p${page}: ${pageProducts.length} parsed, ${newCount} new`);
// All products on this page already seen → we've looped back to start
if (newCount === 0) break;
await sleep(2000);
}
return allProducts;
}
export async function scrape10Gtek(): Promise<void> {
console.log("=== 10Gtek Scraper Starting (via sfpcables.com) ===\n");
const vendorId = await ensureVendor(
"10Gtek",
"compatible",
"https://www.10gtek.com",
"https://www.sfpcables.com"
);
let totalProducts = 0;
let priceUpdates = 0;
for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`);
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [/${cat.slug}] ---`);
try {
const html = await fetchPage(BASE + cat.path);
const catProducts = parseProductList(html, cat);
console.log(` Found ${catProducts.length} products`);
const catProducts = await fetchAllPages(cat);
console.log(` Fetched: ${catProducts.length} unique products`);
for (const product of catProducts) {
try {
@ -201,7 +254,7 @@ export async function scrape10Gtek(): Promise<void> {
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: product.currency || "USD",
currency: product.currency ?? "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
@ -211,14 +264,12 @@ export async function scrape10Gtek(): Promise<void> {
totalProducts++;
} catch (err) {
console.warn(` Error: ${(err as Error).message.slice(0, 80)}`);
console.warn(` Error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
}
}
} catch (err) {
console.error(` Category failed: ${(err as Error).message}`);
}
await sleep(2000);
}
console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`);
@ -227,5 +278,9 @@ export async function scrape10Gtek(): Promise<void> {
if (require.main === module) {
scrape10Gtek()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}