fix: 10Gtek scraper now fetches prices from sfpcables.com
10gtek.com main site only exposes technical spec tables with no prices. sfpcables.com is 10Gtek's own retail store and has both Model numbers and USD prices in standard Magento product listings. Changes: - Switch scraping target from www.10gtek.com to sfpcables.com - Parse Model: <part> + US.XX per product block (Magento structure) - XFP fallback: extract part number from title after '|' separator - Add fetchAllPages() with Magento loop-detection via seen-part dedup - Remove QSFP-DD category (not available on sfpcables.com) - Drop XFP-less categories from old 10gtek.com spec-table parser Verified: 10/10 SFP prices, 10/10 SFP+ prices, 4/4 XFP prices on live site.
This commit is contained in:
parent
582965ecb5
commit
eed599cc2c
@ -1,29 +1,31 @@
|
|||||||
/**
|
/**
|
||||||
* 10Gtek.com Scraper — Chinese OEM Transceiver Vendor
|
* 10Gtek.com Scraper — Chinese OEM Transceiver Vendor
|
||||||
*
|
*
|
||||||
* 10gtek.com is a direct competitor to FS.com at lower price points.
|
* 10Gtek's main site (www.10gtek.com) only shows technical spec tables, no prices.
|
||||||
* Uses plain fetch (server-rendered HTML).
|
* Prices are available on their retail store: sfpcables.com (same company/brand).
|
||||||
* Rate limited: 1 req/2sec.
|
* This scraper targets sfpcables.com which has both part numbers and USD prices.
|
||||||
*
|
*
|
||||||
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP
|
* Strategy: Paginate each category on sfpcables.com, extract Model + price per product.
|
||||||
|
* Rate limited: 1 req/2sec between pages.
|
||||||
|
*
|
||||||
|
* Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, XFP
|
||||||
*/
|
*/
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||||
import { contentHash, parsePrice } from "../utils/hash";
|
import { contentHash, parsePrice } from "../utils/hash";
|
||||||
|
|
||||||
const BASE = "https://www.10gtek.com";
|
const BASE = "https://www.sfpcables.com";
|
||||||
const HEADERS = {
|
const HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
Accept: "text/html,application/xhtml+xml",
|
Accept: "text/html,application/xhtml+xml",
|
||||||
};
|
};
|
||||||
|
|
||||||
const CATEGORIES = [
|
const CATEGORIES = [
|
||||||
{ path: "/sfp", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
{ slug: "sfp-1-25g-series", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||||
{ path: "/10g-sfp+", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
{ slug: "sfp-transceivers", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||||
{ path: "/sfp28", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
{ slug: "sfp28-transceivers", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||||
{ path: "/qsfp", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
{ slug: "qsfp-transceivers", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||||
{ path: "/qsfp28", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
{ slug: "100g-qsfp28-transceivers", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||||
{ path: "/qsfpdd", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
{ slug: "xfp-transceivers", formFactor: "XFP", speed: "10G", speedGbps: 10 },
|
||||||
{ path: "/xfp", formFactor: "XFP", speed: "10G", speedGbps: 10 },
|
|
||||||
];
|
];
|
||||||
|
|
||||||
interface Product {
|
interface Product {
|
||||||
@ -69,94 +71,94 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
|||||||
}
|
}
|
||||||
|
|
||||||
function detectFiber(text: string): string {
|
function detectFiber(text: string): string {
|
||||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text))
|
||||||
|
return "SMF";
|
||||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||||
|
if (/copper|rj.?45/i.test(text)) return "Copper";
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Strip HTML tags and decode common entities */
|
/** Strip HTML tags and decode common entities */
|
||||||
function stripHtml(s: string): string {
|
function stripHtml(s: string): string {
|
||||||
return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<")
|
return s
|
||||||
.replace(/>/g, ">").replace(/ /g, " ").replace(/°/g, "°")
|
.replace(/<[^>]+>/g, "")
|
||||||
.replace(/&#\d+;/g, "").trim();
|
.replace(/&/g, "&")
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">")
|
||||||
|
.replace(/ /g, " ")
|
||||||
|
.replace(/&#\d+;/g, "")
|
||||||
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseDistance(text: string): { label: string; meters: number } | undefined {
|
/**
|
||||||
const km = text.match(/(\d+)\s*km/i);
|
* Parse product listings from a sfpcables.com category page.
|
||||||
if (km) return { label: `${km[1]}km`, meters: parseInt(km[1]) * 1000 };
|
*
|
||||||
const m = text.match(/(\d+)\s*m\b/i);
|
* HTML structure per product (Magento):
|
||||||
if (m) return { label: `${m[1]}m`, meters: parseInt(m[1]) };
|
* <div class="product-shop">
|
||||||
return undefined;
|
* <h2 class="product-name"><a href="URL" title="NAME">NAME</a></h2>
|
||||||
}
|
* <div>Model: PART_NUMBER</div> ← most categories
|
||||||
|
* <span class="price">US$X.XX</span> ← appears twice; first = listing price
|
||||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
* </div>
|
||||||
|
*
|
||||||
|
* XFP exception: no Model: div; part number is in title after "|"
|
||||||
|
* e.g. "XFP Transceiver 10GBase-SR 850nm, 300M | XFP-10G-MM-SR"
|
||||||
|
*/
|
||||||
|
function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||||
const products: Product[] = [];
|
const products: Product[] = [];
|
||||||
|
|
||||||
// 10Gtek uses HTML tables with columns:
|
// Each product block starts at product-shop, ends before next product-shop or toolbar-bottom
|
||||||
// Part No. | Spec | Data Rate | Wavelength | Fiber Type | Distance | Optical Comp. | Tx Power | E.R | Rx Sens. | Temp.
|
const blockRegex =
|
||||||
// Extract all <tr> rows and parse cells
|
/<div class="product-shop">([\s\S]*?)(?=<div class="product-shop"|<div class="toolbar-bottom">)/g;
|
||||||
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
|
let blockMatch: RegExpExecArray | null;
|
||||||
let rowMatch;
|
|
||||||
while ((rowMatch = rowRegex.exec(html)) !== null) {
|
while ((blockMatch = blockRegex.exec(html)) !== null) {
|
||||||
const rowHtml = rowMatch[1];
|
const block = blockMatch[1];
|
||||||
// Extract all <td> cell contents
|
|
||||||
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi;
|
// Extract product URL and title from <h2 class="product-name"><a href="..." title="...">
|
||||||
const cells: string[] = [];
|
const nameMatch = block.match(/<h2 class="product-name">\s*<a[^>]+href="([^"]+)"[^>]+title="([^"]+)"/);
|
||||||
let cellMatch;
|
if (!nameMatch) continue;
|
||||||
while ((cellMatch = cellRegex.exec(rowHtml)) !== null) {
|
const productUrl = nameMatch[1].trim();
|
||||||
cells.push(stripHtml(cellMatch[1]));
|
const rawTitle = stripHtml(nameMatch[2]);
|
||||||
|
|
||||||
|
// Extract part number from <div>Model: PART</div>
|
||||||
|
const modelMatch = block.match(/<div>Model:\s*([^<]+)<\/div>/i);
|
||||||
|
let partNumber = modelMatch ? modelMatch[1].trim() : "";
|
||||||
|
|
||||||
|
// XFP fallback: parse title after "|"
|
||||||
|
if (!partNumber && rawTitle.includes("|")) {
|
||||||
|
const afterPipe = rawTitle.split("|").pop()?.trim() ?? "";
|
||||||
|
if (afterPipe.length >= 3 && /^[A-Z0-9]/i.test(afterPipe)) {
|
||||||
|
partNumber = afterPipe.trim();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Need at least 6 columns, first cell must look like a part number (starts with A or contains letters+digits)
|
// Skip products without a usable part number
|
||||||
if (cells.length < 6) continue;
|
|
||||||
const partNumber = cells[0];
|
|
||||||
if (!partNumber || partNumber.length < 3) continue;
|
if (!partNumber || partNumber.length < 3) continue;
|
||||||
// Skip header rows
|
|
||||||
if (/^Part\s*No/i.test(partNumber) || /^Spec/i.test(partNumber)) continue;
|
|
||||||
// Part numbers typically start with A (ASF, AXS, AXQ, AQS, etc.) or contain alphanumeric
|
|
||||||
if (!/^[A-Z][A-Z0-9]/i.test(partNumber)) continue;
|
|
||||||
|
|
||||||
const spec = cells[1] || "";
|
// Extract price — first occurrence of US$X.XX in the block
|
||||||
const dataRate = cells[2] || "";
|
const priceMatch = block.match(/<span class="price">US\$([0-9]+(?:\.[0-9]{1,2})?)<\/span>/);
|
||||||
const wavelength = cells.length >= 4 ? cells[3] : "";
|
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||||
const fiberType = cells.length >= 5 ? cells[4] : "";
|
|
||||||
const distance = cells.length >= 6 ? cells[5] : "";
|
|
||||||
const txPower = cells.length >= 8 ? cells[7] : "";
|
|
||||||
|
|
||||||
// Build descriptive name
|
// Detect reach and fiber type from product title
|
||||||
const name = `${partNumber} ${spec} ${dataRate}`.trim();
|
const reach = detectReach(rawTitle);
|
||||||
const reach = parseDistance(distance) || detectReach(spec + " " + distance);
|
const fiber = detectFiber(rawTitle);
|
||||||
|
|
||||||
// Determine fiber type from table cell or spec
|
|
||||||
let fiber = "";
|
|
||||||
if (/SMF|single/i.test(fiberType)) fiber = "SMF";
|
|
||||||
else if (/MMF|multi/i.test(fiberType)) fiber = "MMF";
|
|
||||||
else if (/CAT|RJ|copper/i.test(fiberType)) fiber = "Copper";
|
|
||||||
else fiber = detectFiber(spec);
|
|
||||||
|
|
||||||
// Extract wavelength
|
|
||||||
const wl = wavelength.replace(/[^0-9]/g, "");
|
|
||||||
|
|
||||||
products.push({
|
products.push({
|
||||||
partNumber,
|
partNumber,
|
||||||
name,
|
name: rawTitle,
|
||||||
url: `${BASE}${cat.path}#${partNumber}`,
|
url: productUrl,
|
||||||
|
price,
|
||||||
|
currency: price !== undefined ? "USD" : undefined,
|
||||||
formFactor: cat.formFactor,
|
formFactor: cat.formFactor,
|
||||||
speed: cat.speed,
|
speed: cat.speed,
|
||||||
speedGbps: cat.speedGbps,
|
speedGbps: cat.speedGbps,
|
||||||
reachLabel: reach?.label,
|
reachLabel: reach?.label,
|
||||||
reachMeters: reach?.meters,
|
reachMeters: reach?.meters,
|
||||||
fiberType: fiber,
|
fiberType: fiber || undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dedupe by part number
|
return products;
|
||||||
const seen = new Set<string>();
|
|
||||||
return products.filter((p) => {
|
|
||||||
if (seen.has(p.partNumber)) return false;
|
|
||||||
seen.add(p.partNumber);
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchPage(url: string): Promise<string> {
|
async function fetchPage(url: string): Promise<string> {
|
||||||
@ -165,21 +167,72 @@ async function fetchPage(url: string): Promise<string> {
|
|||||||
return resp.text();
|
return resp.text();
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrape10Gtek(): Promise<void> {
|
/**
|
||||||
console.log("=== 10Gtek Scraper Starting ===\n");
|
* Fetch all pages of a category, stopping when no new part numbers appear.
|
||||||
|
* Magento loops back to page 1 for out-of-range page numbers, so we detect
|
||||||
|
* this by checking for duplicate part numbers from previous pages.
|
||||||
|
*/
|
||||||
|
async function fetchAllPages(cat: typeof CATEGORIES[number]): Promise<Product[]> {
|
||||||
|
const MAX_PAGES = 30;
|
||||||
|
const allProducts: Product[] = [];
|
||||||
|
const seenPartNumbers = new Set<string>();
|
||||||
|
|
||||||
const vendorId = await ensureVendor("10Gtek", "compatible", "https://www.10gtek.com", "https://www.10gtek.com");
|
for (let page = 1; page <= MAX_PAGES; page++) {
|
||||||
|
const url = page === 1 ? `${BASE}/${cat.slug}` : `${BASE}/${cat.slug}?p=${page}`;
|
||||||
|
|
||||||
|
let html: string;
|
||||||
|
try {
|
||||||
|
html = await fetchPage(url);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(` Fetch error on p${page}: ${(err as Error).message}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pageProducts = parseProductsFromPage(html, cat);
|
||||||
|
|
||||||
|
// Stop if no products found (truly empty page)
|
||||||
|
if (pageProducts.length === 0) break;
|
||||||
|
|
||||||
|
// Count new (unseen) products — detects Magento catalog wrap-around
|
||||||
|
let newCount = 0;
|
||||||
|
for (const p of pageProducts) {
|
||||||
|
if (!seenPartNumbers.has(p.partNumber)) {
|
||||||
|
seenPartNumbers.add(p.partNumber);
|
||||||
|
allProducts.push(p);
|
||||||
|
newCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` p${page}: ${pageProducts.length} parsed, ${newCount} new`);
|
||||||
|
|
||||||
|
// All products on this page already seen → we've looped back to start
|
||||||
|
if (newCount === 0) break;
|
||||||
|
|
||||||
|
await sleep(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
return allProducts;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrape10Gtek(): Promise<void> {
|
||||||
|
console.log("=== 10Gtek Scraper Starting (via sfpcables.com) ===\n");
|
||||||
|
|
||||||
|
const vendorId = await ensureVendor(
|
||||||
|
"10Gtek",
|
||||||
|
"compatible",
|
||||||
|
"https://www.10gtek.com",
|
||||||
|
"https://www.sfpcables.com"
|
||||||
|
);
|
||||||
|
|
||||||
let totalProducts = 0;
|
let totalProducts = 0;
|
||||||
let priceUpdates = 0;
|
let priceUpdates = 0;
|
||||||
|
|
||||||
for (const cat of CATEGORIES) {
|
for (const cat of CATEGORIES) {
|
||||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`);
|
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [/${cat.slug}] ---`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const html = await fetchPage(BASE + cat.path);
|
const catProducts = await fetchAllPages(cat);
|
||||||
const catProducts = parseProductList(html, cat);
|
console.log(` Fetched: ${catProducts.length} unique products`);
|
||||||
console.log(` Found ${catProducts.length} products`);
|
|
||||||
|
|
||||||
for (const product of catProducts) {
|
for (const product of catProducts) {
|
||||||
try {
|
try {
|
||||||
@ -201,7 +254,7 @@ export async function scrape10Gtek(): Promise<void> {
|
|||||||
transceiverId: txId,
|
transceiverId: txId,
|
||||||
sourceVendorId: vendorId,
|
sourceVendorId: vendorId,
|
||||||
price: product.price,
|
price: product.price,
|
||||||
currency: product.currency || "USD",
|
currency: product.currency ?? "USD",
|
||||||
stockLevel: "in_stock",
|
stockLevel: "in_stock",
|
||||||
url: product.url,
|
url: product.url,
|
||||||
contentHash: hash,
|
contentHash: hash,
|
||||||
@ -211,14 +264,12 @@ export async function scrape10Gtek(): Promise<void> {
|
|||||||
|
|
||||||
totalProducts++;
|
totalProducts++;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(` Error: ${(err as Error).message.slice(0, 80)}`);
|
console.warn(` Error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error(` Category failed: ${(err as Error).message}`);
|
console.error(` Category failed: ${(err as Error).message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
await sleep(2000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`);
|
console.log(`\n=== 10Gtek Complete: ${totalProducts} products, ${priceUpdates} prices ===`);
|
||||||
@ -227,5 +278,9 @@ export async function scrape10Gtek(): Promise<void> {
|
|||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
scrape10Gtek()
|
scrape10Gtek()
|
||||||
.then(() => pool.end())
|
.then(() => pool.end())
|
||||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
.catch((err) => {
|
||||||
|
console.error("Fatal:", err);
|
||||||
|
pool.end();
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user