Rene Fichtmueller 5393f73c17 feat: stock quality schema + QSFPTEK/NADDOD v2 scrapers with real-time stock counts
- Migration 028 (retroactive): document warehouse columns added to stock_observations
- Migration 037: composite indexes for DISTINCT ON (transceiver_id, source_vendor_id) queries
- Migration 038: add stock_confidence (1/2/3), price_currency, price_includes_tax,
  stock_vendor_ts to stock_observations + TRUNCATE test-run data

db.ts: upsertStockObservation now accepts stockConfidence, priceCurrency,
priceIncludesTax, stockVendorTs; delta detection includes quantity_available

fs-com.ts: passes stockConfidence=3 + priceCurrency=EUR + priceIncludesTax=false

qsfptek.ts v2: Phase 1 API listing + Phase 2 detail-page stock extraction
- Parses 'X in real-time stock, DATE' from product detail pages
- Writes stock_observations with confidence=2 + stockVendorTs
- Up to 500 detail pages/run at 2s rate limit

naddod.ts v2: complete rewrite from WooCommerce to Astro sitemap-based
- Discovers products via /sitemaps/products.xml (600+ products)
- URL format: /products/XXXXX.html
- Extracts 'In Stock: X' exact counts from SSR HTML
- Writes both price + stock observations (confidence 1 or 2)
2026-04-17 22:54:40 +02:00

347 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* QSFPTEK Scraper — Chinese compatible transceiver vendor
*
* qsfptek.com — migrated to custom Java/Spring + Vue.js in 2025.
* Old OpenCart /c/*.html paths are gone (404).
* Products are served via HTML-fragment API: /mall/commodity/list
*
* API: GET /mall/commodity/list?categoryId=21&attributes=VALUE_ID&page=N&pageSize=30
* Returns HTML fragment with product cards.
*
* Phase 1: Collect product list + prices via API (plain HTTP, no JS needed)
* Phase 2: Fetch product detail pages to extract real-time stock count
* Format: "5507 in real-time stock, 17 Apr, 2026"
* Confidence: 2 (aggregated global count with vendor timestamp)
*
* Rate limited: 1 req/2sec.
*/
import * as cheerio from "cheerio";
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.qsfptek.com";
const LIST_API = "/mall/commodity/list";
const CATEGORY_ID = "21"; // Transceivers (top-level, all products)
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
};
const MAX_PAGES = 40;
const PAGE_SIZE = 30;
// Limit detail-page fetches per run to avoid overwhelming the server
// (~500 products × 2s = ~17min for a full refresh; subsequent runs are faster
// since unchanged stock counts are skipped by upsertStockObservation)
const MAX_DETAIL_PAGES = 500;
// Data rate attribute values (found in /mall/commodity/attribute?categoryId=21)
// pid = "2c9180837bbaf08f017bbdd1ebf7001e" (Data Rate attribute group)
const DATA_RATE_ATTRIBUTES: Array<{
attrId: string;
speed: string;
speedGbps: number;
formFactor: string;
}> = [
{ attrId: "2c9180837bbaf08f017bbde50ea30101", speed: "1G", speedGbps: 1, formFactor: "SFP" },
{ attrId: "2c9180837bbaf08f017bbde50ea30100", speed: "10G", speedGbps: 10, formFactor: "SFP+" },
{ attrId: "2c9180837bbaf08f017bbde50ea300ff", speed: "25G", speedGbps: 25, formFactor: "SFP28" },
{ attrId: "2c9180837bbaf08f017bbde50ea300fe", speed: "40G", speedGbps: 40, formFactor: "QSFP+" },
{ attrId: "2c9180837bbaf08f017bbde50ea300fd", speed: "100G", speedGbps: 100, formFactor: "QSFP28" },
{ attrId: "2c98491f8f4b8e55018f94aa8c5d48ff", speed: "200G", speedGbps: 200, formFactor: "QSFP56" },
{ attrId: "2c9180837e2e7f64017e389caf0700c8", speed: "400G", speedGbps: 400, formFactor: "QSFP-DD" },
{ attrId: "2c98491f8e363cbf018e3faa5b6d2f8b", speed: "800G", speedGbps: 800, formFactor: "OSFP" },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
interface StockDetail {
qty: number;
vendorTs: Date | null;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [re, label, meters] of patterns) {
if (re.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const m = text.match(/(\d{3,4})\s*nm/i);
return m ? m[1] : "";
}
/**
* Parse QSFPTEK real-time stock text.
* Format: "5507 in real-time stock, 17 Apr, 2026"
* Returns { qty, vendorTs } or null if not found.
*/
function parseStockDetail(html: string): StockDetail | null {
// Match: "<number> in real-time stock, <date>"
const m = html.match(/(\d[\d,]*)\s+in\s+real-?time\s+stock[,\s]+(\d{1,2}\s+\w+,?\s*\d{4})/i);
if (!m) {
// Also try: "<number> in stock" without timestamp
const simple = html.match(/(\d[\d,]+)\s+in\s+(?:real-?time\s+)?stock\b/i);
if (simple) {
const qty = parseInt(simple[1].replace(/,/g, ""), 10);
return isNaN(qty) || qty < 0 ? null : { qty, vendorTs: null };
}
return null;
}
const qty = parseInt(m[1].replace(/,/g, ""), 10);
if (isNaN(qty) || qty < 0) return null;
// Parse vendor timestamp: "17 Apr, 2026" → Date
let vendorTs: Date | null = null;
try {
const dateStr = m[2].replace(",", "");
const d = new Date(dateStr);
if (!isNaN(d.getTime())) vendorTs = d;
} catch {
// ignore unparseable date
}
return { qty, vendorTs };
}
function parseProductFragment(html: string, attr: typeof DATA_RATE_ATTRIBUTES[number]): Product[] {
const $ = cheerio.load(html);
const products: Product[] = [];
const seen = new Set<string>();
// Product cards: li.Hot, li.New, li with class containing product
$("li, div.kuang").each((_i, el) => {
const $el = $(el);
// Get product URL
const href = $el.find("a[href*='/product/']").first().attr("href") ||
$el.find("a[href*='/en/product/']").first().attr("href");
if (!href) return;
const url = href.startsWith("http") ? href : BASE + href;
if (seen.has(url)) return;
seen.add(url);
// Get product name from h3
const name = $el.find("h3.tit, h3").first().text().trim().replace(/\s+/g, " ");
if (!name || name.length < 10) return;
// Get price: "US$ 33.90" format
const priceText = $el.find("*").text();
const priceMatch = priceText.match(/US\$\s*([\d,]+\.?\d{0,2})/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
const reach = detectReach(name);
const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
formFactor: attr.formFactor, speed: attr.speed, speedGbps: attr.speedGbps,
reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name),
});
});
return products;
}
async function fetchProductList(attrId: string, page: number): Promise<string> {
const url = `${BASE}${LIST_API}?categoryId=${CATEGORY_ID}&attributes=${attrId}&page=${page}&pageSize=${PAGE_SIZE}`;
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
async function fetchDetailPage(url: string): Promise<string> {
// Normalise URL: ensure /en/ prefix for detail pages
const normalized = url.includes("/en/product/") ? url : url.replace("/product/", "/en/product/");
const resp = await fetch(normalized, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${normalized}`);
return resp.text();
}
export async function scrapeQsfptek(): Promise<void> {
console.log("=== QSFPTEK Scraper v2 Starting (API + stock detail mode) ===\n");
const vendorId = await ensureVendor(
"QSFPTEK",
"compatible",
"https://www.qsfptek.com",
"https://www.qsfptek.com/c/fiber-optic-transceiver.html",
);
// ── Phase 1: Collect all products via API listing ─────────────────────────
console.log("[Phase 1] Collecting product catalog...");
const allProducts = new Map<string, Product>(); // url → product
for (const attr of DATA_RATE_ATTRIBUTES) {
console.log(`\n ${attr.formFactor} (${attr.speed})`);
for (let page = 1; page <= MAX_PAGES; page++) {
try {
const html = await fetchProductList(attr.attrId, page);
const pageProds = parseProductFragment(html, attr);
if (pageProds.length === 0) {
if (page === 1) console.log(" No products on page 1 — skipping");
else console.log(` Page ${page}: empty, stopping`);
break;
}
let newCount = 0;
for (const p of pageProds) {
if (!allProducts.has(p.url)) {
allProducts.set(p.url, p);
newCount++;
}
}
console.log(` Page ${page}: ${pageProds.length} results, ${newCount} new (${allProducts.size} total)`);
if (pageProds.length < PAGE_SIZE) break;
if (page < MAX_PAGES) await sleep(2000);
} catch (err) {
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 80)}`);
break;
}
}
await sleep(2000);
}
console.log(`\n[Phase 1] Complete — ${allProducts.size} unique products collected`);
// ── Phase 2: Write to DB + fetch detail pages for stock counts ─────────────
console.log("\n[Phase 2] Writing prices + fetching real-time stock counts...");
let totalProducts = 0;
let priceUpdates = 0;
let stockWritten = 0;
let stockSkipped = 0;
let detailFetched = 0;
let errors = 0;
const products = [...allProducts.values()].slice(0, MAX_DETAIL_PAGES);
for (const product of products) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
// Price observation from listing page
if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber });
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: "in_stock",
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
// Fetch detail page for real-time stock count
await sleep(2000);
try {
const detailHtml = await fetchDetailPage(product.url);
detailFetched++;
const stockInfo = parseStockDetail(detailHtml);
if (stockInfo !== null) {
const isNew = await upsertStockObservation({
transceiverId: txId,
sourceVendorId: vendorId,
stockLevel: stockInfo.qty > 0 ? "in_stock" : "out_of_stock",
quantityAvailable: stockInfo.qty > 0 ? stockInfo.qty : undefined,
productUrl: product.url,
// Quality metadata: QSFPTEK provides aggregated global count + timestamp
stockConfidence: 2,
priceCurrency: "USD",
priceIncludesTax: false,
stockVendorTs: stockInfo.vendorTs,
});
if (isNew) stockWritten++;
else stockSkipped++;
}
} catch (detailErr) {
// Detail page failures are non-fatal — we still have price data
console.warn(` Stock fetch failed for ${product.partNumber}: ${(detailErr as Error).message.slice(0, 60)}`);
}
totalProducts++;
if (totalProducts % 50 === 0) {
console.log(` Progress: ${totalProducts}/${products.length} products | ${priceUpdates} prices | ${stockWritten} stock obs`);
}
} catch (err) {
console.warn(` DB error for ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`);
errors++;
}
}
console.log("\n=== QSFPTEK Scraper v2 Complete ===");
console.log(` Products processed: ${totalProducts}`);
console.log(` Price observations: ${priceUpdates} new`);
console.log(` Detail pages fetched: ${detailFetched}`);
console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`);
if (errors > 0) console.warn(` Errors: ${errors}`);
}
if (require.main === module) {
scrapeQsfptek()
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}