Rene Fichtmueller b43bdd3060 feat: TIP Phase 0+1 — monorepo, DB schema, API, scraper engine
Phase 0 - Foundation:
- Restructure into npm workspace monorepo (packages/core, api, scraper)
- PostgreSQL 17 + TimescaleDB schema (15 tables incl. hypertables)
- Docker Compose for local dev (PostgreSQL on 5433 + Qdrant)
- Express 5 API on port 3200 with 6 routes
- Seed script to migrate 159 transceivers + 42 standards from npm package
- Erik server setup script + PM2 ecosystem config

Phase 1 - Scraper Engine:
- Crawlee + Playwright framework with pg-boss scheduler
- FS.com scraper (PlaywrightCrawler, anti-bot workaround)
- Optcore.net scraper (WP REST API enumeration + PlaywrightCrawler)
  - Uses /wp-json/wp/v2/product to get 2000+ product URLs
  - Playwright renders individual product pages for price extraction
- Cisco TMG Matrix scraper (compatibility data)
- News RSS aggregator (optics.org, SPIE, Network World, Nature Photonics)
  - Keyword relevance scoring for transceiver/fiber topics
  - xml2js with malformed XML sanitization
- SHA-256 content hashing for change detection (skip unchanged records)
- pg-boss v10 with explicit queue creation before scheduling
2026-03-27 16:27:31 +13:00

298 lines
9.1 KiB
TypeScript

/**
* Optcore.net Scraper — Most transparent pricing in the industry.
* Prices start at $5.50, fully public, no bot protection.
*
* Strategy: WP REST API to enumerate transceiver product URLs,
* then PlaywrightCrawler to render each page and extract price.
*
* Optcore uses Flatsome WooCommerce with Cloudflare Rocket Loader
* (JS lazy-loading) — static HTML has no product data.
*/
import { PlaywrightCrawler } from "crawlee";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel } from "../utils/hash";
const BASE_URL = "https://www.optcore.net";
// Transceiver category IDs from /wp-json/wp/v2/product_cat
// Filtered to optical transceiver categories with products
const TRANSCEIVER_CATEGORY_IDS = [
309, // 10G SFP+
173, // 1G SFP
76, // 100G QSFP28
79, // 25G SFP28
73, // 40G QSFP+
311, // 10G BiDi SFP+
313, // 10G CWDM SFP+
312, // 10G DWDM SFP+
333, // 10G XFP
1088, // 10GBase-T SFP+
59, // 8G/10G/16G SFP+
1102, // BiDi SFP
4097, // 400G QSFP-DD
77, // 100G CFP/CFP2/CFP4
4101, // 200G QSFP56
4092, // 50G SFP56
6441, // 800G OSFP
];
interface OptcoreProduct {
partNumber: string;
name: string;
price: number;
currency: string;
stockLevel: string;
url: string;
formFactor?: string;
speedGbps?: number;
speed?: string;
reachLabel?: string;
}
function detectFormFactor(text: string): string | undefined {
const lower = text.toLowerCase();
if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP";
if (lower.includes("qsfp-dd")) return "QSFP-DD";
if (lower.includes("qsfp56")) return "QSFP56";
if (lower.includes("qsfp28")) return "QSFP28";
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
if (lower.includes("sfp28")) return "SFP28";
if (lower.includes("sfp56")) return "SFP56";
if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+";
if (lower.includes("cfp4")) return "CFP4";
if (lower.includes("cfp2")) return "CFP2";
if (lower.includes("cfp")) return "CFP";
if (lower.includes("xfp")) return "XFP";
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
return undefined;
}
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/800\s*g/i, "800G", 800],
[/400\s*g/i, "400G", 400],
[/200\s*g/i, "200G", 200],
[/100\s*g/i, "100G", 100],
[/50\s*g/i, "50G", 50],
[/40\s*g/i, "40G", 40],
[/25\s*g/i, "25G", 25],
[/16\s*g/i, "16G", 16],
[/10\s*g/i, "10G", 10],
[/1000\s*base/i, "1G", 1],
[/1\s*g\b/i, "1G", 1],
];
for (const [re, speed, gbps] of patterns) {
if (re.test(text)) return { speed, speedGbps: gbps };
}
return undefined;
}
function detectReach(text: string): string | undefined {
const match = text.match(/(\d+)\s*(m|km)\b/i);
if (match) return `${match[1]}${match[2].toLowerCase()}`;
return undefined;
}
/**
* Fetch product URLs for transceiver categories via WP REST API.
* Returns up to 2000 product URLs with title + slug for metadata.
*/
async function fetchTransceiverUrls(): Promise<Array<{ url: string; title: string; partNumber: string }>> {
const results: Array<{ url: string; title: string; partNumber: string }> = [];
const seen = new Set<string>();
for (const catId of TRANSCEIVER_CATEGORY_IDS) {
let page = 1;
let hasMore = true;
while (hasMore) {
const apiUrl = `${BASE_URL}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,link,title`;
try {
const resp = await fetch(apiUrl, {
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-Scraper/1.0)" },
signal: AbortSignal.timeout(15000),
});
if (!resp.ok) break;
const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1");
const products: Array<{ slug: string; link: string; title: { rendered: string } }> = await resp.json();
for (const p of products) {
if (!seen.has(p.slug)) {
seen.add(p.slug);
results.push({
url: p.link,
title: p.title.rendered,
partNumber: p.slug,
});
}
}
hasMore = page < totalPages;
page++;
// Rate limit: 10 req/sec max
await new Promise((r) => setTimeout(r, 100));
} catch {
hasMore = false;
}
}
}
return results;
}
export async function scrapeOptcore(): Promise<void> {
console.log("=== Optcore.net Scraper Starting ===\n");
const vendorId = await ensureVendor(
"Optcore",
"compatible",
"https://www.optcore.net",
"https://www.optcore.net/product-category/optical-transceiver/"
);
console.log(`Vendor ID: ${vendorId}`);
// Step 1: Enumerate transceiver product URLs via WP REST API
console.log("Fetching product URLs via WP REST API...");
const productMeta = await fetchTransceiverUrls();
console.log(`Found ${productMeta.length} transceiver product URLs`);
// Build a map for quick metadata lookup
const metaByUrl = new Map(productMeta.map((p) => [p.url, p]));
const products: OptcoreProduct[] = [];
let pagesScraped = 0;
// Step 2: Render each product page with Playwright to extract price
const crawler = new PlaywrightCrawler({
maxConcurrency: 3,
maxRequestsPerMinute: 30,
requestHandlerTimeoutSecs: 30,
headless: true,
launchContext: {
launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
},
},
async requestHandler({ page, request, log }) {
const url = request.url;
log.info(`Scraping: ${url}`);
// Wait for WooCommerce price element to appear
try {
await page.waitForSelector(".woocommerce-Price-amount, .price .amount, [class*=\"price\"]", {
timeout: 8000,
});
} catch {
// Price element not found — might be out of stock or JS failed
log.warning(`No price element found: ${url}`);
pagesScraped++;
return;
}
const data = await page.evaluate(() => {
// Product title
const title =
document.querySelector("h1.product_title, h1.entry-title, h1")?.textContent?.trim() || "";
// Price — WooCommerce renders: <span class="price"><bdi><span class="woocommerce-Price-currencySymbol">$</span>5.50</bdi></span>
const priceEl = document.querySelector(
".price ins .woocommerce-Price-amount, .price .woocommerce-Price-amount, .woocommerce-Price-amount"
);
const priceText = priceEl?.textContent?.trim() || "";
// Stock
const stockEl = document.querySelector(".stock, .availability, [class*=\"stock\"]");
const stockText = stockEl?.textContent?.trim() || "";
return { title, priceText, stockText };
});
const meta = metaByUrl.get(url);
const name = data.title || meta?.title || url.split("/").filter(Boolean).pop() || "";
const partNumber = meta?.partNumber || url.split("/").filter(Boolean).pop() || "";
const { price, currency } = parsePrice(data.priceText);
if (price > 0) {
const speedInfo = detectSpeed(name);
products.push({
partNumber,
name,
price,
currency,
stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock",
url,
formFactor: detectFormFactor(name),
speedGbps: speedInfo?.speedGbps,
speed: speedInfo?.speed,
reachLabel: detectReach(name),
});
}
pagesScraped++;
},
});
const urls = productMeta.map((p) => p.url);
await crawler.run(urls);
console.log(`\nPages scraped: ${pagesScraped}`);
console.log(`Products with price: ${products.length}`);
// Deduplicate
const unique = new Map<string, OptcoreProduct>();
for (const p of products) {
if (!unique.has(p.partNumber)) unique.set(p.partNumber, p);
}
// Write to DB
let written = 0;
let skipped = 0;
for (const p of unique.values()) {
try {
const transceiverId = await findOrCreateScrapedTransceiver({
partNumber: p.partNumber,
vendorId,
formFactor: p.formFactor,
speedGbps: p.speedGbps,
speed: p.speed,
reachLabel: p.reachLabel,
category: "DataCenter",
});
const hash = contentHash({ price: p.price, stock: p.stockLevel });
const isNew = await upsertPriceObservation({
transceiverId,
sourceVendorId: vendorId,
price: p.price,
currency: p.currency,
stockLevel: p.stockLevel,
url: p.url,
contentHash: hash,
});
if (isNew) written++;
else skipped++;
} catch (err) {
console.error(` Error: ${p.partNumber}:`, (err as Error).message);
}
}
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${unique.size} unique)`);
console.log("=== Optcore.net Scraper Complete ===\n");
}
if (require.main === module) {
scrapeOptcore()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}