Phase 0 - Foundation: - Restructure into npm workspace monorepo (packages/core, api, scraper) - PostgreSQL 17 + TimescaleDB schema (15 tables incl. hypertables) - Docker Compose for local dev (PostgreSQL on 5433 + Qdrant) - Express 5 API on port 3200 with 6 routes - Seed script to migrate 159 transceivers + 42 standards from npm package - Erik server setup script + PM2 ecosystem config Phase 1 - Scraper Engine: - Crawlee + Playwright framework with pg-boss scheduler - FS.com scraper (PlaywrightCrawler, anti-bot workaround) - Optcore.net scraper (WP REST API enumeration + PlaywrightCrawler) - Uses /wp-json/wp/v2/product to get 2000+ product URLs - Playwright renders individual product pages for price extraction - Cisco TMG Matrix scraper (compatibility data) - News RSS aggregator (optics.org, SPIE, Network World, Nature Photonics) - Keyword relevance scoring for transceiver/fiber topics - xml2js with malformed XML sanitization - SHA-256 content hashing for change detection (skip unchanged records) - pg-boss v10 with explicit queue creation before scheduling
298 lines
9.1 KiB
TypeScript
298 lines
9.1 KiB
TypeScript
/**
|
|
* Optcore.net Scraper — Most transparent pricing in the industry.
|
|
* Prices start at $5.50, fully public, no bot protection.
|
|
*
|
|
* Strategy: WP REST API to enumerate transceiver product URLs,
|
|
* then PlaywrightCrawler to render each page and extract price.
|
|
*
|
|
* Optcore uses Flatsome WooCommerce with Cloudflare Rocket Loader
|
|
* (JS lazy-loading) — static HTML has no product data.
|
|
*/
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
|
import { contentHash, parsePrice, parseStockLevel } from "../utils/hash";
|
|
|
|
const BASE_URL = "https://www.optcore.net";
|
|
|
|
// Transceiver category IDs from /wp-json/wp/v2/product_cat
|
|
// Filtered to optical transceiver categories with products
|
|
const TRANSCEIVER_CATEGORY_IDS = [
|
|
309, // 10G SFP+
|
|
173, // 1G SFP
|
|
76, // 100G QSFP28
|
|
79, // 25G SFP28
|
|
73, // 40G QSFP+
|
|
311, // 10G BiDi SFP+
|
|
313, // 10G CWDM SFP+
|
|
312, // 10G DWDM SFP+
|
|
333, // 10G XFP
|
|
1088, // 10GBase-T SFP+
|
|
59, // 8G/10G/16G SFP+
|
|
1102, // BiDi SFP
|
|
4097, // 400G QSFP-DD
|
|
77, // 100G CFP/CFP2/CFP4
|
|
4101, // 200G QSFP56
|
|
4092, // 50G SFP56
|
|
6441, // 800G OSFP
|
|
];
|
|
|
|
interface OptcoreProduct {
|
|
partNumber: string;
|
|
name: string;
|
|
price: number;
|
|
currency: string;
|
|
stockLevel: string;
|
|
url: string;
|
|
formFactor?: string;
|
|
speedGbps?: number;
|
|
speed?: string;
|
|
reachLabel?: string;
|
|
}
|
|
|
|
function detectFormFactor(text: string): string | undefined {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("osfp") && !lower.includes("qsfp")) return "OSFP";
|
|
if (lower.includes("qsfp-dd")) return "QSFP-DD";
|
|
if (lower.includes("qsfp56")) return "QSFP56";
|
|
if (lower.includes("qsfp28")) return "QSFP28";
|
|
if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return "QSFP+";
|
|
if (lower.includes("sfp28")) return "SFP28";
|
|
if (lower.includes("sfp56")) return "SFP56";
|
|
if (lower.includes("sfp+") || lower.includes("sfp plus")) return "SFP+";
|
|
if (lower.includes("cfp4")) return "CFP4";
|
|
if (lower.includes("cfp2")) return "CFP2";
|
|
if (lower.includes("cfp")) return "CFP";
|
|
if (lower.includes("xfp")) return "XFP";
|
|
if (lower.includes("sfp") && !lower.includes("qsfp")) return "SFP";
|
|
return undefined;
|
|
}
|
|
|
|
function detectSpeed(text: string): { speed: string; speedGbps: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/800\s*g/i, "800G", 800],
|
|
[/400\s*g/i, "400G", 400],
|
|
[/200\s*g/i, "200G", 200],
|
|
[/100\s*g/i, "100G", 100],
|
|
[/50\s*g/i, "50G", 50],
|
|
[/40\s*g/i, "40G", 40],
|
|
[/25\s*g/i, "25G", 25],
|
|
[/16\s*g/i, "16G", 16],
|
|
[/10\s*g/i, "10G", 10],
|
|
[/1000\s*base/i, "1G", 1],
|
|
[/1\s*g\b/i, "1G", 1],
|
|
];
|
|
for (const [re, speed, gbps] of patterns) {
|
|
if (re.test(text)) return { speed, speedGbps: gbps };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectReach(text: string): string | undefined {
|
|
const match = text.match(/(\d+)\s*(m|km)\b/i);
|
|
if (match) return `${match[1]}${match[2].toLowerCase()}`;
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Fetch product URLs for transceiver categories via WP REST API.
|
|
* Returns up to 2000 product URLs with title + slug for metadata.
|
|
*/
|
|
async function fetchTransceiverUrls(): Promise<Array<{ url: string; title: string; partNumber: string }>> {
|
|
const results: Array<{ url: string; title: string; partNumber: string }> = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (const catId of TRANSCEIVER_CATEGORY_IDS) {
|
|
let page = 1;
|
|
let hasMore = true;
|
|
|
|
while (hasMore) {
|
|
const apiUrl = `${BASE_URL}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,link,title`;
|
|
try {
|
|
const resp = await fetch(apiUrl, {
|
|
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-Scraper/1.0)" },
|
|
signal: AbortSignal.timeout(15000),
|
|
});
|
|
|
|
if (!resp.ok) break;
|
|
|
|
const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1");
|
|
const products: Array<{ slug: string; link: string; title: { rendered: string } }> = await resp.json();
|
|
|
|
for (const p of products) {
|
|
if (!seen.has(p.slug)) {
|
|
seen.add(p.slug);
|
|
results.push({
|
|
url: p.link,
|
|
title: p.title.rendered,
|
|
partNumber: p.slug,
|
|
});
|
|
}
|
|
}
|
|
|
|
hasMore = page < totalPages;
|
|
page++;
|
|
|
|
// Rate limit: 10 req/sec max
|
|
await new Promise((r) => setTimeout(r, 100));
|
|
} catch {
|
|
hasMore = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
export async function scrapeOptcore(): Promise<void> {
|
|
console.log("=== Optcore.net Scraper Starting ===\n");
|
|
|
|
const vendorId = await ensureVendor(
|
|
"Optcore",
|
|
"compatible",
|
|
"https://www.optcore.net",
|
|
"https://www.optcore.net/product-category/optical-transceiver/"
|
|
);
|
|
console.log(`Vendor ID: ${vendorId}`);
|
|
|
|
// Step 1: Enumerate transceiver product URLs via WP REST API
|
|
console.log("Fetching product URLs via WP REST API...");
|
|
const productMeta = await fetchTransceiverUrls();
|
|
console.log(`Found ${productMeta.length} transceiver product URLs`);
|
|
|
|
// Build a map for quick metadata lookup
|
|
const metaByUrl = new Map(productMeta.map((p) => [p.url, p]));
|
|
|
|
const products: OptcoreProduct[] = [];
|
|
let pagesScraped = 0;
|
|
|
|
// Step 2: Render each product page with Playwright to extract price
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 3,
|
|
maxRequestsPerMinute: 30,
|
|
requestHandlerTimeoutSecs: 30,
|
|
headless: true,
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
|
|
},
|
|
},
|
|
|
|
async requestHandler({ page, request, log }) {
|
|
const url = request.url;
|
|
log.info(`Scraping: ${url}`);
|
|
|
|
// Wait for WooCommerce price element to appear
|
|
try {
|
|
await page.waitForSelector(".woocommerce-Price-amount, .price .amount, [class*=\"price\"]", {
|
|
timeout: 8000,
|
|
});
|
|
} catch {
|
|
// Price element not found — might be out of stock or JS failed
|
|
log.warning(`No price element found: ${url}`);
|
|
pagesScraped++;
|
|
return;
|
|
}
|
|
|
|
const data = await page.evaluate(() => {
|
|
// Product title
|
|
const title =
|
|
document.querySelector("h1.product_title, h1.entry-title, h1")?.textContent?.trim() || "";
|
|
|
|
// Price — WooCommerce renders: <span class="price"><bdi><span class="woocommerce-Price-currencySymbol">$</span>5.50</bdi></span>
|
|
const priceEl = document.querySelector(
|
|
".price ins .woocommerce-Price-amount, .price .woocommerce-Price-amount, .woocommerce-Price-amount"
|
|
);
|
|
const priceText = priceEl?.textContent?.trim() || "";
|
|
|
|
// Stock
|
|
const stockEl = document.querySelector(".stock, .availability, [class*=\"stock\"]");
|
|
const stockText = stockEl?.textContent?.trim() || "";
|
|
|
|
return { title, priceText, stockText };
|
|
});
|
|
|
|
const meta = metaByUrl.get(url);
|
|
const name = data.title || meta?.title || url.split("/").filter(Boolean).pop() || "";
|
|
const partNumber = meta?.partNumber || url.split("/").filter(Boolean).pop() || "";
|
|
|
|
const { price, currency } = parsePrice(data.priceText);
|
|
if (price > 0) {
|
|
const speedInfo = detectSpeed(name);
|
|
products.push({
|
|
partNumber,
|
|
name,
|
|
price,
|
|
currency,
|
|
stockLevel: data.stockText ? parseStockLevel(data.stockText) : "in_stock",
|
|
url,
|
|
formFactor: detectFormFactor(name),
|
|
speedGbps: speedInfo?.speedGbps,
|
|
speed: speedInfo?.speed,
|
|
reachLabel: detectReach(name),
|
|
});
|
|
}
|
|
|
|
pagesScraped++;
|
|
},
|
|
});
|
|
|
|
const urls = productMeta.map((p) => p.url);
|
|
await crawler.run(urls);
|
|
|
|
console.log(`\nPages scraped: ${pagesScraped}`);
|
|
console.log(`Products with price: ${products.length}`);
|
|
|
|
// Deduplicate
|
|
const unique = new Map<string, OptcoreProduct>();
|
|
for (const p of products) {
|
|
if (!unique.has(p.partNumber)) unique.set(p.partNumber, p);
|
|
}
|
|
|
|
// Write to DB
|
|
let written = 0;
|
|
let skipped = 0;
|
|
|
|
for (const p of unique.values()) {
|
|
try {
|
|
const transceiverId = await findOrCreateScrapedTransceiver({
|
|
partNumber: p.partNumber,
|
|
vendorId,
|
|
formFactor: p.formFactor,
|
|
speedGbps: p.speedGbps,
|
|
speed: p.speed,
|
|
reachLabel: p.reachLabel,
|
|
category: "DataCenter",
|
|
});
|
|
|
|
const hash = contentHash({ price: p.price, stock: p.stockLevel });
|
|
const isNew = await upsertPriceObservation({
|
|
transceiverId,
|
|
sourceVendorId: vendorId,
|
|
price: p.price,
|
|
currency: p.currency,
|
|
stockLevel: p.stockLevel,
|
|
url: p.url,
|
|
contentHash: hash,
|
|
});
|
|
|
|
if (isNew) written++;
|
|
else skipped++;
|
|
} catch (err) {
|
|
console.error(` Error: ${p.partNumber}:`, (err as Error).message);
|
|
}
|
|
}
|
|
|
|
console.log(`\nDatabase: ${written} new, ${skipped} unchanged (${unique.size} unique)`);
|
|
console.log("=== Optcore.net Scraper Complete ===\n");
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeOptcore()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|