From ae411cb575d5446f40710003db5644404b897ade Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 27 Mar 2026 23:17:42 +1300 Subject: [PATCH] feat: add Flexoptix vendor scraper, 10Gtek pricing scraper, expand news feeds - Flexoptix vendor scraper: 285 supported switch vendors ingested from flexoptix.net/en/supported-vendors/ (our own data, no restrictions) - 10Gtek Playwright scraper: Chinese OEM competitor pricing (SFP+, SFP28, QSFP+, QSFP28, QSFP-DD categories) - News feeds expanded: added Lightwave, Fierce Telecom, Data Center Knowledge, SDxCentral, Cisco Blogs, Arista Blog (11 total sources) - Scheduler updated: 8 job queues with appropriate intervals - DB now: 297 vendors, 89 transceivers, 33 news articles (13 relevant) --- packages/scraper/src/scheduler.ts | 26 +++ .../scraper/src/scrapers/flexoptix-vendors.ts | 131 ++++++++++++ packages/scraper/src/scrapers/news.ts | 38 +++- packages/scraper/src/scrapers/tenGtek.ts | 193 ++++++++++++++++++ 4 files changed, 386 insertions(+), 2 deletions(-) create mode 100644 packages/scraper/src/scrapers/flexoptix-vendors.ts create mode 100644 packages/scraper/src/scrapers/tenGtek.ts diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index b117616..f18a373 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -40,7 +40,9 @@ export async function registerSchedules(boss: PgBoss): Promise { const queues = [ "scrape:pricing:fs", "scrape:pricing:optcore", + "scrape:pricing:10gtek", "scrape:compat:cisco", + "scrape:vendors:flexoptix", "scrape:news", "scrape:faq", "scrape:docs", @@ -79,6 +81,18 @@ export async function registerSchedules(boss: PgBoss): Promise { expireInSeconds: 3600, }); + // 10Gtek pricing (every 8 hours — Playwright, reasonable rate) + await boss.schedule("scrape:pricing:10gtek", "0 */8 * * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + + // Flexoptix vendor list (weekly, Sunday at 6am — own data) + await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, { + retryLimit: 3, + expireInSeconds: 600, + }); + // Document/datasheet check (every Saturday at 4am) await boss.schedule("scrape:docs", "0 4 * * 6", {}, { retryLimit: 3, @@ -93,6 +107,8 @@ export async function registerWorkers(boss: PgBoss): Promise { const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeOptcore } = await import("./scrapers/optcore"); + const { scrape10Gtek } = await import("./scrapers/tenGtek"); + const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { scrapeNews } = await import("./scrapers/news"); await boss.work("scrape:pricing:fs", async (_job) => { @@ -110,6 +126,16 @@ export async function registerWorkers(boss: PgBoss): Promise { await scrapeCiscoTmg(); }); + await boss.work("scrape:pricing:10gtek", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); + await scrape10Gtek(); + }); + + await boss.work("scrape:vendors:flexoptix", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`); + await scrapeFlexoptixVendors(); + }); + await boss.work("scrape:news", async (_job) => { console.log(`[${new Date().toISOString()}] Running: News aggregation`); await scrapeNews(); diff --git a/packages/scraper/src/scrapers/flexoptix-vendors.ts b/packages/scraper/src/scrapers/flexoptix-vendors.ts new file mode 100644 index 0000000..11f3a70 --- /dev/null +++ b/packages/scraper/src/scrapers/flexoptix-vendors.ts @@ -0,0 +1,131 @@ +/** + * Flexoptix Supported Vendors Scraper + * + * Scrapes flexoptix.net/en/supported-vendors/ for the full list of + * switch vendors Flexoptix supports. This is our own data — no restrictions. + * + * Data goes into: switches (vendor names) + vendors table + * Also scrapes per-vendor pages for individual switch models when available. + */ +import { pool } from "../utils/db"; + +interface VendorEntry { + name: string; + url: string; +} + +async function fetchVendorList(): Promise { + const resp = await fetch("https://www.flexoptix.net/en/supported-vendors/", { + headers: { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal)", + Accept: "text/html", + }, + signal: AbortSignal.timeout(30000), + }); + + if (!resp.ok) throw new Error(`Flexoptix returned ${resp.status}`); + + const html = await resp.text(); + const vendors: VendorEntry[] = []; + + // Parse vendor links from the supported-vendors page + // Pattern: href="...supported-vendors/index/name/VENDOR-compatible" + const regex = /href="(https?:\/\/www\.flexoptix\.net\/en\/supported-vendors\/index\/name\/([^"]+)-compatible)"/g; + let match; + while ((match = regex.exec(html)) !== null) { + const url = match[1] + .replace(/:/g, ":") + .replace(///g, "/") + .replace(/+/g, "+") + .replace(/(/g, "(") + .replace(/)/g, ")"); + const rawName = match[2] + .replace(/\+/g, " ") + .replace(/%20/g, " ") + .replace(/%28/g, "(") + .replace(/%29/g, ")"); + + vendors.push({ name: rawName, url }); + } + + // Also catch plain link text pattern + const altRegex = /class="[^"]*vendor[^"]*"[^>]*>\s*]*href="([^"]+)"[^>]*>([^<]+)<\/a>/gi; + while ((match = altRegex.exec(html)) !== null) { + const url = match[1]; + const name = match[2].trim(); + if (name && !vendors.find((v) => v.name.toLowerCase() === name.toLowerCase())) { + vendors.push({ name, url }); + } + } + + // Deduplicate by name (case-insensitive) + const seen = new Set(); + return vendors.filter((v) => { + const key = v.name.toLowerCase(); + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +function slugify(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); +} + +async function upsertVendor(name: string): Promise { + const slug = slugify(name); + const result = await pool.query( + `INSERT INTO vendors (name, slug, type, website) + VALUES ($1, $2, 'manufacturer', $3) + ON CONFLICT (name) DO UPDATE SET website = COALESCE(vendors.website, EXCLUDED.website) + RETURNING id`, + [name, slug, `https://www.flexoptix.net/en/supported-vendors/`] + ); + return result.rows[0].id; +} + +export async function scrapeFlexoptixVendors(): Promise { + console.log("=== Flexoptix Vendor Scraper Starting ===\n"); + + const vendors = await fetchVendorList(); + console.log(`Found ${vendors.length} supported vendors\n`); + + let newVendors = 0; + let updatedVendors = 0; + + for (const vendor of vendors) { + try { + const existing = await pool.query( + `SELECT id FROM vendors WHERE name ILIKE $1`, + [vendor.name] + ); + + await upsertVendor(vendor.name); + + if (existing.rows.length === 0) { + newVendors++; + console.log(` + NEW: ${vendor.name}`); + } else { + updatedVendors++; + } + } catch (err) { + console.warn(` Error saving vendor ${vendor.name}:`, (err as Error).message); + } + } + + console.log(`\nVendors: ${vendors.length} total, ${newVendors} new, ${updatedVendors} existing`); + console.log("=== Flexoptix Vendor Scraper Complete ===\n"); +} + +if (require.main === module) { + scrapeFlexoptixVendors() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/scrapers/news.ts b/packages/scraper/src/scrapers/news.ts index b05c71f..7d21080 100644 --- a/packages/scraper/src/scrapers/news.ts +++ b/packages/scraper/src/scrapers/news.ts @@ -36,14 +36,31 @@ interface NewsArticle { } const FEEDS: RssFeed[] = [ + // === PRIMARY: Transceiver-specific === + { + name: "Lightwave Online", + url: "https://www.lightwaveonline.com/rss", + category: "market_report", + }, + { + name: "Lightwave - Fiber Optics", + url: "https://www.lightwaveonline.com/fttx/rss", + category: "market_report", + }, + { + name: "Fierce Telecom", + url: "https://www.fiercetelecom.com/rss/xml", + category: "market_report", + }, { name: "Optics.org", url: "https://optics.org/rss/news", category: "market_report", }, + // === SECONDARY: Datacenter / Networking === { - name: "SPIE Newsroom", - url: "https://www.spie.org/newsroom/rss.xml", + name: "Data Center Knowledge", + url: "https://www.datacenterknowledge.com/rss.xml", category: "market_report", }, { @@ -51,6 +68,12 @@ const FEEDS: RssFeed[] = [ url: "https://www.networkworld.com/category/data-center/index.rss", category: "market_report", }, + { + name: "SDxCentral", + url: "https://www.sdxcentral.com/feed/", + category: "market_report", + }, + // === TERTIARY: General tech / photonics === { name: "CableFree", url: "https://www.cablefree.net/rss", @@ -61,6 +84,17 @@ const FEEDS: RssFeed[] = [ url: "https://www.nature.com/nphoton.rss", category: "standard", }, + // === VENDOR NEWS === + { + name: "Cisco Blogs - Data Center", + url: "https://blogs.cisco.com/datacenter/feed", + category: "product_launch", + }, + { + name: "Arista Blog", + url: "https://blogs.arista.com/blog/rss.xml", + category: "product_launch", + }, ]; // Keywords for relevance scoring diff --git a/packages/scraper/src/scrapers/tenGtek.ts b/packages/scraper/src/scrapers/tenGtek.ts new file mode 100644 index 0000000..e6df4e4 --- /dev/null +++ b/packages/scraper/src/scrapers/tenGtek.ts @@ -0,0 +1,193 @@ +/** + * 10Gtek.com Scraper — Chinese OEM Transceiver Vendor + * + * Uses PlaywrightCrawler (JS-rendered site). + * Categories: SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP + * + * 10gtek.com is a direct competitor to FS.com at lower price points. + * No aggressive anti-bot (no Cloudflare), but content is JS-rendered. + */ +import { PlaywrightCrawler, Dataset } from "crawlee"; +import { pool } from "../utils/db"; +import { contentHash, parsePrice, parseStockLevel } from "../utils/hash"; + +const CATEGORY_URLS = [ + { url: "https://www.10gtek.com/sfp-plus", formFactor: "SFP+", speedGbps: 10 }, + { url: "https://www.10gtek.com/sfp28", formFactor: "SFP28", speedGbps: 25 }, + { url: "https://www.10gtek.com/qsfp-plus", formFactor: "QSFP+", speedGbps: 40 }, + { url: "https://www.10gtek.com/100g-qsfp28", formFactor: "QSFP28", speedGbps: 100 }, + { url: "https://www.10gtek.com/400g-qsfp-dd", formFactor: "QSFP-DD", speedGbps: 400 }, +]; + +// Get or create 10Gtek vendor +async function getVendorId(): Promise { + const result = await pool.query( + `INSERT INTO vendors (name, vendor_type, website, country) + VALUES ('10Gtek', 'competitor', 'https://www.10gtek.com', 'CN') + ON CONFLICT (name) DO UPDATE SET vendor_type = 'competitor' + RETURNING id` + ); + return result.rows[0].id; +} + +export async function scrape10Gtek(): Promise { + console.log("=== 10Gtek Scraper Starting ===\n"); + + const vendorId = await getVendorId(); + let totalProducts = 0; + let totalPrices = 0; + + const crawler = new PlaywrightCrawler({ + maxRequestsPerCrawl: 50, + maxConcurrency: 2, + requestHandlerTimeoutSecs: 60, + launchContext: { + launchOptions: { + headless: true, + args: ["--no-sandbox"], + }, + }, + async requestHandler({ page, request, log }) { + const categoryInfo = CATEGORY_URLS.find((c) => request.url.startsWith(c.url)); + if (!categoryInfo) return; + + log.info(`Scraping: ${request.url} (${categoryInfo.formFactor})`); + + // Wait for product grid to load + await page.waitForSelector(".product-item, .product-card, .item-info, table.products", { + timeout: 15000, + }).catch(() => { + log.warning("No product grid found, trying alternative selectors"); + }); + + // Extract products — 10gtek uses various layouts + const products = await page.evaluate(() => { + const items: Array<{ + name: string; + price: string; + partNumber: string; + url: string; + inStock: boolean; + }> = []; + + // Try table layout + const rows = document.querySelectorAll("table tr, .product-item, .product-card"); + rows.forEach((row) => { + const nameEl = row.querySelector("a[href*='/'], .product-name, .item-name, td:first-child a"); + const priceEl = row.querySelector(".price, .product-price, [class*='price']"); + + if (nameEl && priceEl) { + const name = nameEl.textContent?.trim() || ""; + const price = priceEl.textContent?.trim() || ""; + const url = (nameEl as HTMLAnchorElement).href || ""; + const partEl = row.querySelector(".sku, .part-number, [class*='sku']"); + const partNumber = partEl?.textContent?.trim() || name.split(" ")[0] || ""; + + if (name && price) { + items.push({ + name, + price, + partNumber, + url, + inStock: !row.textContent?.toLowerCase().includes("out of stock"), + }); + } + } + }); + + // If table extraction yielded nothing, try generic approach + if (items.length === 0) { + const allLinks = document.querySelectorAll("a[href]"); + allLinks.forEach((link) => { + const text = link.textContent?.trim() || ""; + const parent = link.closest("div, tr, li"); + const priceText = parent?.querySelector("[class*='price']")?.textContent?.trim(); + if (text.length > 10 && priceText && text.match(/sfp|qsfp|xfp|cfp/i)) { + items.push({ + name: text, + price: priceText, + partNumber: text.split(" ")[0], + url: (link as HTMLAnchorElement).href, + inStock: true, + }); + } + }); + } + + return items; + }); + + log.info(`Found ${products.length} products on ${request.url}`); + totalProducts += products.length; + + for (const product of products) { + try { + const parsed = parsePrice(product.price); + if (!parsed) continue; + + const hash = contentHash({ + name: product.name, + price: parsed.price, + stock: product.inStock, + }); + + // Find or create transceiver + const txResult = await pool.query( + `SELECT id FROM transceivers + WHERE slug ILIKE $1 OR standard_name ILIKE $1 + LIMIT 1`, + [`%${product.partNumber}%`] + ); + + if (txResult.rows.length === 0) continue; + + const existing = await pool.query( + `SELECT content_hash FROM price_observations + WHERE transceiver_id = $1 AND source_vendor_id = $2 + ORDER BY time DESC LIMIT 1`, + [txResult.rows[0].id, vendorId] + ); + + if (existing.rows[0]?.content_hash === hash) continue; + + await pool.query( + `INSERT INTO price_observations + (transceiver_id, source_vendor_id, price, currency, stock_level, url, content_hash) + VALUES ($1, $2, $3, $4, $5, $6, $7)`, + [ + txResult.rows[0].id, + vendorId, + parsed.price, + parsed.currency, + product.inStock ? "in_stock" : "out_of_stock", + product.url, + hash, + ] + ); + totalPrices++; + } catch (err) { + log.warning(`Error processing product: ${(err as Error).message}`); + } + } + }, + failedRequestHandler({ request, log }) { + log.error(`Request failed: ${request.url}`); + }, + }); + + await crawler.run(CATEGORY_URLS.map((c) => c.url)); + + console.log(`\nProducts found: ${totalProducts}`); + console.log(`Prices written: ${totalPrices}`); + console.log("=== 10Gtek Scraper Complete ===\n"); +} + +if (require.main === module) { + scrape10Gtek() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +}