feat: add Flexoptix vendor scraper, 10Gtek pricing scraper, expand news feeds

- Flexoptix vendor scraper: 285 supported switch vendors ingested from
  flexoptix.net/en/supported-vendors/ (our own data, no restrictions)
- 10Gtek Playwright scraper: Chinese OEM competitor pricing (SFP+, SFP28,
  QSFP+, QSFP28, QSFP-DD categories)
- News feeds expanded: added Lightwave, Fierce Telecom, Data Center Knowledge,
  SDxCentral, Cisco Blogs, Arista Blog (11 total sources)
- Scheduler updated: 8 job queues with appropriate intervals
- DB now: 297 vendors, 89 transceivers, 33 news articles (13 relevant)
This commit is contained in:
Rene Fichtmueller 2026-03-27 23:17:42 +13:00
parent 649e6a9796
commit ae411cb575
4 changed files with 386 additions and 2 deletions

View File

@ -40,7 +40,9 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
const queues = [ const queues = [
"scrape:pricing:fs", "scrape:pricing:fs",
"scrape:pricing:optcore", "scrape:pricing:optcore",
"scrape:pricing:10gtek",
"scrape:compat:cisco", "scrape:compat:cisco",
"scrape:vendors:flexoptix",
"scrape:news", "scrape:news",
"scrape:faq", "scrape:faq",
"scrape:docs", "scrape:docs",
@ -79,6 +81,18 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
expireInSeconds: 3600, expireInSeconds: 3600,
}); });
// 10Gtek pricing (every 8 hours — Playwright, reasonable rate)
await boss.schedule("scrape:pricing:10gtek", "0 */8 * * *", {}, {
retryLimit: 2,
expireInSeconds: 3600,
});
// Flexoptix vendor list (weekly, Sunday at 6am — own data)
await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, {
retryLimit: 3,
expireInSeconds: 600,
});
// Document/datasheet check (every Saturday at 4am) // Document/datasheet check (every Saturday at 4am)
await boss.schedule("scrape:docs", "0 4 * * 6", {}, { await boss.schedule("scrape:docs", "0 4 * * 6", {}, {
retryLimit: 3, retryLimit: 3,
@ -93,6 +107,8 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeFs } = await import("./scrapers/fs-com");
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
const { scrapeOptcore } = await import("./scrapers/optcore"); const { scrapeOptcore } = await import("./scrapers/optcore");
const { scrape10Gtek } = await import("./scrapers/tenGtek");
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
const { scrapeNews } = await import("./scrapers/news"); const { scrapeNews } = await import("./scrapers/news");
await boss.work("scrape:pricing:fs", async (_job) => { await boss.work("scrape:pricing:fs", async (_job) => {
@ -110,6 +126,16 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await scrapeCiscoTmg(); await scrapeCiscoTmg();
}); });
await boss.work("scrape:pricing:10gtek", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
await scrape10Gtek();
});
await boss.work("scrape:vendors:flexoptix", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`);
await scrapeFlexoptixVendors();
});
await boss.work("scrape:news", async (_job) => { await boss.work("scrape:news", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: News aggregation`); console.log(`[${new Date().toISOString()}] Running: News aggregation`);
await scrapeNews(); await scrapeNews();

View File

@ -0,0 +1,131 @@
/**
* Flexoptix Supported Vendors Scraper
*
* Scrapes flexoptix.net/en/supported-vendors/ for the full list of
* switch vendors Flexoptix supports. This is our own data no restrictions.
*
* Data goes into: switches (vendor names) + vendors table
* Also scrapes per-vendor pages for individual switch models when available.
*/
import { pool } from "../utils/db";
interface VendorEntry {
name: string;
url: string;
}
async function fetchVendorList(): Promise<VendorEntry[]> {
const resp = await fetch("https://www.flexoptix.net/en/supported-vendors/", {
headers: {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal)",
Accept: "text/html",
},
signal: AbortSignal.timeout(30000),
});
if (!resp.ok) throw new Error(`Flexoptix returned ${resp.status}`);
const html = await resp.text();
const vendors: VendorEntry[] = [];
// Parse vendor links from the supported-vendors page
// Pattern: href="...supported-vendors/index/name/VENDOR-compatible"
const regex = /href="(https?:\/\/www\.flexoptix\.net\/en\/supported-vendors\/index\/name\/([^"]+)-compatible)"/g;
let match;
while ((match = regex.exec(html)) !== null) {
const url = match[1]
.replace(/&#x3A;/g, ":")
.replace(/&#x2F;/g, "/")
.replace(/&#x2B;/g, "+")
.replace(/&#x28;/g, "(")
.replace(/&#x29;/g, ")");
const rawName = match[2]
.replace(/\+/g, " ")
.replace(/%20/g, " ")
.replace(/%28/g, "(")
.replace(/%29/g, ")");
vendors.push({ name: rawName, url });
}
// Also catch plain link text pattern
const altRegex = /class="[^"]*vendor[^"]*"[^>]*>\s*<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/gi;
while ((match = altRegex.exec(html)) !== null) {
const url = match[1];
const name = match[2].trim();
if (name && !vendors.find((v) => v.name.toLowerCase() === name.toLowerCase())) {
vendors.push({ name, url });
}
}
// Deduplicate by name (case-insensitive)
const seen = new Set<string>();
return vendors.filter((v) => {
const key = v.name.toLowerCase();
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
function slugify(name: string): string {
return name
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-|-$/g, "");
}
async function upsertVendor(name: string): Promise<string> {
const slug = slugify(name);
const result = await pool.query(
`INSERT INTO vendors (name, slug, type, website)
VALUES ($1, $2, 'manufacturer', $3)
ON CONFLICT (name) DO UPDATE SET website = COALESCE(vendors.website, EXCLUDED.website)
RETURNING id`,
[name, slug, `https://www.flexoptix.net/en/supported-vendors/`]
);
return result.rows[0].id;
}
export async function scrapeFlexoptixVendors(): Promise<void> {
console.log("=== Flexoptix Vendor Scraper Starting ===\n");
const vendors = await fetchVendorList();
console.log(`Found ${vendors.length} supported vendors\n`);
let newVendors = 0;
let updatedVendors = 0;
for (const vendor of vendors) {
try {
const existing = await pool.query(
`SELECT id FROM vendors WHERE name ILIKE $1`,
[vendor.name]
);
await upsertVendor(vendor.name);
if (existing.rows.length === 0) {
newVendors++;
console.log(` + NEW: ${vendor.name}`);
} else {
updatedVendors++;
}
} catch (err) {
console.warn(` Error saving vendor ${vendor.name}:`, (err as Error).message);
}
}
console.log(`\nVendors: ${vendors.length} total, ${newVendors} new, ${updatedVendors} existing`);
console.log("=== Flexoptix Vendor Scraper Complete ===\n");
}
if (require.main === module) {
scrapeFlexoptixVendors()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}

View File

@ -36,14 +36,31 @@ interface NewsArticle {
} }
const FEEDS: RssFeed[] = [ const FEEDS: RssFeed[] = [
// === PRIMARY: Transceiver-specific ===
{
name: "Lightwave Online",
url: "https://www.lightwaveonline.com/rss",
category: "market_report",
},
{
name: "Lightwave - Fiber Optics",
url: "https://www.lightwaveonline.com/fttx/rss",
category: "market_report",
},
{
name: "Fierce Telecom",
url: "https://www.fiercetelecom.com/rss/xml",
category: "market_report",
},
{ {
name: "Optics.org", name: "Optics.org",
url: "https://optics.org/rss/news", url: "https://optics.org/rss/news",
category: "market_report", category: "market_report",
}, },
// === SECONDARY: Datacenter / Networking ===
{ {
name: "SPIE Newsroom", name: "Data Center Knowledge",
url: "https://www.spie.org/newsroom/rss.xml", url: "https://www.datacenterknowledge.com/rss.xml",
category: "market_report", category: "market_report",
}, },
{ {
@ -51,6 +68,12 @@ const FEEDS: RssFeed[] = [
url: "https://www.networkworld.com/category/data-center/index.rss", url: "https://www.networkworld.com/category/data-center/index.rss",
category: "market_report", category: "market_report",
}, },
{
name: "SDxCentral",
url: "https://www.sdxcentral.com/feed/",
category: "market_report",
},
// === TERTIARY: General tech / photonics ===
{ {
name: "CableFree", name: "CableFree",
url: "https://www.cablefree.net/rss", url: "https://www.cablefree.net/rss",
@ -61,6 +84,17 @@ const FEEDS: RssFeed[] = [
url: "https://www.nature.com/nphoton.rss", url: "https://www.nature.com/nphoton.rss",
category: "standard", category: "standard",
}, },
// === VENDOR NEWS ===
{
name: "Cisco Blogs - Data Center",
url: "https://blogs.cisco.com/datacenter/feed",
category: "product_launch",
},
{
name: "Arista Blog",
url: "https://blogs.arista.com/blog/rss.xml",
category: "product_launch",
},
]; ];
// Keywords for relevance scoring // Keywords for relevance scoring

View File

@ -0,0 +1,193 @@
/**
* 10Gtek.com Scraper Chinese OEM Transceiver Vendor
*
* Uses PlaywrightCrawler (JS-rendered site).
* Categories: SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP
*
* 10gtek.com is a direct competitor to FS.com at lower price points.
* No aggressive anti-bot (no Cloudflare), but content is JS-rendered.
*/
import { PlaywrightCrawler, Dataset } from "crawlee";
import { pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel } from "../utils/hash";
const CATEGORY_URLS = [
{ url: "https://www.10gtek.com/sfp-plus", formFactor: "SFP+", speedGbps: 10 },
{ url: "https://www.10gtek.com/sfp28", formFactor: "SFP28", speedGbps: 25 },
{ url: "https://www.10gtek.com/qsfp-plus", formFactor: "QSFP+", speedGbps: 40 },
{ url: "https://www.10gtek.com/100g-qsfp28", formFactor: "QSFP28", speedGbps: 100 },
{ url: "https://www.10gtek.com/400g-qsfp-dd", formFactor: "QSFP-DD", speedGbps: 400 },
];
// Get or create 10Gtek vendor
async function getVendorId(): Promise<string> {
const result = await pool.query(
`INSERT INTO vendors (name, vendor_type, website, country)
VALUES ('10Gtek', 'competitor', 'https://www.10gtek.com', 'CN')
ON CONFLICT (name) DO UPDATE SET vendor_type = 'competitor'
RETURNING id`
);
return result.rows[0].id;
}
export async function scrape10Gtek(): Promise<void> {
console.log("=== 10Gtek Scraper Starting ===\n");
const vendorId = await getVendorId();
let totalProducts = 0;
let totalPrices = 0;
const crawler = new PlaywrightCrawler({
maxRequestsPerCrawl: 50,
maxConcurrency: 2,
requestHandlerTimeoutSecs: 60,
launchContext: {
launchOptions: {
headless: true,
args: ["--no-sandbox"],
},
},
async requestHandler({ page, request, log }) {
const categoryInfo = CATEGORY_URLS.find((c) => request.url.startsWith(c.url));
if (!categoryInfo) return;
log.info(`Scraping: ${request.url} (${categoryInfo.formFactor})`);
// Wait for product grid to load
await page.waitForSelector(".product-item, .product-card, .item-info, table.products", {
timeout: 15000,
}).catch(() => {
log.warning("No product grid found, trying alternative selectors");
});
// Extract products — 10gtek uses various layouts
const products = await page.evaluate(() => {
const items: Array<{
name: string;
price: string;
partNumber: string;
url: string;
inStock: boolean;
}> = [];
// Try table layout
const rows = document.querySelectorAll("table tr, .product-item, .product-card");
rows.forEach((row) => {
const nameEl = row.querySelector("a[href*='/'], .product-name, .item-name, td:first-child a");
const priceEl = row.querySelector(".price, .product-price, [class*='price']");
if (nameEl && priceEl) {
const name = nameEl.textContent?.trim() || "";
const price = priceEl.textContent?.trim() || "";
const url = (nameEl as HTMLAnchorElement).href || "";
const partEl = row.querySelector(".sku, .part-number, [class*='sku']");
const partNumber = partEl?.textContent?.trim() || name.split(" ")[0] || "";
if (name && price) {
items.push({
name,
price,
partNumber,
url,
inStock: !row.textContent?.toLowerCase().includes("out of stock"),
});
}
}
});
// If table extraction yielded nothing, try generic approach
if (items.length === 0) {
const allLinks = document.querySelectorAll("a[href]");
allLinks.forEach((link) => {
const text = link.textContent?.trim() || "";
const parent = link.closest("div, tr, li");
const priceText = parent?.querySelector("[class*='price']")?.textContent?.trim();
if (text.length > 10 && priceText && text.match(/sfp|qsfp|xfp|cfp/i)) {
items.push({
name: text,
price: priceText,
partNumber: text.split(" ")[0],
url: (link as HTMLAnchorElement).href,
inStock: true,
});
}
});
}
return items;
});
log.info(`Found ${products.length} products on ${request.url}`);
totalProducts += products.length;
for (const product of products) {
try {
const parsed = parsePrice(product.price);
if (!parsed) continue;
const hash = contentHash({
name: product.name,
price: parsed.price,
stock: product.inStock,
});
// Find or create transceiver
const txResult = await pool.query(
`SELECT id FROM transceivers
WHERE slug ILIKE $1 OR standard_name ILIKE $1
LIMIT 1`,
[`%${product.partNumber}%`]
);
if (txResult.rows.length === 0) continue;
const existing = await pool.query(
`SELECT content_hash FROM price_observations
WHERE transceiver_id = $1 AND source_vendor_id = $2
ORDER BY time DESC LIMIT 1`,
[txResult.rows[0].id, vendorId]
);
if (existing.rows[0]?.content_hash === hash) continue;
await pool.query(
`INSERT INTO price_observations
(transceiver_id, source_vendor_id, price, currency, stock_level, url, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
[
txResult.rows[0].id,
vendorId,
parsed.price,
parsed.currency,
product.inStock ? "in_stock" : "out_of_stock",
product.url,
hash,
]
);
totalPrices++;
} catch (err) {
log.warning(`Error processing product: ${(err as Error).message}`);
}
}
},
failedRequestHandler({ request, log }) {
log.error(`Request failed: ${request.url}`);
},
});
await crawler.run(CATEGORY_URLS.map((c) => c.url));
console.log(`\nProducts found: ${totalProducts}`);
console.log(`Prices written: ${totalPrices}`);
console.log("=== 10Gtek Scraper Complete ===\n");
}
if (require.main === module) {
scrape10Gtek()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}