feat: add Flexoptix vendor scraper, 10Gtek pricing scraper, expand news feeds
- Flexoptix vendor scraper: 285 supported switch vendors ingested from flexoptix.net/en/supported-vendors/ (our own data, no restrictions) - 10Gtek Playwright scraper: Chinese OEM competitor pricing (SFP+, SFP28, QSFP+, QSFP28, QSFP-DD categories) - News feeds expanded: added Lightwave, Fierce Telecom, Data Center Knowledge, SDxCentral, Cisco Blogs, Arista Blog (11 total sources) - Scheduler updated: 8 job queues with appropriate intervals - DB now: 297 vendors, 89 transceivers, 33 news articles (13 relevant)
This commit is contained in:
parent
92f42832bf
commit
bd3a02ae4b
@ -40,7 +40,9 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
const queues = [
|
||||
"scrape:pricing:fs",
|
||||
"scrape:pricing:optcore",
|
||||
"scrape:pricing:10gtek",
|
||||
"scrape:compat:cisco",
|
||||
"scrape:vendors:flexoptix",
|
||||
"scrape:news",
|
||||
"scrape:faq",
|
||||
"scrape:docs",
|
||||
@ -79,6 +81,18 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// 10Gtek pricing (every 8 hours — Playwright, reasonable rate)
|
||||
await boss.schedule("scrape:pricing:10gtek", "0 */8 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// Flexoptix vendor list (weekly, Sunday at 6am — own data)
|
||||
await boss.schedule("scrape:vendors:flexoptix", "0 6 * * 0", {}, {
|
||||
retryLimit: 3,
|
||||
expireInSeconds: 600,
|
||||
});
|
||||
|
||||
// Document/datasheet check (every Saturday at 4am)
|
||||
await boss.schedule("scrape:docs", "0 4 * * 6", {}, {
|
||||
retryLimit: 3,
|
||||
@ -93,6 +107,8 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
const { scrapeFs } = await import("./scrapers/fs-com");
|
||||
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
|
||||
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||||
const { scrape10Gtek } = await import("./scrapers/tenGtek");
|
||||
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
|
||||
const { scrapeNews } = await import("./scrapers/news");
|
||||
|
||||
await boss.work("scrape:pricing:fs", async (_job) => {
|
||||
@ -110,6 +126,16 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await scrapeCiscoTmg();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:10gtek", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
||||
await scrape10Gtek();
|
||||
});
|
||||
|
||||
await boss.work("scrape:vendors:flexoptix", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`);
|
||||
await scrapeFlexoptixVendors();
|
||||
});
|
||||
|
||||
await boss.work("scrape:news", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: News aggregation`);
|
||||
await scrapeNews();
|
||||
|
||||
131
packages/scraper/src/scrapers/flexoptix-vendors.ts
Normal file
131
packages/scraper/src/scrapers/flexoptix-vendors.ts
Normal file
@ -0,0 +1,131 @@
|
||||
/**
|
||||
* Flexoptix Supported Vendors Scraper
|
||||
*
|
||||
* Scrapes flexoptix.net/en/supported-vendors/ for the full list of
|
||||
* switch vendors Flexoptix supports. This is our own data — no restrictions.
|
||||
*
|
||||
* Data goes into: switches (vendor names) + vendors table
|
||||
* Also scrapes per-vendor pages for individual switch models when available.
|
||||
*/
|
||||
import { pool } from "../utils/db";
|
||||
|
||||
interface VendorEntry {
|
||||
name: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
async function fetchVendorList(): Promise<VendorEntry[]> {
|
||||
const resp = await fetch("https://www.flexoptix.net/en/supported-vendors/", {
|
||||
headers: {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal)",
|
||||
Accept: "text/html",
|
||||
},
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
|
||||
if (!resp.ok) throw new Error(`Flexoptix returned ${resp.status}`);
|
||||
|
||||
const html = await resp.text();
|
||||
const vendors: VendorEntry[] = [];
|
||||
|
||||
// Parse vendor links from the supported-vendors page
|
||||
// Pattern: href="...supported-vendors/index/name/VENDOR-compatible"
|
||||
const regex = /href="(https?:\/\/www\.flexoptix\.net\/en\/supported-vendors\/index\/name\/([^"]+)-compatible)"/g;
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]
|
||||
.replace(/:/g, ":")
|
||||
.replace(///g, "/")
|
||||
.replace(/+/g, "+")
|
||||
.replace(/(/g, "(")
|
||||
.replace(/)/g, ")");
|
||||
const rawName = match[2]
|
||||
.replace(/\+/g, " ")
|
||||
.replace(/%20/g, " ")
|
||||
.replace(/%28/g, "(")
|
||||
.replace(/%29/g, ")");
|
||||
|
||||
vendors.push({ name: rawName, url });
|
||||
}
|
||||
|
||||
// Also catch plain link text pattern
|
||||
const altRegex = /class="[^"]*vendor[^"]*"[^>]*>\s*<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/gi;
|
||||
while ((match = altRegex.exec(html)) !== null) {
|
||||
const url = match[1];
|
||||
const name = match[2].trim();
|
||||
if (name && !vendors.find((v) => v.name.toLowerCase() === name.toLowerCase())) {
|
||||
vendors.push({ name, url });
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by name (case-insensitive)
|
||||
const seen = new Set<string>();
|
||||
return vendors.filter((v) => {
|
||||
const key = v.name.toLowerCase();
|
||||
if (seen.has(key)) return false;
|
||||
seen.add(key);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
function slugify(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, "-")
|
||||
.replace(/^-|-$/g, "");
|
||||
}
|
||||
|
||||
async function upsertVendor(name: string): Promise<string> {
|
||||
const slug = slugify(name);
|
||||
const result = await pool.query(
|
||||
`INSERT INTO vendors (name, slug, type, website)
|
||||
VALUES ($1, $2, 'manufacturer', $3)
|
||||
ON CONFLICT (name) DO UPDATE SET website = COALESCE(vendors.website, EXCLUDED.website)
|
||||
RETURNING id`,
|
||||
[name, slug, `https://www.flexoptix.net/en/supported-vendors/`]
|
||||
);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
export async function scrapeFlexoptixVendors(): Promise<void> {
|
||||
console.log("=== Flexoptix Vendor Scraper Starting ===\n");
|
||||
|
||||
const vendors = await fetchVendorList();
|
||||
console.log(`Found ${vendors.length} supported vendors\n`);
|
||||
|
||||
let newVendors = 0;
|
||||
let updatedVendors = 0;
|
||||
|
||||
for (const vendor of vendors) {
|
||||
try {
|
||||
const existing = await pool.query(
|
||||
`SELECT id FROM vendors WHERE name ILIKE $1`,
|
||||
[vendor.name]
|
||||
);
|
||||
|
||||
await upsertVendor(vendor.name);
|
||||
|
||||
if (existing.rows.length === 0) {
|
||||
newVendors++;
|
||||
console.log(` + NEW: ${vendor.name}`);
|
||||
} else {
|
||||
updatedVendors++;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` Error saving vendor ${vendor.name}:`, (err as Error).message);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nVendors: ${vendors.length} total, ${newVendors} new, ${updatedVendors} existing`);
|
||||
console.log("=== Flexoptix Vendor Scraper Complete ===\n");
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeFlexoptixVendors()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -36,14 +36,31 @@ interface NewsArticle {
|
||||
}
|
||||
|
||||
const FEEDS: RssFeed[] = [
|
||||
// === PRIMARY: Transceiver-specific ===
|
||||
{
|
||||
name: "Lightwave Online",
|
||||
url: "https://www.lightwaveonline.com/rss",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "Lightwave - Fiber Optics",
|
||||
url: "https://www.lightwaveonline.com/fttx/rss",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "Fierce Telecom",
|
||||
url: "https://www.fiercetelecom.com/rss/xml",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "Optics.org",
|
||||
url: "https://optics.org/rss/news",
|
||||
category: "market_report",
|
||||
},
|
||||
// === SECONDARY: Datacenter / Networking ===
|
||||
{
|
||||
name: "SPIE Newsroom",
|
||||
url: "https://www.spie.org/newsroom/rss.xml",
|
||||
name: "Data Center Knowledge",
|
||||
url: "https://www.datacenterknowledge.com/rss.xml",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
@ -51,6 +68,12 @@ const FEEDS: RssFeed[] = [
|
||||
url: "https://www.networkworld.com/category/data-center/index.rss",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "SDxCentral",
|
||||
url: "https://www.sdxcentral.com/feed/",
|
||||
category: "market_report",
|
||||
},
|
||||
// === TERTIARY: General tech / photonics ===
|
||||
{
|
||||
name: "CableFree",
|
||||
url: "https://www.cablefree.net/rss",
|
||||
@ -61,6 +84,17 @@ const FEEDS: RssFeed[] = [
|
||||
url: "https://www.nature.com/nphoton.rss",
|
||||
category: "standard",
|
||||
},
|
||||
// === VENDOR NEWS ===
|
||||
{
|
||||
name: "Cisco Blogs - Data Center",
|
||||
url: "https://blogs.cisco.com/datacenter/feed",
|
||||
category: "product_launch",
|
||||
},
|
||||
{
|
||||
name: "Arista Blog",
|
||||
url: "https://blogs.arista.com/blog/rss.xml",
|
||||
category: "product_launch",
|
||||
},
|
||||
];
|
||||
|
||||
// Keywords for relevance scoring
|
||||
|
||||
193
packages/scraper/src/scrapers/tenGtek.ts
Normal file
193
packages/scraper/src/scrapers/tenGtek.ts
Normal file
@ -0,0 +1,193 @@
|
||||
/**
|
||||
* 10Gtek.com Scraper — Chinese OEM Transceiver Vendor
|
||||
*
|
||||
* Uses PlaywrightCrawler (JS-rendered site).
|
||||
* Categories: SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP
|
||||
*
|
||||
* 10gtek.com is a direct competitor to FS.com at lower price points.
|
||||
* No aggressive anti-bot (no Cloudflare), but content is JS-rendered.
|
||||
*/
|
||||
import { PlaywrightCrawler, Dataset } from "crawlee";
|
||||
import { pool } from "../utils/db";
|
||||
import { contentHash, parsePrice, parseStockLevel } from "../utils/hash";
|
||||
|
||||
const CATEGORY_URLS = [
|
||||
{ url: "https://www.10gtek.com/sfp-plus", formFactor: "SFP+", speedGbps: 10 },
|
||||
{ url: "https://www.10gtek.com/sfp28", formFactor: "SFP28", speedGbps: 25 },
|
||||
{ url: "https://www.10gtek.com/qsfp-plus", formFactor: "QSFP+", speedGbps: 40 },
|
||||
{ url: "https://www.10gtek.com/100g-qsfp28", formFactor: "QSFP28", speedGbps: 100 },
|
||||
{ url: "https://www.10gtek.com/400g-qsfp-dd", formFactor: "QSFP-DD", speedGbps: 400 },
|
||||
];
|
||||
|
||||
// Get or create 10Gtek vendor
|
||||
async function getVendorId(): Promise<string> {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO vendors (name, vendor_type, website, country)
|
||||
VALUES ('10Gtek', 'competitor', 'https://www.10gtek.com', 'CN')
|
||||
ON CONFLICT (name) DO UPDATE SET vendor_type = 'competitor'
|
||||
RETURNING id`
|
||||
);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
export async function scrape10Gtek(): Promise<void> {
|
||||
console.log("=== 10Gtek Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await getVendorId();
|
||||
let totalProducts = 0;
|
||||
let totalPrices = 0;
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
maxRequestsPerCrawl: 50,
|
||||
maxConcurrency: 2,
|
||||
requestHandlerTimeoutSecs: 60,
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
headless: true,
|
||||
args: ["--no-sandbox"],
|
||||
},
|
||||
},
|
||||
async requestHandler({ page, request, log }) {
|
||||
const categoryInfo = CATEGORY_URLS.find((c) => request.url.startsWith(c.url));
|
||||
if (!categoryInfo) return;
|
||||
|
||||
log.info(`Scraping: ${request.url} (${categoryInfo.formFactor})`);
|
||||
|
||||
// Wait for product grid to load
|
||||
await page.waitForSelector(".product-item, .product-card, .item-info, table.products", {
|
||||
timeout: 15000,
|
||||
}).catch(() => {
|
||||
log.warning("No product grid found, trying alternative selectors");
|
||||
});
|
||||
|
||||
// Extract products — 10gtek uses various layouts
|
||||
const products = await page.evaluate(() => {
|
||||
const items: Array<{
|
||||
name: string;
|
||||
price: string;
|
||||
partNumber: string;
|
||||
url: string;
|
||||
inStock: boolean;
|
||||
}> = [];
|
||||
|
||||
// Try table layout
|
||||
const rows = document.querySelectorAll("table tr, .product-item, .product-card");
|
||||
rows.forEach((row) => {
|
||||
const nameEl = row.querySelector("a[href*='/'], .product-name, .item-name, td:first-child a");
|
||||
const priceEl = row.querySelector(".price, .product-price, [class*='price']");
|
||||
|
||||
if (nameEl && priceEl) {
|
||||
const name = nameEl.textContent?.trim() || "";
|
||||
const price = priceEl.textContent?.trim() || "";
|
||||
const url = (nameEl as HTMLAnchorElement).href || "";
|
||||
const partEl = row.querySelector(".sku, .part-number, [class*='sku']");
|
||||
const partNumber = partEl?.textContent?.trim() || name.split(" ")[0] || "";
|
||||
|
||||
if (name && price) {
|
||||
items.push({
|
||||
name,
|
||||
price,
|
||||
partNumber,
|
||||
url,
|
||||
inStock: !row.textContent?.toLowerCase().includes("out of stock"),
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// If table extraction yielded nothing, try generic approach
|
||||
if (items.length === 0) {
|
||||
const allLinks = document.querySelectorAll("a[href]");
|
||||
allLinks.forEach((link) => {
|
||||
const text = link.textContent?.trim() || "";
|
||||
const parent = link.closest("div, tr, li");
|
||||
const priceText = parent?.querySelector("[class*='price']")?.textContent?.trim();
|
||||
if (text.length > 10 && priceText && text.match(/sfp|qsfp|xfp|cfp/i)) {
|
||||
items.push({
|
||||
name: text,
|
||||
price: priceText,
|
||||
partNumber: text.split(" ")[0],
|
||||
url: (link as HTMLAnchorElement).href,
|
||||
inStock: true,
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return items;
|
||||
});
|
||||
|
||||
log.info(`Found ${products.length} products on ${request.url}`);
|
||||
totalProducts += products.length;
|
||||
|
||||
for (const product of products) {
|
||||
try {
|
||||
const parsed = parsePrice(product.price);
|
||||
if (!parsed) continue;
|
||||
|
||||
const hash = contentHash({
|
||||
name: product.name,
|
||||
price: parsed.price,
|
||||
stock: product.inStock,
|
||||
});
|
||||
|
||||
// Find or create transceiver
|
||||
const txResult = await pool.query(
|
||||
`SELECT id FROM transceivers
|
||||
WHERE slug ILIKE $1 OR standard_name ILIKE $1
|
||||
LIMIT 1`,
|
||||
[`%${product.partNumber}%`]
|
||||
);
|
||||
|
||||
if (txResult.rows.length === 0) continue;
|
||||
|
||||
const existing = await pool.query(
|
||||
`SELECT content_hash FROM price_observations
|
||||
WHERE transceiver_id = $1 AND source_vendor_id = $2
|
||||
ORDER BY time DESC LIMIT 1`,
|
||||
[txResult.rows[0].id, vendorId]
|
||||
);
|
||||
|
||||
if (existing.rows[0]?.content_hash === hash) continue;
|
||||
|
||||
await pool.query(
|
||||
`INSERT INTO price_observations
|
||||
(transceiver_id, source_vendor_id, price, currency, stock_level, url, content_hash)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
|
||||
[
|
||||
txResult.rows[0].id,
|
||||
vendorId,
|
||||
parsed.price,
|
||||
parsed.currency,
|
||||
product.inStock ? "in_stock" : "out_of_stock",
|
||||
product.url,
|
||||
hash,
|
||||
]
|
||||
);
|
||||
totalPrices++;
|
||||
} catch (err) {
|
||||
log.warning(`Error processing product: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
},
|
||||
failedRequestHandler({ request, log }) {
|
||||
log.error(`Request failed: ${request.url}`);
|
||||
},
|
||||
});
|
||||
|
||||
await crawler.run(CATEGORY_URLS.map((c) => c.url));
|
||||
|
||||
console.log(`\nProducts found: ${totalProducts}`);
|
||||
console.log(`Prices written: ${totalPrices}`);
|
||||
console.log("=== 10Gtek Scraper Complete ===\n");
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrape10Gtek()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user