Rene Fichtmueller bd3a02ae4b feat: add Flexoptix vendor scraper, 10Gtek pricing scraper, expand news feeds
- Flexoptix vendor scraper: 285 supported switch vendors ingested from
  flexoptix.net/en/supported-vendors/ (our own data, no restrictions)
- 10Gtek Playwright scraper: Chinese OEM competitor pricing (SFP+, SFP28,
  QSFP+, QSFP28, QSFP-DD categories)
- News feeds expanded: added Lightwave, Fierce Telecom, Data Center Knowledge,
  SDxCentral, Cisco Blogs, Arista Blog (11 total sources)
- Scheduler updated: 8 job queues with appropriate intervals
- DB now: 297 vendors, 89 transceivers, 33 news articles (13 relevant)
2026-03-27 23:17:42 +13:00

304 lines
8.7 KiB
TypeScript

/**
* News Aggregator — Optics & Fiber Trade Press RSS Scraper
*
* Sources:
* - optics.org (photonics industry news)
* - SPIE Newsroom (photonics research)
* - Network World (data center / networking)
* - Light Reading (telecom)
* - Telecom Ramblings (industry commentary)
*
* Stores articles in news_articles table.
* Relevance filtering: keyword scoring for transceiver/optics topics.
*/
import { pool } from "../utils/db";
import { contentHash } from "../utils/hash";
import { parseStringPromise } from "xml2js";
// Categories allowed by news_articles CHECK constraint
type NewsCategory = "product_launch" | "market_report" | "standard" | "m_and_a" | "factory" | "event";
interface RssFeed {
name: string;
url: string;
category: NewsCategory;
}
interface NewsArticle {
title: string;
sourceUrl: string;
summary: string;
publishedAt: Date;
source: string;
category: NewsCategory | null;
relevanceScore: number;
contentHash: string;
}
const FEEDS: RssFeed[] = [
// === PRIMARY: Transceiver-specific ===
{
name: "Lightwave Online",
url: "https://www.lightwaveonline.com/rss",
category: "market_report",
},
{
name: "Lightwave - Fiber Optics",
url: "https://www.lightwaveonline.com/fttx/rss",
category: "market_report",
},
{
name: "Fierce Telecom",
url: "https://www.fiercetelecom.com/rss/xml",
category: "market_report",
},
{
name: "Optics.org",
url: "https://optics.org/rss/news",
category: "market_report",
},
// === SECONDARY: Datacenter / Networking ===
{
name: "Data Center Knowledge",
url: "https://www.datacenterknowledge.com/rss.xml",
category: "market_report",
},
{
name: "Network World - Data Center",
url: "https://www.networkworld.com/category/data-center/index.rss",
category: "market_report",
},
{
name: "SDxCentral",
url: "https://www.sdxcentral.com/feed/",
category: "market_report",
},
// === TERTIARY: General tech / photonics ===
{
name: "CableFree",
url: "https://www.cablefree.net/rss",
category: "market_report",
},
{
name: "Nature Photonics",
url: "https://www.nature.com/nphoton.rss",
category: "standard",
},
// === VENDOR NEWS ===
{
name: "Cisco Blogs - Data Center",
url: "https://blogs.cisco.com/datacenter/feed",
category: "product_launch",
},
{
name: "Arista Blog",
url: "https://blogs.arista.com/blog/rss.xml",
category: "product_launch",
},
];
// Keywords for relevance scoring
const HIGH_RELEVANCE = [
"transceiver", "sfp", "qsfp", "xfp", "cfp", "osfp",
"optical module", "fiber optic", "wavelength", "dwdm", "cwdm",
"400g", "800g", "1.6t", "coherent", "pluggable",
"ofc", "ecoc", "cioe",
];
const MEDIUM_RELEVANCE = [
"data center", "datacenter", "interconnect", "bandwidth",
"switch", "router", "cisco", "arista", "juniper",
"100g", "40g", "25g", "10g",
"silicon photonics", "photonic",
"ii-vi", "coherent", "lumentum", "inphi",
"flexoptix", "prolabs",
];
function scoreRelevance(title: string, summary: string): number {
const text = `${title} ${summary}`.toLowerCase();
let score = 0;
for (const kw of HIGH_RELEVANCE) {
if (text.includes(kw)) score += 3;
}
for (const kw of MEDIUM_RELEVANCE) {
if (text.includes(kw)) score += 1;
}
return score;
}
async function fetchFeed(feed: RssFeed): Promise<NewsArticle[]> {
const articles: NewsArticle[] = [];
try {
const resp = await fetch(feed.url, {
headers: {
"User-Agent": "Mozilla/5.0 (compatible; TIP-NewsBot/1.0; +https://flexoptix.net)",
Accept: "application/rss+xml, application/xml, text/xml",
},
signal: AbortSignal.timeout(15000),
});
if (!resp.ok) {
console.warn(` Feed ${feed.name} returned ${resp.status}`);
return [];
}
const rawXml = await resp.text();
// Sanitize common RSS issues: unescaped & in URLs, attribute-without-value
const xml = rawXml
.replace(/&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[\dA-Fa-f]+;)/g, "&amp;")
.replace(/(<\w[^>]*)\s+(\w+)=([^"'\s>]+)(?=[\s/>])/g, '$1 $2="$3"');
const parsed = await parseStringPromise(xml, { explicitArray: false, strict: false });
// strict: false makes keys uppercase; support both
const rss = parsed?.rss || parsed?.RSS;
const channel = rss?.channel || rss?.CHANNEL || parsed?.feed || parsed?.FEED;
if (!channel) return [];
const items = channel.item || channel.ITEM || channel.entry || channel.ENTRY || [];
const itemArray = Array.isArray(items) ? items : [items];
for (const item of itemArray) {
const title = extractText(item.title || item.TITLE) || "";
const url = extractLink(item) || "";
const summary = extractText(
item.description || item.DESCRIPTION || item.summary || item.SUMMARY || item["content:encoded"]
) || "";
const pubDate = item.pubDate || item.PUBDATE || item.published || item.updated || "";
if (!title || !url) continue;
const publishedAt = pubDate ? new Date(pubDate) : new Date();
if (isNaN(publishedAt.getTime())) continue;
// Skip articles older than 7 days
const ageMs = Date.now() - publishedAt.getTime();
if (ageMs > 7 * 24 * 60 * 60 * 1000) continue;
const relevanceScore = scoreRelevance(title, summary);
const hash = contentHash({ title, url });
articles.push({
title: title.slice(0, 500),
sourceUrl: url.slice(0, 1000),
summary: stripHtml(summary).slice(0, 2000),
publishedAt,
source: feed.name,
category: feed.category as NewsCategory,
relevanceScore,
contentHash: hash,
});
}
} catch (err) {
console.warn(` Feed ${feed.name} error:`, (err as Error).message);
}
return articles;
}
function extractText(value: unknown): string {
if (!value) return "";
if (typeof value === "string") return value;
if (typeof value === "object" && value !== null) {
const obj = value as Record<string, unknown>;
return String(obj._ || obj["#text"] || "");
}
return String(value);
}
function extractLink(item: Record<string, unknown>): string {
const link = item.link || item.LINK;
if (typeof link === "string") return link;
if (Array.isArray(link)) {
const rel = (link as Array<Record<string, unknown>>).find(
(l) => !l["$"] || (l["$"] as Record<string, string>).rel === "alternate"
);
return String((rel?.["$"] as Record<string, string>)?.href || rel?._ || "");
}
if (typeof link === "object" && link !== null) {
const l = link as Record<string, unknown>;
return String((l["$"] as Record<string, string>)?.href || l._ || "");
}
return "";
}
function stripHtml(html: string): string {
return html
.replace(/<[^>]+>/g, " ")
.replace(/&nbsp;/g, " ")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/\s+/g, " ")
.trim();
}
async function upsertArticle(article: NewsArticle): Promise<boolean> {
const result = await pool.query(
`INSERT INTO news_articles (title, source_url, summary, published_at, source, category, relevance_score, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT (source_url) DO UPDATE
SET relevance_score = EXCLUDED.relevance_score,
content_hash = EXCLUDED.content_hash
RETURNING (xmax = 0) AS inserted`,
[
article.title,
article.sourceUrl,
article.summary,
article.publishedAt,
article.source,
article.category,
article.relevanceScore,
article.contentHash,
]
);
return result.rows[0]?.inserted ?? true;
}
export async function scrapeNews(): Promise<void> {
console.log("=== News Scraper Starting ===\n");
let totalFetched = 0;
let totalWritten = 0;
let totalRelevant = 0;
for (const feed of FEEDS) {
console.log(`Fetching: ${feed.name} (${feed.url})`);
const articles = await fetchFeed(feed);
console.log(`${articles.length} articles (last 7 days)`);
for (const article of articles) {
totalFetched++;
if (article.relevanceScore > 0) totalRelevant++;
try {
const isNew = await upsertArticle(article);
if (isNew) totalWritten++;
} catch (err) {
console.error(` Error saving article:`, (err as Error).message);
}
}
// Rate limit between feeds
await new Promise((r) => setTimeout(r, 1000));
}
console.log(`\nFetched: ${totalFetched} articles`);
console.log(`Relevant (score > 0): ${totalRelevant}`);
console.log(`Written: ${totalWritten} new`);
console.log("=== News Scraper Complete ===\n");
}
if (require.main === module) {
scrapeNews()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}