- Flexoptix vendor scraper: 285 supported switch vendors ingested from flexoptix.net/en/supported-vendors/ (our own data, no restrictions) - 10Gtek Playwright scraper: Chinese OEM competitor pricing (SFP+, SFP28, QSFP+, QSFP28, QSFP-DD categories) - News feeds expanded: added Lightwave, Fierce Telecom, Data Center Knowledge, SDxCentral, Cisco Blogs, Arista Blog (11 total sources) - Scheduler updated: 8 job queues with appropriate intervals - DB now: 297 vendors, 89 transceivers, 33 news articles (13 relevant)
304 lines
8.7 KiB
TypeScript
304 lines
8.7 KiB
TypeScript
/**
|
|
* News Aggregator — Optics & Fiber Trade Press RSS Scraper
|
|
*
|
|
* Sources:
|
|
* - optics.org (photonics industry news)
|
|
* - SPIE Newsroom (photonics research)
|
|
* - Network World (data center / networking)
|
|
* - Light Reading (telecom)
|
|
* - Telecom Ramblings (industry commentary)
|
|
*
|
|
* Stores articles in news_articles table.
|
|
* Relevance filtering: keyword scoring for transceiver/optics topics.
|
|
*/
|
|
import { pool } from "../utils/db";
|
|
import { contentHash } from "../utils/hash";
|
|
import { parseStringPromise } from "xml2js";
|
|
|
|
// Categories allowed by news_articles CHECK constraint
|
|
type NewsCategory = "product_launch" | "market_report" | "standard" | "m_and_a" | "factory" | "event";
|
|
|
|
interface RssFeed {
|
|
name: string;
|
|
url: string;
|
|
category: NewsCategory;
|
|
}
|
|
|
|
interface NewsArticle {
|
|
title: string;
|
|
sourceUrl: string;
|
|
summary: string;
|
|
publishedAt: Date;
|
|
source: string;
|
|
category: NewsCategory | null;
|
|
relevanceScore: number;
|
|
contentHash: string;
|
|
}
|
|
|
|
const FEEDS: RssFeed[] = [
|
|
// === PRIMARY: Transceiver-specific ===
|
|
{
|
|
name: "Lightwave Online",
|
|
url: "https://www.lightwaveonline.com/rss",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "Lightwave - Fiber Optics",
|
|
url: "https://www.lightwaveonline.com/fttx/rss",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "Fierce Telecom",
|
|
url: "https://www.fiercetelecom.com/rss/xml",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "Optics.org",
|
|
url: "https://optics.org/rss/news",
|
|
category: "market_report",
|
|
},
|
|
// === SECONDARY: Datacenter / Networking ===
|
|
{
|
|
name: "Data Center Knowledge",
|
|
url: "https://www.datacenterknowledge.com/rss.xml",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "Network World - Data Center",
|
|
url: "https://www.networkworld.com/category/data-center/index.rss",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "SDxCentral",
|
|
url: "https://www.sdxcentral.com/feed/",
|
|
category: "market_report",
|
|
},
|
|
// === TERTIARY: General tech / photonics ===
|
|
{
|
|
name: "CableFree",
|
|
url: "https://www.cablefree.net/rss",
|
|
category: "market_report",
|
|
},
|
|
{
|
|
name: "Nature Photonics",
|
|
url: "https://www.nature.com/nphoton.rss",
|
|
category: "standard",
|
|
},
|
|
// === VENDOR NEWS ===
|
|
{
|
|
name: "Cisco Blogs - Data Center",
|
|
url: "https://blogs.cisco.com/datacenter/feed",
|
|
category: "product_launch",
|
|
},
|
|
{
|
|
name: "Arista Blog",
|
|
url: "https://blogs.arista.com/blog/rss.xml",
|
|
category: "product_launch",
|
|
},
|
|
];
|
|
|
|
// Keywords for relevance scoring
|
|
const HIGH_RELEVANCE = [
|
|
"transceiver", "sfp", "qsfp", "xfp", "cfp", "osfp",
|
|
"optical module", "fiber optic", "wavelength", "dwdm", "cwdm",
|
|
"400g", "800g", "1.6t", "coherent", "pluggable",
|
|
"ofc", "ecoc", "cioe",
|
|
];
|
|
|
|
const MEDIUM_RELEVANCE = [
|
|
"data center", "datacenter", "interconnect", "bandwidth",
|
|
"switch", "router", "cisco", "arista", "juniper",
|
|
"100g", "40g", "25g", "10g",
|
|
"silicon photonics", "photonic",
|
|
"ii-vi", "coherent", "lumentum", "inphi",
|
|
"flexoptix", "prolabs",
|
|
];
|
|
|
|
function scoreRelevance(title: string, summary: string): number {
|
|
const text = `${title} ${summary}`.toLowerCase();
|
|
let score = 0;
|
|
|
|
for (const kw of HIGH_RELEVANCE) {
|
|
if (text.includes(kw)) score += 3;
|
|
}
|
|
for (const kw of MEDIUM_RELEVANCE) {
|
|
if (text.includes(kw)) score += 1;
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
async function fetchFeed(feed: RssFeed): Promise<NewsArticle[]> {
|
|
const articles: NewsArticle[] = [];
|
|
|
|
try {
|
|
const resp = await fetch(feed.url, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-NewsBot/1.0; +https://flexoptix.net)",
|
|
Accept: "application/rss+xml, application/xml, text/xml",
|
|
},
|
|
signal: AbortSignal.timeout(15000),
|
|
});
|
|
|
|
if (!resp.ok) {
|
|
console.warn(` Feed ${feed.name} returned ${resp.status}`);
|
|
return [];
|
|
}
|
|
|
|
const rawXml = await resp.text();
|
|
// Sanitize common RSS issues: unescaped & in URLs, attribute-without-value
|
|
const xml = rawXml
|
|
.replace(/&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[\dA-Fa-f]+;)/g, "&")
|
|
.replace(/(<\w[^>]*)\s+(\w+)=([^"'\s>]+)(?=[\s/>])/g, '$1 $2="$3"');
|
|
const parsed = await parseStringPromise(xml, { explicitArray: false, strict: false });
|
|
|
|
// strict: false makes keys uppercase; support both
|
|
const rss = parsed?.rss || parsed?.RSS;
|
|
const channel = rss?.channel || rss?.CHANNEL || parsed?.feed || parsed?.FEED;
|
|
if (!channel) return [];
|
|
|
|
const items = channel.item || channel.ITEM || channel.entry || channel.ENTRY || [];
|
|
const itemArray = Array.isArray(items) ? items : [items];
|
|
|
|
for (const item of itemArray) {
|
|
const title = extractText(item.title || item.TITLE) || "";
|
|
const url = extractLink(item) || "";
|
|
const summary = extractText(
|
|
item.description || item.DESCRIPTION || item.summary || item.SUMMARY || item["content:encoded"]
|
|
) || "";
|
|
const pubDate = item.pubDate || item.PUBDATE || item.published || item.updated || "";
|
|
|
|
if (!title || !url) continue;
|
|
|
|
const publishedAt = pubDate ? new Date(pubDate) : new Date();
|
|
if (isNaN(publishedAt.getTime())) continue;
|
|
|
|
// Skip articles older than 7 days
|
|
const ageMs = Date.now() - publishedAt.getTime();
|
|
if (ageMs > 7 * 24 * 60 * 60 * 1000) continue;
|
|
|
|
const relevanceScore = scoreRelevance(title, summary);
|
|
const hash = contentHash({ title, url });
|
|
|
|
articles.push({
|
|
title: title.slice(0, 500),
|
|
sourceUrl: url.slice(0, 1000),
|
|
summary: stripHtml(summary).slice(0, 2000),
|
|
publishedAt,
|
|
source: feed.name,
|
|
category: feed.category as NewsCategory,
|
|
relevanceScore,
|
|
contentHash: hash,
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.warn(` Feed ${feed.name} error:`, (err as Error).message);
|
|
}
|
|
|
|
return articles;
|
|
}
|
|
|
|
function extractText(value: unknown): string {
|
|
if (!value) return "";
|
|
if (typeof value === "string") return value;
|
|
if (typeof value === "object" && value !== null) {
|
|
const obj = value as Record<string, unknown>;
|
|
return String(obj._ || obj["#text"] || "");
|
|
}
|
|
return String(value);
|
|
}
|
|
|
|
function extractLink(item: Record<string, unknown>): string {
|
|
const link = item.link || item.LINK;
|
|
if (typeof link === "string") return link;
|
|
if (Array.isArray(link)) {
|
|
const rel = (link as Array<Record<string, unknown>>).find(
|
|
(l) => !l["$"] || (l["$"] as Record<string, string>).rel === "alternate"
|
|
);
|
|
return String((rel?.["$"] as Record<string, string>)?.href || rel?._ || "");
|
|
}
|
|
if (typeof link === "object" && link !== null) {
|
|
const l = link as Record<string, unknown>;
|
|
return String((l["$"] as Record<string, string>)?.href || l._ || "");
|
|
}
|
|
return "";
|
|
}
|
|
|
|
function stripHtml(html: string): string {
|
|
return html
|
|
.replace(/<[^>]+>/g, " ")
|
|
.replace(/ /g, " ")
|
|
.replace(/&/g, "&")
|
|
.replace(/</g, "<")
|
|
.replace(/>/g, ">")
|
|
.replace(/"/g, '"')
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
async function upsertArticle(article: NewsArticle): Promise<boolean> {
|
|
const result = await pool.query(
|
|
`INSERT INTO news_articles (title, source_url, summary, published_at, source, category, relevance_score, content_hash)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
ON CONFLICT (source_url) DO UPDATE
|
|
SET relevance_score = EXCLUDED.relevance_score,
|
|
content_hash = EXCLUDED.content_hash
|
|
RETURNING (xmax = 0) AS inserted`,
|
|
[
|
|
article.title,
|
|
article.sourceUrl,
|
|
article.summary,
|
|
article.publishedAt,
|
|
article.source,
|
|
article.category,
|
|
article.relevanceScore,
|
|
article.contentHash,
|
|
]
|
|
);
|
|
return result.rows[0]?.inserted ?? true;
|
|
}
|
|
|
|
export async function scrapeNews(): Promise<void> {
|
|
console.log("=== News Scraper Starting ===\n");
|
|
|
|
let totalFetched = 0;
|
|
let totalWritten = 0;
|
|
let totalRelevant = 0;
|
|
|
|
for (const feed of FEEDS) {
|
|
console.log(`Fetching: ${feed.name} (${feed.url})`);
|
|
const articles = await fetchFeed(feed);
|
|
console.log(` → ${articles.length} articles (last 7 days)`);
|
|
|
|
for (const article of articles) {
|
|
totalFetched++;
|
|
if (article.relevanceScore > 0) totalRelevant++;
|
|
|
|
try {
|
|
const isNew = await upsertArticle(article);
|
|
if (isNew) totalWritten++;
|
|
} catch (err) {
|
|
console.error(` Error saving article:`, (err as Error).message);
|
|
}
|
|
}
|
|
|
|
// Rate limit between feeds
|
|
await new Promise((r) => setTimeout(r, 1000));
|
|
}
|
|
|
|
console.log(`\nFetched: ${totalFetched} articles`);
|
|
console.log(`Relevant (score > 0): ${totalRelevant}`);
|
|
console.log(`Written: ${totalWritten} new`);
|
|
console.log("=== News Scraper Complete ===\n");
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeNews()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|