/** * News Aggregator — Optics & Fiber Trade Press RSS Scraper * * Sources: * - optics.org (photonics industry news) * - SPIE Newsroom (photonics research) * - Network World (data center / networking) * - Light Reading (telecom) * - Telecom Ramblings (industry commentary) * * Stores articles in news_articles table. * Relevance filtering: keyword scoring for transceiver/optics topics. */ import { pool } from "../utils/db"; import { contentHash } from "../utils/hash"; import { parseStringPromise } from "xml2js"; // Categories allowed by news_articles CHECK constraint type NewsCategory = "product_launch" | "market_report" | "standard" | "m_and_a" | "factory" | "event"; interface RssFeed { name: string; url: string; category: NewsCategory; } interface NewsArticle { title: string; sourceUrl: string; summary: string; publishedAt: Date; source: string; category: NewsCategory | null; relevanceScore: number; contentHash: string; } const FEEDS: RssFeed[] = [ // === PRIMARY: Transceiver-specific === { name: "Lightwave Online", url: "https://www.lightwaveonline.com/rss", category: "market_report", }, { name: "Lightwave - Fiber Optics", url: "https://www.lightwaveonline.com/fttx/rss", category: "market_report", }, { name: "Fierce Telecom", url: "https://www.fiercetelecom.com/rss/xml", category: "market_report", }, { name: "Optics.org", url: "https://optics.org/rss/news", category: "market_report", }, // === SECONDARY: Datacenter / Networking === { name: "Data Center Knowledge", url: "https://www.datacenterknowledge.com/rss.xml", category: "market_report", }, { name: "Network World - Data Center", url: "https://www.networkworld.com/category/data-center/index.rss", category: "market_report", }, { name: "SDxCentral", url: "https://www.sdxcentral.com/feed/", category: "market_report", }, // === TERTIARY: General tech / photonics === { name: "CableFree", url: "https://www.cablefree.net/rss", category: "market_report", }, { name: "Nature Photonics", url: "https://www.nature.com/nphoton.rss", category: "standard", }, // === VENDOR NEWS === { name: "Cisco Blogs - Data Center", url: "https://blogs.cisco.com/datacenter/feed", category: "product_launch", }, { name: "Arista Blog", url: "https://blogs.arista.com/blog/rss.xml", category: "product_launch", }, ]; // Keywords for relevance scoring const HIGH_RELEVANCE = [ "transceiver", "sfp", "qsfp", "xfp", "cfp", "osfp", "optical module", "fiber optic", "wavelength", "dwdm", "cwdm", "400g", "800g", "1.6t", "coherent", "pluggable", "ofc", "ecoc", "cioe", ]; const MEDIUM_RELEVANCE = [ "data center", "datacenter", "interconnect", "bandwidth", "switch", "router", "cisco", "arista", "juniper", "100g", "40g", "25g", "10g", "silicon photonics", "photonic", "ii-vi", "coherent", "lumentum", "inphi", "flexoptix", "prolabs", ]; function scoreRelevance(title: string, summary: string): number { const text = `${title} ${summary}`.toLowerCase(); let score = 0; for (const kw of HIGH_RELEVANCE) { if (text.includes(kw)) score += 3; } for (const kw of MEDIUM_RELEVANCE) { if (text.includes(kw)) score += 1; } return score; } async function fetchFeed(feed: RssFeed): Promise { const articles: NewsArticle[] = []; try { const resp = await fetch(feed.url, { headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-NewsBot/1.0; +https://flexoptix.net)", Accept: "application/rss+xml, application/xml, text/xml", }, signal: AbortSignal.timeout(15000), }); if (!resp.ok) { console.warn(` Feed ${feed.name} returned ${resp.status}`); return []; } const rawXml = await resp.text(); // Sanitize common RSS issues: unescaped & in URLs, attribute-without-value const xml = rawXml .replace(/&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[\dA-Fa-f]+;)/g, "&") .replace(/(<\w[^>]*)\s+(\w+)=([^"'\s>]+)(?=[\s/>])/g, '$1 $2="$3"'); const parsed = await parseStringPromise(xml, { explicitArray: false, strict: false }); // strict: false makes keys uppercase; support both const rss = parsed?.rss || parsed?.RSS; const channel = rss?.channel || rss?.CHANNEL || parsed?.feed || parsed?.FEED; if (!channel) return []; const items = channel.item || channel.ITEM || channel.entry || channel.ENTRY || []; const itemArray = Array.isArray(items) ? items : [items]; for (const item of itemArray) { const title = extractText(item.title || item.TITLE) || ""; const url = extractLink(item) || ""; const summary = extractText( item.description || item.DESCRIPTION || item.summary || item.SUMMARY || item["content:encoded"] ) || ""; const pubDate = item.pubDate || item.PUBDATE || item.published || item.updated || ""; if (!title || !url) continue; const publishedAt = pubDate ? new Date(pubDate) : new Date(); if (isNaN(publishedAt.getTime())) continue; // Skip articles older than 7 days const ageMs = Date.now() - publishedAt.getTime(); if (ageMs > 7 * 24 * 60 * 60 * 1000) continue; const relevanceScore = scoreRelevance(title, summary); const hash = contentHash({ title, url }); articles.push({ title: title.slice(0, 500), sourceUrl: url.slice(0, 1000), summary: stripHtml(summary).slice(0, 2000), publishedAt, source: feed.name, category: feed.category as NewsCategory, relevanceScore, contentHash: hash, }); } } catch (err) { console.warn(` Feed ${feed.name} error:`, (err as Error).message); } return articles; } function extractText(value: unknown): string { if (!value) return ""; if (typeof value === "string") return value; if (typeof value === "object" && value !== null) { const obj = value as Record; return String(obj._ || obj["#text"] || ""); } return String(value); } function extractLink(item: Record): string { const link = item.link || item.LINK; if (typeof link === "string") return link; if (Array.isArray(link)) { const rel = (link as Array>).find( (l) => !l["$"] || (l["$"] as Record).rel === "alternate" ); return String((rel?.["$"] as Record)?.href || rel?._ || ""); } if (typeof link === "object" && link !== null) { const l = link as Record; return String((l["$"] as Record)?.href || l._ || ""); } return ""; } function stripHtml(html: string): string { return html .replace(/<[^>]+>/g, " ") .replace(/ /g, " ") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/\s+/g, " ") .trim(); } async function upsertArticle(article: NewsArticle): Promise { const result = await pool.query( `INSERT INTO news_articles (title, source_url, summary, published_at, source, category, relevance_score, content_hash) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (source_url) DO UPDATE SET relevance_score = EXCLUDED.relevance_score, content_hash = EXCLUDED.content_hash RETURNING (xmax = 0) AS inserted`, [ article.title, article.sourceUrl, article.summary, article.publishedAt, article.source, article.category, article.relevanceScore, article.contentHash, ] ); return result.rows[0]?.inserted ?? true; } export async function scrapeNews(): Promise { console.log("=== News Scraper Starting ===\n"); let totalFetched = 0; let totalWritten = 0; let totalRelevant = 0; for (const feed of FEEDS) { console.log(`Fetching: ${feed.name} (${feed.url})`); const articles = await fetchFeed(feed); console.log(` → ${articles.length} articles (last 7 days)`); for (const article of articles) { totalFetched++; if (article.relevanceScore > 0) totalRelevant++; try { const isNew = await upsertArticle(article); if (isNew) totalWritten++; } catch (err) { console.error(` Error saving article:`, (err as Error).message); } } // Rate limit between feeds await new Promise((r) => setTimeout(r, 1000)); } console.log(`\nFetched: ${totalFetched} articles`); console.log(`Relevant (score > 0): ${totalRelevant}`); console.log(`Written: ${totalWritten} new`); console.log("=== News Scraper Complete ===\n"); } if (require.main === module) { scrapeNews() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }