transceiver-db/packages/scraper/src/scrapers/news.ts

/**
 * News Aggregator — Optics & Fiber Trade Press RSS Scraper
 *
 * Sources:
 *   - optics.org       (photonics industry news)
 *   - SPIE Newsroom    (photonics research)
 *   - Network World    (data center / networking)
 *   - Light Reading    (telecom)
 *   - Telecom Ramblings (industry commentary)
 *
 * Stores articles in news_articles table.
 * Relevance filtering: keyword scoring for transceiver/optics topics.
 */
import { pool } from "../utils/db";
import { contentHash } from "../utils/hash";
import { parseStringPromise } from "xml2js";

// Categories allowed by news_articles CHECK constraint
type NewsCategory = "product_launch" | "market_report" | "standard" | "m_and_a" | "factory" | "event";

interface RssFeed {
  name: string;
  url: string;
  category: NewsCategory;
}

interface NewsArticle {
  title: string;
  sourceUrl: string;
  summary: string;
  publishedAt: Date;
  source: string;
  category: NewsCategory | null;
  relevanceScore: number;
  contentHash: string;
}

const FEEDS: RssFeed[] = [
  // === PRIMARY: Transceiver-specific ===
  {
    name: "Lightwave Online",
    url: "https://www.lightwaveonline.com/rss",
    category: "market_report",
  },
  {
    name: "Lightwave - Fiber Optics",
    url: "https://www.lightwaveonline.com/fttx/rss",
    category: "market_report",
  },
  {
    name: "Fierce Telecom",
    url: "https://www.fiercetelecom.com/rss/xml",
    category: "market_report",
  },
  {
    name: "Optics.org",
    url: "https://optics.org/rss/news",
    category: "market_report",
  },
  // === SECONDARY: Datacenter / Networking ===
  {
    name: "Data Center Knowledge",
    url: "https://www.datacenterknowledge.com/rss.xml",
    category: "market_report",
  },
  {
    name: "Network World - Data Center",
    url: "https://www.networkworld.com/category/data-center/index.rss",
    category: "market_report",
  },
  {
    name: "SDxCentral",
    url: "https://www.sdxcentral.com/feed/",
    category: "market_report",
  },
  // === TERTIARY: General tech / photonics ===
  {
    name: "CableFree",
    url: "https://www.cablefree.net/rss",
    category: "market_report",
  },
  {
    name: "Nature Photonics",
    url: "https://www.nature.com/nphoton.rss",
    category: "standard",
  },
  // === VENDOR NEWS ===
  {
    name: "Cisco Blogs - Data Center",
    url: "https://blogs.cisco.com/datacenter/feed",
    category: "product_launch",
  },
  {
    name: "Arista Blog",
    url: "https://blogs.arista.com/blog/rss.xml",
    category: "product_launch",
  },
];

// Keywords for relevance scoring
const HIGH_RELEVANCE = [
  "transceiver", "sfp", "qsfp", "xfp", "cfp", "osfp",
  "optical module", "fiber optic", "wavelength", "dwdm", "cwdm",
  "400g", "800g", "1.6t", "coherent", "pluggable",
  "ofc", "ecoc", "cioe",
];

const MEDIUM_RELEVANCE = [
  "data center", "datacenter", "interconnect", "bandwidth",
  "switch", "router", "cisco", "arista", "juniper",
  "100g", "40g", "25g", "10g",
  "silicon photonics", "photonic",
  "ii-vi", "coherent", "lumentum", "inphi",
  "flexoptix", "prolabs",
];

function scoreRelevance(title: string, summary: string): number {
  const text = `${title} ${summary}`.toLowerCase();
  let score = 0;

  for (const kw of HIGH_RELEVANCE) {
    if (text.includes(kw)) score += 3;
  }
  for (const kw of MEDIUM_RELEVANCE) {
    if (text.includes(kw)) score += 1;
  }

  return score;
}

async function fetchFeed(feed: RssFeed): Promise<NewsArticle[]> {
  const articles: NewsArticle[] = [];

  try {
    const resp = await fetch(feed.url, {
      headers: {
        "User-Agent": "Mozilla/5.0 (compatible; TIP-NewsBot/1.0; +https://flexoptix.net)",
        Accept: "application/rss+xml, application/xml, text/xml",
      },
      signal: AbortSignal.timeout(15000),
    });

    if (!resp.ok) {
      console.warn(`  Feed ${feed.name} returned ${resp.status}`);
      return [];
    }

    const rawXml = await resp.text();
    // Sanitize common RSS issues: unescaped & in URLs, attribute-without-value
    const xml = rawXml
      .replace(/&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[\dA-Fa-f]+;)/g, "&amp;")
      .replace(/(<\w[^>]*)\s+(\w+)=([^"'\s>]+)(?=[\s/>])/g, '$1 $2="$3"');
    const parsed = await parseStringPromise(xml, { explicitArray: false, strict: false });

    // strict: false makes keys uppercase; support both
    const rss = parsed?.rss || parsed?.RSS;
    const channel = rss?.channel || rss?.CHANNEL || parsed?.feed || parsed?.FEED;
    if (!channel) return [];

    const items = channel.item || channel.ITEM || channel.entry || channel.ENTRY || [];
    const itemArray = Array.isArray(items) ? items : [items];

    for (const item of itemArray) {
      const title = extractText(item.title || item.TITLE) || "";
      const url = extractLink(item) || "";
      const summary = extractText(
        item.description || item.DESCRIPTION || item.summary || item.SUMMARY || item["content:encoded"]
      ) || "";
      const pubDate = item.pubDate || item.PUBDATE || item.published || item.updated || "";

      if (!title || !url) continue;

      const publishedAt = pubDate ? new Date(pubDate) : new Date();
      if (isNaN(publishedAt.getTime())) continue;

      // Skip articles older than 7 days
      const ageMs = Date.now() - publishedAt.getTime();
      if (ageMs > 7 * 24 * 60 * 60 * 1000) continue;

      const relevanceScore = scoreRelevance(title, summary);
      const hash = contentHash({ title, url });

      articles.push({
        title: title.slice(0, 500),
        sourceUrl: url.slice(0, 1000),
        summary: stripHtml(summary).slice(0, 2000),
        publishedAt,
        source: feed.name,
        category: feed.category as NewsCategory,
        relevanceScore,
        contentHash: hash,
      });
    }
  } catch (err) {
    console.warn(`  Feed ${feed.name} error:`, (err as Error).message);
  }

  return articles;
}

function extractText(value: unknown): string {
  if (!value) return "";
  if (typeof value === "string") return value;
  if (typeof value === "object" && value !== null) {
    const obj = value as Record<string, unknown>;
    return String(obj._ || obj["#text"] || "");
  }
  return String(value);
}

function extractLink(item: Record<string, unknown>): string {
  const link = item.link || item.LINK;
  if (typeof link === "string") return link;
  if (Array.isArray(link)) {
    const rel = (link as Array<Record<string, unknown>>).find(
      (l) => !l["$"] || (l["$"] as Record<string, string>).rel === "alternate"
    );
    return String((rel?.["$"] as Record<string, string>)?.href || rel?._ || "");
  }
  if (typeof link === "object" && link !== null) {
    const l = link as Record<string, unknown>;
    return String((l["$"] as Record<string, string>)?.href || l._ || "");
  }
  return "";
}

function stripHtml(html: string): string {
  return html
    .replace(/<[^>]+>/g, " ")
    .replace(/&nbsp;/g, " ")
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/\s+/g, " ")
    .trim();
}

async function upsertArticle(article: NewsArticle): Promise<boolean> {
  const result = await pool.query(
    `INSERT INTO news_articles (title, source_url, summary, published_at, source, category, relevance_score, content_hash)
     VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
     ON CONFLICT (source_url) DO UPDATE
       SET relevance_score = EXCLUDED.relevance_score,
           content_hash = EXCLUDED.content_hash
     RETURNING (xmax = 0) AS inserted`,
    [
      article.title,
      article.sourceUrl,
      article.summary,
      article.publishedAt,
      article.source,
      article.category,
      article.relevanceScore,
      article.contentHash,
    ]
  );
  return result.rows[0]?.inserted ?? true;
}

export async function scrapeNews(): Promise<void> {
  console.log("=== News Scraper Starting ===\n");

  let totalFetched = 0;
  let totalWritten = 0;
  let totalRelevant = 0;

  for (const feed of FEEDS) {
    console.log(`Fetching: ${feed.name} (${feed.url})`);
    const articles = await fetchFeed(feed);
    console.log(`  → ${articles.length} articles (last 7 days)`);

    for (const article of articles) {
      totalFetched++;
      if (article.relevanceScore > 0) totalRelevant++;

      try {
        const isNew = await upsertArticle(article);
        if (isNew) totalWritten++;
      } catch (err) {
        console.error(`  Error saving article:`, (err as Error).message);
      }
    }

    // Rate limit between feeds
    await new Promise((r) => setTimeout(r, 1000));
  }

  console.log(`\nFetched: ${totalFetched} articles`);
  console.log(`Relevant (score > 0): ${totalRelevant}`);
  console.log(`Written: ${totalWritten} new`);
  console.log("=== News Scraper Complete ===\n");
}

if (require.main === module) {
  scrapeNews()
    .then(() => pool.end())
    .catch((err) => {
      console.error("Fatal:", err);
      pool.end();
      process.exit(1);
    });
}