/** * AI Cluster & Hyperscale DC Announcement Scraper * * Monitors RSS feeds from: * - DataCenterKnowledge (datacenterknowledge.com) * - The Register (theregister.com) * - ServeTheHome (servethehome.com) * - DataCenter Dynamics (datacenterdynamics.com) * - Blocks & Files (blocksandfiles.com) * * Extracts announcements about: * - AI clusters (xAI, Meta, AWS, Microsoft, Google) * - New data center builds with scale indicators * - Network tech mentions (400G, 800G, InfiniBand, RoCE) * * Each announced AI cluster = predictable transceiver demand * with 3-9 month deployment lag. */ import * as cheerio from "cheerio"; import { pool } from "../utils/db"; import { logger } from "../utils/logger"; import { contentHash } from "../utils/hash"; interface Announcement { company: string; title: string; summary: string; announced_date: string | null; scale_mw: number | null; scale_servers: number | null; network_speed: string | null; estimated_transceivers: number | null; deployment_date: string | null; location: string | null; source_url: string; source_name: string; } const RSS_FEEDS = [ { url: "https://www.datacenterknowledge.com/rss.xml", name: "DataCenterKnowledge" }, { url: "https://www.datacenterdynamics.com/en/rss/", name: "DataCenter Dynamics" }, { url: "https://www.theregister.com/data_centre/rss/", name: "The Register" }, { url: "https://blocksandfiles.com/feed/", name: "Blocks & Files" }, { url: "https://www.nextplatform.com/feed/", name: "Next Platform" }, { url: "https://www.servethehome.com/feed/", name: "ServeTheHome" }, ]; const COMPANY_PATTERNS: Array<{ pattern: RegExp; name: string }> = [ { pattern: /\bxAI\b/i, name: "xAI" }, { pattern: /\bMeta\b.*?(AI|data center)/i, name: "Meta" }, { pattern: /\bOpenAI\b/i, name: "OpenAI" }, { pattern: /\bAWS\b|\bAmazon\b.*?cloud/i, name: "Amazon (AWS)" }, { pattern: /\bMicrosoft\b.*?(Azure|AI)/i, name: "Microsoft" }, { pattern: /\bGoogle\b.*?(Cloud|DeepMind)/i, name: "Google" }, { pattern: /\bOracle\b.*?cloud/i, name: "Oracle Cloud" }, { pattern: /\bCoreWeave\b/i, name: "CoreWeave" }, { pattern: /\bLambda\b.*?(Labs|cloud)/i, name: "Lambda Labs" }, { pattern: /\bNVIDIA\b.*?supercomputer/i, name: "NVIDIA" }, { pattern: /\bApple\b.*?data center/i, name: "Apple" }, ]; const AI_KEYWORDS = [ "GPU cluster", "AI supercomputer", "AI infrastructure", "data center", "hyperscale", "AI campus", "GPU server", "AI factory", "compute cluster", "400G", "800G", "InfiniBand", "RoCE", "co-packaged optics", ]; const SCALE_PATTERNS = [ { pattern: /(\d+(?:\.\d+)?)\s*(?:MW|megawatt)/i, key: "mw" }, { pattern: /(\d[\d,]*)\s*(?:GPU|H100|H200|A100|B200)/i, key: "gpus" }, { pattern: /\$(\d+(?:\.\d+)?)\s*(?:billion|B)\b/i, key: "usd_billion" }, { pattern: /(\d[\d,]*)\s*(?:server|node)/i, key: "servers" }, ]; const NETWORK_PATTERNS = [ { pattern: /\b800G\b/i, value: "800G" }, { pattern: /\b400G\b/i, value: "400G" }, { pattern: /\bInfiniBand NDR\b/i, value: "IB-NDR-400G" }, { pattern: /\bInfiniBand HDR\b/i, value: "IB-HDR-200G" }, { pattern: /\bInfiniBand\b/i, value: "InfiniBand" }, { pattern: /\bRoCEv2?\b/i, value: "RoCE-Ethernet" }, { pattern: /\b100G\b/i, value: "100G" }, ]; // Rough estimate: 1 GPU server ≈ 8 transceivers for 400G networking function estimateTransceivers(servers: number | null, mw: number | null, networkSpeed: string | null): number | null { if (!servers && !mw) return null; const serverEstimate = servers ?? (mw ? Math.round(mw * 300) : 0); // ~300 servers/MW const perServer = networkSpeed?.includes("800G") ? 16 : 8; return Math.round(serverEstimate * perServer); } async function parseRssFeed(feedUrl: string, sourceName: string): Promise { const res = await fetch(feedUrl, { headers: { "User-Agent": "TIP-DataCollector/1.0 contact@context-x.org" }, signal: AbortSignal.timeout(15000), }); if (!res.ok) throw new Error(`RSS fetch failed: ${res.status}`); const xml = await res.text(); const $ = cheerio.load(xml, { xmlMode: true }); const announcements: Announcement[] = []; $("item").each((_, el) => { const title = $(el).find("title").first().text().trim(); const link = $(el).find("link").first().text().trim() || $(el).find("guid").first().text().trim(); const desc = $(el).find("description").first().text().replace(/<[^>]+>/g, " ").trim(); const pubDate = $(el).find("pubDate").first().text().trim(); const fullText = `${title} ${desc}`; // Check if this article is relevant const isRelevant = AI_KEYWORDS.some(kw => fullText.toLowerCase().includes(kw.toLowerCase())); if (!isRelevant) return; // Extract company let company = "Unknown"; for (const cp of COMPANY_PATTERNS) { if (cp.pattern.test(fullText)) { company = cp.name; break; } } // Extract scale let scaleMw: number | null = null; let scaleServers: number | null = null; for (const sp of SCALE_PATTERNS) { const m = fullText.match(sp.pattern); if (m) { const v = parseFloat(m[1].replace(/,/g, "")); if (sp.key === "mw") scaleMw = v; else if (sp.key === "gpus" || sp.key === "servers") scaleServers = Math.round(v / 8) * 8; // normalize else if (sp.key === "usd_billion") scaleServers = Math.round(v * 2000); // rough estimate } } // Extract network speed let networkSpeed: string | null = null; for (const np of NETWORK_PATTERNS) { if (np.pattern.test(fullText)) { networkSpeed = np.value; break; } } // Parse date let announcedDate: string | null = null; if (pubDate) { try { announcedDate = new Date(pubDate).toISOString().split("T")[0]; } catch { /* ignore */ } } const summary = desc.substring(0, 500); announcements.push({ company, title: title.substring(0, 300), summary, announced_date: announcedDate, scale_mw: scaleMw, scale_servers: scaleServers, network_speed: networkSpeed, estimated_transceivers: estimateTransceivers(scaleServers, scaleMw, networkSpeed), deployment_date: null, // extracted from text in future location: null, source_url: link, source_name: sourceName, }); }); return announcements; } export async function scrapeAiClusters(): Promise { logger.info("AI cluster announcement scraper starting"); let newItems = 0; for (const feed of RSS_FEEDS) { try { logger.info(`Fetching: ${feed.name}`); await new Promise(r => setTimeout(r, 1000)); const announcements = await parseRssFeed(feed.url, feed.name); logger.info(`${feed.name}: ${announcements.length} relevant articles found`); for (const a of announcements) { const hash = contentHash(`${a.title}${a.source_url}`); try { await pool.query(` INSERT INTO ai_cluster_announcements (company, title, summary, announced_date, scale_mw, scale_servers, network_speed, estimated_transceivers, source_url, source_name, content_hash) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11) ON CONFLICT (content_hash) DO NOTHING `, [ a.company, a.title, a.summary, a.announced_date, a.scale_mw, a.scale_servers, a.network_speed, a.estimated_transceivers, a.source_url, a.source_name, hash, ]); newItems++; } catch { /* duplicate */ } } } catch (err) { logger.warn(`AI cluster feed failed: ${feed.name}`, { err }); } } logger.info(`AI cluster scraper done — ${newItems} items recorded`); }