New scrapers (all registered in pg-boss, 50 total jobs):
- sec-edgar.ts : SEC EDGAR XBRL API — hyperscaler CapEx from 10-Q/10-K
- github-signals.ts : GitHub Search/Stats API — tech adoption metrics weekly
- ebay-velocity.ts : eBay completed listings — sold count + price distribution
- ai-clusters.ts : RSS feeds (6 sources) — AI cluster & DC announcements
- distributor-leads.ts : Mouser, Digi-Key, RS Components — lead time + stock
- standards-tracker.ts : IEEE 802.3, OIF, IETF — draft/ballot/published status
New utilities:
- forecast-engine.ts : Weighted signal aggregator → demand_index + price_direction
6 signal types, 4 horizons (3/9/12/18 months), 5 technologies tracked
New DB tables (migration 022):
hyperscaler_capex, distributor_lead_times, github_tech_signals,
marketplace_velocity, ai_cluster_announcements, standards_activity,
forecast_signals
Schedules:
- EDGAR: weekly Mon 06:00
- GitHub: weekly Sun 05:00
- eBay velocity: every 12h
- AI clusters: every 4h (news-speed)
- Distributor leads: daily 03:30
- Standards: weekly Wed 04:00
- Forecast engine: daily 08:00 (after all nightly scrapers)
205 lines
7.7 KiB
TypeScript
205 lines
7.7 KiB
TypeScript
/**
|
|
* AI Cluster & Hyperscale DC Announcement Scraper
|
|
*
|
|
* Monitors RSS feeds from:
|
|
* - DataCenterKnowledge (datacenterknowledge.com)
|
|
* - The Register (theregister.com)
|
|
* - ServeTheHome (servethehome.com)
|
|
* - DataCenter Dynamics (datacenterdynamics.com)
|
|
* - Blocks & Files (blocksandfiles.com)
|
|
*
|
|
* Extracts announcements about:
|
|
* - AI clusters (xAI, Meta, AWS, Microsoft, Google)
|
|
* - New data center builds with scale indicators
|
|
* - Network tech mentions (400G, 800G, InfiniBand, RoCE)
|
|
*
|
|
* Each announced AI cluster = predictable transceiver demand
|
|
* with 3-9 month deployment lag.
|
|
*/
|
|
|
|
import * as cheerio from "cheerio";
|
|
import { pool } from "../utils/db";
|
|
import { logger } from "../utils/logger";
|
|
import { contentHash } from "../utils/hash";
|
|
|
|
interface Announcement {
|
|
company: string;
|
|
title: string;
|
|
summary: string;
|
|
announced_date: string | null;
|
|
scale_mw: number | null;
|
|
scale_servers: number | null;
|
|
network_speed: string | null;
|
|
estimated_transceivers: number | null;
|
|
deployment_date: string | null;
|
|
location: string | null;
|
|
source_url: string;
|
|
source_name: string;
|
|
}
|
|
|
|
const RSS_FEEDS = [
|
|
{ url: "https://www.datacenterknowledge.com/rss.xml", name: "DataCenterKnowledge" },
|
|
{ url: "https://www.datacenterdynamics.com/en/rss/", name: "DataCenter Dynamics" },
|
|
{ url: "https://www.theregister.com/data_centre/rss/", name: "The Register" },
|
|
{ url: "https://blocksandfiles.com/feed/", name: "Blocks & Files" },
|
|
{ url: "https://www.nextplatform.com/feed/", name: "Next Platform" },
|
|
{ url: "https://www.servethehome.com/feed/", name: "ServeTheHome" },
|
|
];
|
|
|
|
const COMPANY_PATTERNS: Array<{ pattern: RegExp; name: string }> = [
|
|
{ pattern: /\bxAI\b/i, name: "xAI" },
|
|
{ pattern: /\bMeta\b.*?(AI|data center)/i, name: "Meta" },
|
|
{ pattern: /\bOpenAI\b/i, name: "OpenAI" },
|
|
{ pattern: /\bAWS\b|\bAmazon\b.*?cloud/i, name: "Amazon (AWS)" },
|
|
{ pattern: /\bMicrosoft\b.*?(Azure|AI)/i, name: "Microsoft" },
|
|
{ pattern: /\bGoogle\b.*?(Cloud|DeepMind)/i, name: "Google" },
|
|
{ pattern: /\bOracle\b.*?cloud/i, name: "Oracle Cloud" },
|
|
{ pattern: /\bCoreWeave\b/i, name: "CoreWeave" },
|
|
{ pattern: /\bLambda\b.*?(Labs|cloud)/i, name: "Lambda Labs" },
|
|
{ pattern: /\bNVIDIA\b.*?supercomputer/i, name: "NVIDIA" },
|
|
{ pattern: /\bApple\b.*?data center/i, name: "Apple" },
|
|
];
|
|
|
|
const AI_KEYWORDS = [
|
|
"GPU cluster", "AI supercomputer", "AI infrastructure", "data center",
|
|
"hyperscale", "AI campus", "GPU server", "AI factory", "compute cluster",
|
|
"400G", "800G", "InfiniBand", "RoCE", "co-packaged optics",
|
|
];
|
|
|
|
const SCALE_PATTERNS = [
|
|
{ pattern: /(\d+(?:\.\d+)?)\s*(?:MW|megawatt)/i, key: "mw" },
|
|
{ pattern: /(\d[\d,]*)\s*(?:GPU|H100|H200|A100|B200)/i, key: "gpus" },
|
|
{ pattern: /\$(\d+(?:\.\d+)?)\s*(?:billion|B)\b/i, key: "usd_billion" },
|
|
{ pattern: /(\d[\d,]*)\s*(?:server|node)/i, key: "servers" },
|
|
];
|
|
|
|
const NETWORK_PATTERNS = [
|
|
{ pattern: /\b800G\b/i, value: "800G" },
|
|
{ pattern: /\b400G\b/i, value: "400G" },
|
|
{ pattern: /\bInfiniBand NDR\b/i, value: "IB-NDR-400G" },
|
|
{ pattern: /\bInfiniBand HDR\b/i, value: "IB-HDR-200G" },
|
|
{ pattern: /\bInfiniBand\b/i, value: "InfiniBand" },
|
|
{ pattern: /\bRoCEv2?\b/i, value: "RoCE-Ethernet" },
|
|
{ pattern: /\b100G\b/i, value: "100G" },
|
|
];
|
|
|
|
// Rough estimate: 1 GPU server ≈ 8 transceivers for 400G networking
|
|
function estimateTransceivers(servers: number | null, mw: number | null, networkSpeed: string | null): number | null {
|
|
if (!servers && !mw) return null;
|
|
const serverEstimate = servers ?? (mw ? Math.round(mw * 300) : 0); // ~300 servers/MW
|
|
const perServer = networkSpeed?.includes("800G") ? 16 : 8;
|
|
return Math.round(serverEstimate * perServer);
|
|
}
|
|
|
|
async function parseRssFeed(feedUrl: string, sourceName: string): Promise<Announcement[]> {
|
|
const res = await fetch(feedUrl, {
|
|
headers: { "User-Agent": "TIP-DataCollector/1.0 contact@context-x.org" },
|
|
signal: AbortSignal.timeout(15000),
|
|
});
|
|
if (!res.ok) throw new Error(`RSS fetch failed: ${res.status}`);
|
|
const xml = await res.text();
|
|
const $ = cheerio.load(xml, { xmlMode: true });
|
|
|
|
const announcements: Announcement[] = [];
|
|
|
|
$("item").each((_, el) => {
|
|
const title = $(el).find("title").first().text().trim();
|
|
const link = $(el).find("link").first().text().trim() || $(el).find("guid").first().text().trim();
|
|
const desc = $(el).find("description").first().text().replace(/<[^>]+>/g, " ").trim();
|
|
const pubDate = $(el).find("pubDate").first().text().trim();
|
|
|
|
const fullText = `${title} ${desc}`;
|
|
|
|
// Check if this article is relevant
|
|
const isRelevant = AI_KEYWORDS.some(kw => fullText.toLowerCase().includes(kw.toLowerCase()));
|
|
if (!isRelevant) return;
|
|
|
|
// Extract company
|
|
let company = "Unknown";
|
|
for (const cp of COMPANY_PATTERNS) {
|
|
if (cp.pattern.test(fullText)) { company = cp.name; break; }
|
|
}
|
|
|
|
// Extract scale
|
|
let scaleMw: number | null = null;
|
|
let scaleServers: number | null = null;
|
|
for (const sp of SCALE_PATTERNS) {
|
|
const m = fullText.match(sp.pattern);
|
|
if (m) {
|
|
const v = parseFloat(m[1].replace(/,/g, ""));
|
|
if (sp.key === "mw") scaleMw = v;
|
|
else if (sp.key === "gpus" || sp.key === "servers") scaleServers = Math.round(v / 8) * 8; // normalize
|
|
else if (sp.key === "usd_billion") scaleServers = Math.round(v * 2000); // rough estimate
|
|
}
|
|
}
|
|
|
|
// Extract network speed
|
|
let networkSpeed: string | null = null;
|
|
for (const np of NETWORK_PATTERNS) {
|
|
if (np.pattern.test(fullText)) { networkSpeed = np.value; break; }
|
|
}
|
|
|
|
// Parse date
|
|
let announcedDate: string | null = null;
|
|
if (pubDate) {
|
|
try { announcedDate = new Date(pubDate).toISOString().split("T")[0]; } catch { /* ignore */ }
|
|
}
|
|
|
|
const summary = desc.substring(0, 500);
|
|
|
|
announcements.push({
|
|
company,
|
|
title: title.substring(0, 300),
|
|
summary,
|
|
announced_date: announcedDate,
|
|
scale_mw: scaleMw,
|
|
scale_servers: scaleServers,
|
|
network_speed: networkSpeed,
|
|
estimated_transceivers: estimateTransceivers(scaleServers, scaleMw, networkSpeed),
|
|
deployment_date: null, // extracted from text in future
|
|
location: null,
|
|
source_url: link,
|
|
source_name: sourceName,
|
|
});
|
|
});
|
|
|
|
return announcements;
|
|
}
|
|
|
|
export async function scrapeAiClusters(): Promise<void> {
|
|
logger.info("AI cluster announcement scraper starting");
|
|
let newItems = 0;
|
|
|
|
for (const feed of RSS_FEEDS) {
|
|
try {
|
|
logger.info(`Fetching: ${feed.name}`);
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
|
|
const announcements = await parseRssFeed(feed.url, feed.name);
|
|
logger.info(`${feed.name}: ${announcements.length} relevant articles found`);
|
|
|
|
for (const a of announcements) {
|
|
const hash = contentHash(`${a.title}${a.source_url}`);
|
|
try {
|
|
await pool.query(`
|
|
INSERT INTO ai_cluster_announcements
|
|
(company, title, summary, announced_date, scale_mw, scale_servers,
|
|
network_speed, estimated_transceivers, source_url, source_name, content_hash)
|
|
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
|
|
ON CONFLICT (content_hash) DO NOTHING
|
|
`, [
|
|
a.company, a.title, a.summary, a.announced_date,
|
|
a.scale_mw, a.scale_servers, a.network_speed,
|
|
a.estimated_transceivers, a.source_url, a.source_name, hash,
|
|
]);
|
|
newItems++;
|
|
} catch { /* duplicate */ }
|
|
}
|
|
} catch (err) {
|
|
logger.warn(`AI cluster feed failed: ${feed.name}`, { err });
|
|
}
|
|
}
|
|
|
|
logger.info(`AI cluster scraper done — ${newItems} items recorded`);
|
|
}
|