Rene Fichtmueller 7f1c701ba1 feat: 6 prediction signal scrapers + forecast engine
New scrapers (all registered in pg-boss, 50 total jobs):
  - sec-edgar.ts       : SEC EDGAR XBRL API — hyperscaler CapEx from 10-Q/10-K
  - github-signals.ts  : GitHub Search/Stats API — tech adoption metrics weekly
  - ebay-velocity.ts   : eBay completed listings — sold count + price distribution
  - ai-clusters.ts     : RSS feeds (6 sources) — AI cluster & DC announcements
  - distributor-leads.ts : Mouser, Digi-Key, RS Components — lead time + stock
  - standards-tracker.ts : IEEE 802.3, OIF, IETF — draft/ballot/published status

New utilities:
  - forecast-engine.ts : Weighted signal aggregator → demand_index + price_direction
    6 signal types, 4 horizons (3/9/12/18 months), 5 technologies tracked

New DB tables (migration 022):
  hyperscaler_capex, distributor_lead_times, github_tech_signals,
  marketplace_velocity, ai_cluster_announcements, standards_activity,
  forecast_signals

Schedules:
  - EDGAR: weekly Mon 06:00
  - GitHub: weekly Sun 05:00
  - eBay velocity: every 12h
  - AI clusters: every 4h (news-speed)
  - Distributor leads: daily 03:30
  - Standards: weekly Wed 04:00
  - Forecast engine: daily 08:00 (after all nightly scrapers)
2026-04-02 02:02:44 +02:00

205 lines
7.7 KiB
TypeScript

/**
* AI Cluster & Hyperscale DC Announcement Scraper
*
* Monitors RSS feeds from:
* - DataCenterKnowledge (datacenterknowledge.com)
* - The Register (theregister.com)
* - ServeTheHome (servethehome.com)
* - DataCenter Dynamics (datacenterdynamics.com)
* - Blocks & Files (blocksandfiles.com)
*
* Extracts announcements about:
* - AI clusters (xAI, Meta, AWS, Microsoft, Google)
* - New data center builds with scale indicators
* - Network tech mentions (400G, 800G, InfiniBand, RoCE)
*
* Each announced AI cluster = predictable transceiver demand
* with 3-9 month deployment lag.
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
import { contentHash } from "../utils/hash";
interface Announcement {
company: string;
title: string;
summary: string;
announced_date: string | null;
scale_mw: number | null;
scale_servers: number | null;
network_speed: string | null;
estimated_transceivers: number | null;
deployment_date: string | null;
location: string | null;
source_url: string;
source_name: string;
}
const RSS_FEEDS = [
{ url: "https://www.datacenterknowledge.com/rss.xml", name: "DataCenterKnowledge" },
{ url: "https://www.datacenterdynamics.com/en/rss/", name: "DataCenter Dynamics" },
{ url: "https://www.theregister.com/data_centre/rss/", name: "The Register" },
{ url: "https://blocksandfiles.com/feed/", name: "Blocks & Files" },
{ url: "https://www.nextplatform.com/feed/", name: "Next Platform" },
{ url: "https://www.servethehome.com/feed/", name: "ServeTheHome" },
];
const COMPANY_PATTERNS: Array<{ pattern: RegExp; name: string }> = [
{ pattern: /\bxAI\b/i, name: "xAI" },
{ pattern: /\bMeta\b.*?(AI|data center)/i, name: "Meta" },
{ pattern: /\bOpenAI\b/i, name: "OpenAI" },
{ pattern: /\bAWS\b|\bAmazon\b.*?cloud/i, name: "Amazon (AWS)" },
{ pattern: /\bMicrosoft\b.*?(Azure|AI)/i, name: "Microsoft" },
{ pattern: /\bGoogle\b.*?(Cloud|DeepMind)/i, name: "Google" },
{ pattern: /\bOracle\b.*?cloud/i, name: "Oracle Cloud" },
{ pattern: /\bCoreWeave\b/i, name: "CoreWeave" },
{ pattern: /\bLambda\b.*?(Labs|cloud)/i, name: "Lambda Labs" },
{ pattern: /\bNVIDIA\b.*?supercomputer/i, name: "NVIDIA" },
{ pattern: /\bApple\b.*?data center/i, name: "Apple" },
];
const AI_KEYWORDS = [
"GPU cluster", "AI supercomputer", "AI infrastructure", "data center",
"hyperscale", "AI campus", "GPU server", "AI factory", "compute cluster",
"400G", "800G", "InfiniBand", "RoCE", "co-packaged optics",
];
const SCALE_PATTERNS = [
{ pattern: /(\d+(?:\.\d+)?)\s*(?:MW|megawatt)/i, key: "mw" },
{ pattern: /(\d[\d,]*)\s*(?:GPU|H100|H200|A100|B200)/i, key: "gpus" },
{ pattern: /\$(\d+(?:\.\d+)?)\s*(?:billion|B)\b/i, key: "usd_billion" },
{ pattern: /(\d[\d,]*)\s*(?:server|node)/i, key: "servers" },
];
const NETWORK_PATTERNS = [
{ pattern: /\b800G\b/i, value: "800G" },
{ pattern: /\b400G\b/i, value: "400G" },
{ pattern: /\bInfiniBand NDR\b/i, value: "IB-NDR-400G" },
{ pattern: /\bInfiniBand HDR\b/i, value: "IB-HDR-200G" },
{ pattern: /\bInfiniBand\b/i, value: "InfiniBand" },
{ pattern: /\bRoCEv2?\b/i, value: "RoCE-Ethernet" },
{ pattern: /\b100G\b/i, value: "100G" },
];
// Rough estimate: 1 GPU server ≈ 8 transceivers for 400G networking
function estimateTransceivers(servers: number | null, mw: number | null, networkSpeed: string | null): number | null {
if (!servers && !mw) return null;
const serverEstimate = servers ?? (mw ? Math.round(mw * 300) : 0); // ~300 servers/MW
const perServer = networkSpeed?.includes("800G") ? 16 : 8;
return Math.round(serverEstimate * perServer);
}
async function parseRssFeed(feedUrl: string, sourceName: string): Promise<Announcement[]> {
const res = await fetch(feedUrl, {
headers: { "User-Agent": "TIP-DataCollector/1.0 contact@context-x.org" },
signal: AbortSignal.timeout(15000),
});
if (!res.ok) throw new Error(`RSS fetch failed: ${res.status}`);
const xml = await res.text();
const $ = cheerio.load(xml, { xmlMode: true });
const announcements: Announcement[] = [];
$("item").each((_, el) => {
const title = $(el).find("title").first().text().trim();
const link = $(el).find("link").first().text().trim() || $(el).find("guid").first().text().trim();
const desc = $(el).find("description").first().text().replace(/<[^>]+>/g, " ").trim();
const pubDate = $(el).find("pubDate").first().text().trim();
const fullText = `${title} ${desc}`;
// Check if this article is relevant
const isRelevant = AI_KEYWORDS.some(kw => fullText.toLowerCase().includes(kw.toLowerCase()));
if (!isRelevant) return;
// Extract company
let company = "Unknown";
for (const cp of COMPANY_PATTERNS) {
if (cp.pattern.test(fullText)) { company = cp.name; break; }
}
// Extract scale
let scaleMw: number | null = null;
let scaleServers: number | null = null;
for (const sp of SCALE_PATTERNS) {
const m = fullText.match(sp.pattern);
if (m) {
const v = parseFloat(m[1].replace(/,/g, ""));
if (sp.key === "mw") scaleMw = v;
else if (sp.key === "gpus" || sp.key === "servers") scaleServers = Math.round(v / 8) * 8; // normalize
else if (sp.key === "usd_billion") scaleServers = Math.round(v * 2000); // rough estimate
}
}
// Extract network speed
let networkSpeed: string | null = null;
for (const np of NETWORK_PATTERNS) {
if (np.pattern.test(fullText)) { networkSpeed = np.value; break; }
}
// Parse date
let announcedDate: string | null = null;
if (pubDate) {
try { announcedDate = new Date(pubDate).toISOString().split("T")[0]; } catch { /* ignore */ }
}
const summary = desc.substring(0, 500);
announcements.push({
company,
title: title.substring(0, 300),
summary,
announced_date: announcedDate,
scale_mw: scaleMw,
scale_servers: scaleServers,
network_speed: networkSpeed,
estimated_transceivers: estimateTransceivers(scaleServers, scaleMw, networkSpeed),
deployment_date: null, // extracted from text in future
location: null,
source_url: link,
source_name: sourceName,
});
});
return announcements;
}
export async function scrapeAiClusters(): Promise<void> {
logger.info("AI cluster announcement scraper starting");
let newItems = 0;
for (const feed of RSS_FEEDS) {
try {
logger.info(`Fetching: ${feed.name}`);
await new Promise(r => setTimeout(r, 1000));
const announcements = await parseRssFeed(feed.url, feed.name);
logger.info(`${feed.name}: ${announcements.length} relevant articles found`);
for (const a of announcements) {
const hash = contentHash(`${a.title}${a.source_url}`);
try {
await pool.query(`
INSERT INTO ai_cluster_announcements
(company, title, summary, announced_date, scale_mw, scale_servers,
network_speed, estimated_transceivers, source_url, source_name, content_hash)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
ON CONFLICT (content_hash) DO NOTHING
`, [
a.company, a.title, a.summary, a.announced_date,
a.scale_mw, a.scale_servers, a.network_speed,
a.estimated_transceivers, a.source_url, a.source_name, hash,
]);
newItems++;
} catch { /* duplicate */ }
}
} catch (err) {
logger.warn(`AI cluster feed failed: ${feed.name}`, { err });
}
}
logger.info(`AI cluster scraper done — ${newItems} items recorded`);
}