Rene Fichtmueller 370c1d8801 feat: 6 prediction signal scrapers + forecast engine
New scrapers (all registered in pg-boss, 50 total jobs):
  - sec-edgar.ts       : SEC EDGAR XBRL API — hyperscaler CapEx from 10-Q/10-K
  - github-signals.ts  : GitHub Search/Stats API — tech adoption metrics weekly
  - ebay-velocity.ts   : eBay completed listings — sold count + price distribution
  - ai-clusters.ts     : RSS feeds (6 sources) — AI cluster & DC announcements
  - distributor-leads.ts : Mouser, Digi-Key, RS Components — lead time + stock
  - standards-tracker.ts : IEEE 802.3, OIF, IETF — draft/ballot/published status

New utilities:
  - forecast-engine.ts : Weighted signal aggregator → demand_index + price_direction
    6 signal types, 4 horizons (3/9/12/18 months), 5 technologies tracked

New DB tables (migration 022):
  hyperscaler_capex, distributor_lead_times, github_tech_signals,
  marketplace_velocity, ai_cluster_announcements, standards_activity,
  forecast_signals

Schedules:
  - EDGAR: weekly Mon 06:00
  - GitHub: weekly Sun 05:00
  - eBay velocity: every 12h
  - AI clusters: every 4h (news-speed)
  - Distributor leads: daily 03:30
  - Standards: weekly Wed 04:00
  - Forecast engine: daily 08:00 (after all nightly scrapers)
2026-04-02 02:02:44 +02:00

135 lines
5.0 KiB
TypeScript

/**
* SEC EDGAR Hyperscaler CapEx Scraper
*
* Uses the SEC EDGAR XBRL API (free, no auth) to extract quarterly CapEx
* from Amazon, Microsoft, Google/Alphabet, and Meta 10-Q/10-K filings.
*
* XBRL concept: us-gaap/PaymentsToAcquirePropertyPlantAndEquipment
* API: https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json
*
* Hyperscaler DC CapEx is the strongest 6-12 month leading indicator
* for 400G/800G transceiver demand.
*/
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const COMPANIES: Record<string, { name: string; cik: string; dcPct: number }> = {
amazon: { name: "Amazon (AWS)", cik: "0001018724", dcPct: 0.65 }, // ~65% of CapEx is AWS infra
microsoft: { name: "Microsoft Azure", cik: "0000789019", dcPct: 0.55 },
alphabet: { name: "Google Cloud", cik: "0001652044", dcPct: 0.60 },
meta: { name: "Meta AI/DC", cik: "0001326801", dcPct: 0.85 }, // almost all meta capex is DC
};
const EDGAR_BASE = "https://data.sec.gov/api/xbrl/companyfacts";
const HEADERS = {
"User-Agent": "TIP-DataCollector/1.0 contact@context-x.org",
Accept: "application/json",
};
interface XbrlUnit {
end: string; // ISO date
val: number; // value in USD
form: string; // '10-Q' or '10-K'
filed: string;
frame?: string; // 'CY2024Q1' etc
accn: string; // accession number
fp?: string; // Q1, Q2, Q3, FY
fy?: number; // fiscal year
}
async function fetchCapexData(cik: string): Promise<XbrlUnit[]> {
const url = `${EDGAR_BASE}/CIK${cik}.json`;
const res = await fetch(url, { headers: HEADERS });
if (!res.ok) throw new Error(`EDGAR fetch failed for CIK ${cik}: ${res.status}`);
const data = await res.json() as {
facts: {
"us-gaap"?: {
PaymentsToAcquirePropertyPlantAndEquipment?: {
units: { USD: XbrlUnit[] }
}
}
}
};
return data.facts?.["us-gaap"]
?.PaymentsToAcquirePropertyPlantAndEquipment
?.units?.USD ?? [];
}
function labelFromFrame(unit: XbrlUnit): string {
if (unit.frame) {
const m = unit.frame.match(/CY(\d{4})(Q\d)?/);
if (m) return m[2] ? `${m[2]} ${m[1]}` : `FY ${m[1]}`;
}
if (unit.fp && unit.fy) return `${unit.fp === "FY" ? "FY" : unit.fp} ${unit.fy}`;
return unit.end.substring(0, 7); // YYYY-MM
}
export async function scrapeSecEdgar(): Promise<void> {
logger.info("SEC EDGAR CapEx scraper starting");
let inserted = 0;
for (const [key, company] of Object.entries(COMPANIES)) {
try {
logger.info(`Fetching EDGAR data for ${company.name}`);
await new Promise(r => setTimeout(r, 800)); // respect SEC rate limit: 10 req/sec
const units = await fetchCapexData(company.cik);
if (!units.length) { logger.warn(`No XBRL data for ${company.name}`); continue; }
// Filter to quarterly 10-Q/10-K filings from last 3 years
const cutoff = new Date();
cutoff.setFullYear(cutoff.getFullYear() - 3);
const recent = units
.filter(u => (u.form === "10-Q" || u.form === "10-K") && new Date(u.end) >= cutoff)
.sort((a, b) => new Date(b.end).getTime() - new Date(a.end).getTime());
// Deduplicate by period end date — keep most recently filed
const seen = new Map<string, XbrlUnit>();
for (const u of recent) {
if (!seen.has(u.end)) seen.set(u.end, u);
}
for (const unit of seen.values()) {
const capexM = unit.val / 1_000_000; // convert to millions
const dcCapexM = Math.round(capexM * company.dcPct * 10) / 10;
const periodLabel = labelFromFrame(unit);
const sourceUrl = `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${company.cik}&type=${unit.form}&dateb=&owner=include&count=40`;
await pool.query(`
INSERT INTO hyperscaler_capex
(company, period_label, period_end, capex_usd_millions, dc_capex_est_millions, source_url, filing_type)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT (company, period_end) DO UPDATE SET
capex_usd_millions = EXCLUDED.capex_usd_millions,
dc_capex_est_millions = EXCLUDED.dc_capex_est_millions,
period_label = EXCLUDED.period_label,
filing_type = EXCLUDED.filing_type
`, [key, periodLabel, unit.end, Math.round(capexM * 10) / 10, dcCapexM, sourceUrl, unit.form]);
inserted++;
}
// Compute YoY growth for most recent period
await pool.query(`
UPDATE hyperscaler_capex h1
SET yoy_growth_pct = ROUND(
(h1.capex_usd_millions - h2.capex_usd_millions) / NULLIF(h2.capex_usd_millions, 0) * 100, 1
)
FROM hyperscaler_capex h2
WHERE h1.company = $1
AND h2.company = $1
AND h2.period_end = h1.period_end - INTERVAL '1 year'
AND h1.yoy_growth_pct IS NULL
`, [key]);
logger.info(`${company.name}: ${seen.size} periods upserted`);
} catch (err) {
logger.error(`EDGAR scraper failed for ${company.name}`, { err });
}
}
logger.info(`SEC EDGAR scraper done — ${inserted} records`);
}