feat: 6 prediction signal scrapers + forecast engine

New scrapers (all registered in pg-boss, 50 total jobs):
  - sec-edgar.ts       : SEC EDGAR XBRL API — hyperscaler CapEx from 10-Q/10-K
  - github-signals.ts  : GitHub Search/Stats API — tech adoption metrics weekly
  - ebay-velocity.ts   : eBay completed listings — sold count + price distribution
  - ai-clusters.ts     : RSS feeds (6 sources) — AI cluster & DC announcements
  - distributor-leads.ts : Mouser, Digi-Key, RS Components — lead time + stock
  - standards-tracker.ts : IEEE 802.3, OIF, IETF — draft/ballot/published status

New utilities:
  - forecast-engine.ts : Weighted signal aggregator → demand_index + price_direction
    6 signal types, 4 horizons (3/9/12/18 months), 5 technologies tracked

New DB tables (migration 022):
  hyperscaler_capex, distributor_lead_times, github_tech_signals,
  marketplace_velocity, ai_cluster_announcements, standards_activity,
  forecast_signals

Schedules:
  - EDGAR: weekly Mon 06:00
  - GitHub: weekly Sun 05:00
  - eBay velocity: every 12h
  - AI clusters: every 4h (news-speed)
  - Distributor leads: daily 03:30
  - Standards: weekly Wed 04:00
  - Forecast engine: daily 08:00 (after all nightly scrapers)
This commit is contained in:
Rene Fichtmueller 2026-04-02 02:02:44 +02:00
parent fc7d74e680
commit 7f1c701ba1
9 changed files with 1569 additions and 2 deletions

View File

@ -110,6 +110,15 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
// ── Compute (every 4h, after pricing waves) ─────────────────────── // ── Compute (every 4h, after pricing waves) ───────────────────────
"compute:abc", "compute:abc",
"compute:reorder-signals", "compute:reorder-signals",
// ── Prediction Signal Scrapers (new) ──────────────────────────────
"scrape:signals:sec-edgar",
"scrape:signals:github",
"scrape:signals:ebay-velocity",
"scrape:signals:ai-clusters",
"scrape:signals:distributor-leads",
"scrape:signals:standards",
// ── Forecast Engine ───────────────────────────────────────────────
"compute:forecast",
// ── Sync ────────────────────────────────────────────────────────── // ── Sync ──────────────────────────────────────────────────────────
"sync:nas", "sync:nas",
]; ];
@ -232,13 +241,36 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
await boss.schedule("compute:abc", "50 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); await boss.schedule("compute:abc", "50 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
await boss.schedule("compute:reorder-signals", "55 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); await boss.schedule("compute:reorder-signals", "55 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
// ══════════════════════════════════════════════════════════════════════
// PREDICTION SIGNAL SCRAPERS
// ══════════════════════════════════════════════════════════════════════
// SEC EDGAR CapEx — weekly Monday 06:00 (filings don't change that fast)
await boss.schedule("scrape:signals:sec-edgar", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 3600 });
// GitHub signals — weekly Sunday 05:00
await boss.schedule("scrape:signals:github", "0 5 * * 0", {}, { retryLimit: 2, expireInSeconds: 7200 });
// eBay sold velocity — every 12h (fast-moving market signal)
await boss.schedule("scrape:signals:ebay-velocity", "0 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
// AI cluster RSS feeds — every 4h (news moves fast)
await boss.schedule("scrape:signals:ai-clusters", "10 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
// Distributor lead times — daily 03:30 (stock changes overnight)
await boss.schedule("scrape:signals:distributor-leads","30 3 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
// Standards tracker — weekly Wednesday 04:00 (standards move slowly)
await boss.schedule("scrape:signals:standards", "0 4 * * 3", {}, { retryLimit: 1, expireInSeconds: 3600 });
// ══════════════════════════════════════════════════════════════════════
// FORECAST ENGINE — daily at 08:00 (after all nightly scrapers done)
// ══════════════════════════════════════════════════════════════════════
await boss.schedule("compute:forecast", "0 8 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
// ══════════════════════════════════════════════════════════════════════ // ══════════════════════════════════════════════════════════════════════
// NAS SYNC — nightly // NAS SYNC — nightly
// ══════════════════════════════════════════════════════════════════════ // ══════════════════════════════════════════════════════════════════════
await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
console.log("All schedules registered — 24/7 continuous scraping (42 jobs)"); console.log("All schedules registered — 24/7 continuous scraping (50 jobs)");
} }
export async function registerWorkers(boss: PgBoss): Promise<void> { export async function registerWorkers(boss: PgBoss): Promise<void> {
@ -270,6 +302,14 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
const { scrapeSkylane } = await import("./scrapers/skylane"); const { scrapeSkylane } = await import("./scrapers/skylane");
const { scrapeAscentOptics } = await import("./scrapers/ascentoptics"); const { scrapeAscentOptics } = await import("./scrapers/ascentoptics");
const { scrapeGaoTek } = await import("./scrapers/gaotek"); const { scrapeGaoTek } = await import("./scrapers/gaotek");
// ── Prediction signal scrapers ────────────────────────────────────────
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
const { scrapeGithubSignals } = await import("./scrapers/github-signals");
const { scrapeEbayVelocity } = await import("./scrapers/ebay-velocity");
const { scrapeAiClusters } = await import("./scrapers/ai-clusters");
const { scrapeDistributorLeads }= await import("./scrapers/distributor-leads");
const { scrapeStandardsTracker }= await import("./scrapers/standards-tracker");
const { runForecastEngine } = await import("./utils/forecast-engine");
// ── Playwright scrapers ─────────────────────────────────────────────── // ── Playwright scrapers ───────────────────────────────────────────────
@ -482,5 +522,44 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await runNightlyNasSync(); await runNightlyNasSync();
}); });
console.log("All workers registered (42 jobs, 24/7 continuous)"); // ── Prediction signal scrapers ────────────────────────────────────────
await boss.work("scrape:signals:sec-edgar", async () => {
console.log(`[${new Date().toISOString()}] Running: SEC EDGAR CapEx`);
await scrapeSecEdgar();
});
await boss.work("scrape:signals:github", async () => {
console.log(`[${new Date().toISOString()}] Running: GitHub tech signals`);
await scrapeGithubSignals();
});
await boss.work("scrape:signals:ebay-velocity", async () => {
console.log(`[${new Date().toISOString()}] Running: eBay sold velocity`);
await scrapeEbayVelocity();
});
await boss.work("scrape:signals:ai-clusters", async () => {
console.log(`[${new Date().toISOString()}] Running: AI cluster announcements`);
await scrapeAiClusters();
});
await boss.work("scrape:signals:distributor-leads", async () => {
console.log(`[${new Date().toISOString()}] Running: Distributor lead times`);
await scrapeDistributorLeads();
});
await boss.work("scrape:signals:standards", async () => {
console.log(`[${new Date().toISOString()}] Running: Standards tracker`);
await scrapeStandardsTracker();
});
// ── Forecast engine ───────────────────────────────────────────────────
await boss.work("compute:forecast", async () => {
console.log(`[${new Date().toISOString()}] Running: Forecast engine`);
await runForecastEngine();
});
console.log("All workers registered (50 jobs, 24/7 continuous)");
} }

View File

@ -0,0 +1,204 @@
/**
* AI Cluster & Hyperscale DC Announcement Scraper
*
* Monitors RSS feeds from:
* - DataCenterKnowledge (datacenterknowledge.com)
* - The Register (theregister.com)
* - ServeTheHome (servethehome.com)
* - DataCenter Dynamics (datacenterdynamics.com)
* - Blocks & Files (blocksandfiles.com)
*
* Extracts announcements about:
* - AI clusters (xAI, Meta, AWS, Microsoft, Google)
* - New data center builds with scale indicators
* - Network tech mentions (400G, 800G, InfiniBand, RoCE)
*
* Each announced AI cluster = predictable transceiver demand
* with 3-9 month deployment lag.
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
import { contentHash } from "../utils/hash";
interface Announcement {
company: string;
title: string;
summary: string;
announced_date: string | null;
scale_mw: number | null;
scale_servers: number | null;
network_speed: string | null;
estimated_transceivers: number | null;
deployment_date: string | null;
location: string | null;
source_url: string;
source_name: string;
}
const RSS_FEEDS = [
{ url: "https://www.datacenterknowledge.com/rss.xml", name: "DataCenterKnowledge" },
{ url: "https://www.datacenterdynamics.com/en/rss/", name: "DataCenter Dynamics" },
{ url: "https://www.theregister.com/data_centre/rss/", name: "The Register" },
{ url: "https://blocksandfiles.com/feed/", name: "Blocks & Files" },
{ url: "https://www.nextplatform.com/feed/", name: "Next Platform" },
{ url: "https://www.servethehome.com/feed/", name: "ServeTheHome" },
];
const COMPANY_PATTERNS: Array<{ pattern: RegExp; name: string }> = [
{ pattern: /\bxAI\b/i, name: "xAI" },
{ pattern: /\bMeta\b.*?(AI|data center)/i, name: "Meta" },
{ pattern: /\bOpenAI\b/i, name: "OpenAI" },
{ pattern: /\bAWS\b|\bAmazon\b.*?cloud/i, name: "Amazon (AWS)" },
{ pattern: /\bMicrosoft\b.*?(Azure|AI)/i, name: "Microsoft" },
{ pattern: /\bGoogle\b.*?(Cloud|DeepMind)/i, name: "Google" },
{ pattern: /\bOracle\b.*?cloud/i, name: "Oracle Cloud" },
{ pattern: /\bCoreWeave\b/i, name: "CoreWeave" },
{ pattern: /\bLambda\b.*?(Labs|cloud)/i, name: "Lambda Labs" },
{ pattern: /\bNVIDIA\b.*?supercomputer/i, name: "NVIDIA" },
{ pattern: /\bApple\b.*?data center/i, name: "Apple" },
];
const AI_KEYWORDS = [
"GPU cluster", "AI supercomputer", "AI infrastructure", "data center",
"hyperscale", "AI campus", "GPU server", "AI factory", "compute cluster",
"400G", "800G", "InfiniBand", "RoCE", "co-packaged optics",
];
const SCALE_PATTERNS = [
{ pattern: /(\d+(?:\.\d+)?)\s*(?:MW|megawatt)/i, key: "mw" },
{ pattern: /(\d[\d,]*)\s*(?:GPU|H100|H200|A100|B200)/i, key: "gpus" },
{ pattern: /\$(\d+(?:\.\d+)?)\s*(?:billion|B)\b/i, key: "usd_billion" },
{ pattern: /(\d[\d,]*)\s*(?:server|node)/i, key: "servers" },
];
const NETWORK_PATTERNS = [
{ pattern: /\b800G\b/i, value: "800G" },
{ pattern: /\b400G\b/i, value: "400G" },
{ pattern: /\bInfiniBand NDR\b/i, value: "IB-NDR-400G" },
{ pattern: /\bInfiniBand HDR\b/i, value: "IB-HDR-200G" },
{ pattern: /\bInfiniBand\b/i, value: "InfiniBand" },
{ pattern: /\bRoCEv2?\b/i, value: "RoCE-Ethernet" },
{ pattern: /\b100G\b/i, value: "100G" },
];
// Rough estimate: 1 GPU server ≈ 8 transceivers for 400G networking
function estimateTransceivers(servers: number | null, mw: number | null, networkSpeed: string | null): number | null {
if (!servers && !mw) return null;
const serverEstimate = servers ?? (mw ? Math.round(mw * 300) : 0); // ~300 servers/MW
const perServer = networkSpeed?.includes("800G") ? 16 : 8;
return Math.round(serverEstimate * perServer);
}
async function parseRssFeed(feedUrl: string, sourceName: string): Promise<Announcement[]> {
const res = await fetch(feedUrl, {
headers: { "User-Agent": "TIP-DataCollector/1.0 contact@context-x.org" },
signal: AbortSignal.timeout(15000),
});
if (!res.ok) throw new Error(`RSS fetch failed: ${res.status}`);
const xml = await res.text();
const $ = cheerio.load(xml, { xmlMode: true });
const announcements: Announcement[] = [];
$("item").each((_, el) => {
const title = $(el).find("title").first().text().trim();
const link = $(el).find("link").first().text().trim() || $(el).find("guid").first().text().trim();
const desc = $(el).find("description").first().text().replace(/<[^>]+>/g, " ").trim();
const pubDate = $(el).find("pubDate").first().text().trim();
const fullText = `${title} ${desc}`;
// Check if this article is relevant
const isRelevant = AI_KEYWORDS.some(kw => fullText.toLowerCase().includes(kw.toLowerCase()));
if (!isRelevant) return;
// Extract company
let company = "Unknown";
for (const cp of COMPANY_PATTERNS) {
if (cp.pattern.test(fullText)) { company = cp.name; break; }
}
// Extract scale
let scaleMw: number | null = null;
let scaleServers: number | null = null;
for (const sp of SCALE_PATTERNS) {
const m = fullText.match(sp.pattern);
if (m) {
const v = parseFloat(m[1].replace(/,/g, ""));
if (sp.key === "mw") scaleMw = v;
else if (sp.key === "gpus" || sp.key === "servers") scaleServers = Math.round(v / 8) * 8; // normalize
else if (sp.key === "usd_billion") scaleServers = Math.round(v * 2000); // rough estimate
}
}
// Extract network speed
let networkSpeed: string | null = null;
for (const np of NETWORK_PATTERNS) {
if (np.pattern.test(fullText)) { networkSpeed = np.value; break; }
}
// Parse date
let announcedDate: string | null = null;
if (pubDate) {
try { announcedDate = new Date(pubDate).toISOString().split("T")[0]; } catch { /* ignore */ }
}
const summary = desc.substring(0, 500);
announcements.push({
company,
title: title.substring(0, 300),
summary,
announced_date: announcedDate,
scale_mw: scaleMw,
scale_servers: scaleServers,
network_speed: networkSpeed,
estimated_transceivers: estimateTransceivers(scaleServers, scaleMw, networkSpeed),
deployment_date: null, // extracted from text in future
location: null,
source_url: link,
source_name: sourceName,
});
});
return announcements;
}
export async function scrapeAiClusters(): Promise<void> {
logger.info("AI cluster announcement scraper starting");
let newItems = 0;
for (const feed of RSS_FEEDS) {
try {
logger.info(`Fetching: ${feed.name}`);
await new Promise(r => setTimeout(r, 1000));
const announcements = await parseRssFeed(feed.url, feed.name);
logger.info(`${feed.name}: ${announcements.length} relevant articles found`);
for (const a of announcements) {
const hash = contentHash(`${a.title}${a.source_url}`);
try {
await pool.query(`
INSERT INTO ai_cluster_announcements
(company, title, summary, announced_date, scale_mw, scale_servers,
network_speed, estimated_transceivers, source_url, source_name, content_hash)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
ON CONFLICT (content_hash) DO NOTHING
`, [
a.company, a.title, a.summary, a.announced_date,
a.scale_mw, a.scale_servers, a.network_speed,
a.estimated_transceivers, a.source_url, a.source_name, hash,
]);
newItems++;
} catch { /* duplicate */ }
}
} catch (err) {
logger.warn(`AI cluster feed failed: ${feed.name}`, { err });
}
}
logger.info(`AI cluster scraper done — ${newItems} items recorded`);
}

View File

@ -0,0 +1,241 @@
/**
* Distributor Lead Time & Stock Monitor
*
* Scrapes Mouser Electronics and Digi-Key for transceiver
* availability, stock levels, and lead times.
*
* Lead time is a 2-4 month supply constraint indicator:
* Lead time > 12 weeks price increase likely in 6-8 weeks
* Lead time < 4 weeks excess supply, price pressure downward
* In Stock / large qty commodity phase
*
* No API keys required uses public catalog search pages.
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xhtml",
"Accept-Language": "en-US,en;q=0.9",
};
interface LeadEntry {
distributor: string;
formFactor: string;
speedLabel: string;
partNumber: string;
productName: string;
inStock: boolean;
stockQty: number | null;
leadTimeWeeks: number | null;
priceUsd: number | null;
productUrl: string;
}
// ─── Mouser ────────────────────────────────────────────────────────────────
const MOUSER_SEARCHES = [
{ url: "https://www.mouser.com/c/optoelectronics/fiber-optic-components/fiber-optic-transceivers-pluggable/?q=100G&instock=y", form: "QSFP28", speed: "100G" },
{ url: "https://www.mouser.com/c/optoelectronics/fiber-optic-components/fiber-optic-transceivers-pluggable/?q=400G", form: "QSFP-DD", speed: "400G" },
{ url: "https://www.mouser.com/c/optoelectronics/fiber-optic-components/fiber-optic-transceivers-pluggable/?q=800G", form: "OSFP", speed: "800G" },
{ url: "https://www.mouser.com/c/optoelectronics/fiber-optic-components/fiber-optic-transceivers-pluggable/?q=SFP+10G", form: "SFP+", speed: "10G" },
];
async function scrapeMouser(entry: { url: string; form: string; speed: string }): Promise<LeadEntry[]> {
const res = await fetch(entry.url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`Mouser ${res.status}`);
const $ = cheerio.load(await res.text());
const results: LeadEntry[] = [];
$(".product-table tr[data-part-number]").each((_, el) => {
const partNumber = $(el).attr("data-part-number") ?? "";
const productName = $(el).find(".part-description a").first().text().trim();
const productUrl = `https://www.mouser.com${$(el).find(".part-description a").first().attr("href") ?? ""}`;
const priceText = $(el).find(".pricing-buy-price").first().text().replace(/,/g, "").trim();
const price = parseFloat(priceText.replace(/[^0-9.]/g, "")) || null;
const availText = $(el).find(".avail-text").first().text().trim();
const qtyMatch = availText.match(/([\d,]+)\s*In Stock/i);
const inStock = qtyMatch != null || availText.toLowerCase().includes("in stock");
const stockQty = qtyMatch ? parseInt(qtyMatch[1].replace(/,/g, "")) : (inStock ? 1 : null);
let leadTimeWeeks: number | null = null;
const leadMatch = availText.match(/(\d+)\s*(?:week|wk)/i);
if (leadMatch) leadTimeWeeks = parseInt(leadMatch[1]);
else if (!inStock) leadTimeWeeks = null; // non-stocked
if (!partNumber && !productName) return;
results.push({
distributor: "mouser",
formFactor: entry.form,
speedLabel: entry.speed,
partNumber,
productName: productName.substring(0, 200),
inStock,
stockQty,
leadTimeWeeks,
priceUsd: price,
productUrl: productUrl.substring(0, 500),
});
});
return results;
}
// ─── Digi-Key ──────────────────────────────────────────────────────────────
const DIGIKEY_SEARCHES = [
{ url: "https://www.digikey.com/en/products/filter/fiber-optic-transceivers-pluggable/814?q=100G&s=N4IgjCBcoA2oBhUBjEAzAhgGwM4FMBXAFwHsoBOABgDoA2AJgFYBOGoA", form: "QSFP28", speed: "100G" },
{ url: "https://www.digikey.com/en/products/filter/fiber-optic-transceivers-pluggable/814?q=400G", form: "QSFP-DD", speed: "400G" },
];
async function scrapeDigikey(entry: { url: string; form: string; speed: string }): Promise<LeadEntry[]> {
const res = await fetch(entry.url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`Digi-Key ${res.status}`);
const $ = cheerio.load(await res.text());
const results: LeadEntry[] = [];
// Digi-Key uses React-rendered tables — grab what's in the SSR HTML
$("tr[data-partid], .product-table tbody tr").each((_, el) => {
const cells = $(el).find("td");
if (cells.length < 4) return;
const partLink = $(cells[0]).find("a").first();
const partNumber = partLink.text().trim() || $(cells[0]).text().trim();
const productName = $(cells[1]).text().trim();
const priceText = $(cells[2]).text().replace(/,/g, "").trim();
const price = parseFloat(priceText.replace(/[^0-9.]/g, "")) || null;
const qtyText = $(cells[3]).text().replace(/,/g, "").trim();
const qty = parseInt(qtyText.replace(/[^0-9]/g, "")) || 0;
if (!partNumber || partNumber.length < 3) return;
results.push({
distributor: "digikey",
formFactor: entry.form,
speedLabel: entry.speed,
partNumber: partNumber.substring(0, 100),
productName: productName.substring(0, 200),
inStock: qty > 0,
stockQty: qty || null,
leadTimeWeeks: qty === 0 ? null : 0,
priceUsd: price,
productUrl: `https://www.digikey.com${partLink.attr("href") ?? ""}`,
});
});
return results;
}
// ─── RS Components ─────────────────────────────────────────────────────────
async function scrapeRsComponents(): Promise<LeadEntry[]> {
const url = "https://uk.rs-online.com/web/c/optoelectronics/fibre-optic-components/fibre-optic-transceivers/?applied-dimensions=4294958026&sortby=P_PRICE&st=400G";
const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`RS ${res.status}`);
const $ = cheerio.load(await res.text());
const results: LeadEntry[] = [];
$(".product-info-wrap").each((_, el) => {
const partNumber = $(el).find(".product-number").first().text().trim().replace("RS Stock No.", "");
const productName = $(el).find(".product-title a").first().text().trim();
const productUrl = `https://uk.rs-online.com${$(el).find(".product-title a").attr("href") ?? ""}`;
const priceText = $(el).find(".price-info .price").first().text().trim();
const price = parseFloat(priceText.replace(/[^0-9.]/g, "")) || null;
const stockText = $(el).find(".stock-status").first().text().trim();
const inStock = /in stock|available/i.test(stockText);
if (!productName) return;
results.push({
distributor: "rs-components",
formFactor: "QSFP-DD",
speedLabel: "400G",
partNumber: partNumber.trim(),
productName: productName.substring(0, 200),
inStock,
stockQty: null,
leadTimeWeeks: inStock ? 0 : null,
priceUsd: price, // GBP, stored as-is — currency conversion handled in forecast
productUrl: productUrl.substring(0, 500),
});
});
return results;
}
export async function scrapeDistributorLeads(): Promise<void> {
logger.info("Distributor lead time scraper starting");
let recorded = 0;
// Mouser
for (const s of MOUSER_SEARCHES) {
try {
await new Promise(r => setTimeout(r, 3000));
logger.info(`Mouser: ${s.speed} ${s.form}`);
const entries = await scrapeMouser(s);
for (const e of entries.slice(0, 50)) { // limit to 50 per search
await pool.query(`
INSERT INTO distributor_lead_times
(distributor, form_factor, speed_label, part_number, product_name,
in_stock, stock_qty, lead_time_weeks, price_usd, product_url)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
`, [e.distributor, e.formFactor, e.speedLabel, e.partNumber, e.productName,
e.inStock, e.stockQty, e.leadTimeWeeks, e.priceUsd, e.productUrl]);
recorded++;
}
logger.info(`Mouser ${s.speed}: ${entries.length} products`);
} catch (err) {
logger.warn(`Mouser scrape failed: ${s.speed}`, { err });
}
}
// Digi-Key
for (const s of DIGIKEY_SEARCHES) {
try {
await new Promise(r => setTimeout(r, 4000));
logger.info(`Digi-Key: ${s.speed} ${s.form}`);
const entries = await scrapeDigikey(s);
for (const e of entries.slice(0, 30)) {
await pool.query(`
INSERT INTO distributor_lead_times
(distributor, form_factor, speed_label, part_number, product_name,
in_stock, stock_qty, lead_time_weeks, price_usd, product_url)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
`, [e.distributor, e.formFactor, e.speedLabel, e.partNumber, e.productName,
e.inStock, e.stockQty, e.leadTimeWeeks, e.priceUsd, e.productUrl]);
recorded++;
}
logger.info(`Digi-Key ${s.speed}: ${entries.length} products`);
} catch (err) {
logger.warn(`Digi-Key scrape failed: ${s.speed}`, { err });
}
}
// RS Components
try {
await new Promise(r => setTimeout(r, 3000));
const rsEntries = await scrapeRsComponents();
for (const e of rsEntries.slice(0, 30)) {
await pool.query(`
INSERT INTO distributor_lead_times
(distributor, form_factor, speed_label, part_number, product_name,
in_stock, stock_qty, lead_time_weeks, price_usd, product_url)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
`, [e.distributor, e.formFactor, e.speedLabel, e.partNumber, e.productName,
e.inStock, e.stockQty, e.leadTimeWeeks, e.priceUsd, e.productUrl]);
recorded++;
}
logger.info(`RS Components: ${rsEntries.length} products`);
} catch (err) {
logger.warn("RS Components scrape failed", { err });
}
logger.info(`Distributor lead time scraper done — ${recorded} records`);
}

View File

@ -0,0 +1,134 @@
/**
* eBay Marketplace Velocity Scraper
*
* Tracks SOLD listing counts and price distributions to measure
* actual market demand velocity a 1-3 month leading indicator.
*
* Scrapes eBay's public completed/sold listings page (no API needed).
* Looks at: sold count last 30 days, active listing count, price spread.
*
* High sold velocity + rising prices buy signal
* Falling velocity + dropping prices commodity transition in progress
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xhtml,application/xml;q=0.9,*/*;q=0.8",
};
const SEARCH_TERMS: Array<{ keyword: string; formFactor: string; speedLabel: string }> = [
{ keyword: "QSFP28 100G transceiver", formFactor: "QSFP28", speedLabel: "100G" },
{ keyword: "QSFP+ 40G transceiver", formFactor: "QSFP+", speedLabel: "40G" },
{ keyword: "QSFP28 400G transceiver", formFactor: "QSFP28", speedLabel: "400G" },
{ keyword: "QSFP-DD 400G transceiver", formFactor: "QSFP-DD", speedLabel: "400G" },
{ keyword: "QSFP-DD 800G transceiver", formFactor: "QSFP-DD", speedLabel: "800G" },
{ keyword: "SFP+ 10G transceiver", formFactor: "SFP+", speedLabel: "10G" },
{ keyword: "OSFP 400G transceiver", formFactor: "OSFP", speedLabel: "400G" },
{ keyword: "CFP2 100G coherent DCO", formFactor: "CFP2", speedLabel: "100G" },
{ keyword: "400ZR coherent transceiver", formFactor: "QSFP-DD", speedLabel: "400G-ZR"},
];
function parsePrice(text: string): number | null {
const m = text.replace(/,/g, "").match(/[\$£€]?\s*([\d.]+)/);
return m ? parseFloat(m[1]) : null;
}
async function scrapeEbaySoldListings(
keyword: string
): Promise<{ soldCount: number; activeCount: number; avgSoldPrice: number | null; minPrice: number | null; maxPrice: number | null }> {
const enc = encodeURIComponent(keyword);
// Completed (sold) listings
const soldUrl = `https://www.ebay.com/sch/i.html?_nkw=${enc}&LH_Complete=1&LH_Sold=1&_sop=12&_ipg=200`;
const res = await fetch(soldUrl, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`eBay returned ${res.status}`);
const html = await res.text();
const $ = cheerio.load(html);
const prices: number[] = [];
let soldCount = 0;
// Count sold items and extract prices
$(".s-item").each((_, el) => {
const priceText = $(el).find(".s-item__price").first().text().trim();
const price = parsePrice(priceText);
const isSold = $(el).find(".POSITIVE").length > 0 || $(el).find(".s-item__caption--signal").text().includes("Sold");
if (price && price > 0 && price < 50000) {
prices.push(price);
if (isSold) soldCount++;
}
});
// Total count from results summary
const resultText = $(".srp-controls__count-heading").text();
const totalMatch = resultText.replace(/,/g, "").match(/([\d,]+)/);
if (totalMatch) soldCount = Math.max(soldCount, parseInt(totalMatch[1]));
// Active listings (separate search)
await new Promise(r => setTimeout(r, 1500));
const activeUrl = `https://www.ebay.com/sch/i.html?_nkw=${enc}&_sop=12&_ipg=1&LH_BIN=1`;
let activeCount = 0;
try {
const activeRes = await fetch(activeUrl, { headers: HEADERS, signal: AbortSignal.timeout(15000) });
if (activeRes.ok) {
const activeHtml = await activeRes.text();
const $a = cheerio.load(activeHtml);
const activeText = $a(".srp-controls__count-heading").text().replace(/,/g, "");
const am = activeText.match(/([\d]+)/);
if (am) activeCount = parseInt(am[1]);
}
} catch { /* ignore active count failure */ }
const avgSoldPrice = prices.length > 0
? Math.round(prices.reduce((a, b) => a + b, 0) / prices.length * 100) / 100
: null;
return {
soldCount,
activeCount,
avgSoldPrice,
minPrice: prices.length > 0 ? Math.min(...prices) : null,
maxPrice: prices.length > 0 ? Math.max(...prices) : null,
};
}
export async function scrapeEbayVelocity(): Promise<void> {
logger.info("eBay velocity scraper starting");
let recorded = 0;
for (const term of SEARCH_TERMS) {
try {
logger.info(`Checking eBay velocity: ${term.keyword}`);
await new Promise(r => setTimeout(r, 3000)); // be gentle
const result = await scrapeEbaySoldListings(term.keyword);
await pool.query(`
INSERT INTO marketplace_velocity
(marketplace, keyword, form_factor, speed_label, sold_count_30d,
active_listings, avg_sold_price, min_price, max_price, currency)
VALUES ('ebay', $1, $2, $3, $4, $5, $6, $7, $8, 'USD')
`, [
term.keyword, term.formFactor, term.speedLabel,
result.soldCount, result.activeCount, result.avgSoldPrice,
result.minPrice, result.maxPrice,
]);
recorded++;
logger.info(
`${term.speedLabel} ${term.formFactor}: ${result.soldCount} sold, ` +
`${result.activeCount} active, avg $${result.avgSoldPrice ?? "?"}`
);
} catch (err) {
logger.warn(`eBay velocity failed: ${term.keyword}`, { err });
}
}
logger.info(`eBay velocity scraper done — ${recorded} records`);
}

View File

@ -0,0 +1,163 @@
/**
* GitHub Technology Adoption Signal Collector
*
* Tracks how fast key networking technologies are adopted in open-source
* projects by measuring commit/repo/star activity weekly.
*
* Technologies tracked: 400G, 800G, ZR, CPO, silicon-photonics, CMIS, 1.6T
*
* Key repos monitored:
* - sonic-net/sonic-buildimage (SONiC NOS)
* - openconfig/public (vendor-neutral config)
* - DPDK/dpdk (data plane)
* - Arista/sonic-mgmt
* - netdisco/netdisco
*
* GitHub activity is a 6-18 month leading indicator for enterprise adoption.
*/
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const GITHUB_API = "https://api.github.com";
const TOKEN = process.env.GITHUB_TOKEN || "";
const HEADERS: Record<string, string> = {
"User-Agent": "TIP-DataCollector/1.0",
"Accept": "application/vnd.github.v3+json",
...(TOKEN ? { "Authorization": `Bearer ${TOKEN}` } : {}),
};
const TECHNOLOGIES = [
{ key: "400G", queries: ["400G transceiver", "QSFP28 400G", "400GbE"] },
{ key: "800G", queries: ["800G transceiver", "QSFP-DD 800G", "800GbE"] },
{ key: "ZR", queries: ["400ZR transceiver", "OpenZR+ optical", "coherent ZR"] },
{ key: "CPO", queries: ["co-packaged optics", "CPO networking"] },
{ key: "silicon-photonics",queries: ["silicon photonics transceiver", "SiPh networking"] },
{ key: "CMIS", queries: ["CMIS 5.0 transceiver", "CMIS optical module"] },
{ key: "1.6T", queries: ["1.6T ethernet", "1.6Tbps networking"] },
{ key: "OSFP", queries: ["OSFP transceiver", "OSFP-XD module"] },
];
const KEY_REPOS = [
"sonic-net/sonic-buildimage",
"openconfig/public",
"DPDK/dpdk",
"netdisco/netdisco",
"FRRouting/frr",
];
async function githubGet<T>(path: string): Promise<T> {
const res = await fetch(`${GITHUB_API}${path}`, { headers: HEADERS });
if (res.status === 403) throw new Error("GitHub rate limited");
if (!res.ok) throw new Error(`GitHub API error: ${res.status} ${path}`);
return res.json() as Promise<T>;
}
async function searchRepoCount(query: string): Promise<number> {
const enc = encodeURIComponent(query);
const data = await githubGet<{ total_count: number }>(
`/search/repositories?q=${enc}&per_page=1`
);
return data.total_count;
}
async function searchCodeCount(query: string): Promise<number> {
const enc = encodeURIComponent(query);
const data = await githubGet<{ total_count: number }>(
`/search/code?q=${enc}&per_page=1`
);
return data.total_count;
}
async function getRepoWeeklyCommits(repo: string): Promise<number> {
// Returns commits in last 52 weeks, we take last week
const data = await githubGet<Array<{ total: number }>>(
`/repos/${repo}/stats/commit_activity`
);
if (!Array.isArray(data) || data.length === 0) return 0;
return data[data.length - 1]?.total ?? 0;
}
async function getRepoStars(repo: string): Promise<number> {
const data = await githubGet<{ stargazers_count: number }>(
`/repos/${repo}`
);
return data.stargazers_count;
}
function getWeekStart(): Date {
const d = new Date();
d.setUTCHours(0, 0, 0, 0);
d.setUTCDate(d.getUTCDate() - d.getUTCDay()); // Sunday
return d;
}
async function upsertSignal(technology: string, metric: string, context: string, value: number, weekStart: Date) {
await pool.query(`
INSERT INTO github_tech_signals (technology, metric, repo_context, value, week_start)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (technology, metric, repo_context, week_start)
DO UPDATE SET value = EXCLUDED.value
`, [technology, metric, context, value, weekStart]);
}
export async function scrapeGithubSignals(): Promise<void> {
logger.info("GitHub signals scraper starting");
const weekStart = getWeekStart();
let signalCount = 0;
// 1. Technology repo count (how many repos mention each tech)
for (const tech of TECHNOLOGIES) {
let totalRepos = 0;
for (const q of tech.queries) {
try {
await new Promise(r => setTimeout(r, 1200)); // stay under rate limit
const count = await searchRepoCount(`${q} language:Python language:Go language:TypeScript language:C`);
totalRepos += count;
} catch (err) {
logger.warn(`GitHub repo search failed: ${q}`, { err });
}
}
await upsertSignal(tech.key, "repo_count", "all", totalRepos, weekStart);
signalCount++;
logger.info(`${tech.key}: ${totalRepos} repos`);
await new Promise(r => setTimeout(r, 2000));
}
// 2. Key repo commit activity (weekly)
for (const repo of KEY_REPOS) {
try {
await new Promise(r => setTimeout(r, 1000));
const commits = await getRepoWeeklyCommits(repo);
const stars = await getRepoStars(repo);
await upsertSignal("all-networking", "weekly_commits", repo, commits, weekStart);
await upsertSignal("all-networking", "stars", repo, stars, weekStart);
signalCount += 2;
logger.info(`${repo}: ${commits} commits last week, ${stars} stars`);
} catch (err) {
logger.warn(`GitHub repo stats failed: ${repo}`, { err });
}
}
// 3. SONiC-specific tech signals (most valuable for transceiver market)
const sonicTechSearches = [
{ tech: "400G", q: "400G repo:sonic-net/sonic-buildimage" },
{ tech: "800G", q: "800G repo:sonic-net/sonic-buildimage" },
{ tech: "CMIS", q: "CMIS repo:sonic-net/sonic-buildimage" },
{ tech: "OSFP", q: "OSFP repo:sonic-net/sonic-buildimage" },
{ tech: "ZR", q: "400ZR repo:sonic-net/sonic-buildimage" },
];
for (const s of sonicTechSearches) {
try {
await new Promise(r => setTimeout(r, 1500));
const count = await searchCodeCount(s.q);
await upsertSignal(s.tech, "sonic_code_refs", "sonic-net/sonic-buildimage", count, weekStart);
signalCount++;
logger.info(`SONiC ${s.tech}: ${count} code references`);
} catch (err) {
logger.warn(`SONiC code search failed: ${s.q}`, { err });
}
}
logger.info(`GitHub signals scraper done — ${signalCount} signals recorded`);
}

View File

@ -0,0 +1,134 @@
/**
* SEC EDGAR Hyperscaler CapEx Scraper
*
* Uses the SEC EDGAR XBRL API (free, no auth) to extract quarterly CapEx
* from Amazon, Microsoft, Google/Alphabet, and Meta 10-Q/10-K filings.
*
* XBRL concept: us-gaap/PaymentsToAcquirePropertyPlantAndEquipment
* API: https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json
*
* Hyperscaler DC CapEx is the strongest 6-12 month leading indicator
* for 400G/800G transceiver demand.
*/
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const COMPANIES: Record<string, { name: string; cik: string; dcPct: number }> = {
amazon: { name: "Amazon (AWS)", cik: "0001018724", dcPct: 0.65 }, // ~65% of CapEx is AWS infra
microsoft: { name: "Microsoft Azure", cik: "0000789019", dcPct: 0.55 },
alphabet: { name: "Google Cloud", cik: "0001652044", dcPct: 0.60 },
meta: { name: "Meta AI/DC", cik: "0001326801", dcPct: 0.85 }, // almost all meta capex is DC
};
const EDGAR_BASE = "https://data.sec.gov/api/xbrl/companyfacts";
const HEADERS = {
"User-Agent": "TIP-DataCollector/1.0 contact@context-x.org",
Accept: "application/json",
};
interface XbrlUnit {
end: string; // ISO date
val: number; // value in USD
form: string; // '10-Q' or '10-K'
filed: string;
frame?: string; // 'CY2024Q1' etc
accn: string; // accession number
fp?: string; // Q1, Q2, Q3, FY
fy?: number; // fiscal year
}
async function fetchCapexData(cik: string): Promise<XbrlUnit[]> {
const url = `${EDGAR_BASE}/CIK${cik}.json`;
const res = await fetch(url, { headers: HEADERS });
if (!res.ok) throw new Error(`EDGAR fetch failed for CIK ${cik}: ${res.status}`);
const data = await res.json() as {
facts: {
"us-gaap"?: {
PaymentsToAcquirePropertyPlantAndEquipment?: {
units: { USD: XbrlUnit[] }
}
}
}
};
return data.facts?.["us-gaap"]
?.PaymentsToAcquirePropertyPlantAndEquipment
?.units?.USD ?? [];
}
function labelFromFrame(unit: XbrlUnit): string {
if (unit.frame) {
const m = unit.frame.match(/CY(\d{4})(Q\d)?/);
if (m) return m[2] ? `${m[2]} ${m[1]}` : `FY ${m[1]}`;
}
if (unit.fp && unit.fy) return `${unit.fp === "FY" ? "FY" : unit.fp} ${unit.fy}`;
return unit.end.substring(0, 7); // YYYY-MM
}
export async function scrapeSecEdgar(): Promise<void> {
logger.info("SEC EDGAR CapEx scraper starting");
let inserted = 0;
for (const [key, company] of Object.entries(COMPANIES)) {
try {
logger.info(`Fetching EDGAR data for ${company.name}`);
await new Promise(r => setTimeout(r, 800)); // respect SEC rate limit: 10 req/sec
const units = await fetchCapexData(company.cik);
if (!units.length) { logger.warn(`No XBRL data for ${company.name}`); continue; }
// Filter to quarterly 10-Q/10-K filings from last 3 years
const cutoff = new Date();
cutoff.setFullYear(cutoff.getFullYear() - 3);
const recent = units
.filter(u => (u.form === "10-Q" || u.form === "10-K") && new Date(u.end) >= cutoff)
.sort((a, b) => new Date(b.end).getTime() - new Date(a.end).getTime());
// Deduplicate by period end date — keep most recently filed
const seen = new Map<string, XbrlUnit>();
for (const u of recent) {
if (!seen.has(u.end)) seen.set(u.end, u);
}
for (const unit of seen.values()) {
const capexM = unit.val / 1_000_000; // convert to millions
const dcCapexM = Math.round(capexM * company.dcPct * 10) / 10;
const periodLabel = labelFromFrame(unit);
const sourceUrl = `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${company.cik}&type=${unit.form}&dateb=&owner=include&count=40`;
await pool.query(`
INSERT INTO hyperscaler_capex
(company, period_label, period_end, capex_usd_millions, dc_capex_est_millions, source_url, filing_type)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT (company, period_end) DO UPDATE SET
capex_usd_millions = EXCLUDED.capex_usd_millions,
dc_capex_est_millions = EXCLUDED.dc_capex_est_millions,
period_label = EXCLUDED.period_label,
filing_type = EXCLUDED.filing_type
`, [key, periodLabel, unit.end, Math.round(capexM * 10) / 10, dcCapexM, sourceUrl, unit.form]);
inserted++;
}
// Compute YoY growth for most recent period
await pool.query(`
UPDATE hyperscaler_capex h1
SET yoy_growth_pct = ROUND(
(h1.capex_usd_millions - h2.capex_usd_millions) / NULLIF(h2.capex_usd_millions, 0) * 100, 1
)
FROM hyperscaler_capex h2
WHERE h1.company = $1
AND h2.company = $1
AND h2.period_end = h1.period_end - INTERVAL '1 year'
AND h1.yoy_growth_pct IS NULL
`, [key]);
logger.info(`${company.name}: ${seen.size} periods upserted`);
} catch (err) {
logger.error(`EDGAR scraper failed for ${company.name}`, { err });
}
}
logger.info(`SEC EDGAR scraper done — ${inserted} records`);
}

View File

@ -0,0 +1,199 @@
/**
* Network Standards & Draft Activity Tracker
*
* Monitors the status of key networking standards that directly
* affect transceiver form factor adoption timelines:
*
* - IEEE 802.3 (Ethernet PHY standards)
* - IETF (network protocols)
* - OIF (Optical Internetworking Forum MSA agreements)
* - CMIS (Common Management Interface Specification)
*
* Standard status is a 12-24 month leading indicator:
* "Draft 3.0 approved" ~18 months to mass-market products
* "Published" OEMs start shipping within 12 months
* "Ballot closed" final spec in ~3 months
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const HEADERS = {
"User-Agent": "TIP-DataCollector/1.0 contact@context-x.org",
"Accept": "text/html,application/xhtml+xml",
};
interface StandardStatus {
standard_body: string;
standard_name: string;
technology: string;
current_status: string;
draft_version: string | null;
approval_date: string | null;
source_url: string;
notes: string | null;
}
// ─── IEEE 802.3 ────────────────────────────────────────────────────────────
async function scrapeIeee802(): Promise<StandardStatus[]> {
const url = "https://www.ieee802.org/3/";
const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`IEEE ${res.status}`);
const $ = cheerio.load(await res.text());
const results: StandardStatus[] = [];
// IEEE 802.3 project table
$("table tr").each((_, row) => {
const cells = $(row).find("td, th");
if (cells.length < 2) return;
const taskName = $(cells[0]).text().trim();
const statusText = $(cells[1]).text().trim();
const link = $(cells[0]).find("a").first().attr("href");
// Filter for high-speed Ethernet projects
const speedMatch = taskName.match(/\b(100G|200G|400G|800G|1\.6T|1600G)\b/i)
|| taskName.match(/802\.3(bs|cd|cu|ck|df|dk|dj|dl)\b/i);
if (!speedMatch) return;
const tech = taskName.match(/\b(400G|800G|1\.6T|100G|200G)\b/i)?.[1] ?? "high-speed";
let status = "in-progress";
if (/published|approved/i.test(statusText)) status = "published";
else if (/ballot/i.test(statusText)) status = "ballot";
else if (/withdrawn|cancelled/i.test(statusText)) status = "cancelled";
results.push({
standard_body: "ieee",
standard_name: taskName.substring(0, 100),
technology: tech,
current_status: status,
draft_version: null,
approval_date: null,
source_url: link ? `https://www.ieee802.org/3/${link}` : url,
notes: statusText.substring(0, 200),
});
});
return results;
}
// ─── OIF ──────────────────────────────────────────────────────────────────
async function scrapeOif(): Promise<StandardStatus[]> {
const url = "https://www.oiforum.com/technical-work/hot-topics/";
const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`OIF ${res.status}`);
const $ = cheerio.load(await res.text());
const results: StandardStatus[] = [];
$(".entry-content h2, .entry-content h3").each((_, el) => {
const heading = $(el).text().trim();
if (!heading) return;
const link = $(el).find("a").first().attr("href") || url;
const techMatch = heading.match(/\b(400ZR|800ZR|CMIS|400G|800G|CPO|OIF-)\S*/i);
if (!techMatch) return;
const tech = heading.includes("ZR") ? heading.includes("800") ? "800G-ZR" : "400G-ZR"
: heading.includes("CMIS") ? "CMIS"
: heading.includes("800") ? "800G"
: heading.includes("400") ? "400G"
: "optical";
results.push({
standard_body: "oif",
standard_name: heading.substring(0, 100),
technology: tech,
current_status: "in-progress", // OIF IA are usually in-progress or published
draft_version: null,
approval_date: null,
source_url: link,
notes: null,
});
});
return results;
}
// ─── IETF Datatracker ─────────────────────────────────────────────────────
async function scrapeIetf(): Promise<StandardStatus[]> {
// IETF has a proper REST API
const url = "https://datatracker.ietf.org/api/v1/doc/document/?type=draft&name__contains=optical&limit=20&offset=0";
const res = await fetch(url, {
headers: { ...HEADERS, Accept: "application/json" },
signal: AbortSignal.timeout(15000),
});
if (!res.ok) throw new Error(`IETF API ${res.status}`);
const data = await res.json() as {
objects: Array<{
name: string; title: string; abstract: string;
std_level: string | null; stream: string;
}>
};
return (data.objects ?? []).map(doc => {
const tech = doc.name.includes("400g") ? "400G"
: doc.name.includes("800g") ? "800G"
: doc.title.match(/\b(400G|800G|ZR|coherent)\b/i)?.[1] ?? "optical";
return {
standard_body: "ietf",
standard_name: doc.name.substring(0, 100),
technology: tech,
current_status: doc.std_level ? "published" : "in-progress",
draft_version: null,
approval_date: null,
source_url: `https://datatracker.ietf.org/doc/${doc.name}/`,
notes: doc.title.substring(0, 200),
};
});
}
export async function scrapeStandardsTracker(): Promise<void> {
logger.info("Standards tracker starting");
let updated = 0;
const scrapers: Array<{ name: string; fn: () => Promise<StandardStatus[]> }> = [
{ name: "IEEE 802.3", fn: scrapeIeee802 },
{ name: "OIF", fn: scrapeOif },
{ name: "IETF", fn: scrapeIetf },
];
for (const s of scrapers) {
try {
await new Promise(r => setTimeout(r, 2000));
logger.info(`Checking ${s.name}`);
const standards = await s.fn();
for (const std of standards) {
await pool.query(`
INSERT INTO standards_activity
(standard_body, standard_name, technology, current_status,
draft_version, approval_date, source_url, notes, last_checked)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW())
ON CONFLICT (standard_body, standard_name) DO UPDATE SET
current_status = EXCLUDED.current_status,
technology = EXCLUDED.technology,
notes = COALESCE(EXCLUDED.notes, standards_activity.notes),
last_checked = NOW()
`, [std.standard_body, std.standard_name, std.technology, std.current_status,
std.draft_version, std.approval_date, std.source_url, std.notes]);
updated++;
}
logger.info(`${s.name}: ${standards.length} standards checked`);
} catch (err) {
logger.warn(`Standards scraper failed: ${s.name}`, { err });
}
}
logger.info(`Standards tracker done — ${updated} records updated`);
}

View File

@ -0,0 +1,282 @@
/**
* TIP Forecast Engine
*
* Aggregates all prediction signals into demand index + price forecast
* per technology, for 3/9/12/18 month horizons.
*
* Signal weights (empirically estimated, tunable):
*
* Signal Type | Weight | Lead Time | Notes
*
* Hyperscaler CapEx YoY | 0.30 | 6-12 mo | Strongest single signal
* AI Cluster Announcements | 0.25 | 3-9 mo | Direct DC build demand
* eBay Sold Velocity | 0.20 | 1-3 mo | Actual market demand now
* Distributor Lead Times | 0.15 | 2-4 mo | Supply constraint proxy
* GitHub Tech Signals | 0.06 | 6-18 mo | Enterprise adoption rate
* Standards Status | 0.04 | 12-24 mo | Long-horizon technology wave
*
* Forecast output:
* demand_index 0-100 composite score
* price_direction 'rising'|'falling'|'stable'
* price_delta_pct estimated % change over horizon
* confidence 0.0-1.0 based on how many signals available
*/
import { pool } from "./db";
import { logger } from "./logger";
const TECHNOLOGIES = [
"100G-QSFP28",
"400G-QSFP-DD",
"400G-ZR",
"800G-OSFP",
"10G-SFP+",
];
const TECH_KEYWORDS: Record<string, { speeds: string[]; formFactors: string[]; githubKeys: string[] }> = {
"100G-QSFP28": { speeds: ["100G"], formFactors: ["QSFP28"], githubKeys: ["400G"] }, // 100G still riding 400G wave
"400G-QSFP-DD": { speeds: ["400G"], formFactors: ["QSFP-DD", "QSFP28"], githubKeys: ["400G", "OSFP"] },
"400G-ZR": { speeds: ["400G-ZR","400G"], formFactors: ["QSFP-DD"], githubKeys: ["ZR", "400G"] },
"800G-OSFP": { speeds: ["800G"], formFactors: ["OSFP", "QSFP-DD"], githubKeys: ["800G", "OSFP"] },
"10G-SFP+": { speeds: ["10G"], formFactors: ["SFP+"], githubKeys: [] },
};
interface SignalBundle {
capexYoyPct: number | null; // latest hyperscaler YoY CapEx growth
aiAnnouncementsQty: number; // AI cluster announcements last 90 days
aiEstimatedTcvrs: number; // total estimated transceivers from announcements
ebayVelocity: number | null; // sold count last measurement
ebayActiveListings: number | null;
avgLeadTimeWeeks: number | null; // average distributor lead time
inStockPct: number | null; // % of distributors showing in stock
githubRepoCount: number | null; // total repos for tech keywords
standardsApproved: number; // how many relevant standards published
standardsInProgress: number; // how many relevant standards in progress
}
async function gatherSignals(tech: string): Promise<SignalBundle> {
const keys = TECH_KEYWORDS[tech];
const speedList = keys.speeds.map(s => `'${s}'`).join(",");
const formList = keys.formFactors.map(f => `'${f}'`).join(",");
// Hyperscaler CapEx — most recent quarter's YoY growth across all companies
const capexRow = await pool.query<{ avg_yoy: number }>(`
SELECT AVG(yoy_growth_pct) AS avg_yoy
FROM hyperscaler_capex
WHERE period_end > NOW() - INTERVAL '9 months'
AND yoy_growth_pct IS NOT NULL
`);
const capexYoyPct = capexRow.rows[0]?.avg_yoy ?? null;
// AI cluster announcements — last 90 days
const aiRow = await pool.query<{ qty: number; est_tcvrs: number }>(`
SELECT COUNT(*) AS qty, COALESCE(SUM(estimated_transceivers), 0) AS est_tcvrs
FROM ai_cluster_announcements
WHERE announced_date > NOW() - INTERVAL '90 days'
AND (network_speed LIKE '%${tech.includes("800G") ? "800" : "400"}%'
OR network_speed IS NULL)
`);
const aiAnnouncementsQty = parseInt(String(aiRow.rows[0]?.qty ?? 0));
const aiEstimatedTcvrs = parseInt(String(aiRow.rows[0]?.est_tcvrs ?? 0));
// eBay velocity — latest measurement matching form factor
const ebayRow = await pool.query<{ sold: number; active: number }>(`
SELECT AVG(sold_count_30d) AS sold, AVG(active_listings) AS active
FROM marketplace_velocity
WHERE scraped_at > NOW() - INTERVAL '7 days'
AND (speed_label IN (${speedList}) OR form_factor IN (${formList}))
`);
const ebayVelocity = ebayRow.rows[0]?.sold ?? null;
const ebayActiveListings = ebayRow.rows[0]?.active ?? null;
// Distributor lead times
const distRow = await pool.query<{ avg_lead: number; in_stock_pct: number }>(`
SELECT
AVG(lead_time_weeks) FILTER (WHERE lead_time_weeks IS NOT NULL) AS avg_lead,
100.0 * COUNT(*) FILTER (WHERE in_stock = true) / NULLIF(COUNT(*), 0) AS in_stock_pct
FROM distributor_lead_times
WHERE scraped_at > NOW() - INTERVAL '14 days'
AND (form_factor IN (${formList}) OR speed_label IN (${speedList}))
`);
const avgLeadTimeWeeks = distRow.rows[0]?.avg_lead ?? null;
const inStockPct = distRow.rows[0]?.in_stock_pct ?? null;
// GitHub signals
let githubRepoCount: number | null = null;
if (keys.githubKeys.length > 0) {
const ghKeys = keys.githubKeys.map(k => `'${k}'`).join(",");
const ghRow = await pool.query<{ total: number }>(`
SELECT SUM(value) AS total
FROM github_tech_signals
WHERE technology IN (${ghKeys})
AND metric = 'repo_count'
AND week_start >= NOW() - INTERVAL '14 days'
`);
githubRepoCount = ghRow.rows[0]?.total ?? null;
}
// Standards status
const stdSpeeds = keys.speeds.map(s => `'${s}'`).join(",");
const stdRow = await pool.query<{ approved: number; in_progress: number }>(`
SELECT
COUNT(*) FILTER (WHERE current_status = 'published') AS approved,
COUNT(*) FILTER (WHERE current_status IN ('in-progress', 'ballot')) AS in_progress
FROM standards_activity
WHERE technology IN (${stdSpeeds})
OR standard_name LIKE '%${keys.speeds[0] ?? ""}%'
`);
const standardsApproved = parseInt(String(stdRow.rows[0]?.approved ?? 0));
const standardsInProgress = parseInt(String(stdRow.rows[0]?.in_progress ?? 0));
return {
capexYoyPct, aiAnnouncementsQty, aiEstimatedTcvrs,
ebayVelocity, ebayActiveListings,
avgLeadTimeWeeks, inStockPct,
githubRepoCount,
standardsApproved, standardsInProgress,
};
}
function computeDemandIndex(s: SignalBundle, horizonMonths: number): {
index: number;
priceDirection: string;
priceDeltaPct: number;
confidence: number;
breakdown: Record<string, number>;
} {
let score = 50; // neutral baseline
let signalCount = 0;
const breakdown: Record<string, number> = {};
// 1. Hyperscaler CapEx (weight 0.30) — primary signal
if (s.capexYoyPct !== null) {
const capexScore = Math.min(100, Math.max(0, 50 + s.capexYoyPct * 1.2));
score += (capexScore - 50) * 0.30;
breakdown.capex = capexScore;
signalCount++;
}
// 2. AI Cluster Announcements (weight 0.25)
if (s.aiAnnouncementsQty > 0) {
// > 20 announcements in 90 days = max signal
const aiScore = Math.min(100, 50 + s.aiAnnouncementsQty * 2.5);
score += (aiScore - 50) * 0.25;
breakdown.ai_clusters = aiScore;
signalCount++;
}
// 3. eBay Velocity (weight 0.20) — higher velocity = more demand
if (s.ebayVelocity !== null && s.ebayActiveListings !== null) {
const ratio = s.ebayActiveListings > 0
? (s.ebayVelocity / s.ebayActiveListings) * 100
: 0;
// > 30% sell-through = strong demand
const ebayScore = Math.min(100, Math.max(0, 50 + (ratio - 15) * 1.5));
score += (ebayScore - 50) * 0.20;
breakdown.ebay_velocity = ebayScore;
signalCount++;
}
// 4. Distributor Lead Times (weight 0.15) — longer lead = tighter supply = higher prices
if (s.avgLeadTimeWeeks !== null) {
// 4 weeks = normal, 12+ weeks = constrained
const leadScore = Math.min(100, Math.max(0, 50 + (s.avgLeadTimeWeeks - 4) * 4));
score += (leadScore - 50) * 0.15;
breakdown.lead_times = leadScore;
signalCount++;
// Invert for long horizon: tight supply now = more capacity investment = prices drop later
if (horizonMonths >= 12 && s.avgLeadTimeWeeks > 12) {
score -= 5; // price correction expected
}
}
// 5. GitHub signals (weight 0.06)
if (s.githubRepoCount !== null) {
const ghScore = Math.min(100, Math.max(0, 40 + Math.log10(Math.max(1, s.githubRepoCount)) * 12));
score += (ghScore - 50) * 0.06;
breakdown.github = ghScore;
signalCount++;
}
// 6. Standards status (weight 0.04)
if (s.standardsApproved > 0 || s.standardsInProgress > 0) {
// Published standards = late-game adoption = price pressure down
// In-progress = early game = price still rising
const stdScore = s.standardsApproved > 2 ? 35 : (s.standardsInProgress > 2 ? 65 : 50);
score += (stdScore - 50) * 0.04;
breakdown.standards = stdScore;
signalCount++;
}
const demandIndex = Math.round(Math.min(100, Math.max(0, score)) * 10) / 10;
// Price direction logic
let priceDirection = "stable";
let priceDeltaPct = 0;
if (demandIndex > 65) {
priceDirection = "rising";
priceDeltaPct = Math.round((demandIndex - 50) * 0.4 * 10) / 10;
} else if (demandIndex < 40) {
priceDirection = "falling";
priceDeltaPct = -Math.round((50 - demandIndex) * 0.35 * 10) / 10;
}
// Adjust price delta by horizon (longer = larger moves)
const horizonMultiplier = Math.sqrt(horizonMonths / 3);
priceDeltaPct = Math.round(priceDeltaPct * horizonMultiplier * 10) / 10;
const confidence = Math.min(1.0, signalCount / 6);
return { index: demandIndex, priceDirection, priceDeltaPct, confidence, breakdown };
}
export async function runForecastEngine(): Promise<void> {
logger.info("Forecast engine starting");
const horizons = [3, 9, 12, 18];
let computed = 0;
for (const tech of TECHNOLOGIES) {
try {
const signals = await gatherSignals(tech);
for (const horizon of horizons) {
const forecast = computeDemandIndex(signals, horizon);
await pool.query(`
INSERT INTO forecast_signals
(technology, horizon_months, demand_index, price_direction,
price_delta_pct, confidence, signal_count, signal_breakdown, forecast_date)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, CURRENT_DATE)
ON CONFLICT (technology, horizon_months, forecast_date)
DO UPDATE SET
demand_index = EXCLUDED.demand_index,
price_direction = EXCLUDED.price_direction,
price_delta_pct = EXCLUDED.price_delta_pct,
confidence = EXCLUDED.confidence,
signal_count = EXCLUDED.signal_count,
signal_breakdown = EXCLUDED.signal_breakdown,
computed_at = NOW()
`, [
tech, horizon, forecast.index, forecast.priceDirection,
forecast.priceDeltaPct, forecast.confidence,
forecast.breakdown ? Object.keys(forecast.breakdown).length : 0,
JSON.stringify(forecast.breakdown),
]);
computed++;
logger.info(
`${tech} +${horizon}mo: demand=${forecast.index}/100, ` +
`price=${forecast.priceDirection} ${forecast.priceDeltaPct > 0 ? "+" : ""}${forecast.priceDeltaPct}%, ` +
`confidence=${Math.round(forecast.confidence * 100)}%`
);
}
} catch (err) {
logger.error(`Forecast engine failed for ${tech}`, { err });
}
}
logger.info(`Forecast engine done — ${computed} forecasts computed`);
}

View File

@ -0,0 +1,131 @@
-- ─────────────────────────────────────────────────────────────────────────────
-- 022 — Prediction Signals Data Model
-- New tables for demand/supply/adoption signals that feed the forecast engine
-- ─────────────────────────────────────────────────────────────────────────────
-- Hyperscaler quarterly CapEx (from SEC EDGAR XBRL)
CREATE TABLE IF NOT EXISTS hyperscaler_capex (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
company TEXT NOT NULL, -- 'amazon', 'microsoft', 'google', 'meta'
period_label TEXT NOT NULL, -- 'Q1 2025'
period_end DATE NOT NULL,
capex_usd_millions NUMERIC(12,1), -- total CapEx in USD millions
dc_capex_est_millions NUMERIC(12,1), -- estimated DC/cloud share
yoy_growth_pct NUMERIC(7,2), -- YoY growth %
source_url TEXT,
filing_type TEXT, -- '10-Q', '10-K'
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(company, period_end)
);
-- Distributor lead times and stock levels
CREATE TABLE IF NOT EXISTS distributor_lead_times (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
distributor TEXT NOT NULL, -- 'mouser', 'digikey', 'rs'
form_factor TEXT, -- 'QSFP28', 'QSFP-DD'
speed_label TEXT, -- '100G', '400G'
part_number TEXT,
product_name TEXT,
in_stock BOOLEAN,
stock_qty INTEGER,
lead_time_weeks INTEGER,
price_usd NUMERIC(10,2),
product_url TEXT,
scraped_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_dist_leads_form_factor ON distributor_lead_times(form_factor, scraped_at DESC);
-- GitHub technology adoption signals (weekly snapshots)
CREATE TABLE IF NOT EXISTS github_tech_signals (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
technology TEXT NOT NULL, -- '400G', '800G', 'ZR', 'CPO', 'silicon-photonics', 'CMIS'
metric TEXT NOT NULL, -- 'repo_count', 'commit_count', 'issue_count', 'stars_total'
repo_context TEXT, -- 'sonic-net/SONiC', 'openconfig', 'all'
value INTEGER NOT NULL,
week_start DATE NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(technology, metric, repo_context, week_start)
);
-- eBay marketplace velocity (sold listings count + average price)
CREATE TABLE IF NOT EXISTS marketplace_velocity (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
marketplace TEXT NOT NULL, -- 'ebay', 'aliexpress'
keyword TEXT NOT NULL,
form_factor TEXT,
speed_label TEXT,
sold_count_30d INTEGER, -- listings sold in last 30 days
active_listings INTEGER, -- current active listings
avg_sold_price NUMERIC(10,2),
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
currency TEXT DEFAULT 'USD',
scraped_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_velocity_keyword ON marketplace_velocity(keyword, scraped_at DESC);
-- AI cluster / hyperscale DC announcements
CREATE TABLE IF NOT EXISTS ai_cluster_announcements (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
company TEXT NOT NULL,
title TEXT NOT NULL,
summary TEXT,
announced_date DATE,
scale_mw NUMERIC(10,1), -- announced power in MW
scale_servers INTEGER,
network_speed TEXT, -- '400G', '800G', 'IB-NDR'
estimated_transceivers INTEGER, -- rough estimate
deployment_date DATE, -- expected go-live
location TEXT,
source_url TEXT NOT NULL,
source_name TEXT,
content_hash TEXT UNIQUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ai_clusters_date ON ai_cluster_announcements(announced_date DESC);
-- Network standards status tracker
CREATE TABLE IF NOT EXISTS standards_activity (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
standard_body TEXT NOT NULL, -- 'ieee', 'ietf', 'oif', 'msa', 'snia'
standard_name TEXT NOT NULL, -- '802.3df', 'CMIS 5.2', 'OIF-400ZR'
technology TEXT, -- '400G', '800G', 'ZR', 'CPO'
current_status TEXT, -- 'in-progress', 'ballot', 'approved', 'published'
draft_version TEXT,
ballot_date DATE,
approval_date DATE,
source_url TEXT,
notes TEXT,
last_checked TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(standard_body, standard_name)
);
-- Aggregated forecast signals (computed by forecast engine)
CREATE TABLE IF NOT EXISTS forecast_signals (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
technology TEXT NOT NULL, -- '100G-QSFP28', '400G-QSFP-DD', '800G-OSFP'
horizon_months INTEGER NOT NULL, -- 3, 9, 12, 18
demand_index NUMERIC(5,2), -- 0-100 composite demand score
price_direction TEXT, -- 'rising', 'falling', 'stable'
price_delta_pct NUMERIC(7,2), -- estimated % price change
confidence NUMERIC(3,2), -- 0.0-1.0
signal_count INTEGER, -- how many signals fed this forecast
signal_breakdown JSONB, -- breakdown by signal type
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
);
CREATE INDEX IF NOT EXISTS idx_forecast_tech ON forecast_signals(technology, computed_at DESC);
CREATE UNIQUE INDEX IF NOT EXISTS idx_forecast_unique ON forecast_signals(technology, horizon_months, date_trunc('day', computed_at));
-- Seed known standards status
INSERT INTO standards_activity (standard_body, standard_name, technology, current_status, source_url, notes) VALUES
('ieee', '802.3bs-400GbE', '400G', 'published', 'https://standards.ieee.org/ieee/802.3bs/5950/', 'Published 2017, baseline 400G standard'),
('ieee', '802.3cd-100G/200G/400G', '400G', 'published', 'https://standards.ieee.org/ieee/802.3cd/6635/', 'Published 2018'),
('ieee', '802.3df-800GbE', '800G', 'in-progress', 'https://www.ieee802.org/3/df/', 'Draft in progress, expected 2025'),
('ieee', '802.3dk-1.6T', '1.6T', 'in-progress', 'https://www.ieee802.org/3/dk/', 'Very early stage'),
('oif', 'OIF-400ZR', '400G-ZR', 'published', 'https://www.oiforum.com/technical-work/hot-topics/400zr-2/', 'Published 2020'),
('oif', 'OIF-CMIS-5.2', 'CMIS', 'published', 'https://www.oiforum.com/technical-work/hot-topics/cmis/', 'CMIS 5.2 published 2023'),
('oif', 'OIF-800ZR', '800G-ZR', 'in-progress', 'https://www.oiforum.com/technical-work/hot-topics/', 'In development'),
('msa', '400G-FR4', '400G', 'published', 'https://www.400g.com/', '400G FR4 MSA complete'),
('msa', '800G-XDR8', '800G', 'in-progress', 'https://www.800g.info/', 'Specification in progress')
ON CONFLICT (standard_body, standard_name) DO NOTHING;