transceiver-db/packages/scraper/src/scrapers/standards-tracker.ts
Rene Fichtmueller 370c1d8801 feat: 6 prediction signal scrapers + forecast engine
New scrapers (all registered in pg-boss, 50 total jobs):
  - sec-edgar.ts       : SEC EDGAR XBRL API — hyperscaler CapEx from 10-Q/10-K
  - github-signals.ts  : GitHub Search/Stats API — tech adoption metrics weekly
  - ebay-velocity.ts   : eBay completed listings — sold count + price distribution
  - ai-clusters.ts     : RSS feeds (6 sources) — AI cluster & DC announcements
  - distributor-leads.ts : Mouser, Digi-Key, RS Components — lead time + stock
  - standards-tracker.ts : IEEE 802.3, OIF, IETF — draft/ballot/published status

New utilities:
  - forecast-engine.ts : Weighted signal aggregator → demand_index + price_direction
    6 signal types, 4 horizons (3/9/12/18 months), 5 technologies tracked

New DB tables (migration 022):
  hyperscaler_capex, distributor_lead_times, github_tech_signals,
  marketplace_velocity, ai_cluster_announcements, standards_activity,
  forecast_signals

Schedules:
  - EDGAR: weekly Mon 06:00
  - GitHub: weekly Sun 05:00
  - eBay velocity: every 12h
  - AI clusters: every 4h (news-speed)
  - Distributor leads: daily 03:30
  - Standards: weekly Wed 04:00
  - Forecast engine: daily 08:00 (after all nightly scrapers)
2026-04-02 02:02:44 +02:00

200 lines
7.1 KiB
TypeScript

/**
* Network Standards & Draft Activity Tracker
*
* Monitors the status of key networking standards that directly
* affect transceiver form factor adoption timelines:
*
* - IEEE 802.3 (Ethernet PHY standards)
* - IETF (network protocols)
* - OIF (Optical Internetworking Forum — MSA agreements)
* - CMIS (Common Management Interface Specification)
*
* Standard status is a 12-24 month leading indicator:
* "Draft 3.0 approved" → ~18 months to mass-market products
* "Published" → OEMs start shipping within 12 months
* "Ballot closed" → final spec in ~3 months
*/
import * as cheerio from "cheerio";
import { pool } from "../utils/db";
import { logger } from "../utils/logger";
const HEADERS = {
"User-Agent": "TIP-DataCollector/1.0 contact@context-x.org",
"Accept": "text/html,application/xhtml+xml",
};
interface StandardStatus {
standard_body: string;
standard_name: string;
technology: string;
current_status: string;
draft_version: string | null;
approval_date: string | null;
source_url: string;
notes: string | null;
}
// ─── IEEE 802.3 ────────────────────────────────────────────────────────────
async function scrapeIeee802(): Promise<StandardStatus[]> {
const url = "https://www.ieee802.org/3/";
const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`IEEE ${res.status}`);
const $ = cheerio.load(await res.text());
const results: StandardStatus[] = [];
// IEEE 802.3 project table
$("table tr").each((_, row) => {
const cells = $(row).find("td, th");
if (cells.length < 2) return;
const taskName = $(cells[0]).text().trim();
const statusText = $(cells[1]).text().trim();
const link = $(cells[0]).find("a").first().attr("href");
// Filter for high-speed Ethernet projects
const speedMatch = taskName.match(/\b(100G|200G|400G|800G|1\.6T|1600G)\b/i)
|| taskName.match(/802\.3(bs|cd|cu|ck|df|dk|dj|dl)\b/i);
if (!speedMatch) return;
const tech = taskName.match(/\b(400G|800G|1\.6T|100G|200G)\b/i)?.[1] ?? "high-speed";
let status = "in-progress";
if (/published|approved/i.test(statusText)) status = "published";
else if (/ballot/i.test(statusText)) status = "ballot";
else if (/withdrawn|cancelled/i.test(statusText)) status = "cancelled";
results.push({
standard_body: "ieee",
standard_name: taskName.substring(0, 100),
technology: tech,
current_status: status,
draft_version: null,
approval_date: null,
source_url: link ? `https://www.ieee802.org/3/${link}` : url,
notes: statusText.substring(0, 200),
});
});
return results;
}
// ─── OIF ──────────────────────────────────────────────────────────────────
async function scrapeOif(): Promise<StandardStatus[]> {
const url = "https://www.oiforum.com/technical-work/hot-topics/";
const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
if (!res.ok) throw new Error(`OIF ${res.status}`);
const $ = cheerio.load(await res.text());
const results: StandardStatus[] = [];
$(".entry-content h2, .entry-content h3").each((_, el) => {
const heading = $(el).text().trim();
if (!heading) return;
const link = $(el).find("a").first().attr("href") || url;
const techMatch = heading.match(/\b(400ZR|800ZR|CMIS|400G|800G|CPO|OIF-)\S*/i);
if (!techMatch) return;
const tech = heading.includes("ZR") ? heading.includes("800") ? "800G-ZR" : "400G-ZR"
: heading.includes("CMIS") ? "CMIS"
: heading.includes("800") ? "800G"
: heading.includes("400") ? "400G"
: "optical";
results.push({
standard_body: "oif",
standard_name: heading.substring(0, 100),
technology: tech,
current_status: "in-progress", // OIF IA are usually in-progress or published
draft_version: null,
approval_date: null,
source_url: link,
notes: null,
});
});
return results;
}
// ─── IETF Datatracker ─────────────────────────────────────────────────────
async function scrapeIetf(): Promise<StandardStatus[]> {
// IETF has a proper REST API
const url = "https://datatracker.ietf.org/api/v1/doc/document/?type=draft&name__contains=optical&limit=20&offset=0";
const res = await fetch(url, {
headers: { ...HEADERS, Accept: "application/json" },
signal: AbortSignal.timeout(15000),
});
if (!res.ok) throw new Error(`IETF API ${res.status}`);
const data = await res.json() as {
objects: Array<{
name: string; title: string; abstract: string;
std_level: string | null; stream: string;
}>
};
return (data.objects ?? []).map(doc => {
const tech = doc.name.includes("400g") ? "400G"
: doc.name.includes("800g") ? "800G"
: doc.title.match(/\b(400G|800G|ZR|coherent)\b/i)?.[1] ?? "optical";
return {
standard_body: "ietf",
standard_name: doc.name.substring(0, 100),
technology: tech,
current_status: doc.std_level ? "published" : "in-progress",
draft_version: null,
approval_date: null,
source_url: `https://datatracker.ietf.org/doc/${doc.name}/`,
notes: doc.title.substring(0, 200),
};
});
}
export async function scrapeStandardsTracker(): Promise<void> {
logger.info("Standards tracker starting");
let updated = 0;
const scrapers: Array<{ name: string; fn: () => Promise<StandardStatus[]> }> = [
{ name: "IEEE 802.3", fn: scrapeIeee802 },
{ name: "OIF", fn: scrapeOif },
{ name: "IETF", fn: scrapeIetf },
];
for (const s of scrapers) {
try {
await new Promise(r => setTimeout(r, 2000));
logger.info(`Checking ${s.name}`);
const standards = await s.fn();
for (const std of standards) {
await pool.query(`
INSERT INTO standards_activity
(standard_body, standard_name, technology, current_status,
draft_version, approval_date, source_url, notes, last_checked)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW())
ON CONFLICT (standard_body, standard_name) DO UPDATE SET
current_status = EXCLUDED.current_status,
technology = EXCLUDED.technology,
notes = COALESCE(EXCLUDED.notes, standards_activity.notes),
last_checked = NOW()
`, [std.standard_body, std.standard_name, std.technology, std.current_status,
std.draft_version, std.approval_date, std.source_url, std.notes]);
updated++;
}
logger.info(`${s.name}: ${standards.length} standards checked`);
} catch (err) {
logger.warn(`Standards scraper failed: ${s.name}`, { err });
}
}
logger.info(`Standards tracker done — ${updated} records updated`);
}