/** * NOG Conference Talks Scraper for TIP * * Crawls agenda pages of major Network Operators Group meetings: * DENOG, RIPE, NANOG, ENOG, NLNOG, GRNOG, NZNOG, APRICOT * * Extracts talk titles + speakers + abstracts, then filters for * optical-networking / transceiver-relevant content. * * Two outputs: * 1. news_articles — relevant talks as news items (source="NOG Talks") * 2. market_intelligence — high-signal talks (deployment/procurement topics) * * Later integration: if ctxmeet DB has ConferenceTalk rows, this scraper * will skip and just pull from there via cross-DB query. * * Runs weekly via pg-boss: scrape:nog-talks */ import { pool } from "../utils/db"; import { contentHash } from "../utils/hash"; interface NogTalk { title: string; speaker: string; speakerOrg?: string; abstract?: string; event: string; eventUrl: string; talkUrl?: string; date?: string; track?: string; } // ── Optical / transceiver relevance keywords ────────────────────────────────── const OPTICS_KEYWORDS = [ "transceiver", "optical", "optics", "fiber", "fibre", "wavelength", "sfp", "qsfp", "osfp", "400g", "800g", "100g", "25g", "dwdm", "cwdm", "coherent", "pluggable", "dac", "aoc", "silicon photonics", "cpo", "data center", "datacenter", "dc fabric", "spine", "leaf", "cabling", "400zr", "800zr", "zr+", "bidi", "mpo", "lc", "dom", "ddm", "innolight", "coherent corp", "lumentum", "ii-vi", "finisar", "ciena", "infinera", "acacia", "broadcom", "marvell", "interconnect", "co-packaged", "lpo", "dsp", ]; // Networking keywords that often co-occur with optics topics const NETWORK_KEYWORDS = [ "peering", "ix", "ixp", "bgp", "routing", "infrastructure", "network upgrade", "capacity", "bandwidth", "latency", "data center interconnect", "dci", "wan", "mpls", "sr-mpls", "hyperscaler", "cloud", "colocation", "colo", ]; function scoreRelevance(text: string): number { const tl = text.toLowerCase(); let score = 0; for (const kw of OPTICS_KEYWORDS) { if (tl.includes(kw)) score += 3; } for (const kw of NETWORK_KEYWORDS) { if (tl.includes(kw)) score += 1; } return score; } function headers(): Record { return { "User-Agent": "TIP-NOG-Crawler/1.0 (Transceiver Intelligence; research)", "Accept": "text/html,application/xhtml+xml", }; } async function fetchText(url: string): Promise { try { const res = await fetch(url, { headers: headers(), signal: AbortSignal.timeout(20000) }); if (!res.ok) return ""; return res.text(); } catch { return ""; } } function cleanText(s: string): string { return s.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim(); } // ── DENOG ──────────────────────────────────────────────────────────────────── async function scrapeDENOG(): Promise { const talks: NogTalk[] = []; // DENOG agendas at denog.de/DENOG{N}/agenda/ for (const n of [17, 16, 15]) { const url = `https://www.denog.de/DENOG${n}/agenda/`; const html = await fetchText(url); if (!html) continue; // Extract talk titles from agenda (h2/h3 + speaker patterns) const talkPattern = /<(?:h[23]|td)[^>]*>\s*([^<]{10,200})<\/(?:h[23]|td)>/gi; let m: RegExpExecArray | null; const titles: string[] = []; while ((m = talkPattern.exec(html)) !== null) { const t = cleanText(m[1]); if (t.length > 10 && t.length < 200 && !t.startsWith("http") && !/^\d+$/.test(t)) { titles.push(t); } } for (const title of titles.slice(0, 30)) { talks.push({ title, speaker: "DENOG Speaker", event: `DENOG${n}`, eventUrl: url, date: `${2023 + (n - 15)}-01-01`, }); } } return talks; } // ── NANOG ──────────────────────────────────────────────────────────────────── async function scrapeNANOG(): Promise { const talks: NogTalk[] = []; // NANOG agendas — JSON API available for recent meetings for (const meeting of [93, 92, 91]) { const url = `https://www.nanog.org/meetings/nanog${meeting}/agenda/`; const html = await fetchText(url); if (!html) continue; // NANOG uses structured agenda with talk titles in

or
const patterns = [ /]*class="[^"]*title[^"]*"[^>]*>\s*([^<]{10,200})<\/div>/gi, /]*>\s*([^<]{10,200})<\/h3>/gi, /]*class="[^"]*session[^"]*"[^>]*>\s*([^<]{10,200})<\/td>/gi, ]; const titles: string[] = []; for (const pat of patterns) { let m: RegExpExecArray | null; pat.lastIndex = 0; while ((m = pat.exec(html)) !== null) { const t = cleanText(m[1]); if (t.length > 10 && t.length < 200) titles.push(t); } } for (const title of [...new Set(titles)].slice(0, 25)) { talks.push({ title, speaker: "NANOG Speaker", event: `NANOG${meeting}`, eventUrl: url, date: `${2023 + Math.floor((meeting - 90) / 3)}-01-01`, }); } } return talks; } // ── RIPE ───────────────────────────────────────────────────────────────────── async function scrapeRIPE(): Promise { const talks: NogTalk[] = []; // RIPE has a public programme API for (const meeting of [89, 88, 87]) { const apiUrl = `https://ripe${meeting}.ripe.net/programme/agenda/`; const html = await fetchText(apiUrl); if (!html) continue; // Look for session titles const patterns = [ /]*>\s*([^<]{10,200})<\/h[23]>/gi, /]*class="[^"]*slot-title[^"]*"[^>]*>\s*([^<]{10,200})<\/div>/gi, /]*class="[^"]*session[^"]*"[^>]*>\s*([^<]{10,200})<\/a>/gi, ]; const titles: string[] = []; for (const pat of patterns) { let m: RegExpExecArray | null; pat.lastIndex = 0; while ((m = pat.exec(html)) !== null) { const t = cleanText(m[1]); if (t.length > 10 && t.length < 200 && !t.includes("©")) titles.push(t); } } for (const title of [...new Set(titles)].slice(0, 30)) { talks.push({ title, speaker: "RIPE Speaker", event: `RIPE ${meeting}`, eventUrl: apiUrl, date: `${2023 + Math.floor((meeting - 87) / 2)}-01-01`, }); } } return talks; } // ── ENOG ───────────────────────────────────────────────────────────────────── async function scrapeENOG(): Promise { const talks: NogTalk[] = []; const html = await fetchText("https://www.enog.org/presentations/"); if (!html) return talks; const linkPattern = /]*href="([^"]*presentation[^"]*)"[^>]*>\s*([^<]{10,200})<\/a>/gi; let m: RegExpExecArray | null; while ((m = linkPattern.exec(html)) !== null) { const title = cleanText(m[2]); if (title.length > 10) { talks.push({ title, speaker: "ENOG Speaker", event: "ENOG", eventUrl: "https://www.enog.org/presentations/", talkUrl: m[1].startsWith("http") ? m[1] : `https://www.enog.org${m[1]}`, }); } } return talks.slice(0, 30); } // ── NLNOG ──────────────────────────────────────────────────────────────────── async function scrapeNLNOG(): Promise { const talks: NogTalk[] = []; const html = await fetchText("https://nlnog.net/nlnog-day/"); if (!html) return talks; const titlePattern = /]*>\s*([^<]{10,200})<\/h[234]>/gi; let m: RegExpExecArray | null; while ((m = titlePattern.exec(html)) !== null) { const title = cleanText(m[1]); if (title.length > 10 && !title.includes("NLNOG Day")) { talks.push({ title, speaker: "NLNOG Speaker", event: "NLNOG Day", eventUrl: "https://nlnog.net/nlnog-day/" }); } } return talks.slice(0, 20); } // ── NOG-PG / EURO-IX ───────────────────────────────────────────────────────── async function scrapeEuroIX(): Promise { const talks: NogTalk[] = []; const html = await fetchText("https://www.euro-ix.net/en/forixps/ixp-tools-and-services/"); if (!html) return talks; // Less structured — just grab notable headings const titlePattern = /]*>\s*([^<]{15,200})<\/h[23]>/gi; let m: RegExpExecArray | null; while ((m = titlePattern.exec(html)) !== null) { const title = cleanText(m[1]); if (title.length > 15) { talks.push({ title, speaker: "Euro-IX Speaker", event: "Euro-IX Forum", eventUrl: "https://www.euro-ix.net" }); } } return talks.slice(0, 15); } // ── CtxEvent cross-DB query (when ctxmeet has data) ────────────────────────── async function pullFromCtxEvent(): Promise { try { // Direct cross-DB query via dblink (same Postgres instance) or separate pool // ctxmeet DB is on same server, port 5432 const result = await pool.query(` SELECT * FROM dblink( 'host=localhost port=5432 dbname=ctxmeet user=tip password=', $$ SELECT ct.title, ct.speaker, ct."speakerOrg", ct.abstract, e.name as event, e."eventUrl", ct."startTime", ct."talkType" FROM "ConferenceTalk" ct JOIN "Event" e ON e.id = ct."eventId" WHERE e."startDate" > NOW() - INTERVAL '18 months' ORDER BY e."startDate" DESC LIMIT 100 $$ ) AS t(title text, speaker text, "speakerOrg" text, abstract text, event text, "eventUrl" text, "startTime" timestamptz, "talkType" text) `).catch(() => ({ rows: [] })); return result.rows.map((r: Record) => ({ title: String(r.title || ""), speaker: String(r.speaker || ""), speakerOrg: String(r.speakerOrg || ""), abstract: r.abstract ? String(r.abstract) : undefined, event: String(r.event || ""), eventUrl: String(r["eventUrl"] || ""), date: r.startTime ? new Date(r.startTime as string).toISOString().split("T")[0] : undefined, })); } catch { return []; } } // ── Store relevant talks in TIP DB ─────────────────────────────────────────── async function storeTalks(talks: NogTalk[]): Promise<{ stored: number; skipped: number }> { let stored = 0; let skipped = 0; for (const talk of talks) { const fullText = `${talk.title} ${talk.abstract || ""} ${talk.speaker} ${talk.speakerOrg || ""}`; const score = scoreRelevance(fullText); if (score < 2) { skipped++; continue; } const relevanceScore = Math.min(1, score / 15); const hash = contentHash({ source: "nog-talk", event: talk.event, title: talk.title }); // Make source_url unique per talk using hash suffix const uniqueUrl = talk.talkUrl || `${talk.eventUrl}#talk-${hash.substring(0, 8)}`; // Store in news_articles await pool.query(` INSERT INTO news_articles ( title, source, source_url, summary, published_at, category, relevance_score, content_hash, tags ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (source_url) DO UPDATE SET relevance_score = GREATEST(news_articles.relevance_score, EXCLUDED.relevance_score), content_hash = EXCLUDED.content_hash `, [ talk.title, `NOG Talks: ${talk.event}`, uniqueUrl, talk.abstract ? `${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""}: ${talk.abstract.substring(0, 400)}` : `${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""} — ${talk.event}`, talk.date ? new Date(talk.date) : new Date(), "event", relevanceScore, hash, JSON.stringify(["nog", talk.event.toLowerCase().replace(/\s+/g, "-"), "conference"]), ]).catch(() => null); // High-relevance talks also go into market_intelligence if (score >= 6) { const intelType = detectIntelType(talk.title + " " + (talk.abstract || "")); const buySignal = detectBuySignal(talk.title + " " + (talk.abstract || "")); await pool.query(` INSERT INTO market_intelligence ( intel_type, title, summary, relevance_score, technologies, buy_signal_implication, source_url, source_name, published_at, is_demo ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, false) ON CONFLICT DO NOTHING `, [ intelType, `[${talk.event}] ${talk.title}`, `${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""}: ${(talk.abstract || talk.title).substring(0, 500)}`, relevanceScore, extractTechnologies(fullText), buySignal, talk.talkUrl || talk.eventUrl, `NOG Talks: ${talk.event}`, talk.date ? new Date(talk.date) : new Date(), ]).catch(() => null); } stored++; } return { stored, skipped }; } function detectIntelType(text: string): string { const tl = text.toLowerCase(); if (tl.includes("deploy") || tl.includes("migration") || tl.includes("upgrade")) return "supply_chain"; if (tl.includes("400g") || tl.includes("800g") || tl.includes("100g")) return "technology_launch"; if (tl.includes("price") || tl.includes("cost") || tl.includes("budget")) return "price_movement"; if (tl.includes("standard") || tl.includes("spec") || tl.includes("rfc")) return "standard_draft"; if (tl.includes("capex") || tl.includes("investment") || tl.includes("market")) return "capex_cycle"; if (tl.includes("lead time") || tl.includes("availability") || tl.includes("shortage")) return "distributor_lead_time"; return "supply_chain"; } function detectBuySignal(text: string): string { const tl = text.toLowerCase(); if (tl.includes("deploy") || tl.includes("migration") || tl.includes("upgrade") || tl.includes("scale")) return "bullish"; if (tl.includes("shortage") || tl.includes("lead time") || tl.includes("limited")) return "opportunity"; if (tl.includes("wait") || tl.includes("next gen") || tl.includes("coming soon")) return "bearish"; return "neutral"; } function extractTechnologies(text: string): string[] { const techs: string[] = []; const tl = text.toLowerCase(); const techMap: Record = { "400g": "400G", "800g": "800G", "100g": "100G", "25g": "25G", "10g": "10G", "sfp28": "SFP28", "qsfp28": "QSFP28", "qsfp-dd": "QSFP-DD", "osfp": "OSFP", "zr+": "400ZR+", "silicon photonics": "Silicon Photonics", "cpo": "CPO", "dwdm": "DWDM", "cwdm": "CWDM", "lpo": "LPO", "dac": "DAC", "aoc": "AOC", "mpo": "MPO", "bidi": "BiDi", }; for (const [key, label] of Object.entries(techMap)) { if (tl.includes(key)) techs.push(label); } return [...new Set(techs)]; } // ── Main export ────────────────────────────────────────────────────────────── export async function scrapeNogTalks(): Promise { console.log("[NOG Talks] Starting NOG conference talks scraper..."); // First try CtxEvent DB (when populated) const ctxTalks = await pullFromCtxEvent(); console.log(`[NOG Talks] CtxEvent DB: ${ctxTalks.length} talks`); // Always scrape live NOG sources const [denog, nanog, ripe, enog, nlnog, euroix] = await Promise.all([ scrapeDENOG(), scrapeNANOG(), scrapeRIPE(), scrapeENOG(), scrapeNLNOG(), scrapeEuroIX(), ]); const allTalks = [...ctxTalks, ...denog, ...nanog, ...ripe, ...enog, ...nlnog, ...euroix]; console.log(`[NOG Talks] Collected ${allTalks.length} talks total (DENOG:${denog.length} NANOG:${nanog.length} RIPE:${ripe.length} ENOG:${enog.length} NLNOG:${nlnog.length} EuroIX:${euroix.length} CtxEvent:${ctxTalks.length})`); const { stored, skipped } = await storeTalks(allTalks); console.log(`[NOG Talks] Done — stored:${stored} (optics-relevant) skipped:${skipped} (not relevant)`); }