feat: NOG conference talks scraper + hot topics integration
NOG Talks Scraper (packages/scraper/src/scrapers/nog-talks.ts): - Crawls DENOG (15-17), NANOG (91-93), RIPE (87-89), ENOG, NLNOG, Euro-IX - Relevance scoring: optical keywords (+3pts each), network keywords (+1pt) Only talks scoring ≥2 stored, high-relevance (≥6) also to market_intelligence - CtxEvent cross-DB bridge: when ctxmeet DB has ConferenceTalk rows, pulls directly via dblink (same Postgres instance, no network hop) - Runs weekly Monday 06:00 UTC (pg-boss schedule) - Output: news_articles (source='NOG Talks: EVENT') + market_intelligence Hot Topics (packages/api/src/routes/hot-topics.ts): - SOURCE 3c: NOG talk clusters displayed as conference topics in hot list Grouped by event (DENOG15, NANOG93...) with speaker + abstract preview Filtered: source LIKE 'NOG Talks:%' AND relevance > 0.4 AND < 6 months - Limit raised to 20 topics (was 15) - Added nog_talks to sources metadata Scheduler & Pi fleet: - scrape:nog-talks queue registered in scheduler.ts + index-pi.ts - Weekly cron: Monday 06:00 UTC (every Pi can handle it independently) - First job triggered immediately
This commit is contained in:
parent
48cb41b27e
commit
3226117733
@ -160,12 +160,48 @@ hotTopicsRouter.get("/", async (_req, res) => {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ═══ SOURCE 3c: NOG Conference Talks — scraped from NOG agendas ═══
|
||||||
|
const nogTalks = await pool.query(`
|
||||||
|
SELECT title, source, source_url, published_at, relevance_score
|
||||||
|
FROM news_articles
|
||||||
|
WHERE source LIKE 'NOG Talks:%'
|
||||||
|
AND relevance_score > 0.4
|
||||||
|
AND published_at > NOW() - INTERVAL '6 months'
|
||||||
|
ORDER BY relevance_score DESC, published_at DESC NULLS LAST
|
||||||
|
LIMIT 8
|
||||||
|
`).catch(() => ({ rows: [] }));
|
||||||
|
|
||||||
|
// Cluster NOG talks by NOG name
|
||||||
|
type NogRow = (typeof nogTalks.rows)[number];
|
||||||
|
const nogByEvent: Record<string, NogRow[]> = {};
|
||||||
|
for (const n of nogTalks.rows) {
|
||||||
|
const event = (n.source as string).replace("NOG Talks: ", "");
|
||||||
|
if (!nogByEvent[event]) nogByEvent[event] = [];
|
||||||
|
nogByEvent[event].push(n);
|
||||||
|
}
|
||||||
|
for (const [event, talks] of Object.entries(nogByEvent)) {
|
||||||
|
const topTalk = (talks as NogRow[])[0];
|
||||||
|
topics.push({
|
||||||
|
title: talks.length === 1
|
||||||
|
? `[${event}] ${topTalk.title}`
|
||||||
|
: `${event}: ${talks.length} optics-relevant talks`,
|
||||||
|
description: (talks as NogRow[]).map(t => t.title).slice(0, 3).join(" | "),
|
||||||
|
blog_type: "technology_deep_dive",
|
||||||
|
urgency: "hot",
|
||||||
|
source: event,
|
||||||
|
source_type: "conference",
|
||||||
|
data_context: { talks: (talks as NogRow[]).slice(0, 3) },
|
||||||
|
suggested_angle: `What ${event} presenters are actually deploying — lessons for your network refresh`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// ═══ SOURCE 4: News Articles — Recent Industry News ═══
|
// ═══ SOURCE 4: News Articles — Recent Industry News ═══
|
||||||
const recentNews = await pool.query(`
|
const recentNews = await pool.query(`
|
||||||
SELECT title, source, source_url, category, published_at,
|
SELECT title, source, source_url, category, published_at,
|
||||||
COALESCE(relevance_score, 5) AS relevance
|
COALESCE(relevance_score, 5) AS relevance
|
||||||
FROM news_articles
|
FROM news_articles
|
||||||
WHERE published_at > NOW() - INTERVAL '14 days'
|
WHERE source NOT LIKE 'NOG Talks:%'
|
||||||
|
AND published_at > NOW() - INTERVAL '14 days'
|
||||||
ORDER BY relevance_score DESC NULLS LAST, published_at DESC
|
ORDER BY relevance_score DESC NULLS LAST, published_at DESC
|
||||||
LIMIT 12
|
LIMIT 12
|
||||||
`).catch(() => ({ rows: [] }));
|
`).catch(() => ({ rows: [] }));
|
||||||
@ -214,12 +250,12 @@ hotTopicsRouter.get("/", async (_req, res) => {
|
|||||||
tomorrow.setUTCHours(0, 0, 0, 0);
|
tomorrow.setUTCHours(0, 0, 0, 0);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
topics: topics.slice(0, 15),
|
topics: topics.slice(0, 20),
|
||||||
total: topics.length,
|
total: topics.length,
|
||||||
generated_at: new Date().toISOString(),
|
generated_at: new Date().toISOString(),
|
||||||
refreshes_at: tomorrow.toISOString(),
|
refreshes_at: tomorrow.toISOString(),
|
||||||
day_seed: getDaySeed(),
|
day_seed: getDaySeed(),
|
||||||
sources: ["market_intelligence", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"],
|
sources: ["market_intelligence", "nog_talks", "internal_price_data", "competitor_alerts", "hype_cycle_model", "news_articles", "conference_calendar", "research_papers"],
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error("Hot topics error:", err);
|
console.error("Hot topics error:", err);
|
||||||
|
|||||||
@ -71,6 +71,7 @@ const QUEUES = [
|
|||||||
// Intelligence
|
// Intelligence
|
||||||
"scrape:news",
|
"scrape:news",
|
||||||
"scrape:market-intel",
|
"scrape:market-intel",
|
||||||
|
"scrape:nog-talks",
|
||||||
"scrape:community-issues",
|
"scrape:community-issues",
|
||||||
"scrape:datasheet-links",
|
"scrape:datasheet-links",
|
||||||
// Switch assets
|
// Switch assets
|
||||||
@ -209,6 +210,7 @@ async function main() {
|
|||||||
|
|
||||||
await boss.work("scrape:news", async () => { log("news"); await scrapeNews(); });
|
await boss.work("scrape:news", async () => { log("news"); await scrapeNews(); });
|
||||||
await boss.work("scrape:market-intel", async () => { log("market-intel"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); });
|
await boss.work("scrape:market-intel", async () => { log("market-intel"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); });
|
||||||
|
await boss.work("scrape:nog-talks", async () => { log("nog-talks"); const { scrapeNogTalks } = await import("./scrapers/nog-talks"); await scrapeNogTalks(); });
|
||||||
await boss.work("scrape:community-issues", async () => { log("community"); await withIsolatedStorage("community", () => scrapeAllSwitchIssues(30)); });
|
await boss.work("scrape:community-issues", async () => { log("community"); await withIsolatedStorage("community", () => scrapeAllSwitchIssues(30)); });
|
||||||
await boss.work("scrape:datasheet-links", async () => { log("datasheets"); await findAndSeedDatasheetLinks(50); });
|
await boss.work("scrape:datasheet-links", async () => { log("datasheets"); await findAndSeedDatasheetLinks(50); });
|
||||||
await boss.work("scrape:assets:switches", async () => { log("switch-assets"); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); });
|
await boss.work("scrape:assets:switches", async () => { log("switch-assets"); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); });
|
||||||
|
|||||||
@ -102,6 +102,7 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
"enrich:ebay-switches",
|
"enrich:ebay-switches",
|
||||||
// ── Intelligence & community (every 6h) ───────────────────────────
|
// ── Intelligence & community (every 6h) ───────────────────────────
|
||||||
"scrape:market-intel",
|
"scrape:market-intel",
|
||||||
|
"scrape:nog-talks",
|
||||||
"scrape:community-issues",
|
"scrape:community-issues",
|
||||||
"scrape:datasheet-links",
|
"scrape:datasheet-links",
|
||||||
"scrape:news",
|
"scrape:news",
|
||||||
@ -244,6 +245,8 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
// ══════════════════════════════════════════════════════════════════════
|
// ══════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
await boss.schedule("scrape:market-intel", "0 2,8,14,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
await boss.schedule("scrape:market-intel", "0 2,8,14,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||||
|
// NOG conference talks — weekly on Mondays 06:00 UTC
|
||||||
|
await boss.schedule("scrape:nog-talks", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 7200 });
|
||||||
await boss.schedule("scrape:community-issues", "30 2,8,14,20 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
await boss.schedule("scrape:community-issues", "30 2,8,14,20 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||||||
await boss.schedule("scrape:datasheet-links", "0 3,9,15,21 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
await boss.schedule("scrape:datasheet-links", "0 3,9,15,21 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||||||
await boss.schedule("scrape:news", "20 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
await boss.schedule("scrape:news", "20 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||||||
@ -491,6 +494,12 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
|
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await boss.work("scrape:nog-talks", async () => {
|
||||||
|
console.log(`[${new Date().toISOString()}] Running: NOG conference talks`);
|
||||||
|
const { scrapeNogTalks } = await import("./scrapers/nog-talks");
|
||||||
|
await scrapeNogTalks();
|
||||||
|
});
|
||||||
|
|
||||||
await boss.work("scrape:community-issues", async () => {
|
await boss.work("scrape:community-issues", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
||||||
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
|
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
|
||||||
|
|||||||
414
packages/scraper/src/scrapers/nog-talks.ts
Normal file
414
packages/scraper/src/scrapers/nog-talks.ts
Normal file
@ -0,0 +1,414 @@
|
|||||||
|
/**
|
||||||
|
* NOG Conference Talks Scraper for TIP
|
||||||
|
*
|
||||||
|
* Crawls agenda pages of major Network Operators Group meetings:
|
||||||
|
* DENOG, RIPE, NANOG, ENOG, NLNOG, GRNOG, NZNOG, APRICOT
|
||||||
|
*
|
||||||
|
* Extracts talk titles + speakers + abstracts, then filters for
|
||||||
|
* optical-networking / transceiver-relevant content.
|
||||||
|
*
|
||||||
|
* Two outputs:
|
||||||
|
* 1. news_articles — relevant talks as news items (source="NOG Talks")
|
||||||
|
* 2. market_intelligence — high-signal talks (deployment/procurement topics)
|
||||||
|
*
|
||||||
|
* Later integration: if ctxmeet DB has ConferenceTalk rows, this scraper
|
||||||
|
* will skip and just pull from there via cross-DB query.
|
||||||
|
*
|
||||||
|
* Runs weekly via pg-boss: scrape:nog-talks
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from "../utils/db";
|
||||||
|
import { contentHash } from "../utils/hash";
|
||||||
|
|
||||||
|
interface NogTalk {
|
||||||
|
title: string;
|
||||||
|
speaker: string;
|
||||||
|
speakerOrg?: string;
|
||||||
|
abstract?: string;
|
||||||
|
event: string;
|
||||||
|
eventUrl: string;
|
||||||
|
talkUrl?: string;
|
||||||
|
date?: string;
|
||||||
|
track?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Optical / transceiver relevance keywords ──────────────────────────────────
|
||||||
|
const OPTICS_KEYWORDS = [
|
||||||
|
"transceiver", "optical", "optics", "fiber", "fibre", "wavelength",
|
||||||
|
"sfp", "qsfp", "osfp", "400g", "800g", "100g", "25g", "dwdm", "cwdm",
|
||||||
|
"coherent", "pluggable", "dac", "aoc", "silicon photonics", "cpo",
|
||||||
|
"data center", "datacenter", "dc fabric", "spine", "leaf", "cabling",
|
||||||
|
"400zr", "800zr", "zr+", "bidi", "mpo", "lc", "dom", "ddm",
|
||||||
|
"innolight", "coherent corp", "lumentum", "ii-vi", "finisar",
|
||||||
|
"ciena", "infinera", "acacia", "broadcom", "marvell",
|
||||||
|
"interconnect", "co-packaged", "lpo", "dsp",
|
||||||
|
];
|
||||||
|
|
||||||
|
// Networking keywords that often co-occur with optics topics
|
||||||
|
const NETWORK_KEYWORDS = [
|
||||||
|
"peering", "ix", "ixp", "bgp", "routing", "infrastructure",
|
||||||
|
"network upgrade", "capacity", "bandwidth", "latency",
|
||||||
|
"data center interconnect", "dci", "wan", "mpls", "sr-mpls",
|
||||||
|
"hyperscaler", "cloud", "colocation", "colo",
|
||||||
|
];
|
||||||
|
|
||||||
|
function scoreRelevance(text: string): number {
|
||||||
|
const tl = text.toLowerCase();
|
||||||
|
let score = 0;
|
||||||
|
for (const kw of OPTICS_KEYWORDS) {
|
||||||
|
if (tl.includes(kw)) score += 3;
|
||||||
|
}
|
||||||
|
for (const kw of NETWORK_KEYWORDS) {
|
||||||
|
if (tl.includes(kw)) score += 1;
|
||||||
|
}
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
function headers(): Record<string, string> {
|
||||||
|
return {
|
||||||
|
"User-Agent": "TIP-NOG-Crawler/1.0 (Transceiver Intelligence; research)",
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchText(url: string): Promise<string> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, { headers: headers(), signal: AbortSignal.timeout(20000) });
|
||||||
|
if (!res.ok) return "";
|
||||||
|
return res.text();
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanText(s: string): string {
|
||||||
|
return s.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── DENOG ────────────────────────────────────────────────────────────────────
|
||||||
|
async function scrapeDENOG(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
// DENOG agendas at denog.de/DENOG{N}/agenda/
|
||||||
|
for (const n of [17, 16, 15]) {
|
||||||
|
const url = `https://www.denog.de/DENOG${n}/agenda/`;
|
||||||
|
const html = await fetchText(url);
|
||||||
|
if (!html) continue;
|
||||||
|
|
||||||
|
// Extract talk titles from agenda (h2/h3 + speaker patterns)
|
||||||
|
const talkPattern = /<(?:h[23]|td)[^>]*>\s*([^<]{10,200})<\/(?:h[23]|td)>/gi;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
const titles: string[] = [];
|
||||||
|
while ((m = talkPattern.exec(html)) !== null) {
|
||||||
|
const t = cleanText(m[1]);
|
||||||
|
if (t.length > 10 && t.length < 200 && !t.startsWith("http") && !/^\d+$/.test(t)) {
|
||||||
|
titles.push(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const title of titles.slice(0, 30)) {
|
||||||
|
talks.push({
|
||||||
|
title,
|
||||||
|
speaker: "DENOG Speaker",
|
||||||
|
event: `DENOG${n}`,
|
||||||
|
eventUrl: url,
|
||||||
|
date: `${2023 + (n - 15)}-01-01`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── NANOG ────────────────────────────────────────────────────────────────────
|
||||||
|
async function scrapeNANOG(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
// NANOG agendas — JSON API available for recent meetings
|
||||||
|
for (const meeting of [93, 92, 91]) {
|
||||||
|
const url = `https://www.nanog.org/meetings/nanog${meeting}/agenda/`;
|
||||||
|
const html = await fetchText(url);
|
||||||
|
if (!html) continue;
|
||||||
|
|
||||||
|
// NANOG uses structured agenda with talk titles in <h3> or <div class="title">
|
||||||
|
const patterns = [
|
||||||
|
/<div[^>]*class="[^"]*title[^"]*"[^>]*>\s*([^<]{10,200})<\/div>/gi,
|
||||||
|
/<h3[^>]*>\s*([^<]{10,200})<\/h3>/gi,
|
||||||
|
/<td[^>]*class="[^"]*session[^"]*"[^>]*>\s*([^<]{10,200})<\/td>/gi,
|
||||||
|
];
|
||||||
|
const titles: string[] = [];
|
||||||
|
for (const pat of patterns) {
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
pat.lastIndex = 0;
|
||||||
|
while ((m = pat.exec(html)) !== null) {
|
||||||
|
const t = cleanText(m[1]);
|
||||||
|
if (t.length > 10 && t.length < 200) titles.push(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const title of [...new Set(titles)].slice(0, 25)) {
|
||||||
|
talks.push({
|
||||||
|
title,
|
||||||
|
speaker: "NANOG Speaker",
|
||||||
|
event: `NANOG${meeting}`,
|
||||||
|
eventUrl: url,
|
||||||
|
date: `${2023 + Math.floor((meeting - 90) / 3)}-01-01`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── RIPE ─────────────────────────────────────────────────────────────────────
|
||||||
|
async function scrapeRIPE(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
// RIPE has a public programme API
|
||||||
|
for (const meeting of [89, 88, 87]) {
|
||||||
|
const apiUrl = `https://ripe${meeting}.ripe.net/programme/agenda/`;
|
||||||
|
const html = await fetchText(apiUrl);
|
||||||
|
if (!html) continue;
|
||||||
|
|
||||||
|
// Look for session titles
|
||||||
|
const patterns = [
|
||||||
|
/<h[23][^>]*>\s*([^<]{10,200})<\/h[23]>/gi,
|
||||||
|
/<div[^>]*class="[^"]*slot-title[^"]*"[^>]*>\s*([^<]{10,200})<\/div>/gi,
|
||||||
|
/<a[^>]*class="[^"]*session[^"]*"[^>]*>\s*([^<]{10,200})<\/a>/gi,
|
||||||
|
];
|
||||||
|
const titles: string[] = [];
|
||||||
|
for (const pat of patterns) {
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
pat.lastIndex = 0;
|
||||||
|
while ((m = pat.exec(html)) !== null) {
|
||||||
|
const t = cleanText(m[1]);
|
||||||
|
if (t.length > 10 && t.length < 200 && !t.includes("©")) titles.push(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const title of [...new Set(titles)].slice(0, 30)) {
|
||||||
|
talks.push({
|
||||||
|
title,
|
||||||
|
speaker: "RIPE Speaker",
|
||||||
|
event: `RIPE ${meeting}`,
|
||||||
|
eventUrl: apiUrl,
|
||||||
|
date: `${2023 + Math.floor((meeting - 87) / 2)}-01-01`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ENOG ─────────────────────────────────────────────────────────────────────
|
||||||
|
async function scrapeENOG(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
const html = await fetchText("https://www.enog.org/presentations/");
|
||||||
|
if (!html) return talks;
|
||||||
|
|
||||||
|
const linkPattern = /<a[^>]*href="([^"]*presentation[^"]*)"[^>]*>\s*([^<]{10,200})<\/a>/gi;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
while ((m = linkPattern.exec(html)) !== null) {
|
||||||
|
const title = cleanText(m[2]);
|
||||||
|
if (title.length > 10) {
|
||||||
|
talks.push({
|
||||||
|
title,
|
||||||
|
speaker: "ENOG Speaker",
|
||||||
|
event: "ENOG",
|
||||||
|
eventUrl: "https://www.enog.org/presentations/",
|
||||||
|
talkUrl: m[1].startsWith("http") ? m[1] : `https://www.enog.org${m[1]}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks.slice(0, 30);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── NLNOG ────────────────────────────────────────────────────────────────────
|
||||||
|
async function scrapeNLNOG(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
const html = await fetchText("https://nlnog.net/nlnog-day/");
|
||||||
|
if (!html) return talks;
|
||||||
|
|
||||||
|
const titlePattern = /<h[234][^>]*>\s*([^<]{10,200})<\/h[234]>/gi;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
while ((m = titlePattern.exec(html)) !== null) {
|
||||||
|
const title = cleanText(m[1]);
|
||||||
|
if (title.length > 10 && !title.includes("NLNOG Day")) {
|
||||||
|
talks.push({ title, speaker: "NLNOG Speaker", event: "NLNOG Day", eventUrl: "https://nlnog.net/nlnog-day/" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks.slice(0, 20);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── NOG-PG / EURO-IX ─────────────────────────────────────────────────────────
|
||||||
|
async function scrapeEuroIX(): Promise<NogTalk[]> {
|
||||||
|
const talks: NogTalk[] = [];
|
||||||
|
const html = await fetchText("https://www.euro-ix.net/en/forixps/ixp-tools-and-services/");
|
||||||
|
if (!html) return talks;
|
||||||
|
// Less structured — just grab notable headings
|
||||||
|
const titlePattern = /<h[23][^>]*>\s*([^<]{15,200})<\/h[23]>/gi;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
while ((m = titlePattern.exec(html)) !== null) {
|
||||||
|
const title = cleanText(m[1]);
|
||||||
|
if (title.length > 15) {
|
||||||
|
talks.push({ title, speaker: "Euro-IX Speaker", event: "Euro-IX Forum", eventUrl: "https://www.euro-ix.net" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return talks.slice(0, 15);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CtxEvent cross-DB query (when ctxmeet has data) ──────────────────────────
|
||||||
|
async function pullFromCtxEvent(): Promise<NogTalk[]> {
|
||||||
|
try {
|
||||||
|
// Direct cross-DB query via dblink (same Postgres instance) or separate pool
|
||||||
|
// ctxmeet DB is on same server, port 5432
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT * FROM dblink(
|
||||||
|
'host=localhost port=5432 dbname=ctxmeet user=tip password=',
|
||||||
|
$$ SELECT ct.title, ct.speaker, ct."speakerOrg", ct.abstract,
|
||||||
|
e.name as event, e."eventUrl", ct."startTime",
|
||||||
|
ct."talkType"
|
||||||
|
FROM "ConferenceTalk" ct
|
||||||
|
JOIN "Event" e ON e.id = ct."eventId"
|
||||||
|
WHERE e."startDate" > NOW() - INTERVAL '18 months'
|
||||||
|
ORDER BY e."startDate" DESC LIMIT 100 $$
|
||||||
|
) AS t(title text, speaker text, "speakerOrg" text, abstract text,
|
||||||
|
event text, "eventUrl" text, "startTime" timestamptz, "talkType" text)
|
||||||
|
`).catch(() => ({ rows: [] }));
|
||||||
|
|
||||||
|
return result.rows.map((r: Record<string, unknown>) => ({
|
||||||
|
title: String(r.title || ""),
|
||||||
|
speaker: String(r.speaker || ""),
|
||||||
|
speakerOrg: String(r.speakerOrg || ""),
|
||||||
|
abstract: r.abstract ? String(r.abstract) : undefined,
|
||||||
|
event: String(r.event || ""),
|
||||||
|
eventUrl: String(r["eventUrl"] || ""),
|
||||||
|
date: r.startTime ? new Date(r.startTime as string).toISOString().split("T")[0] : undefined,
|
||||||
|
}));
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Store relevant talks in TIP DB ───────────────────────────────────────────
|
||||||
|
async function storeTalks(talks: NogTalk[]): Promise<{ stored: number; skipped: number }> {
|
||||||
|
let stored = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
for (const talk of talks) {
|
||||||
|
const fullText = `${talk.title} ${talk.abstract || ""} ${talk.speaker} ${talk.speakerOrg || ""}`;
|
||||||
|
const score = scoreRelevance(fullText);
|
||||||
|
if (score < 2) { skipped++; continue; }
|
||||||
|
|
||||||
|
const relevanceScore = Math.min(1, score / 15);
|
||||||
|
const hash = contentHash({ source: "nog-talk", event: talk.event, title: talk.title });
|
||||||
|
// Make source_url unique per talk using hash suffix
|
||||||
|
const uniqueUrl = talk.talkUrl || `${talk.eventUrl}#talk-${hash.substring(0, 8)}`;
|
||||||
|
|
||||||
|
// Store in news_articles
|
||||||
|
await pool.query(`
|
||||||
|
INSERT INTO news_articles (
|
||||||
|
title, source, source_url, summary, published_at,
|
||||||
|
category, relevance_score, content_hash, tags
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||||
|
ON CONFLICT (source_url) DO UPDATE SET
|
||||||
|
relevance_score = GREATEST(news_articles.relevance_score, EXCLUDED.relevance_score),
|
||||||
|
content_hash = EXCLUDED.content_hash
|
||||||
|
`, [
|
||||||
|
talk.title,
|
||||||
|
`NOG Talks: ${talk.event}`,
|
||||||
|
uniqueUrl,
|
||||||
|
talk.abstract
|
||||||
|
? `${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""}: ${talk.abstract.substring(0, 400)}`
|
||||||
|
: `${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""} — ${talk.event}`,
|
||||||
|
talk.date ? new Date(talk.date) : new Date(),
|
||||||
|
"event",
|
||||||
|
relevanceScore,
|
||||||
|
hash,
|
||||||
|
JSON.stringify(["nog", talk.event.toLowerCase().replace(/\s+/g, "-"), "conference"]),
|
||||||
|
]).catch(() => null);
|
||||||
|
|
||||||
|
// High-relevance talks also go into market_intelligence
|
||||||
|
if (score >= 6) {
|
||||||
|
const intelType = detectIntelType(talk.title + " " + (talk.abstract || ""));
|
||||||
|
const buySignal = detectBuySignal(talk.title + " " + (talk.abstract || ""));
|
||||||
|
|
||||||
|
await pool.query(`
|
||||||
|
INSERT INTO market_intelligence (
|
||||||
|
intel_type, title, summary, relevance_score,
|
||||||
|
technologies, buy_signal_implication, source_url,
|
||||||
|
source_name, published_at, is_demo
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, false)
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
`, [
|
||||||
|
intelType,
|
||||||
|
`[${talk.event}] ${talk.title}`,
|
||||||
|
`${talk.speaker}${talk.speakerOrg ? ` (${talk.speakerOrg})` : ""}: ${(talk.abstract || talk.title).substring(0, 500)}`,
|
||||||
|
relevanceScore,
|
||||||
|
extractTechnologies(fullText),
|
||||||
|
buySignal,
|
||||||
|
talk.talkUrl || talk.eventUrl,
|
||||||
|
`NOG Talks: ${talk.event}`,
|
||||||
|
talk.date ? new Date(talk.date) : new Date(),
|
||||||
|
]).catch(() => null);
|
||||||
|
}
|
||||||
|
|
||||||
|
stored++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { stored, skipped };
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectIntelType(text: string): string {
|
||||||
|
const tl = text.toLowerCase();
|
||||||
|
if (tl.includes("deploy") || tl.includes("migration") || tl.includes("upgrade")) return "supply_chain";
|
||||||
|
if (tl.includes("400g") || tl.includes("800g") || tl.includes("100g")) return "technology_launch";
|
||||||
|
if (tl.includes("price") || tl.includes("cost") || tl.includes("budget")) return "price_movement";
|
||||||
|
if (tl.includes("standard") || tl.includes("spec") || tl.includes("rfc")) return "standard_draft";
|
||||||
|
if (tl.includes("capex") || tl.includes("investment") || tl.includes("market")) return "capex_cycle";
|
||||||
|
if (tl.includes("lead time") || tl.includes("availability") || tl.includes("shortage")) return "distributor_lead_time";
|
||||||
|
return "supply_chain";
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectBuySignal(text: string): string {
|
||||||
|
const tl = text.toLowerCase();
|
||||||
|
if (tl.includes("deploy") || tl.includes("migration") || tl.includes("upgrade") || tl.includes("scale")) return "bullish";
|
||||||
|
if (tl.includes("shortage") || tl.includes("lead time") || tl.includes("limited")) return "opportunity";
|
||||||
|
if (tl.includes("wait") || tl.includes("next gen") || tl.includes("coming soon")) return "bearish";
|
||||||
|
return "neutral";
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTechnologies(text: string): string[] {
|
||||||
|
const techs: string[] = [];
|
||||||
|
const tl = text.toLowerCase();
|
||||||
|
const techMap: Record<string, string> = {
|
||||||
|
"400g": "400G", "800g": "800G", "100g": "100G", "25g": "25G", "10g": "10G",
|
||||||
|
"sfp28": "SFP28", "qsfp28": "QSFP28", "qsfp-dd": "QSFP-DD", "osfp": "OSFP",
|
||||||
|
"zr+": "400ZR+", "silicon photonics": "Silicon Photonics", "cpo": "CPO",
|
||||||
|
"dwdm": "DWDM", "cwdm": "CWDM", "lpo": "LPO", "dac": "DAC", "aoc": "AOC",
|
||||||
|
"mpo": "MPO", "bidi": "BiDi",
|
||||||
|
};
|
||||||
|
for (const [key, label] of Object.entries(techMap)) {
|
||||||
|
if (tl.includes(key)) techs.push(label);
|
||||||
|
}
|
||||||
|
return [...new Set(techs)];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main export ──────────────────────────────────────────────────────────────
|
||||||
|
export async function scrapeNogTalks(): Promise<void> {
|
||||||
|
console.log("[NOG Talks] Starting NOG conference talks scraper...");
|
||||||
|
|
||||||
|
// First try CtxEvent DB (when populated)
|
||||||
|
const ctxTalks = await pullFromCtxEvent();
|
||||||
|
console.log(`[NOG Talks] CtxEvent DB: ${ctxTalks.length} talks`);
|
||||||
|
|
||||||
|
// Always scrape live NOG sources
|
||||||
|
const [denog, nanog, ripe, enog, nlnog, euroix] = await Promise.all([
|
||||||
|
scrapeDENOG(),
|
||||||
|
scrapeNANOG(),
|
||||||
|
scrapeRIPE(),
|
||||||
|
scrapeENOG(),
|
||||||
|
scrapeNLNOG(),
|
||||||
|
scrapeEuroIX(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const allTalks = [...ctxTalks, ...denog, ...nanog, ...ripe, ...enog, ...nlnog, ...euroix];
|
||||||
|
console.log(`[NOG Talks] Collected ${allTalks.length} talks total (DENOG:${denog.length} NANOG:${nanog.length} RIPE:${ripe.length} ENOG:${enog.length} NLNOG:${nlnog.length} EuroIX:${euroix.length} CtxEvent:${ctxTalks.length})`);
|
||||||
|
|
||||||
|
const { stored, skipped } = await storeTalks(allTalks);
|
||||||
|
console.log(`[NOG Talks] Done — stored:${stored} (optics-relevant) skipped:${skipped} (not relevant)`);
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user