/** * Community Issues Scraper * * Scrapes known issues, bugs, incompatibilities from: * - Reddit r/networking, r/homelab, r/sysadmin * - ServeTheHome forums * - Arista Community / EOS Central * - Cisco Community * - Juniper Community * - NetworkEngineering StackExchange * - GitHub Issues (for SONiC, OpenConfig, etc.) * * Uses Crawler LLM to extract structured issue data. */ import { CheerioCrawler, RequestQueue } from "crawlee"; import { extractMarketIntel } from "../crawler-llm/core"; import { db as pool } from "../utils/db"; import { logger } from "../utils/logger"; interface ExtractedIssue { productModel: string; title: string; summary: string; severity: "info" | "warning" | "critical"; issueTags: string[]; affectedFirmware: string | null; fixFirmware: string | null; dateReported: string | null; isResolved: boolean; confidence: number; } // ───────────────────────────────────────────────────────────────────────────── // Search URL builders per source // ───────────────────────────────────────────────────────────────────────────── const COMMUNITY_SOURCES: Array<{ name: string; type: string; buildSearchUrl: (model: string) => string; }> = [ { name: "Reddit r/networking", type: "reddit", buildSearchUrl: (model) => `https://www.reddit.com/r/networking/search/?q=${encodeURIComponent(model + " issue")}&sort=relevance&t=all`, }, { name: "Reddit r/homelab", type: "reddit", buildSearchUrl: (model) => `https://www.reddit.com/r/homelab/search/?q=${encodeURIComponent(model)}&sort=relevance&t=all`, }, { name: "ServeTheHome", type: "forum", buildSearchUrl: (model) => `https://forums.servethehome.com/index.php?search/1/?q=${encodeURIComponent(model)}&t=post&c[users]=&o=date`, }, { name: "Arista Community", type: "vendor_kb", buildSearchUrl: (model) => `https://eos.arista.com/?s=${encodeURIComponent(model)}`, }, { name: "Cisco Community", type: "vendor_kb", buildSearchUrl: (model) => `https://community.cisco.com/t5/forums/searchpage/tab/message?q=${encodeURIComponent(model + " transceiver issue")}&collapse_discussion=true`, }, { name: "NetworkEngineering SE", type: "forum", buildSearchUrl: (model) => `https://networkengineering.stackexchange.com/search?q=${encodeURIComponent(model)}`, }, ]; // ───────────────────────────────────────────────────────────────────────────── // Determine severity from extracted intel // ───────────────────────────────────────────────────────────────────────────── function determineSeverity(text: string): "info" | "warning" | "critical" { const lower = text.toLowerCase(); if ( lower.includes("security") || lower.includes("vulnerability") || lower.includes("cve") || lower.includes("crash") || lower.includes("data loss") || lower.includes("critical") ) return "critical"; if ( lower.includes("not working") || lower.includes("incompatib") || lower.includes("failure") || lower.includes("not recognized") || lower.includes("port down") || lower.includes("bug") || lower.includes("workaround") ) return "warning"; return "info"; } // ───────────────────────────────────────────────────────────────────────────── // Extract issue tags from text // ───────────────────────────────────────────────────────────────────────────── function extractIssueTags(text: string): string[] { const lower = text.toLowerCase(); const tags: string[] = []; const tagMap: [RegExp, string][] = [ [/firmware|eos|junos|nxos|iosxe/, "firmware"], [/interop|compatibility|third.party/, "interop"], [/macsec|encryption|security/, "macsec"], [/temperature|thermal|overheating/, "thermal"], [/dom|digital optical|ddm/, "dom"], [/breakout|split|qsa|adapter/, "breakout"], [/sfp\+?|qsfp|osfp|cfp/, "transceiver"], [/vxlan|evpn|bgp|ospf/, "routing"], [/poe|power/, "poe"], [/stacking|lag|lacp/, "stacking"], [/memory|buffer|overflow/, "memory"], [/driver|module|kernel/, "driver"], [/snmp|telemetry|monitoring/, "monitoring"], [/latency|performance|throughput/, "performance"], ]; for (const [pattern, tag] of tagMap) { if (pattern.test(lower)) tags.push(tag); } return [...new Set(tags)]; } // ───────────────────────────────────────────────────────────────────────────── // Save extracted issues to DB // ───────────────────────────────────────────────────────────────────────────── async function saveIssue( issue: ExtractedIssue, sourceUrl: string, sourceName: string, sourceType: string ): Promise { // Find product ID const switchResult = await pool.query( `SELECT id FROM switches WHERE model ILIKE $1 OR model ILIKE '%' || $2 || '%' LIMIT 1`, [issue.productModel, issue.productModel.split("-")[0]] ); const transceiverResult = await pool.query( `SELECT id FROM transceivers WHERE part_number ILIKE $1 OR slug ILIKE $2 LIMIT 1`, [issue.productModel, issue.productModel.toLowerCase().replace(/\s+/g, "-")] ); const switchId = switchResult.rows[0]?.id || null; const transceiverI = transceiverResult.rows[0]?.id || null; if (!switchId && !transceiverI) { // Unknown product — still store with model name for future lookup logger.debug(`Issue for unknown product: ${issue.productModel}`); } await pool.query( `INSERT INTO product_issues ( switch_id, transceiver_id, product_model, source_type, source_name, source_url, title, summary, severity, issue_tags, affected_firmware, fix_firmware, date_reported, is_resolved, confidence ) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15) ON CONFLICT DO NOTHING`, [ switchId, transceiverI, issue.productModel, sourceType, sourceName, sourceUrl, issue.title, issue.summary, issue.severity, issue.issueTags, issue.affectedFirmware, issue.fixFirmware, issue.dateReported, issue.isResolved, issue.confidence, ] ); } // ───────────────────────────────────────────────────────────────────────────── // Main: scrape community issues for given switch/transceiver models // ───────────────────────────────────────────────────────────────────────────── export async function scrapeProductIssues( models: string[], sourceLimit = 3 ): Promise { const queue = await RequestQueue.open("community-issues"); // Add search requests for each model × source combination for (const model of models) { const sources = COMMUNITY_SOURCES.slice(0, sourceLimit); for (const source of sources) { await queue.addRequest({ url: source.buildSearchUrl(model), userData: { model, sourceName: source.name, sourceType: source.type }, uniqueKey: `${source.name}-${model}`, }); } } const crawler = new CheerioCrawler({ requestQueue: queue, maxConcurrency: 2, requestHandlerTimeoutSecs: 30, navigationTimeoutSecs: 20, async requestHandler({ request, $, response }) { const { model, sourceName, sourceType } = request.userData as { model: string; sourceName: string; sourceType: string; }; // Extract text content for LLM analysis // Remove nav, scripts, ads for cleaner input $("nav, script, style, .ad, #sidebar, footer, header").remove(); const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 8000); if (pageText.length < 100) return; // Use Crawler LLM market intel extractor to find issues const prompt = `You are analyzing a networking community forum/search results page. Find any reports of problems, bugs, incompatibilities, or issues specifically about the networking device "${model}". For each issue found, extract: - title: brief description of the issue - summary: 1-2 sentence explanation including cause and workaround if mentioned - severity: "critical" (security/crash/data loss), "warning" (functional problem, workaround needed), or "info" (minor/cosmetic) - affectedFirmware: firmware version where issue occurs (or null) - fixFirmware: firmware version where it's fixed (or null) - isResolved: true/false - tags: array of relevant tags from: firmware, interop, thermal, dom, breakout, performance, security, config Page text: ${pageText} Return valid JSON array: [{"title":"...","summary":"...","severity":"...","affectedFirmware":null,"fixFirmware":null,"isResolved":false,"tags":[]}] If no issues found, return []`; try { const intelResult = await extractMarketIntel(pageText, request.url, sourceName); if (intelResult && intelResult.title) { const issue: ExtractedIssue = { productModel: model, title: intelResult.title.substring(0, 200), summary: intelResult.summary?.substring(0, 500) || "", severity: determineSeverity(intelResult.summary || intelResult.title), issueTags: extractIssueTags(`${intelResult.title} ${intelResult.summary}`), affectedFirmware: null, fixFirmware: null, dateReported: intelResult.published_at || null, isResolved: false, confidence: intelResult.confidence || 0.6, }; await saveIssue(issue, request.url, sourceName, sourceType); logger.info(`Issue saved: ${model} — ${issue.title.substring(0, 60)}`); } } catch (err) { logger.warn(`Issue extraction failed for ${model} from ${sourceName}`, { err }); } }, failedRequestHandler: ({ request, error }) => { logger.warn(`Community scraper failed: ${request.url}`, { error }); }, }); await crawler.run(); logger.info(`Community issues scraping complete for ${models.length} models`); } // ───────────────────────────────────────────────────────────────────────────── // Scrape issues for all switches in DB // ───────────────────────────────────────────────────────────────────────────── export async function scrapeAllSwitchIssues(limit = 30): Promise { const result = await pool.query<{ model: string }>( `SELECT sw.model FROM switches sw WHERE NOT EXISTS ( SELECT 1 FROM product_issues pi WHERE pi.product_model = sw.model ) ORDER BY sw.max_speed_gbps DESC LIMIT $1`, [limit] ); const models = result.rows.map(r => r.model); if (models.length === 0) { logger.info("All switches already have issue data"); return; } logger.info(`Scraping community issues for ${models.length} switches`); await scrapeProductIssues(models, 2); // 2 sources per switch to avoid rate limits } // ───────────────────────────────────────────────────────────────────────────── // Scrape datasheet links for switches/transceivers // ───────────────────────────────────────────────────────────────────────────── interface DatasheetSource { vendor: string; pattern: (model: string) => string | null; } const DATASHEET_SOURCES: DatasheetSource[] = [ { vendor: "Arista", pattern: (model) => { const series = model.match(/DCS-(\d+)/)?.[1]; if (!series) return null; return `https://www.arista.com/en/products/fixedconfiguration/${series.toLowerCase()}`; }, }, { vendor: "Cisco", pattern: (model) => { const lower = model.toLowerCase().replace(/\s+/g, "-"); return `https://www.cisco.com/c/en/us/products/collateral/switches/search.html?q=${encodeURIComponent(model)}`; }, }, { vendor: "Juniper", pattern: (model) => { const series = model.split("-")[0]?.toLowerCase(); if (!series) return null; return `https://www.juniper.net/documentation/product/${series}.html`; }, }, { vendor: "HPE Aruba", pattern: (model) => `https://h20195.www2.hpe.com/v2/getpdf.aspx/a00${model.replace(/[^a-z0-9]/gi, "").toLowerCase()}.pdf`, }, ]; export async function findAndSeedDatasheetLinks(limit = 50): Promise { const result = await pool.query<{ id: string; model: string; vendor_name: string }>( `SELECT sw.id, sw.model, v.name AS vendor_name FROM switches sw JOIN vendors v ON sw.vendor_id = v.id WHERE NOT EXISTS ( SELECT 1 FROM product_documents pd WHERE pd.switch_id = sw.id ) LIMIT $1`, [limit] ); for (const sw of result.rows) { for (const source of DATASHEET_SOURCES) { if (!sw.vendor_name.toLowerCase().includes(source.vendor.toLowerCase())) continue; const url = source.pattern(sw.model); if (!url) continue; try { // Check if URL is accessible (simple HEAD request) const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 5000); const resp = await fetch(url, { method: "HEAD", signal: controller.signal }); clearTimeout(timeout); if (resp.ok) { const docType = url.includes(".pdf") ? "datasheet" : "product_page"; await pool.query( `INSERT INTO product_documents (switch_id, doc_type, title, source_url, is_official, language) VALUES ($1, $2, $3, $4, TRUE, 'en') ON CONFLICT DO NOTHING`, [sw.id, docType, `${sw.vendor_name} ${sw.model} ${docType.replace("_", " ")}`, url] ); logger.info(`✓ Doc linked: ${sw.model} → ${url}`); } } catch { // URL not accessible — skip silently } } } } // CLI entrypoint if (require.main === module) { (async () => { const cmd = process.argv[2] || "issues"; if (cmd === "issues") { await scrapeAllSwitchIssues(parseInt(process.argv[3] || "30")); } else if (cmd === "datasheets") { await findAndSeedDatasheetLinks(parseInt(process.argv[3] || "50")); } process.exit(0); })(); }