Rene Fichtmueller 4020ec77d9 feat: product intelligence layer — eBay enricher, community issues, datasheets+manuals API
- Migration 020: product_issues table, condition/marketplace on price_observations, features JSONB
- eBay enricher: switch features/description/refurb prices + transceiver condition pricing
- Community issues scraper: Reddit/ServeTheHome/Arista/Cisco community bug reports
- 7 pre-seeded issues (DCS-7800R3, SG350, QFX5120, CRS326, USW-Pro etc.)
- API: /switches/:id/issues + /switches/:id/documents endpoints
- Dashboard switch modal: features from DB, description, eBay refurb price, issues+docs async
- Datasheet finder for Arista/Cisco/Juniper/HPE vendor pages
- Scheduler: 4 new jobs (ebay enrichment nightly, community issues weekly)
2026-04-01 22:46:27 +02:00

392 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Community Issues Scraper
*
* Scrapes known issues, bugs, incompatibilities from:
* - Reddit r/networking, r/homelab, r/sysadmin
* - ServeTheHome forums
* - Arista Community / EOS Central
* - Cisco Community
* - Juniper Community
* - NetworkEngineering StackExchange
* - GitHub Issues (for SONiC, OpenConfig, etc.)
*
* Uses Crawler LLM to extract structured issue data.
*/
import { CheerioCrawler, RequestQueue } from "crawlee";
import { extractMarketIntel } from "../crawler-llm/core";
import { db as pool } from "../utils/db";
import { logger } from "../utils/logger";
interface ExtractedIssue {
productModel: string;
title: string;
summary: string;
severity: "info" | "warning" | "critical";
issueTags: string[];
affectedFirmware: string | null;
fixFirmware: string | null;
dateReported: string | null;
isResolved: boolean;
confidence: number;
}
// ─────────────────────────────────────────────────────────────────────────────
// Search URL builders per source
// ─────────────────────────────────────────────────────────────────────────────
const COMMUNITY_SOURCES: Array<{
name: string;
type: string;
buildSearchUrl: (model: string) => string;
}> = [
{
name: "Reddit r/networking",
type: "reddit",
buildSearchUrl: (model) =>
`https://www.reddit.com/r/networking/search/?q=${encodeURIComponent(model + " issue")}&sort=relevance&t=all`,
},
{
name: "Reddit r/homelab",
type: "reddit",
buildSearchUrl: (model) =>
`https://www.reddit.com/r/homelab/search/?q=${encodeURIComponent(model)}&sort=relevance&t=all`,
},
{
name: "ServeTheHome",
type: "forum",
buildSearchUrl: (model) =>
`https://forums.servethehome.com/index.php?search/1/?q=${encodeURIComponent(model)}&t=post&c[users]=&o=date`,
},
{
name: "Arista Community",
type: "vendor_kb",
buildSearchUrl: (model) =>
`https://eos.arista.com/?s=${encodeURIComponent(model)}`,
},
{
name: "Cisco Community",
type: "vendor_kb",
buildSearchUrl: (model) =>
`https://community.cisco.com/t5/forums/searchpage/tab/message?q=${encodeURIComponent(model + " transceiver issue")}&collapse_discussion=true`,
},
{
name: "NetworkEngineering SE",
type: "forum",
buildSearchUrl: (model) =>
`https://networkengineering.stackexchange.com/search?q=${encodeURIComponent(model)}`,
},
];
// ─────────────────────────────────────────────────────────────────────────────
// Determine severity from extracted intel
// ─────────────────────────────────────────────────────────────────────────────
function determineSeverity(text: string): "info" | "warning" | "critical" {
const lower = text.toLowerCase();
if (
lower.includes("security") ||
lower.includes("vulnerability") ||
lower.includes("cve") ||
lower.includes("crash") ||
lower.includes("data loss") ||
lower.includes("critical")
) return "critical";
if (
lower.includes("not working") ||
lower.includes("incompatib") ||
lower.includes("failure") ||
lower.includes("not recognized") ||
lower.includes("port down") ||
lower.includes("bug") ||
lower.includes("workaround")
) return "warning";
return "info";
}
// ─────────────────────────────────────────────────────────────────────────────
// Extract issue tags from text
// ─────────────────────────────────────────────────────────────────────────────
function extractIssueTags(text: string): string[] {
const lower = text.toLowerCase();
const tags: string[] = [];
const tagMap: [RegExp, string][] = [
[/firmware|eos|junos|nxos|iosxe/, "firmware"],
[/interop|compatibility|third.party/, "interop"],
[/macsec|encryption|security/, "macsec"],
[/temperature|thermal|overheating/, "thermal"],
[/dom|digital optical|ddm/, "dom"],
[/breakout|split|qsa|adapter/, "breakout"],
[/sfp\+?|qsfp|osfp|cfp/, "transceiver"],
[/vxlan|evpn|bgp|ospf/, "routing"],
[/poe|power/, "poe"],
[/stacking|lag|lacp/, "stacking"],
[/memory|buffer|overflow/, "memory"],
[/driver|module|kernel/, "driver"],
[/snmp|telemetry|monitoring/, "monitoring"],
[/latency|performance|throughput/, "performance"],
];
for (const [pattern, tag] of tagMap) {
if (pattern.test(lower)) tags.push(tag);
}
return [...new Set(tags)];
}
// ─────────────────────────────────────────────────────────────────────────────
// Save extracted issues to DB
// ─────────────────────────────────────────────────────────────────────────────
async function saveIssue(
issue: ExtractedIssue,
sourceUrl: string,
sourceName: string,
sourceType: string
): Promise<void> {
// Find product ID
const switchResult = await pool.query(
`SELECT id FROM switches WHERE model ILIKE $1 OR model ILIKE '%' || $2 || '%' LIMIT 1`,
[issue.productModel, issue.productModel.split("-")[0]]
);
const transceiverResult = await pool.query(
`SELECT id FROM transceivers WHERE part_number ILIKE $1 OR slug ILIKE $2 LIMIT 1`,
[issue.productModel, issue.productModel.toLowerCase().replace(/\s+/g, "-")]
);
const switchId = switchResult.rows[0]?.id || null;
const transceiverI = transceiverResult.rows[0]?.id || null;
if (!switchId && !transceiverI) {
// Unknown product — still store with model name for future lookup
logger.debug(`Issue for unknown product: ${issue.productModel}`);
}
await pool.query(
`INSERT INTO product_issues (
switch_id, transceiver_id, product_model,
source_type, source_name, source_url,
title, summary, severity, issue_tags,
affected_firmware, fix_firmware,
date_reported, is_resolved, confidence
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
ON CONFLICT DO NOTHING`,
[
switchId, transceiverI, issue.productModel,
sourceType, sourceName, sourceUrl,
issue.title, issue.summary, issue.severity, issue.issueTags,
issue.affectedFirmware, issue.fixFirmware,
issue.dateReported, issue.isResolved, issue.confidence,
]
);
}
// ─────────────────────────────────────────────────────────────────────────────
// Main: scrape community issues for given switch/transceiver models
// ─────────────────────────────────────────────────────────────────────────────
export async function scrapeProductIssues(
models: string[],
sourceLimit = 3
): Promise<void> {
const queue = await RequestQueue.open("community-issues");
// Add search requests for each model × source combination
for (const model of models) {
const sources = COMMUNITY_SOURCES.slice(0, sourceLimit);
for (const source of sources) {
await queue.addRequest({
url: source.buildSearchUrl(model),
userData: { model, sourceName: source.name, sourceType: source.type },
uniqueKey: `${source.name}-${model}`,
});
}
}
const crawler = new CheerioCrawler({
requestQueue: queue,
maxConcurrency: 2,
requestHandlerTimeoutSecs: 30,
navigationTimeoutSecs: 20,
async requestHandler({ request, $, response }) {
const { model, sourceName, sourceType } = request.userData as {
model: string; sourceName: string; sourceType: string;
};
// Extract text content for LLM analysis
// Remove nav, scripts, ads for cleaner input
$("nav, script, style, .ad, #sidebar, footer, header").remove();
const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 8000);
if (pageText.length < 100) return;
// Use Crawler LLM market intel extractor to find issues
const prompt = `You are analyzing a networking community forum/search results page.
Find any reports of problems, bugs, incompatibilities, or issues specifically about the networking device "${model}".
For each issue found, extract:
- title: brief description of the issue
- summary: 1-2 sentence explanation including cause and workaround if mentioned
- severity: "critical" (security/crash/data loss), "warning" (functional problem, workaround needed), or "info" (minor/cosmetic)
- affectedFirmware: firmware version where issue occurs (or null)
- fixFirmware: firmware version where it's fixed (or null)
- isResolved: true/false
- tags: array of relevant tags from: firmware, interop, thermal, dom, breakout, performance, security, config
Page text: ${pageText}
Return valid JSON array: [{"title":"...","summary":"...","severity":"...","affectedFirmware":null,"fixFirmware":null,"isResolved":false,"tags":[]}]
If no issues found, return []`;
try {
const intelResult = await extractMarketIntel(pageText, request.url, sourceName);
if (intelResult && intelResult.title) {
const issue: ExtractedIssue = {
productModel: model,
title: intelResult.title.substring(0, 200),
summary: intelResult.description?.substring(0, 500) || "",
severity: determineSeverity(intelResult.description || intelResult.title),
issueTags: extractIssueTags(`${intelResult.title} ${intelResult.description}`),
affectedFirmware: null,
fixFirmware: null,
dateReported: intelResult.publishedDate || null,
isResolved: false,
confidence: intelResult.confidence || 0.6,
};
await saveIssue(issue, request.url, sourceName, sourceType);
logger.info(`Issue saved: ${model}${issue.title.substring(0, 60)}`);
}
} catch (err) {
logger.warn(`Issue extraction failed for ${model} from ${sourceName}`, { err });
}
},
failedRequestHandler: ({ request, error }) => {
logger.warn(`Community scraper failed: ${request.url}`, { error });
},
});
await crawler.run();
logger.info(`Community issues scraping complete for ${models.length} models`);
}
// ─────────────────────────────────────────────────────────────────────────────
// Scrape issues for all switches in DB
// ─────────────────────────────────────────────────────────────────────────────
export async function scrapeAllSwitchIssues(limit = 30): Promise<void> {
const result = await pool.query<{ model: string }>(
`SELECT sw.model FROM switches sw
WHERE NOT EXISTS (
SELECT 1 FROM product_issues pi WHERE pi.product_model = sw.model
)
ORDER BY sw.max_speed_gbps DESC
LIMIT $1`,
[limit]
);
const models = result.rows.map(r => r.model);
if (models.length === 0) {
logger.info("All switches already have issue data");
return;
}
logger.info(`Scraping community issues for ${models.length} switches`);
await scrapeProductIssues(models, 2); // 2 sources per switch to avoid rate limits
}
// ─────────────────────────────────────────────────────────────────────────────
// Scrape datasheet links for switches/transceivers
// ─────────────────────────────────────────────────────────────────────────────
interface DatasheetSource {
vendor: string;
pattern: (model: string) => string | null;
}
const DATASHEET_SOURCES: DatasheetSource[] = [
{
vendor: "Arista",
pattern: (model) => {
const series = model.match(/DCS-(\d+)/)?.[1];
if (!series) return null;
return `https://www.arista.com/en/products/fixedconfiguration/${series.toLowerCase()}`;
},
},
{
vendor: "Cisco",
pattern: (model) => {
const lower = model.toLowerCase().replace(/\s+/g, "-");
return `https://www.cisco.com/c/en/us/products/collateral/switches/search.html?q=${encodeURIComponent(model)}`;
},
},
{
vendor: "Juniper",
pattern: (model) => {
const series = model.split("-")[0]?.toLowerCase();
if (!series) return null;
return `https://www.juniper.net/documentation/product/${series}.html`;
},
},
{
vendor: "HPE Aruba",
pattern: (model) =>
`https://h20195.www2.hpe.com/v2/getpdf.aspx/a00${model.replace(/[^a-z0-9]/gi, "").toLowerCase()}.pdf`,
},
];
export async function findAndSeedDatasheetLinks(limit = 50): Promise<void> {
const result = await pool.query<{ id: string; model: string; vendor_name: string }>(
`SELECT sw.id, sw.model, v.name AS vendor_name
FROM switches sw
JOIN vendors v ON sw.vendor_id = v.id
WHERE NOT EXISTS (
SELECT 1 FROM product_documents pd WHERE pd.switch_id = sw.id
)
LIMIT $1`,
[limit]
);
for (const sw of result.rows) {
for (const source of DATASHEET_SOURCES) {
if (!sw.vendor_name.toLowerCase().includes(source.vendor.toLowerCase())) continue;
const url = source.pattern(sw.model);
if (!url) continue;
try {
// Check if URL is accessible (simple HEAD request)
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
const resp = await fetch(url, { method: "HEAD", signal: controller.signal });
clearTimeout(timeout);
if (resp.ok) {
const docType = url.includes(".pdf") ? "datasheet" : "product_page";
await pool.query(
`INSERT INTO product_documents (switch_id, doc_type, title, source_url, is_official, language)
VALUES ($1, $2, $3, $4, TRUE, 'en')
ON CONFLICT DO NOTHING`,
[sw.id, docType, `${sw.vendor_name} ${sw.model} ${docType.replace("_", " ")}`, url]
);
logger.info(`✓ Doc linked: ${sw.model}${url}`);
}
} catch {
// URL not accessible — skip silently
}
}
}
}
// CLI entrypoint
if (require.main === module) {
(async () => {
const cmd = process.argv[2] || "issues";
if (cmd === "issues") {
await scrapeAllSwitchIssues(parseInt(process.argv[3] || "30"));
} else if (cmd === "datasheets") {
await findAndSeedDatasheetLinks(parseInt(process.argv[3] || "50"));
}
process.exit(0);
})();
}