- Register scrape:pricing:naddod (48 */2), qsfptek (52 */2), addon (55 */2) in pg-boss - Add boss.work() handlers for all three (fetch-based, run on Erik) - Fix findOrCreateScrapedTransceiver callers: remove invalid `name`/`url` params, fix `t.id` → `t` (function already returns string ID) - Fix ebay-enricher: remove invalid `extractType` option, use extraction.standard_name instead of non-existent `.description`, fix cheerio type incompatibility - Fix community-issues: description → summary, publishedDate → published_at - Startup zombie cleanup already deployed (index.ts) — no changes needed - ProLabs rewritten to fetch-based catalog scraper (no Playwright, bypasses WAF)
392 lines
16 KiB
TypeScript
392 lines
16 KiB
TypeScript
/**
|
||
* Community Issues Scraper
|
||
*
|
||
* Scrapes known issues, bugs, incompatibilities from:
|
||
* - Reddit r/networking, r/homelab, r/sysadmin
|
||
* - ServeTheHome forums
|
||
* - Arista Community / EOS Central
|
||
* - Cisco Community
|
||
* - Juniper Community
|
||
* - NetworkEngineering StackExchange
|
||
* - GitHub Issues (for SONiC, OpenConfig, etc.)
|
||
*
|
||
* Uses Crawler LLM to extract structured issue data.
|
||
*/
|
||
|
||
import { CheerioCrawler, RequestQueue } from "crawlee";
|
||
import { extractMarketIntel } from "../crawler-llm/core";
|
||
import { db as pool } from "../utils/db";
|
||
import { logger } from "../utils/logger";
|
||
|
||
interface ExtractedIssue {
|
||
productModel: string;
|
||
title: string;
|
||
summary: string;
|
||
severity: "info" | "warning" | "critical";
|
||
issueTags: string[];
|
||
affectedFirmware: string | null;
|
||
fixFirmware: string | null;
|
||
dateReported: string | null;
|
||
isResolved: boolean;
|
||
confidence: number;
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Search URL builders per source
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
const COMMUNITY_SOURCES: Array<{
|
||
name: string;
|
||
type: string;
|
||
buildSearchUrl: (model: string) => string;
|
||
}> = [
|
||
{
|
||
name: "Reddit r/networking",
|
||
type: "reddit",
|
||
buildSearchUrl: (model) =>
|
||
`https://www.reddit.com/r/networking/search/?q=${encodeURIComponent(model + " issue")}&sort=relevance&t=all`,
|
||
},
|
||
{
|
||
name: "Reddit r/homelab",
|
||
type: "reddit",
|
||
buildSearchUrl: (model) =>
|
||
`https://www.reddit.com/r/homelab/search/?q=${encodeURIComponent(model)}&sort=relevance&t=all`,
|
||
},
|
||
{
|
||
name: "ServeTheHome",
|
||
type: "forum",
|
||
buildSearchUrl: (model) =>
|
||
`https://forums.servethehome.com/index.php?search/1/?q=${encodeURIComponent(model)}&t=post&c[users]=&o=date`,
|
||
},
|
||
{
|
||
name: "Arista Community",
|
||
type: "vendor_kb",
|
||
buildSearchUrl: (model) =>
|
||
`https://eos.arista.com/?s=${encodeURIComponent(model)}`,
|
||
},
|
||
{
|
||
name: "Cisco Community",
|
||
type: "vendor_kb",
|
||
buildSearchUrl: (model) =>
|
||
`https://community.cisco.com/t5/forums/searchpage/tab/message?q=${encodeURIComponent(model + " transceiver issue")}&collapse_discussion=true`,
|
||
},
|
||
{
|
||
name: "NetworkEngineering SE",
|
||
type: "forum",
|
||
buildSearchUrl: (model) =>
|
||
`https://networkengineering.stackexchange.com/search?q=${encodeURIComponent(model)}`,
|
||
},
|
||
];
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Determine severity from extracted intel
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
function determineSeverity(text: string): "info" | "warning" | "critical" {
|
||
const lower = text.toLowerCase();
|
||
if (
|
||
lower.includes("security") ||
|
||
lower.includes("vulnerability") ||
|
||
lower.includes("cve") ||
|
||
lower.includes("crash") ||
|
||
lower.includes("data loss") ||
|
||
lower.includes("critical")
|
||
) return "critical";
|
||
|
||
if (
|
||
lower.includes("not working") ||
|
||
lower.includes("incompatib") ||
|
||
lower.includes("failure") ||
|
||
lower.includes("not recognized") ||
|
||
lower.includes("port down") ||
|
||
lower.includes("bug") ||
|
||
lower.includes("workaround")
|
||
) return "warning";
|
||
|
||
return "info";
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Extract issue tags from text
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
function extractIssueTags(text: string): string[] {
|
||
const lower = text.toLowerCase();
|
||
const tags: string[] = [];
|
||
const tagMap: [RegExp, string][] = [
|
||
[/firmware|eos|junos|nxos|iosxe/, "firmware"],
|
||
[/interop|compatibility|third.party/, "interop"],
|
||
[/macsec|encryption|security/, "macsec"],
|
||
[/temperature|thermal|overheating/, "thermal"],
|
||
[/dom|digital optical|ddm/, "dom"],
|
||
[/breakout|split|qsa|adapter/, "breakout"],
|
||
[/sfp\+?|qsfp|osfp|cfp/, "transceiver"],
|
||
[/vxlan|evpn|bgp|ospf/, "routing"],
|
||
[/poe|power/, "poe"],
|
||
[/stacking|lag|lacp/, "stacking"],
|
||
[/memory|buffer|overflow/, "memory"],
|
||
[/driver|module|kernel/, "driver"],
|
||
[/snmp|telemetry|monitoring/, "monitoring"],
|
||
[/latency|performance|throughput/, "performance"],
|
||
];
|
||
for (const [pattern, tag] of tagMap) {
|
||
if (pattern.test(lower)) tags.push(tag);
|
||
}
|
||
return [...new Set(tags)];
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Save extracted issues to DB
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
async function saveIssue(
|
||
issue: ExtractedIssue,
|
||
sourceUrl: string,
|
||
sourceName: string,
|
||
sourceType: string
|
||
): Promise<void> {
|
||
// Find product ID
|
||
const switchResult = await pool.query(
|
||
`SELECT id FROM switches WHERE model ILIKE $1 OR model ILIKE '%' || $2 || '%' LIMIT 1`,
|
||
[issue.productModel, issue.productModel.split("-")[0]]
|
||
);
|
||
const transceiverResult = await pool.query(
|
||
`SELECT id FROM transceivers WHERE part_number ILIKE $1 OR slug ILIKE $2 LIMIT 1`,
|
||
[issue.productModel, issue.productModel.toLowerCase().replace(/\s+/g, "-")]
|
||
);
|
||
|
||
const switchId = switchResult.rows[0]?.id || null;
|
||
const transceiverI = transceiverResult.rows[0]?.id || null;
|
||
|
||
if (!switchId && !transceiverI) {
|
||
// Unknown product — still store with model name for future lookup
|
||
logger.debug(`Issue for unknown product: ${issue.productModel}`);
|
||
}
|
||
|
||
await pool.query(
|
||
`INSERT INTO product_issues (
|
||
switch_id, transceiver_id, product_model,
|
||
source_type, source_name, source_url,
|
||
title, summary, severity, issue_tags,
|
||
affected_firmware, fix_firmware,
|
||
date_reported, is_resolved, confidence
|
||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
|
||
ON CONFLICT DO NOTHING`,
|
||
[
|
||
switchId, transceiverI, issue.productModel,
|
||
sourceType, sourceName, sourceUrl,
|
||
issue.title, issue.summary, issue.severity, issue.issueTags,
|
||
issue.affectedFirmware, issue.fixFirmware,
|
||
issue.dateReported, issue.isResolved, issue.confidence,
|
||
]
|
||
);
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Main: scrape community issues for given switch/transceiver models
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
export async function scrapeProductIssues(
|
||
models: string[],
|
||
sourceLimit = 3
|
||
): Promise<void> {
|
||
const queue = await RequestQueue.open("community-issues");
|
||
|
||
// Add search requests for each model × source combination
|
||
for (const model of models) {
|
||
const sources = COMMUNITY_SOURCES.slice(0, sourceLimit);
|
||
for (const source of sources) {
|
||
await queue.addRequest({
|
||
url: source.buildSearchUrl(model),
|
||
userData: { model, sourceName: source.name, sourceType: source.type },
|
||
uniqueKey: `${source.name}-${model}`,
|
||
});
|
||
}
|
||
}
|
||
|
||
const crawler = new CheerioCrawler({
|
||
requestQueue: queue,
|
||
maxConcurrency: 2,
|
||
requestHandlerTimeoutSecs: 30,
|
||
navigationTimeoutSecs: 20,
|
||
|
||
async requestHandler({ request, $, response }) {
|
||
const { model, sourceName, sourceType } = request.userData as {
|
||
model: string; sourceName: string; sourceType: string;
|
||
};
|
||
|
||
// Extract text content for LLM analysis
|
||
// Remove nav, scripts, ads for cleaner input
|
||
$("nav, script, style, .ad, #sidebar, footer, header").remove();
|
||
const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 8000);
|
||
|
||
if (pageText.length < 100) return;
|
||
|
||
// Use Crawler LLM market intel extractor to find issues
|
||
const prompt = `You are analyzing a networking community forum/search results page.
|
||
Find any reports of problems, bugs, incompatibilities, or issues specifically about the networking device "${model}".
|
||
|
||
For each issue found, extract:
|
||
- title: brief description of the issue
|
||
- summary: 1-2 sentence explanation including cause and workaround if mentioned
|
||
- severity: "critical" (security/crash/data loss), "warning" (functional problem, workaround needed), or "info" (minor/cosmetic)
|
||
- affectedFirmware: firmware version where issue occurs (or null)
|
||
- fixFirmware: firmware version where it's fixed (or null)
|
||
- isResolved: true/false
|
||
- tags: array of relevant tags from: firmware, interop, thermal, dom, breakout, performance, security, config
|
||
|
||
Page text: ${pageText}
|
||
|
||
Return valid JSON array: [{"title":"...","summary":"...","severity":"...","affectedFirmware":null,"fixFirmware":null,"isResolved":false,"tags":[]}]
|
||
If no issues found, return []`;
|
||
|
||
try {
|
||
const intelResult = await extractMarketIntel(pageText, request.url, sourceName);
|
||
|
||
if (intelResult && intelResult.title) {
|
||
const issue: ExtractedIssue = {
|
||
productModel: model,
|
||
title: intelResult.title.substring(0, 200),
|
||
summary: intelResult.summary?.substring(0, 500) || "",
|
||
severity: determineSeverity(intelResult.summary || intelResult.title),
|
||
issueTags: extractIssueTags(`${intelResult.title} ${intelResult.summary}`),
|
||
affectedFirmware: null,
|
||
fixFirmware: null,
|
||
dateReported: intelResult.published_at || null,
|
||
isResolved: false,
|
||
confidence: intelResult.confidence || 0.6,
|
||
};
|
||
|
||
await saveIssue(issue, request.url, sourceName, sourceType);
|
||
logger.info(`Issue saved: ${model} — ${issue.title.substring(0, 60)}`);
|
||
}
|
||
} catch (err) {
|
||
logger.warn(`Issue extraction failed for ${model} from ${sourceName}`, { err });
|
||
}
|
||
},
|
||
|
||
failedRequestHandler: ({ request, error }) => {
|
||
logger.warn(`Community scraper failed: ${request.url}`, { error });
|
||
},
|
||
});
|
||
|
||
await crawler.run();
|
||
logger.info(`Community issues scraping complete for ${models.length} models`);
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Scrape issues for all switches in DB
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
export async function scrapeAllSwitchIssues(limit = 30): Promise<void> {
|
||
const result = await pool.query<{ model: string }>(
|
||
`SELECT sw.model FROM switches sw
|
||
WHERE NOT EXISTS (
|
||
SELECT 1 FROM product_issues pi WHERE pi.product_model = sw.model
|
||
)
|
||
ORDER BY sw.max_speed_gbps DESC
|
||
LIMIT $1`,
|
||
[limit]
|
||
);
|
||
|
||
const models = result.rows.map(r => r.model);
|
||
if (models.length === 0) {
|
||
logger.info("All switches already have issue data");
|
||
return;
|
||
}
|
||
|
||
logger.info(`Scraping community issues for ${models.length} switches`);
|
||
await scrapeProductIssues(models, 2); // 2 sources per switch to avoid rate limits
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
// Scrape datasheet links for switches/transceivers
|
||
// ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
interface DatasheetSource {
|
||
vendor: string;
|
||
pattern: (model: string) => string | null;
|
||
}
|
||
|
||
const DATASHEET_SOURCES: DatasheetSource[] = [
|
||
{
|
||
vendor: "Arista",
|
||
pattern: (model) => {
|
||
const series = model.match(/DCS-(\d+)/)?.[1];
|
||
if (!series) return null;
|
||
return `https://www.arista.com/en/products/fixedconfiguration/${series.toLowerCase()}`;
|
||
},
|
||
},
|
||
{
|
||
vendor: "Cisco",
|
||
pattern: (model) => {
|
||
const lower = model.toLowerCase().replace(/\s+/g, "-");
|
||
return `https://www.cisco.com/c/en/us/products/collateral/switches/search.html?q=${encodeURIComponent(model)}`;
|
||
},
|
||
},
|
||
{
|
||
vendor: "Juniper",
|
||
pattern: (model) => {
|
||
const series = model.split("-")[0]?.toLowerCase();
|
||
if (!series) return null;
|
||
return `https://www.juniper.net/documentation/product/${series}.html`;
|
||
},
|
||
},
|
||
{
|
||
vendor: "HPE Aruba",
|
||
pattern: (model) =>
|
||
`https://h20195.www2.hpe.com/v2/getpdf.aspx/a00${model.replace(/[^a-z0-9]/gi, "").toLowerCase()}.pdf`,
|
||
},
|
||
];
|
||
|
||
export async function findAndSeedDatasheetLinks(limit = 50): Promise<void> {
|
||
const result = await pool.query<{ id: string; model: string; vendor_name: string }>(
|
||
`SELECT sw.id, sw.model, v.name AS vendor_name
|
||
FROM switches sw
|
||
JOIN vendors v ON sw.vendor_id = v.id
|
||
WHERE NOT EXISTS (
|
||
SELECT 1 FROM product_documents pd WHERE pd.switch_id = sw.id
|
||
)
|
||
LIMIT $1`,
|
||
[limit]
|
||
);
|
||
|
||
for (const sw of result.rows) {
|
||
for (const source of DATASHEET_SOURCES) {
|
||
if (!sw.vendor_name.toLowerCase().includes(source.vendor.toLowerCase())) continue;
|
||
|
||
const url = source.pattern(sw.model);
|
||
if (!url) continue;
|
||
|
||
try {
|
||
// Check if URL is accessible (simple HEAD request)
|
||
const controller = new AbortController();
|
||
const timeout = setTimeout(() => controller.abort(), 5000);
|
||
const resp = await fetch(url, { method: "HEAD", signal: controller.signal });
|
||
clearTimeout(timeout);
|
||
|
||
if (resp.ok) {
|
||
const docType = url.includes(".pdf") ? "datasheet" : "product_page";
|
||
await pool.query(
|
||
`INSERT INTO product_documents (switch_id, doc_type, title, source_url, is_official, language)
|
||
VALUES ($1, $2, $3, $4, TRUE, 'en')
|
||
ON CONFLICT DO NOTHING`,
|
||
[sw.id, docType, `${sw.vendor_name} ${sw.model} ${docType.replace("_", " ")}`, url]
|
||
);
|
||
logger.info(`✓ Doc linked: ${sw.model} → ${url}`);
|
||
}
|
||
} catch {
|
||
// URL not accessible — skip silently
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// CLI entrypoint
|
||
if (require.main === module) {
|
||
(async () => {
|
||
const cmd = process.argv[2] || "issues";
|
||
if (cmd === "issues") {
|
||
await scrapeAllSwitchIssues(parseInt(process.argv[3] || "30"));
|
||
} else if (cmd === "datasheets") {
|
||
await findAndSeedDatasheetLinks(parseInt(process.argv[3] || "50"));
|
||
}
|
||
process.exit(0);
|
||
})();
|
||
}
|