Rene Fichtmueller a3af873206 feat(scraper): add NADDOD/QSFPTEK/AddOn to scheduler, fix pre-existing TS build errors
- Register scrape:pricing:naddod (48 */2), qsfptek (52 */2), addon (55 */2) in pg-boss
- Add boss.work() handlers for all three (fetch-based, run on Erik)
- Fix findOrCreateScrapedTransceiver callers: remove invalid `name`/`url` params,
  fix `t.id` → `t` (function already returns string ID)
- Fix ebay-enricher: remove invalid `extractType` option, use extraction.standard_name
  instead of non-existent `.description`, fix cheerio type incompatibility
- Fix community-issues: description → summary, publishedDate → published_at
- Startup zombie cleanup already deployed (index.ts) — no changes needed
- ProLabs rewritten to fetch-based catalog scraper (no Playwright, bypasses WAF)
2026-04-11 03:17:33 +02:00

392 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Community Issues Scraper
*
* Scrapes known issues, bugs, incompatibilities from:
* - Reddit r/networking, r/homelab, r/sysadmin
* - ServeTheHome forums
* - Arista Community / EOS Central
* - Cisco Community
* - Juniper Community
* - NetworkEngineering StackExchange
* - GitHub Issues (for SONiC, OpenConfig, etc.)
*
* Uses Crawler LLM to extract structured issue data.
*/
import { CheerioCrawler, RequestQueue } from "crawlee";
import { extractMarketIntel } from "../crawler-llm/core";
import { db as pool } from "../utils/db";
import { logger } from "../utils/logger";
interface ExtractedIssue {
productModel: string;
title: string;
summary: string;
severity: "info" | "warning" | "critical";
issueTags: string[];
affectedFirmware: string | null;
fixFirmware: string | null;
dateReported: string | null;
isResolved: boolean;
confidence: number;
}
// ─────────────────────────────────────────────────────────────────────────────
// Search URL builders per source
// ─────────────────────────────────────────────────────────────────────────────
const COMMUNITY_SOURCES: Array<{
name: string;
type: string;
buildSearchUrl: (model: string) => string;
}> = [
{
name: "Reddit r/networking",
type: "reddit",
buildSearchUrl: (model) =>
`https://www.reddit.com/r/networking/search/?q=${encodeURIComponent(model + " issue")}&sort=relevance&t=all`,
},
{
name: "Reddit r/homelab",
type: "reddit",
buildSearchUrl: (model) =>
`https://www.reddit.com/r/homelab/search/?q=${encodeURIComponent(model)}&sort=relevance&t=all`,
},
{
name: "ServeTheHome",
type: "forum",
buildSearchUrl: (model) =>
`https://forums.servethehome.com/index.php?search/1/?q=${encodeURIComponent(model)}&t=post&c[users]=&o=date`,
},
{
name: "Arista Community",
type: "vendor_kb",
buildSearchUrl: (model) =>
`https://eos.arista.com/?s=${encodeURIComponent(model)}`,
},
{
name: "Cisco Community",
type: "vendor_kb",
buildSearchUrl: (model) =>
`https://community.cisco.com/t5/forums/searchpage/tab/message?q=${encodeURIComponent(model + " transceiver issue")}&collapse_discussion=true`,
},
{
name: "NetworkEngineering SE",
type: "forum",
buildSearchUrl: (model) =>
`https://networkengineering.stackexchange.com/search?q=${encodeURIComponent(model)}`,
},
];
// ─────────────────────────────────────────────────────────────────────────────
// Determine severity from extracted intel
// ─────────────────────────────────────────────────────────────────────────────
function determineSeverity(text: string): "info" | "warning" | "critical" {
const lower = text.toLowerCase();
if (
lower.includes("security") ||
lower.includes("vulnerability") ||
lower.includes("cve") ||
lower.includes("crash") ||
lower.includes("data loss") ||
lower.includes("critical")
) return "critical";
if (
lower.includes("not working") ||
lower.includes("incompatib") ||
lower.includes("failure") ||
lower.includes("not recognized") ||
lower.includes("port down") ||
lower.includes("bug") ||
lower.includes("workaround")
) return "warning";
return "info";
}
// ─────────────────────────────────────────────────────────────────────────────
// Extract issue tags from text
// ─────────────────────────────────────────────────────────────────────────────
function extractIssueTags(text: string): string[] {
const lower = text.toLowerCase();
const tags: string[] = [];
const tagMap: [RegExp, string][] = [
[/firmware|eos|junos|nxos|iosxe/, "firmware"],
[/interop|compatibility|third.party/, "interop"],
[/macsec|encryption|security/, "macsec"],
[/temperature|thermal|overheating/, "thermal"],
[/dom|digital optical|ddm/, "dom"],
[/breakout|split|qsa|adapter/, "breakout"],
[/sfp\+?|qsfp|osfp|cfp/, "transceiver"],
[/vxlan|evpn|bgp|ospf/, "routing"],
[/poe|power/, "poe"],
[/stacking|lag|lacp/, "stacking"],
[/memory|buffer|overflow/, "memory"],
[/driver|module|kernel/, "driver"],
[/snmp|telemetry|monitoring/, "monitoring"],
[/latency|performance|throughput/, "performance"],
];
for (const [pattern, tag] of tagMap) {
if (pattern.test(lower)) tags.push(tag);
}
return [...new Set(tags)];
}
// ─────────────────────────────────────────────────────────────────────────────
// Save extracted issues to DB
// ─────────────────────────────────────────────────────────────────────────────
async function saveIssue(
issue: ExtractedIssue,
sourceUrl: string,
sourceName: string,
sourceType: string
): Promise<void> {
// Find product ID
const switchResult = await pool.query(
`SELECT id FROM switches WHERE model ILIKE $1 OR model ILIKE '%' || $2 || '%' LIMIT 1`,
[issue.productModel, issue.productModel.split("-")[0]]
);
const transceiverResult = await pool.query(
`SELECT id FROM transceivers WHERE part_number ILIKE $1 OR slug ILIKE $2 LIMIT 1`,
[issue.productModel, issue.productModel.toLowerCase().replace(/\s+/g, "-")]
);
const switchId = switchResult.rows[0]?.id || null;
const transceiverI = transceiverResult.rows[0]?.id || null;
if (!switchId && !transceiverI) {
// Unknown product — still store with model name for future lookup
logger.debug(`Issue for unknown product: ${issue.productModel}`);
}
await pool.query(
`INSERT INTO product_issues (
switch_id, transceiver_id, product_model,
source_type, source_name, source_url,
title, summary, severity, issue_tags,
affected_firmware, fix_firmware,
date_reported, is_resolved, confidence
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
ON CONFLICT DO NOTHING`,
[
switchId, transceiverI, issue.productModel,
sourceType, sourceName, sourceUrl,
issue.title, issue.summary, issue.severity, issue.issueTags,
issue.affectedFirmware, issue.fixFirmware,
issue.dateReported, issue.isResolved, issue.confidence,
]
);
}
// ─────────────────────────────────────────────────────────────────────────────
// Main: scrape community issues for given switch/transceiver models
// ─────────────────────────────────────────────────────────────────────────────
export async function scrapeProductIssues(
models: string[],
sourceLimit = 3
): Promise<void> {
const queue = await RequestQueue.open("community-issues");
// Add search requests for each model × source combination
for (const model of models) {
const sources = COMMUNITY_SOURCES.slice(0, sourceLimit);
for (const source of sources) {
await queue.addRequest({
url: source.buildSearchUrl(model),
userData: { model, sourceName: source.name, sourceType: source.type },
uniqueKey: `${source.name}-${model}`,
});
}
}
const crawler = new CheerioCrawler({
requestQueue: queue,
maxConcurrency: 2,
requestHandlerTimeoutSecs: 30,
navigationTimeoutSecs: 20,
async requestHandler({ request, $, response }) {
const { model, sourceName, sourceType } = request.userData as {
model: string; sourceName: string; sourceType: string;
};
// Extract text content for LLM analysis
// Remove nav, scripts, ads for cleaner input
$("nav, script, style, .ad, #sidebar, footer, header").remove();
const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 8000);
if (pageText.length < 100) return;
// Use Crawler LLM market intel extractor to find issues
const prompt = `You are analyzing a networking community forum/search results page.
Find any reports of problems, bugs, incompatibilities, or issues specifically about the networking device "${model}".
For each issue found, extract:
- title: brief description of the issue
- summary: 1-2 sentence explanation including cause and workaround if mentioned
- severity: "critical" (security/crash/data loss), "warning" (functional problem, workaround needed), or "info" (minor/cosmetic)
- affectedFirmware: firmware version where issue occurs (or null)
- fixFirmware: firmware version where it's fixed (or null)
- isResolved: true/false
- tags: array of relevant tags from: firmware, interop, thermal, dom, breakout, performance, security, config
Page text: ${pageText}
Return valid JSON array: [{"title":"...","summary":"...","severity":"...","affectedFirmware":null,"fixFirmware":null,"isResolved":false,"tags":[]}]
If no issues found, return []`;
try {
const intelResult = await extractMarketIntel(pageText, request.url, sourceName);
if (intelResult && intelResult.title) {
const issue: ExtractedIssue = {
productModel: model,
title: intelResult.title.substring(0, 200),
summary: intelResult.summary?.substring(0, 500) || "",
severity: determineSeverity(intelResult.summary || intelResult.title),
issueTags: extractIssueTags(`${intelResult.title} ${intelResult.summary}`),
affectedFirmware: null,
fixFirmware: null,
dateReported: intelResult.published_at || null,
isResolved: false,
confidence: intelResult.confidence || 0.6,
};
await saveIssue(issue, request.url, sourceName, sourceType);
logger.info(`Issue saved: ${model}${issue.title.substring(0, 60)}`);
}
} catch (err) {
logger.warn(`Issue extraction failed for ${model} from ${sourceName}`, { err });
}
},
failedRequestHandler: ({ request, error }) => {
logger.warn(`Community scraper failed: ${request.url}`, { error });
},
});
await crawler.run();
logger.info(`Community issues scraping complete for ${models.length} models`);
}
// ─────────────────────────────────────────────────────────────────────────────
// Scrape issues for all switches in DB
// ─────────────────────────────────────────────────────────────────────────────
export async function scrapeAllSwitchIssues(limit = 30): Promise<void> {
const result = await pool.query<{ model: string }>(
`SELECT sw.model FROM switches sw
WHERE NOT EXISTS (
SELECT 1 FROM product_issues pi WHERE pi.product_model = sw.model
)
ORDER BY sw.max_speed_gbps DESC
LIMIT $1`,
[limit]
);
const models = result.rows.map(r => r.model);
if (models.length === 0) {
logger.info("All switches already have issue data");
return;
}
logger.info(`Scraping community issues for ${models.length} switches`);
await scrapeProductIssues(models, 2); // 2 sources per switch to avoid rate limits
}
// ─────────────────────────────────────────────────────────────────────────────
// Scrape datasheet links for switches/transceivers
// ─────────────────────────────────────────────────────────────────────────────
interface DatasheetSource {
vendor: string;
pattern: (model: string) => string | null;
}
const DATASHEET_SOURCES: DatasheetSource[] = [
{
vendor: "Arista",
pattern: (model) => {
const series = model.match(/DCS-(\d+)/)?.[1];
if (!series) return null;
return `https://www.arista.com/en/products/fixedconfiguration/${series.toLowerCase()}`;
},
},
{
vendor: "Cisco",
pattern: (model) => {
const lower = model.toLowerCase().replace(/\s+/g, "-");
return `https://www.cisco.com/c/en/us/products/collateral/switches/search.html?q=${encodeURIComponent(model)}`;
},
},
{
vendor: "Juniper",
pattern: (model) => {
const series = model.split("-")[0]?.toLowerCase();
if (!series) return null;
return `https://www.juniper.net/documentation/product/${series}.html`;
},
},
{
vendor: "HPE Aruba",
pattern: (model) =>
`https://h20195.www2.hpe.com/v2/getpdf.aspx/a00${model.replace(/[^a-z0-9]/gi, "").toLowerCase()}.pdf`,
},
];
export async function findAndSeedDatasheetLinks(limit = 50): Promise<void> {
const result = await pool.query<{ id: string; model: string; vendor_name: string }>(
`SELECT sw.id, sw.model, v.name AS vendor_name
FROM switches sw
JOIN vendors v ON sw.vendor_id = v.id
WHERE NOT EXISTS (
SELECT 1 FROM product_documents pd WHERE pd.switch_id = sw.id
)
LIMIT $1`,
[limit]
);
for (const sw of result.rows) {
for (const source of DATASHEET_SOURCES) {
if (!sw.vendor_name.toLowerCase().includes(source.vendor.toLowerCase())) continue;
const url = source.pattern(sw.model);
if (!url) continue;
try {
// Check if URL is accessible (simple HEAD request)
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
const resp = await fetch(url, { method: "HEAD", signal: controller.signal });
clearTimeout(timeout);
if (resp.ok) {
const docType = url.includes(".pdf") ? "datasheet" : "product_page";
await pool.query(
`INSERT INTO product_documents (switch_id, doc_type, title, source_url, is_official, language)
VALUES ($1, $2, $3, $4, TRUE, 'en')
ON CONFLICT DO NOTHING`,
[sw.id, docType, `${sw.vendor_name} ${sw.model} ${docType.replace("_", " ")}`, url]
);
logger.info(`✓ Doc linked: ${sw.model}${url}`);
}
} catch {
// URL not accessible — skip silently
}
}
}
}
// CLI entrypoint
if (require.main === module) {
(async () => {
const cmd = process.argv[2] || "issues";
if (cmd === "issues") {
await scrapeAllSwitchIssues(parseInt(process.argv[3] || "30"));
} else if (cmd === "datasheets") {
await findAndSeedDatasheetLinks(parseInt(process.argv[3] || "50"));
}
process.exit(0);
})();
}