feat: add index-pi.ts with all 44 workers for Pi fleet scraper nodes
Complete Pi scraper entry point covering all pricing, catalog, compat, intelligence and prediction signal scrapers. Includes 5 new form-factor coverage scrapers (comms-express, router-switch, multimode-inc, optictransceiver, wiitek). Erik runs only API+DB, all scraping on Pis.
This commit is contained in:
parent
f146ac873e
commit
6ccaa03932
235
packages/scraper/src/index-pi.ts
Normal file
235
packages/scraper/src/index-pi.ts
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
/**
|
||||||
|
* TIP Pi Scraper Node — index-pi.ts
|
||||||
|
*
|
||||||
|
* Runs as a pg-boss worker on Raspberry Pi fleet (Pi1/Pi2/Pi3).
|
||||||
|
* Handles pricing, catalog, compatibility and intelligence scraping.
|
||||||
|
* Connects to production DB via WireGuard VPN.
|
||||||
|
*
|
||||||
|
* Erik (IONOS VPS) runs ONLY: tip-api, tip-mcp, tip-postgres (Docker), tip-qdrant (Docker)
|
||||||
|
* All scraping work is distributed to the Pi fleet.
|
||||||
|
*/
|
||||||
|
import { config } from "dotenv";
|
||||||
|
import { join } from "path";
|
||||||
|
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||||||
|
|
||||||
|
import PgBoss from "pg-boss";
|
||||||
|
import { mkdirSync, rmSync } from "fs";
|
||||||
|
|
||||||
|
const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5432"}/${process.env.POSTGRES_DB}`;
|
||||||
|
const PI_NAME = process.env.PI_NAME || "pi";
|
||||||
|
|
||||||
|
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
|
||||||
|
const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`;
|
||||||
|
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
|
||||||
|
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
|
||||||
|
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
|
||||||
|
const prev = process.env.CRAWLEE_STORAGE_DIR;
|
||||||
|
process.env.CRAWLEE_STORAGE_DIR = dir;
|
||||||
|
try { await fn(); }
|
||||||
|
finally {
|
||||||
|
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
|
||||||
|
try { rmSync(dir, { recursive: true, force: true }); } catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const QUEUES = [
|
||||||
|
// Pricing scrapers
|
||||||
|
"scrape:pricing:fs",
|
||||||
|
"scrape:pricing:10gtek",
|
||||||
|
"scrape:pricing:prolabs",
|
||||||
|
"scrape:pricing:atgbics",
|
||||||
|
"scrape:pricing:optcore",
|
||||||
|
"scrape:pricing:fluxlight",
|
||||||
|
"scrape:pricing:gbics",
|
||||||
|
"scrape:pricing:champion-one",
|
||||||
|
"scrape:pricing:sfpcables",
|
||||||
|
"scrape:pricing:blueoptics",
|
||||||
|
"scrape:pricing:fiber24",
|
||||||
|
"scrape:pricing:tscom",
|
||||||
|
"scrape:pricing:skylane",
|
||||||
|
"scrape:pricing:ascentoptics",
|
||||||
|
"scrape:pricing:gaotek",
|
||||||
|
// Form-factor coverage scrapers
|
||||||
|
"scrape:pricing:comms-express",
|
||||||
|
"scrape:pricing:router-switch",
|
||||||
|
"scrape:pricing:multimode-inc",
|
||||||
|
"scrape:pricing:optictransceiver",
|
||||||
|
"scrape:pricing:wiitek",
|
||||||
|
// Catalog scrapers
|
||||||
|
"scrape:pricing:flexoptix",
|
||||||
|
"scrape:catalog:smartoptics",
|
||||||
|
"scrape:catalog:hubersuhner",
|
||||||
|
// Vendor scrapers
|
||||||
|
"scrape:vendors:flexoptix",
|
||||||
|
"scrape:vendors:flexoptix-supported",
|
||||||
|
// Compatibility scrapers
|
||||||
|
"scrape:compat:cisco",
|
||||||
|
"scrape:compat:juniper",
|
||||||
|
"scrape:compat:sonic",
|
||||||
|
"scrape:compat:ufispace",
|
||||||
|
"scrape:compat:edgecore",
|
||||||
|
// Intelligence
|
||||||
|
"scrape:news",
|
||||||
|
"scrape:market-intel",
|
||||||
|
"scrape:community-issues",
|
||||||
|
"scrape:datasheet-links",
|
||||||
|
// Switch assets
|
||||||
|
"scrape:assets:switches",
|
||||||
|
// eBay enrichment
|
||||||
|
"enrich:ebay-transceivers",
|
||||||
|
"enrich:ebay-switches",
|
||||||
|
// Prediction signals
|
||||||
|
"scrape:signals:sec-edgar",
|
||||||
|
"scrape:signals:github",
|
||||||
|
"scrape:signals:ebay-velocity",
|
||||||
|
"scrape:signals:ai-clusters",
|
||||||
|
"scrape:signals:distributor-leads",
|
||||||
|
"scrape:signals:standards",
|
||||||
|
// Compute
|
||||||
|
"compute:abc",
|
||||||
|
"compute:reorder-signals",
|
||||||
|
"compute:forecast",
|
||||||
|
];
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log(`\n=== TIP Pi Scraper Node [${PI_NAME}] ===`);
|
||||||
|
console.log(`DB: ${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || 5432}`);
|
||||||
|
console.log(`Queues: ${QUEUES.length} workers\n`);
|
||||||
|
|
||||||
|
const boss = new PgBoss({
|
||||||
|
connectionString,
|
||||||
|
retryLimit: 2,
|
||||||
|
retryDelay: 60,
|
||||||
|
expireInSeconds: 3600,
|
||||||
|
monitorStateIntervalSeconds: 60,
|
||||||
|
});
|
||||||
|
|
||||||
|
boss.on("error", (e: Error) => console.error("pg-boss error:", e.message));
|
||||||
|
await boss.start();
|
||||||
|
|
||||||
|
for (const q of QUEUES) {
|
||||||
|
await boss.createQueue(q).catch(() => {});
|
||||||
|
}
|
||||||
|
|
||||||
|
const log = (q: string) => console.log(`[${new Date().toISOString()}] [${PI_NAME}] ${q}`);
|
||||||
|
|
||||||
|
// ── Pricing scrapers ──────────────────────────────────────────────────
|
||||||
|
const { scrapeFs } = await import("./scrapers/fs");
|
||||||
|
const { scrape10GTek } = await import("./scrapers/10gtek");
|
||||||
|
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
||||||
|
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||||||
|
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||||||
|
const { scrapeFluxlight } = await import("./scrapers/fluxlight");
|
||||||
|
const { scrapeGbics } = await import("./scrapers/gbics");
|
||||||
|
const { scrapeChampionOne } = await import("./scrapers/champion-one");
|
||||||
|
const { scrapeSfpCables } = await import("./scrapers/sfpcables");
|
||||||
|
const { scrapeBlueOptics } = await import("./scrapers/blueoptics");
|
||||||
|
const { scrapeFiber24 } = await import("./scrapers/fiber24");
|
||||||
|
const { scrapeTsCom } = await import("./scrapers/tscom");
|
||||||
|
const { scrapeSkylane } = await import("./scrapers/skylane");
|
||||||
|
const { scrapeAscentOptics } = await import("./scrapers/ascentoptics");
|
||||||
|
const { scrapeGaoTek } = await import("./scrapers/gaotek");
|
||||||
|
|
||||||
|
// ── Form-factor coverage scrapers ─────────────────────────────────────
|
||||||
|
const { scrapeCommsExpress } = await import("./scrapers/comms-express");
|
||||||
|
const { scrapeRouterSwitch } = await import("./scrapers/router-switch");
|
||||||
|
const { scrapeMultimodeInc } = await import("./scrapers/multimode-inc");
|
||||||
|
const { scrapeOpticTransceiver } = await import("./scrapers/optictransceiver");
|
||||||
|
const { scrapeWiitek } = await import("./scrapers/wiitek");
|
||||||
|
|
||||||
|
// ── Catalog scrapers ──────────────────────────────────────────────────
|
||||||
|
const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog");
|
||||||
|
const { scrapeSmartOptics } = await import("./scrapers/smartoptics");
|
||||||
|
const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner");
|
||||||
|
|
||||||
|
// ── Vendor scrapers ───────────────────────────────────────────────────
|
||||||
|
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
|
||||||
|
const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors");
|
||||||
|
|
||||||
|
// ── Compatibility scrapers ────────────────────────────────────────────
|
||||||
|
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
|
||||||
|
const { scrapeJuniperHct } = await import("./scrapers/juniper-hct");
|
||||||
|
const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl");
|
||||||
|
const { scrapeUfiSpace } = await import("./scrapers/ufispace");
|
||||||
|
const { scrapeEdgecore } = await import("./scrapers/edgecore");
|
||||||
|
|
||||||
|
// ── Intelligence scrapers ─────────────────────────────────────────────
|
||||||
|
const { scrapeNews } = await import("./scrapers/news");
|
||||||
|
const { scrapeMarketIntelligence, computeAbcClassification, computeReorderSignals } = await import("./scrapers/market-intelligence");
|
||||||
|
const { scrapeAllSwitchIssues, findAndSeedDatasheetLinks } = await import("./scrapers/community-issues");
|
||||||
|
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
|
||||||
|
|
||||||
|
// ── eBay enrichment ───────────────────────────────────────────────────
|
||||||
|
const { enrichTransceiversFromEbay, enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
|
||||||
|
|
||||||
|
// ── Prediction signals ────────────────────────────────────────────────
|
||||||
|
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
|
||||||
|
const { scrapeGithubSignals } = await import("./scrapers/github-signals");
|
||||||
|
const { scrapeEbayVelocity } = await import("./scrapers/ebay-velocity");
|
||||||
|
const { scrapeAiClusters } = await import("./scrapers/ai-clusters");
|
||||||
|
const { scrapeDistributorLeads } = await import("./scrapers/distributor-leads");
|
||||||
|
const { scrapeStandardsTracker } = await import("./scrapers/standards-tracker");
|
||||||
|
const { runForecastEngine } = await import("./scrapers/forecast-engine");
|
||||||
|
|
||||||
|
// ── Register workers ──────────────────────────────────────────────────
|
||||||
|
await boss.work("scrape:pricing:fs", async () => { log("fs"); await withIsolatedStorage("fs", scrapeFs); });
|
||||||
|
await boss.work("scrape:pricing:10gtek", async () => { log("10gtek"); await withIsolatedStorage("10gtek", scrape10GTek); });
|
||||||
|
await boss.work("scrape:pricing:prolabs", async () => { log("prolabs"); await withIsolatedStorage("prolabs", scrapeProLabs); });
|
||||||
|
await boss.work("scrape:pricing:atgbics", async () => { log("atgbics"); await withIsolatedStorage("atgbics", scrapeAtgbics); });
|
||||||
|
await boss.work("scrape:pricing:optcore", async () => { log("optcore"); await withIsolatedStorage("optcore", scrapeOptcore); });
|
||||||
|
await boss.work("scrape:pricing:fluxlight", async () => { log("fluxlight"); await withIsolatedStorage("fluxlight", scrapeFluxlight); });
|
||||||
|
await boss.work("scrape:pricing:gbics", async () => { log("gbics"); await withIsolatedStorage("gbics", scrapeGbics); });
|
||||||
|
await boss.work("scrape:pricing:champion-one", async () => { log("champion-one"); await withIsolatedStorage("champion-one", scrapeChampionOne); });
|
||||||
|
await boss.work("scrape:pricing:sfpcables", async () => { log("sfpcables"); await withIsolatedStorage("sfpcables", scrapeSfpCables); });
|
||||||
|
await boss.work("scrape:pricing:blueoptics", async () => { log("blueoptics"); await withIsolatedStorage("blueoptics", scrapeBlueOptics); });
|
||||||
|
await boss.work("scrape:pricing:fiber24", async () => { log("fiber24"); await withIsolatedStorage("fiber24", scrapeFiber24); });
|
||||||
|
await boss.work("scrape:pricing:tscom", async () => { log("tscom"); await withIsolatedStorage("tscom", scrapeTsCom); });
|
||||||
|
await boss.work("scrape:pricing:skylane", async () => { log("skylane"); await withIsolatedStorage("skylane", scrapeSkylane); });
|
||||||
|
await boss.work("scrape:pricing:ascentoptics", async () => { log("ascentoptics"); await withIsolatedStorage("ascentoptics", scrapeAscentOptics); });
|
||||||
|
await boss.work("scrape:pricing:gaotek", async () => { log("gaotek"); await withIsolatedStorage("gaotek", scrapeGaoTek); });
|
||||||
|
|
||||||
|
await boss.work("scrape:pricing:comms-express", async () => { log("comms-express"); await scrapeCommsExpress(); });
|
||||||
|
await boss.work("scrape:pricing:router-switch", async () => { log("router-switch"); await scrapeRouterSwitch(); });
|
||||||
|
await boss.work("scrape:pricing:multimode-inc", async () => { log("multimode-inc"); await scrapeMultimodeInc(); });
|
||||||
|
await boss.work("scrape:pricing:optictransceiver", async () => { log("optictransceiver"); await scrapeOpticTransceiver(); });
|
||||||
|
await boss.work("scrape:pricing:wiitek", async () => { log("wiitek"); await scrapeWiitek(); });
|
||||||
|
|
||||||
|
await boss.work("scrape:pricing:flexoptix", async () => { log("flexoptix-catalog"); await scrapeFlexoptixCatalog(); });
|
||||||
|
await boss.work("scrape:catalog:smartoptics", async () => { log("smartoptics"); await withIsolatedStorage("smartoptics", scrapeSmartOptics); });
|
||||||
|
await boss.work("scrape:catalog:hubersuhner", async () => { log("hubersuhner"); await withIsolatedStorage("hubersuhner", scrapeHuberSuhner); });
|
||||||
|
|
||||||
|
await boss.work("scrape:vendors:flexoptix", async () => { log("flexoptix-vendors"); await scrapeFlexoptixVendors(); });
|
||||||
|
await boss.work("scrape:vendors:flexoptix-supported", async () => { log("flexoptix-supported"); await seedFlexoptixVendors(); });
|
||||||
|
|
||||||
|
await boss.work("scrape:compat:cisco", async () => { log("cisco-compat"); await withIsolatedStorage("cisco", scrapeCiscoTmg); });
|
||||||
|
await boss.work("scrape:compat:juniper", async () => { log("juniper-compat"); await withIsolatedStorage("juniper", scrapeJuniperHct); });
|
||||||
|
await boss.work("scrape:compat:sonic", async () => { log("sonic-compat"); await withIsolatedStorage("sonic", scrapeSonicHcl); });
|
||||||
|
await boss.work("scrape:compat:ufispace", async () => { log("ufispace"); await withIsolatedStorage("ufispace", scrapeUfiSpace); });
|
||||||
|
await boss.work("scrape:compat:edgecore", async () => { log("edgecore"); await withIsolatedStorage("edgecore", scrapeEdgecore); });
|
||||||
|
|
||||||
|
await boss.work("scrape:news", async () => { log("news"); await scrapeNews(); });
|
||||||
|
await boss.work("scrape:market-intel", async () => { log("market-intel"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); });
|
||||||
|
await boss.work("scrape:community-issues", async () => { log("community"); await withIsolatedStorage("community", () => scrapeAllSwitchIssues(30)); });
|
||||||
|
await boss.work("scrape:datasheet-links", async () => { log("datasheets"); await findAndSeedDatasheetLinks(50); });
|
||||||
|
await boss.work("scrape:assets:switches", async () => { log("switch-assets"); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); });
|
||||||
|
|
||||||
|
await boss.work("enrich:ebay-transceivers", async () => { log("ebay-transceivers"); await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); });
|
||||||
|
await boss.work("enrich:ebay-switches", async () => { log("ebay-switches"); await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30)); });
|
||||||
|
|
||||||
|
await boss.work("scrape:signals:sec-edgar", async () => { log("sec-edgar"); await scrapeSecEdgar(); });
|
||||||
|
await boss.work("scrape:signals:github", async () => { log("github-signals"); await scrapeGithubSignals(); });
|
||||||
|
await boss.work("scrape:signals:ebay-velocity", async () => { log("ebay-velocity"); await scrapeEbayVelocity(); });
|
||||||
|
await boss.work("scrape:signals:ai-clusters", async () => { log("ai-clusters"); await scrapeAiClusters(); });
|
||||||
|
await boss.work("scrape:signals:distributor-leads", async () => { log("distributor-leads"); await scrapeDistributorLeads(); });
|
||||||
|
await boss.work("scrape:signals:standards", async () => { log("standards"); await scrapeStandardsTracker(); });
|
||||||
|
|
||||||
|
await boss.work("compute:abc", async () => { log("abc"); await computeAbcClassification(); });
|
||||||
|
await boss.work("compute:reorder-signals", async () => { log("reorder"); await computeReorderSignals(); });
|
||||||
|
await boss.work("compute:forecast", async () => { log("forecast"); await runForecastEngine(); });
|
||||||
|
|
||||||
|
console.log(`${QUEUES.length} workers active — running 24/7\n`);
|
||||||
|
process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); });
|
||||||
|
process.on("SIGINT", async () => { await boss.stop(); process.exit(0); });
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((e) => { console.error("Fatal:", e); process.exit(1); });
|
||||||
Loading…
x
Reference in New Issue
Block a user