transceiver-db/scripts/pi-scraper-setup.sh
Rene Fichtmueller 072978f1a4 feat: 24/7 scraping fleet — 8 new vendors + continuous schedule + Pi setup
New scrapers (8):
- BlueOptics (EUR, every 4h)
- ShopFiber24 (EUR, every 4h)
- T&S Communication (USD, every 4h)
- SmartOptics (catalog, every 8h)
- HUBER+SUHNER (catalog, every 8h)
- Skylane Optics (USD, every 4h)
- AscentOptics (USD, every 4h)
- GAO Tek (USD, every 4h)

Scheduler: nightly window → 24/7 continuous (42 jobs total)
- Playwright scrapers: every 8h (FS.com, 10Gtek, ATGBICS, ProLabs)
- Fetch/Cheerio: every 4h (11 lightweight vendors)
- Flexoptix catalog: every 2h (primary price source)
- eBay enrichment: every 6h
- Compatibility matrices: every 12h
- Compute jobs: every 4h

Pi fleet: scripts/pi-scraper-setup.sh for one-command Pi node setup
2026-04-02 01:09:05 +02:00

205 lines
8.5 KiB
Bash

#!/bin/bash
# ─────────────────────────────────────────────────────────────────────────────
# TIP Pi Scraper Setup — run this ONCE on each Raspberry Pi
#
# Usage (from the Pi itself or via SSH once you have access):
# curl -sL https://gitea.context-x.org/rene/transceiver-db/raw/branch/main/scripts/pi-scraper-setup.sh | bash
#
# Or copy & run manually:
# bash pi-scraper-setup.sh
#
# What this does:
# 1. Installs Node.js 22 + tsx + pm2
# 2. Clones the TIP scraper package
# 3. Installs dependencies (no Playwright — Pi runs fetch-only scrapers)
# 4. Creates .env pointing to Erik's PostgreSQL via WireGuard
# 5. Starts pm2 with the Pi-specific scheduler (lightweight scrapers only)
# ─────────────────────────────────────────────────────────────────────────────
set -e
PI_NAME="${PI_NAME:-pi-scraper}" # override with PI_NAME=pi2 bash setup.sh
DB_HOST="${DB_HOST:-10.10.0.1}" # Erik WireGuard IP
DB_PORT="${DB_PORT:-5433}"
DB_USER="${DB_USER:-tip}"
DB_PASS="${DB_PASS:-***REDACTED***}"
DB_NAME="${DB_NAME:-transceiver_db}"
GITEA="http://192.168.178.196:3000/rene/transceiver-db.git"
INSTALL_DIR="/opt/tip-scraper"
echo "=== TIP Pi Scraper Setup: $PI_NAME ==="
# ── 1. Node.js 22 ────────────────────────────────────────────────────────────
if ! command -v node &>/dev/null || [[ "$(node --version)" < "v20" ]]; then
echo "Installing Node.js 22..."
curl -fsSL https://deb.nodesource.com/setup_22.x | sudo bash -
sudo apt-get install -y nodejs
fi
echo "Node: $(node --version)"
# ── 2. Global tools ───────────────────────────────────────────────────────────
sudo npm install -g tsx pm2 2>/dev/null || npm install -g tsx pm2
pm2 startup systemd -u "$USER" --hp "$HOME" | tail -1 | sudo bash || true
# ── 3. Clone / update repo ───────────────────────────────────────────────────
if [ -d "$INSTALL_DIR" ]; then
echo "Updating existing repo..."
cd "$INSTALL_DIR" && git pull
else
echo "Cloning from Gitea..."
git clone "$GITEA" "$INSTALL_DIR"
fi
cd "$INSTALL_DIR"
# ── 4. Install deps (scraper package only, skip Playwright) ──────────────────
cd packages/scraper
npm install --ignore-scripts # --ignore-scripts skips playwright browser download
echo "Dependencies installed"
# ── 5. .env file ─────────────────────────────────────────────────────────────
cat > "$INSTALL_DIR/.env" <<EOF
POSTGRES_HOST=$DB_HOST
POSTGRES_PORT=$DB_PORT
POSTGRES_USER=$DB_USER
POSTGRES_PASSWORD=$DB_PASS
POSTGRES_DB=$DB_NAME
CRAWLEE_STORAGE_DIR=/tmp/tip-crawlee
NODE_ENV=production
PI_NODE=true
EOF
echo ".env written"
# ── 6. Pi-specific scheduler index ───────────────────────────────────────────
# The Pi runs only fetch/cheerio scrapers — no Playwright
cat > "$INSTALL_DIR/packages/scraper/src/index-pi.ts" <<'PIEOF'
/**
* Pi Scraper Index — lightweight fetch/cheerio only
* No Playwright, no eBay enricher, no heavy compute
* Runs 24/7 on Raspberry Pi nodes
*/
import { config } from "dotenv";
import { join } from "path";
config({ path: join(__dirname, "..", "..", "..", ".env") });
import PgBoss from "pg-boss";
const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`;
const PI_QUEUES = [
"scrape:pricing:fluxlight",
"scrape:pricing:gbics",
"scrape:pricing:optcore",
"scrape:pricing:champion-one",
"scrape:pricing:sfpcables",
"scrape:pricing:blueoptics",
"scrape:pricing:fiber24",
"scrape:pricing:tscom",
"scrape:pricing:skylane",
"scrape:pricing:ascentoptics",
"scrape:pricing:gaotek",
"scrape:catalog:smartoptics",
"scrape:catalog:hubersuhner",
"scrape:news",
"scrape:market-intel",
];
async function main() {
console.log(`\n=== TIP Pi Scraper (${process.env.PI_NAME || "pi"}) ===\n`);
const boss = new PgBoss({
connectionString,
retryLimit: 2,
retryDelay: 60,
expireInSeconds: 3600,
monitorStateIntervalSeconds: 60,
});
boss.on("error", (e) => console.error("pg-boss error:", e));
await boss.start();
for (const q of PI_QUEUES) {
await boss.createQueue(q).catch(() => {});
}
// Register workers for all Pi-safe scrapers
const mods: Record<string, string> = {
"scrape:pricing:fluxlight": "./scrapers/fluxlight",
"scrape:pricing:gbics": "./scrapers/gbics",
"scrape:pricing:optcore": "./scrapers/optcore",
"scrape:pricing:champion-one": "./scrapers/champion-one",
"scrape:pricing:sfpcables": "./scrapers/sfpcables",
"scrape:pricing:blueoptics": "./scrapers/blueoptics",
"scrape:pricing:fiber24": "./scrapers/fiber24",
"scrape:pricing:tscom": "./scrapers/tscom",
"scrape:pricing:skylane": "./scrapers/skylane",
"scrape:pricing:ascentoptics": "./scrapers/ascentoptics",
"scrape:pricing:gaotek": "./scrapers/gaotek",
"scrape:catalog:smartoptics": "./scrapers/smartoptics",
"scrape:catalog:hubersuhner": "./scrapers/hubersuhner",
"scrape:news": "./scrapers/news",
};
const fnNames: Record<string, string> = {
"scrape:pricing:fluxlight": "scrapeFluxlight",
"scrape:pricing:gbics": "scrapeGbics",
"scrape:pricing:optcore": "scrapeOptcore",
"scrape:pricing:champion-one": "scrapeChampionOne",
"scrape:pricing:sfpcables": "scrapeSfpCables",
"scrape:pricing:blueoptics": "scrapeBlueOptics",
"scrape:pricing:fiber24": "scrapeFiber24",
"scrape:pricing:tscom": "scrapeTsCom",
"scrape:pricing:skylane": "scrapeSkylane",
"scrape:pricing:ascentoptics": "scrapeAscentOptics",
"scrape:pricing:gaotek": "scrapeGaoTek",
"scrape:catalog:smartoptics": "scrapeSmartOptics",
"scrape:catalog:hubersuhner": "scrapeHuberSuhner",
"scrape:news": "scrapeNews",
};
for (const [queue, modPath] of Object.entries(mods)) {
const mod = await import(modPath);
const fn = mod[fnNames[queue]];
if (!fn) { console.warn(`No function ${fnNames[queue]} in ${modPath}`); continue; }
await boss.work(queue, async () => {
console.log(`[${new Date().toISOString()}] [${process.env.PI_NAME || "pi"}] Running: ${queue}`);
try { await fn(); }
catch (e) { console.error(`[${queue}] failed:`, String(e).slice(0, 200)); }
});
}
// Market intel worker
await boss.work("scrape:market-intel", async () => {
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence");
try { await scrapeMarketIntelligence(); }
catch (e) { console.error("market-intel failed:", String(e).slice(0, 200)); }
});
console.log(`Pi worker registered for ${PI_QUEUES.length} queues\nWaiting for jobs...\n`);
process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); });
process.on("SIGINT", async () => { await boss.stop(); process.exit(0); });
}
main().catch((e) => { console.error("Fatal:", e); process.exit(1); });
PIEOF
# ── 7. PM2 process ───────────────────────────────────────────────────────────
cd "$INSTALL_DIR"
PI_NAME="$PI_NAME" pm2 start \
--name "tip-pi-scraper" \
--interpreter "$(which tsx)" \
--cwd "$INSTALL_DIR" \
packages/scraper/src/index-pi.ts \
-- \
|| pm2 restart tip-pi-scraper
pm2 save
echo ""
echo "✅ TIP Pi Scraper ($PI_NAME) is running"
echo " pm2 logs tip-pi-scraper — view logs"
echo " pm2 status — check status"
echo ""
echo "DB target: $DB_HOST:$DB_PORT/$DB_NAME"
echo "Jobs: ${#PI_QUEUES[@]} lightweight scrapers, all day every day"