transceiver-db/scripts/pi-scraper-setup.sh
Rene Fichtmueller 0edc6e3f3a feat: Pi scraper fleet — fetch-only index-pi.ts + FS.COM/NADDOD via SOCKS5
- index-pi.ts: removed Playwright scrapers (FS.COM, eBay enricher, switch assets)
  added NADDOD (fetch-based, benefits from residential IP)
  now 32 fetch-only queues safe for ARM/Pi without Chromium
- index-fs-only.ts: new dedicated FS.COM + NADDOD worker for Erik
  routes through Pi SOCKS5 via PROXY_URLS=socks5://10.10.0.6:1080
  Crawlee ProxyConfiguration automatically applies to Playwright crawler
- pi-scraper-setup.sh: removed inline index-pi.ts override (repo version now authoritative)
- CODEX-TASK-pi-scraper-deploy.md: full 9-step Codex spec for Pi fleet setup
  covers WireGuard keypair, Erik peer config, setup script, ecosystem.config.js
- CODEX-TASK-zero-manual-review.md: deterministic equivalence matcher spec
2026-05-10 09:53:55 +02:00

204 lines
8.2 KiB
Bash

#!/bin/bash
# ─────────────────────────────────────────────────────────────────────────────
# TIP Pi Scraper Setup — run this ONCE on each Raspberry Pi
#
# Usage (from the Pi itself or via SSH once you have access):
# curl -sL https://gitea.context-x.org/rene/transceiver-db/raw/branch/main/scripts/pi-scraper-setup.sh | bash
#
# Or copy & run manually:
# bash pi-scraper-setup.sh
#
# What this does:
# 1. Installs Node.js 22 + tsx + pm2
# 2. Clones the TIP scraper package
# 3. Installs dependencies (no Playwright — Pi runs fetch-only scrapers)
# 4. Creates .env pointing to Erik's PostgreSQL via WireGuard
# 5. Starts pm2 with the Pi-specific scheduler (lightweight scrapers only)
# ─────────────────────────────────────────────────────────────────────────────
set -e
PI_NAME="${PI_NAME:-pi-scraper}" # override with PI_NAME=pi2 bash setup.sh
DB_HOST="${DB_HOST:-10.10.0.1}" # Erik WireGuard IP
DB_PORT="${DB_PORT:-5433}"
DB_USER="${DB_USER:-tip}"
DB_PASS="${DB_PASS:-***REDACTED***}"
DB_NAME="${DB_NAME:-transceiver_db}"
GITEA="http://192.168.178.196:3000/rene/transceiver-db.git"
INSTALL_DIR="/opt/tip-scraper"
echo "=== TIP Pi Scraper Setup: $PI_NAME ==="
# ── 1. Node.js 22 ────────────────────────────────────────────────────────────
if ! command -v node &>/dev/null || [[ "$(node --version)" < "v20" ]]; then
echo "Installing Node.js 22..."
curl -fsSL https://deb.nodesource.com/setup_22.x | sudo bash -
sudo apt-get install -y nodejs
fi
echo "Node: $(node --version)"
# ── 2. Global tools ───────────────────────────────────────────────────────────
sudo npm install -g tsx pm2 2>/dev/null || npm install -g tsx pm2
pm2 startup systemd -u "$USER" --hp "$HOME" | tail -1 | sudo bash || true
# ── 3. Clone / update repo ───────────────────────────────────────────────────
if [ -d "$INSTALL_DIR" ]; then
echo "Updating existing repo..."
cd "$INSTALL_DIR" && git pull
else
echo "Cloning from Gitea..."
git clone "$GITEA" "$INSTALL_DIR"
fi
cd "$INSTALL_DIR"
# ── 4. Install deps (scraper package only, skip Playwright) ──────────────────
cd packages/scraper
npm install --ignore-scripts # --ignore-scripts skips playwright browser download
echo "Dependencies installed"
# ── 5. .env file ─────────────────────────────────────────────────────────────
cat > "$INSTALL_DIR/.env" <<EOF
POSTGRES_HOST=$DB_HOST
POSTGRES_PORT=$DB_PORT
POSTGRES_USER=$DB_USER
POSTGRES_PASSWORD=$DB_PASS
POSTGRES_DB=$DB_NAME
CRAWLEE_STORAGE_DIR=/tmp/tip-crawlee
NODE_ENV=production
PI_NODE=true
EOF
echo ".env written"
# ── 6. Pi-specific scheduler index ───────────────────────────────────────────
# index-pi.ts is in the repo — fetch-only, NO Playwright.
# No inline override needed. Verify it's there:
if [ ! -f "$INSTALL_DIR/packages/scraper/src/index-pi.ts" ]; then
echo "ERROR: index-pi.ts missing in repo — check git clone"
exit 1
fi
echo "index-pi.ts: OK ($(wc -l < "$INSTALL_DIR/packages/scraper/src/index-pi.ts") lines)"
# ── 7. WireGuard (connects to Erik 10.10.0.1 for DB access) ─────────────────
WG_PRIVKEY="${WG_PRIVKEY:-}"
ERIK_PUBKEY="nrh8xiPzUWwLDK4y6+Cu0V3ne56zobIHKtxMGb7BKQo="
ERIK_ENDPOINT="82.165.222.127:51820"
WG_ADDR="${WG_ADDR:-10.10.0.9}" # override per Pi: WG_ADDR=10.10.0.6
if [ -n "$WG_PRIVKEY" ]; then
sudo apt-get install -y wireguard-tools 2>/dev/null | tail -1 || true
# Detect primary outgoing interface
OUTIF=$(ip route get 8.8.8.8 2>/dev/null | awk '{for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}' | head -1)
POSTUPCMD=""
if [ -n "$OUTIF" ] && ! ping -c1 -W2 8.8.8.8 &>/dev/null; then
# Fallback route for WG traffic if default interface has no internet
GW=$(ip route | awk '/default/{print $3; exit}')
POSTUPCMD="PostUp = ip route add $ERIK_ENDPOINT via $GW dev $OUTIF 2>/dev/null || true"
fi
cat > /tmp/wg0.conf <<WGEOF
[Interface]
PrivateKey = $WG_PRIVKEY
Address = $WG_ADDR/24
$POSTUPCMD
[Peer]
PublicKey = $ERIK_PUBKEY
Endpoint = $ERIK_ENDPOINT
AllowedIPs = 10.10.0.1/32
PersistentKeepalive = 25
WGEOF
sudo mv /tmp/wg0.conf /etc/wireguard/wg0.conf
sudo chmod 600 /etc/wireguard/wg0.conf
sudo wg-quick down wg0 2>/dev/null || true
sudo wg-quick up wg0
sudo systemctl enable wg-quick@wg0
echo "WireGuard: $(sudo wg show wg0 | grep 'latest handshake' || echo 'starting...')"
else
echo "WireGuard: skipped (set WG_PRIVKEY and WG_ADDR to enable)"
fi
# ── 8. PM2 process ───────────────────────────────────────────────────────────
cd "$INSTALL_DIR"
PI_NAME="$PI_NAME" pm2 start \
--name "tip-pi-scraper" \
--interpreter "$(which tsx)" \
--cwd "$INSTALL_DIR" \
packages/scraper/src/index-pi.ts \
-- \
|| pm2 restart tip-pi-scraper
pm2 save
# ── 9. Optional: SOCKS5 Proxy Agent (Starlink bandwidth contribution) ────────
# Allows Erik scraper to route requests THROUGH this Pi's internet connection.
# Especially useful when Pi is on Starlink: different IP range, bypasses IONOS
# IP-based rate limiting on target vendor sites.
#
# Starlink notes:
# - CG-NAT: cannot accept direct incoming TCP from internet
# - WireGuard tunnel already bypasses this (Pi connects OUT to Erik)
# - SOCKS5 listens on WireGuard IP (10.10.0.x), not public interface
# - Erik routes selected scraper jobs through: ALL_PROXY=socks5://10.10.0.x:1080
#
# To enable: run with PROXY_AGENT=1 WG_ADDR=10.10.0.6 bash pi-scraper-setup.sh
PROXY_AGENT="${PROXY_AGENT:-0}"
PROXY_PORT="${PROXY_PORT:-1080}"
if [ "$PROXY_AGENT" = "1" ] && [ -n "$WG_PRIVKEY" ]; then
echo ""
echo "── Installing SOCKS5 Proxy Agent ────────────────────────────────────"
# Install dante-server (lightweight SOCKS5 for Linux)
sudo apt-get install -y dante-server 2>/dev/null | tail -1 || true
WG_IP=$(ip addr show wg0 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1)
if [ -z "$WG_IP" ]; then
WG_IP="$WG_ADDR"
fi
# Detect Starlink interface (usually eth0 or wlan0 — the WAN interface)
OUTIF=$(ip route get 8.8.8.8 2>/dev/null | awk '{for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}' | head -1)
cat > /tmp/danted.conf << DANTEEOF
logoutput: syslog
internal: $WG_IP port = $PROXY_PORT
external: $OUTIF
socksmethod: none
clientmethod: none
client pass {
from: 10.10.0.0/24 to: 0.0.0.0/0
log: error
}
socks pass {
from: 10.10.0.0/24 to: 0.0.0.0/0
protocol: tcp
log: error
}
DANTEEOF
sudo mv /tmp/danted.conf /etc/danted.conf
sudo systemctl enable danted
sudo systemctl restart danted
echo "SOCKS5 proxy listening on $WG_IP:$PROXY_PORT (WireGuard-only, no public exposure)"
echo "Use from Erik: ALL_PROXY=socks5://$WG_IP:$PROXY_PORT curl https://example.com"
echo "Starlink interface: $OUTIF"
echo ""
echo "To use for scraper jobs, set in Erik ecosystem.config.js:"
echo " ALL_PROXY: 'socks5://$WG_IP:$PROXY_PORT' # for jobs that need Starlink IP"
fi
echo ""
echo "✅ TIP Pi Scraper ($PI_NAME) is running"
echo " pm2 logs tip-pi-scraper — view logs"
echo " pm2 status — check status"
echo ""
echo "DB target: $DB_HOST:$DB_PORT/$DB_NAME"
echo "Jobs: ${#PI_QUEUES[@]} lightweight scrapers, all day every day"
if [ "$PROXY_AGENT" = "1" ]; then
echo "SOCKS5 proxy: socks5://$WG_ADDR:$PROXY_PORT (Starlink bandwidth via WireGuard)"
fi