diff --git a/CODEX-TASK-pi-scraper-deploy.md b/CODEX-TASK-pi-scraper-deploy.md new file mode 100644 index 0000000..aad43de --- /dev/null +++ b/CODEX-TASK-pi-scraper-deploy.md @@ -0,0 +1,346 @@ +# CODEX TASK: Raspberry Pi Scraper Fleet Deployment + +## Ziel + +Einen oder mehrere Raspberry Pis als dedizierte Scraper-Nodes in die TIP-Infrastruktur +integrieren — mit WireGuard VPN zu Erik's PostgreSQL und SOCKS5-Proxy für FS.COM. + +**Warum Pi statt Erik:** +- Residential IP → FS.COM, NADDOD blockieren Datacenter-IPs (IONOS wird erkannt) +- Starlink oder Kabel-IP = andere IP-Range = höhere Scraper-Erfolgsrate +- Keine Kosten für separate Proxies (€50-150/Monat bei kommerziellen Diensten) +- Pi 4B (4GB) kostet ~60€ einmalig und läuft dauerhaft + +**Architektur nach Deployment:** +``` +Raspberry Pi(s) Erik (82.165.222.127) +┌────────────────────────┐ ┌─────────────────────────┐ +│ index-pi.ts │ VPN │ PostgreSQL + pg-boss │ +│ 32 fetch-only scrapers│◄───────►│ tip-api, tip-mcp │ +│ (NADDOD, GBICS, etc.) │ │ tip-scraper (Playwright)│ +│ │ │ ↑ PROXY_URLS │ +│ dante SOCKS5 :1080 │◄────────│ socks5://Pi:1080 │ +│ (WireGuard IP only) │ │ für FS.COM scraper │ +└────────────────────────┘ └─────────────────────────┘ + WG: 10.10.0.6 WG: 10.10.0.1 +``` + +--- + +## Repository + +``` +Pfad lokal: /Users/renefichtmueller/Desktop/Claude Code/github-repos/transceiver-db/ +Pfad Erik: /opt/tip/ +Gitea: http://192.168.178.196:3000/rene/transceiver-db.git +``` + +--- + +## SCHRITT 1: WireGuard — Pi-Keypair generieren (lokal oder auf Erik) + +Führe dies auf einem Linux-System aus (oder auf dem Pi selbst): + +```bash +# WireGuard keypair für Pi1 generieren +wg genkey | tee /tmp/pi1-privkey | wg pubkey > /tmp/pi1-pubkey +cat /tmp/pi1-privkey # → wird PI_PRIVKEY beim Pi-Setup +cat /tmp/pi1-pubkey # → wird in Erik's wg0.conf eingetragen +``` + +Notiere: +- `PI_PRIVKEY` = privater Schlüssel (bleibt auf dem Pi, NIEMALS in Git) +- `PI_PUBKEY` = öffentlicher Schlüssel (wird auf Erik konfiguriert) + +--- + +## SCHRITT 2: WireGuard auf Erik — Pi als Peer hinzufügen + +**SSH auf Erik:** +```bash +ssh erik-cf +``` + +**Datei bearbeiten:** `/etc/wireguard/wg0.conf` + +Füge am Ende hinzu: +```ini +# Raspberry Pi 1 — Scraper Node +[Peer] +PublicKey = +AllowedIPs = 10.10.0.6/32 +PersistentKeepalive = 25 +``` + +**WireGuard neu laden (ohne Verbindungsabbruch):** +```bash +sudo wg syncconf wg0 <(sudo wg-quick strip wg0) +# oder bei komplettem Reload: +sudo systemctl restart wg-quick@wg0 +``` + +**Verify:** +```bash +sudo wg show wg0 # muss den Pi-Peer als "Peer:" zeigen +``` + +**PostgreSQL Firewall auf Erik — Pi-IP erlauben:** +```bash +# UFW: WireGuard-Traffic von Pi zur DB erlauben +sudo ufw allow in on wg0 from 10.10.0.6 to 10.10.0.1 port 5433 proto tcp comment "Pi1 → PostgreSQL" +sudo ufw status | grep 5433 +``` + +**pg_hba.conf auf Erik anpassen** (damit Pi sich zur DB verbinden kann): +```bash +# Falls nicht schon eingetragen: +echo "host transceiver_db tip 10.10.0.0/24 scram-sha-256" | sudo tee -a /etc/postgresql/17/main/pg_hba.conf +sudo systemctl reload postgresql +``` + +--- + +## SCHRITT 3: Pi physisch einrichten + +Raspberry Pi OS Lite (64-bit) flashen und SSH aktivieren. + +**Auf dem Pi ausführen:** +```bash +# Einmalig: TIP Scraper Setup mit WireGuard + SOCKS5 +PI_NAME=pi1 \ +DB_HOST=10.10.0.1 \ +DB_PORT=5433 \ +DB_USER=tip \ +DB_PASS= \ +DB_NAME=transceiver_db \ +WG_PRIVKEY= \ +WG_ADDR=10.10.0.6 \ +PROXY_AGENT=1 \ +bash <(curl -sL http://192.168.178.196:3000/rene/transceiver-db/raw/branch/main/scripts/pi-scraper-setup.sh) +``` + +**Was das Script macht:** +1. Node.js 22 + tsx + pm2 installieren +2. Repo von Gitea klonen nach `/opt/tip-scraper/` +3. npm install --ignore-scripts (kein Playwright!) +4. .env schreiben (DB via WireGuard: 10.10.0.1:5433) +5. WireGuard konfigurieren (Pi → Erik) +6. PM2 mit `index-pi.ts` starten (32 fetch-only scrapers) +7. dante SOCKS5-Proxy auf 10.10.0.6:1080 starten + +**Verify auf Pi:** +```bash +pm2 status # tip-pi-scraper: online +pm2 logs tip-pi-scraper --lines 20 # muss "32 queues / workers active" zeigen +sudo wg show wg0 # muss Handshake mit Erik zeigen +ss -tlnp | grep 1080 # dante: listening +``` + +--- + +## SCHRITT 4: Erik — FS.COM Scraper durch Pi-Proxy routen + +**Datei:** `/opt/tip/ecosystem.config.js` + +Den bestehenden `tip-scraper` Eintrag aufsplitten in: +- `tip-scraper-pi` — läuft auf dem Pi (kein Eintrag nötig hier, Pi macht's selbst) +- `tip-scraper-fs` — FS.COM via Pi SOCKS5 (neuer PM2-Prozess auf Erik) +- `tip-scraper` — alle anderen Playwright-Scrapers auf Erik + +**Füge folgenden Eintrag in `ecosystem.config.js` hinzu** (neben dem bestehenden `tip-scraper`): + +```javascript +{ + name: "tip-scraper-fs", + script: "./node_modules/.bin/tsx", + args: "packages/scraper/src/index-fs-only.ts", + cwd: "/opt/tip", + interpreter: "none", + exec_mode: "fork", + env: { + NODE_ENV: "production", + POSTGRES_HOST: "localhost", + POSTGRES_PORT: "5433", + POSTGRES_DB: "transceiver_db", + POSTGRES_USER: "tip", + POSTGRES_PASSWORD: "", + PROXY_URLS: "socks5://10.10.0.6:1080", // ← Pi SOCKS5 + CRAWLEE_STORAGE_DIR: "/tmp/tip-crawlee-fs", + }, + max_memory_restart: "800M", + instances: 1, + autorestart: true, +}, +``` + +--- + +## SCHRITT 5: index-fs-only.ts erstellen (Erik-side, nur FS.COM) + +**Erstelle Datei:** `packages/scraper/src/index-fs-only.ts` + +```typescript +/** + * TIP FS.COM Dedicated Scraper + * + * Runs on ERIK but routes traffic through Pi's SOCKS5 proxy + * so FS.com sees a residential IP instead of IONOS datacenter IP. + * + * PROXY_URLS=socks5://10.10.0.6:1080 must be set in environment. + */ +import { config } from "dotenv"; +import { join } from "path"; +config({ path: join(__dirname, "..", "..", "..", ".env") }); + +import PgBoss from "pg-boss"; +import { mkdirSync, rmSync } from "fs"; + +const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`; + +async function withIsolatedStorage(name: string, fn: () => Promise): Promise { + const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`; + mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); + mkdirSync(join(dir, "datasets", "default"), { recursive: true }); + mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); + const prev = process.env.CRAWLEE_STORAGE_DIR; + process.env.CRAWLEE_STORAGE_DIR = dir; + try { await fn(); } + finally { + process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; + try { rmSync(dir, { recursive: true, force: true }); } catch {} + } +} + +async function main() { + const proxy = process.env.PROXY_URLS; + console.log(`\n=== TIP FS.COM Scraper (proxy: ${proxy ?? "none"}) ===\n`); + if (!proxy) { + console.warn("WARNING: PROXY_URLS not set — FS.com will see IONOS IP (may be blocked)"); + } + + const boss = new PgBoss({ + connectionString, + retryLimit: 3, + retryDelay: 300, // 5 min retry on failure + expireInSeconds: 7200, // 2h timeout for full FS catalog run + monitorStateIntervalSeconds: 60, + }); + + boss.on("error", (e: Error) => console.error("pg-boss error:", e.message)); + await boss.start(); + + await boss.createQueue("scrape:pricing:fs").catch(() => {}); + await boss.createQueue("scrape:pricing:naddod").catch(() => {}); + + const { scrapeFs } = await import("./scrapers/fs-com"); + const { scrapeNaddod } = await import("./scrapers/naddod"); + + await boss.work("scrape:pricing:fs", async () => { + console.log(`[${new Date().toISOString()}] FS.COM via ${proxy ?? "direct"}`); + await withIsolatedStorage("fs", scrapeFs); + }); + + // NADDOD also benefits from residential IP (less aggressive rate limiting) + await boss.work("scrape:pricing:naddod", async () => { + console.log(`[${new Date().toISOString()}] NADDOD via ${proxy ?? "direct"}`); + await scrapeNaddod(); + }); + + console.log("FS.COM + NADDOD workers active — waiting for scheduled jobs\n"); + process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); }); + process.on("SIGINT", async () => { await boss.stop(); process.exit(0); }); +} + +main().catch((e) => { console.error("Fatal:", e); process.exit(1); }); +``` + +**Hinweis:** Wenn `PROXY_URLS` gesetzt ist, nutzt der FS.COM PlaywrightCrawler automatisch +den SOCKS5-Proxy via Crawlee's ProxyConfiguration. Kein weiterer Code nötig. + +--- + +## SCHRITT 6: Scheduler — NADDOD-Job aus index.ts entfernen + +Da NADDOD nun durch `index-fs-only.ts` (auf Erik) und `index-pi.ts` (auf Pi) gehandhabt wird, +muss `scrape:pricing:naddod` nur noch EINMAL registriert werden. + +**Prüfe:** `packages/scraper/src/scheduler.ts` — suche nach `naddod` und stelle sicher, +dass der `scrape:pricing:naddod` Job-Schedule NUR EINMAL registriert ist. + +Falls doppelt vorhanden: Im `scheduler.ts` den zweiten Eintrag entfernen. + +--- + +## SCHRITT 7: Erik — PM2 neu starten + +```bash +ssh erik-cf +cd /opt/tip +git pull origin main +pm2 start ecosystem.config.js --only tip-scraper-fs +pm2 save +pm2 status # tip-scraper-fs muss online sein +``` + +--- + +## SCHRITT 8: Verifikation (automatisch — kein manuelles Debugging) + +Die Scrapers verifizieren sich selbst. Folgende Checks laufen automatisch: + +**Pi selbst:** +```bash +# 5 min nach Setup auf dem Pi: +pm2 logs tip-pi-scraper --lines 50 | grep -E "\[(scrape|enrich)\]" +# Erwartete Ausgabe: Timestamp + Queue-Name alle paar Minuten +``` + +**FS.COM über Proxy:** +- FS.COM scraper läuft auf Erik um 02:00 und 14:00 (scheduled via pg-boss) +- Preise landen in `price_observations` Tabelle mit `vendor_id = fs-com` +- Automatische Diagnose: wenn `price` weiterhin NULL → Scraper-eigene Logs prüfen + +**Self-healing:** +- pg-boss retryLimit=3: Jobs werden automatisch 3x wiederholt bei Fehler +- PM2 autorestart: Prozess startet automatisch neu wenn er crasht +- WireGuard PersistentKeepalive=25: VPN bleibt auch bei langer Inaktivität aktiv + +--- + +## SCHRITT 9: Zweiter Pi (optional — für mehr IP-Diversität) + +Gleiche Prozedur, aber: +```bash +PI_NAME=pi2 WG_ADDR=10.10.0.7 WG_PRIVKEY= ... +``` + +Erik's wg0.conf: +```ini +[Peer] +PublicKey = +AllowedIPs = 10.10.0.7/32 +PersistentKeepalive = 25 +``` + +`ecosystem.config.js` auf Erik: zweiten `tip-scraper-fs` mit `PROXY_URLS: "socks5://10.10.0.7:1080"` +für einen weiteren Proxy-Pfad (Round-robin über beide Pis). + +--- + +## Zusammenfassung der neuen Dateien + +| Datei | Status | +|-------|--------| +| `packages/scraper/src/index-pi.ts` | ✅ Geändert: fetch-only, Playwright entfernt, NADDOD hinzugefügt | +| `packages/scraper/src/index-fs-only.ts` | 🆕 Erstellen (Schritt 5) | +| `scripts/pi-scraper-setup.sh` | ✅ Geändert: Kein inline index-pi.ts Override mehr | +| `ecosystem.config.js` (auf Erik) | ✅ Ändern: tip-scraper-fs Eintrag hinzufügen | +| Erik `/etc/wireguard/wg0.conf` | ✅ Manuell: Pi-Peer hinzufügen | +| Erik `/etc/postgresql/17/main/pg_hba.conf` | ✅ Manuell: 10.10.0.0/24 Zugang | + +## Manuelle Schritte (nicht von Codex ausführbar) + +1. **WireGuard Keypair generieren** (Schritt 1) — muss manuell auf sicherem System passieren +2. **Erik wg0.conf bearbeiten** (Schritt 2) — SSH auf Erik nötig +3. **Pi physisch aufsetzen** (Schritt 3) — Hardware + pi-scraper-setup.sh ausführen +4. **DB-Passwort** in setup-Befehl einsetzen (steht in `/opt/tip/.env` auf Erik) diff --git a/CODEX-TASK-zero-manual-review.md b/CODEX-TASK-zero-manual-review.md new file mode 100644 index 0000000..51722a1 --- /dev/null +++ b/CODEX-TASK-zero-manual-review.md @@ -0,0 +1,705 @@ +# CODEX TASK: Zero Manual Review Queue — Deterministic Equivalence Matching + +## Ziel + +Die manuelle Review-Queue (`transceiver_equivalences` mit `status = 'pending'`) wird eliminiert. +Statt probabilistischer Confidence-Scores wird ein deterministisches Exact-Match-System gebaut, +das nur dann einen Match erzeugt, wenn ALLE Pflichtfelder vorhanden und exakt gleich sind. + +**Aktueller Zustand (PROBLEM):** +- 13.374 Einträge in `pending` → manuelle Freigabe nötig +- Confidence-Score 0.0–1.0 → "56%" bedeutet Unsicherheit → Review nötig +- Fehlende Felder (wavelength=?) führen zu unsicheren Matches +- `auto_approved` ab 0.73 Confidence — zu niedrige Schwelle + +**Ziel-Zustand (LÖSUNG):** +- 0 Einträge in `pending` — nie wieder +- Kein Confidence-Score — nur MATCH oder KEIN MATCH +- Fehlende Felder → Enrichment-Job → kein Match bis Daten vollständig +- Nur Exact-Match (mit definierten Toleranzen) → 100% verlässliche Daten + +--- + +## Repository + +``` +Pfad: /opt/tip (auf Erik-Server) ODER + /Users/renefichtmueller/Desktop/Claude Code/github-repos/transceiver-db (lokal) + +Stack: TypeScript, Node.js, PostgreSQL 17, pg-boss (Job-Queue) +Packages: packages/scraper/src/scheduler.ts ← Haupt-Matching-Logik + packages/api/src/routes/review.ts ← API-Endpunkte + sql/ ← DB-Migrationen (nummeriert 001-104) +``` + +--- + +## SCHRITT 1: Datenbank-Migration — Neue Pflichtfelder + +**Erstelle Datei:** `sql/105-wavelength-connector-completeness.sql` + +```sql +-- Migration 105: Wavelength (TX/RX getrennt für BiDi), Connector-Normalisierung, +-- Data-Completeness-Score, Enrichment-Flag + +-- 1. Wellenlängen-Felder aufteilen (BiDi hat TX ≠ RX) +ALTER TABLE transceivers + ADD COLUMN IF NOT EXISTS wavelength_tx_nm INTEGER, -- TX-Wellenlänge in nm (z.B. 1270) + ADD COLUMN IF NOT EXISTS wavelength_rx_nm INTEGER, -- RX-Wellenlänge in nm (z.B. 1330) + ADD COLUMN IF NOT EXISTS connector_type TEXT, -- 'LC', 'SC', 'MPO-12', 'MPO-16', 'RJ45', 'CS', 'SN' + ADD COLUMN IF NOT EXISTS data_completeness INTEGER DEFAULT 0 CHECK (data_completeness BETWEEN 0 AND 100), + ADD COLUMN IF NOT EXISTS enrichment_needed BOOLEAN DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS enrichment_fields TEXT[] DEFAULT '{}'; -- welche Felder fehlen noch + +-- 2. Indices für Performance +CREATE INDEX IF NOT EXISTS idx_tx_wavelength_tx ON transceivers (wavelength_tx_nm) WHERE wavelength_tx_nm IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_tx_wavelength_rx ON transceivers (wavelength_rx_nm) WHERE wavelength_rx_nm IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_tx_completeness ON transceivers (data_completeness); +CREATE INDEX IF NOT EXISTS idx_tx_enrichment ON transceivers (enrichment_needed) WHERE enrichment_needed = TRUE; + +-- 3. Completeness-Berechnungsfunktion +CREATE OR REPLACE FUNCTION calc_data_completeness( + p_form_factor TEXT, p_speed_gbps NUMERIC, p_fiber_type TEXT, + p_reach_meters INTEGER, p_wavelength_tx INTEGER, p_connector TEXT +) RETURNS INTEGER AS $$ +DECLARE + score INTEGER := 0; +BEGIN + IF p_form_factor IS NOT NULL AND p_form_factor != '' THEN score := score + 20; END IF; + IF p_speed_gbps IS NOT NULL AND p_speed_gbps > 0 THEN score := score + 20; END IF; + IF p_fiber_type IS NOT NULL AND p_fiber_type != '' THEN score := score + 20; END IF; + IF p_reach_meters IS NOT NULL AND p_reach_meters > 0 THEN score := score + 20; END IF; + IF p_wavelength_tx IS NOT NULL AND p_wavelength_tx > 0 THEN score := score + 10; END IF; + IF p_connector IS NOT NULL AND p_connector != '' THEN score := score + 10; END IF; + RETURN score; +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- 4. Alle bestehenden Transceivers: Completeness initial berechnen +UPDATE transceivers SET + data_completeness = calc_data_completeness( + form_factor, speed_gbps, fiber_type, + reach_meters, wavelength_tx_nm, connector_type + ), + enrichment_needed = ( + form_factor IS NULL OR speed_gbps IS NULL OR + fiber_type IS NULL OR reach_meters IS NULL OR + wavelength_tx_nm IS NULL OR connector_type IS NULL + ); + +-- 5. Trigger: Completeness automatisch aktualisieren +CREATE OR REPLACE FUNCTION trg_update_completeness() +RETURNS TRIGGER AS $$ +BEGIN + NEW.data_completeness := calc_data_completeness( + NEW.form_factor, NEW.speed_gbps, NEW.fiber_type, + NEW.reach_meters, NEW.wavelength_tx_nm, NEW.connector_type + ); + NEW.enrichment_needed := ( + NEW.form_factor IS NULL OR NEW.speed_gbps IS NULL OR + NEW.fiber_type IS NULL OR NEW.reach_meters IS NULL OR + NEW.wavelength_tx_nm IS NULL OR NEW.connector_type IS NULL + ); + -- Fehlende Felder dokumentieren + NEW.enrichment_fields := ARRAY_REMOVE(ARRAY[ + CASE WHEN NEW.form_factor IS NULL THEN 'form_factor' END, + CASE WHEN NEW.speed_gbps IS NULL THEN 'speed_gbps' END, + CASE WHEN NEW.fiber_type IS NULL THEN 'fiber_type' END, + CASE WHEN NEW.reach_meters IS NULL THEN 'reach_meters' END, + CASE WHEN NEW.wavelength_tx_nm IS NULL THEN 'wavelength_tx_nm' END, + CASE WHEN NEW.connector_type IS NULL THEN 'connector_type' END + ], NULL); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trg_completeness ON transceivers; +CREATE TRIGGER trg_completeness + BEFORE INSERT OR UPDATE ON transceivers + FOR EACH ROW EXECUTE FUNCTION trg_update_completeness(); + +COMMENT ON COLUMN transceivers.wavelength_tx_nm IS 'TX wavelength in nm. For BiDi: TX side. For duplex: both TX=RX.'; +COMMENT ON COLUMN transceivers.wavelength_rx_nm IS 'RX wavelength in nm. Only set for BiDi. NULL = same as TX.'; +COMMENT ON COLUMN transceivers.connector_type IS 'Physical connector: LC, SC, MPO-12, MPO-16, RJ45, CS, SN'; +COMMENT ON COLUMN transceivers.data_completeness IS '0-100: percentage of mandatory fields filled (6 fields × weight)'; +COMMENT ON COLUMN transceivers.enrichment_needed IS 'TRUE = one or more mandatory fields missing, enrichment job needed'; +COMMENT ON COLUMN transceivers.enrichment_fields IS 'Array of field names that still need enrichment'; +``` + +--- + +## SCHRITT 2: IEEE/MSA Standards-Lookup-Tabelle (statisch) + +**Erstelle Datei:** `sql/106-ieee-msa-wavelength-lookup.sql` + +```sql +-- Migration 106: IEEE/MSA Standards Wavelength Lookup +-- Ground-Truth für Wellenlänge basierend auf Standard-Spezifikation +-- Quelle: IEEE 802.3, SFF-8472, SFF-8436, SFF-8661, SFF-8679, MSA specs + +CREATE TABLE IF NOT EXISTS ieee_wavelength_lookup ( + id SERIAL PRIMARY KEY, + form_factor TEXT NOT NULL, + speed_gbps NUMERIC NOT NULL, + fiber_type TEXT NOT NULL, -- 'SMF', 'MMF', 'DAC', 'AOC' + reach_min_m INTEGER NOT NULL, + reach_max_m INTEGER NOT NULL, + wavelength_tx_nm INTEGER NOT NULL, + wavelength_rx_nm INTEGER, -- NULL = gleich wie TX (kein BiDi) + connector_type TEXT NOT NULL, + ieee_standard TEXT, -- z.B. '802.3ae', 'SFF-8431' + notes TEXT, + UNIQUE (form_factor, speed_gbps, fiber_type, reach_min_m, reach_max_m) +); + +INSERT INTO ieee_wavelength_lookup + (form_factor, speed_gbps, fiber_type, reach_min_m, reach_max_m, wavelength_tx_nm, wavelength_rx_nm, connector_type, ieee_standard, notes) +VALUES +-- ── SFP (1G) ───────────────────────────────────────────────────────────────── + ('SFP', 1, 'MMF', 0, 550, 850, NULL, 'LC', '802.3z', '1000BASE-SX'), + ('SFP', 1, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3z', '1000BASE-LX'), + ('SFP', 1, 'SMF', 0, 40000, 1310, NULL, 'LC', '802.3z', '1000BASE-EX'), + ('SFP', 1, 'SMF', 0, 70000, 1550, NULL, 'LC', '802.3z', '1000BASE-ZX'), + ('SFP', 1, 'SMF', 0, 10000, 1270, 1330, 'LC', 'SFF-8472', '1000BASE-BX10-U BiDi'), + ('SFP', 1, 'SMF', 0, 10000, 1330, 1270, 'LC', 'SFF-8472', '1000BASE-BX10-D BiDi'), + ('SFP', 1, 'Copper', 0, 100, NULL, NULL, 'RJ45','802.3ab', '1000BASE-T'), +-- ── SFP+ (10G) ─────────────────────────────────────────────────────────────── + ('SFP+', 10, 'MMF', 0, 300, 850, NULL, 'LC', '802.3ae', '10GBASE-SR'), + ('SFP+', 10, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3ae', '10GBASE-LR'), + ('SFP+', 10, 'SMF', 0, 40000, 1310, NULL, 'LC', '802.3ae', '10GBASE-ER'), + ('SFP+', 10, 'SMF', 0, 80000, 1550, NULL, 'LC', '802.3ae', '10GBASE-ZR'), + ('SFP+', 10, 'SMF', 0, 10000, 1270, 1330, 'LC', 'SFF-8431', '10GBASE-BX10-U BiDi'), + ('SFP+', 10, 'SMF', 0, 10000, 1330, 1270, 'LC', 'SFF-8431', '10GBASE-BX10-D BiDi'), + ('SFP+', 10, 'SMF', 0, 20000, 1270, 1330, 'LC', 'SFF-8431', '10GBASE-BX20-U BiDi'), + ('SFP+', 10, 'SMF', 0, 20000, 1330, 1270, 'LC', 'SFF-8431', '10GBASE-BX20-D BiDi'), + ('SFP+', 10, 'DAC', 0, 7, NULL, NULL, 'SFP+','SFF-8431','10G DAC Twinax'), + ('SFP+', 10, 'AOC', 0, 100, 850, NULL, 'LC', 'SFF-8431', '10G AOC'), +-- ── SFP28 (25G) ────────────────────────────────────────────────────────────── + ('SFP28', 25, 'MMF', 0, 100, 850, NULL, 'LC', '802.3by', '25GBASE-SR'), + ('SFP28', 25, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3cc', '25GBASE-LR'), + ('SFP28', 25, 'SMF', 0, 2000, 1310, NULL, 'LC', '25GBASE-DR','25GBASE-DR'), + ('SFP28', 25, 'SMF', 0, 10000, 1270, 1330, 'LC', '802.3cc', '25GBASE-BX10-U BiDi'), + ('SFP28', 25, 'SMF', 0, 10000, 1330, 1270, 'LC', '802.3cc', '25GBASE-BX10-D BiDi'), + ('SFP28', 25, 'DAC', 0, 5, NULL, NULL, 'SFP28','802.3by','25G DAC'), +-- ── QSFP+ (40G) ────────────────────────────────────────────────────────────── + ('QSFP+', 40, 'MMF', 0, 150, 850, NULL, 'MPO-12','802.3ba','40GBASE-SR4'), + ('QSFP+', 40, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3ba', '40GBASE-LR4 CWDM4'), + ('QSFP+', 40, 'SMF', 0, 2000, 1310, NULL, 'MPO-12','802.3bm','40GBASE-PSM4'), + ('QSFP+', 40, 'DAC', 0, 7, NULL, NULL, 'QSFP+','802.3ba','40G DAC'), +-- ── QSFP28 (100G) ──────────────────────────────────────────────────────────── + ('QSFP28', 100, 'MMF', 0, 100, 850, NULL, 'MPO-12','802.3bm','100GBASE-SR4'), + ('QSFP28', 100, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3cd', '100GBASE-LR4 CWDM4'), + ('QSFP28', 100, 'SMF', 0, 500, 1310, NULL, 'MPO-12','802.3bj','100GBASE-DR (PSM4)'), + ('QSFP28', 100, 'SMF', 0, 40000, 1310, NULL, 'LC', '802.3ba', '100GBASE-ER4'), + ('QSFP28', 100, 'SMF', 0, 2000, 1310, NULL, 'LC', 'CWDM4-MSA','100G CWDM4 2km'), + ('QSFP28', 100, 'DAC', 0, 5, NULL, NULL, 'QSFP28','802.3bj','100G DAC'), + ('QSFP28', 100, 'AOC', 0, 100, 850, NULL, 'MPO-12','802.3bm','100G AOC SR4'), +-- ── QSFP56 (200G) ──────────────────────────────────────────────────────────── + ('QSFP56', 200, 'MMF', 0, 100, 850, NULL, 'MPO-16','802.3cd','200GBASE-SR4'), + ('QSFP56', 200, 'SMF', 0, 2000, 1310, NULL, 'LC', '802.3cd', '200GBASE-DR4'), + ('QSFP56', 200, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3cd', '200GBASE-FR4'), + ('QSFP56', 200, 'SMF', 0, 40000, 1310, NULL, 'LC', '802.3cd', '200GBASE-LR4'), +-- ── QSFP-DD (400G) ─────────────────────────────────────────────────────────── + ('QSFP-DD', 400, 'MMF', 0, 100, 850, NULL, 'MPO-16','802.3bs','400GBASE-SR8'), + ('QSFP-DD', 400, 'SMF', 0, 500, 1310, NULL, 'MPO-12','802.3bs','400GBASE-DR4'), + ('QSFP-DD', 400, 'SMF', 0, 2000, 1310, NULL, 'LC', '802.3bs', '400GBASE-FR4'), + ('QSFP-DD', 400, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3bs', '400GBASE-LR4'), + ('QSFP-DD', 400, 'SMF', 0, 10000, 1310, NULL, 'MPO-12','800G MSA','400GBASE-PSM4'), + ('QSFP-DD', 400, 'DAC', 0, 5, NULL, NULL, 'QSFP-DD','802.3bs','400G DAC'), +-- ── OSFP / QSFP-DD800 (800G) ───────────────────────────────────────────────── + ('OSFP', 800, 'MMF', 0, 100, 850, NULL, 'MPO-16','802.3df','800GBASE-SR8'), + ('OSFP', 800, 'SMF', 0, 500, 1310, NULL, 'MPO-12','802.3df','800GBASE-DR8'), + ('OSFP', 800, 'SMF', 0, 2000, 1310, NULL, 'LC', '802.3df', '800GBASE-FR4 2x400G'), + ('OSFP', 800, 'SMF', 0, 10000, 1310, NULL, 'LC', '802.3df', '800GBASE-LR4'), + ('QSFP-DD800', 800, 'MMF', 0, 100, 850, NULL, 'MPO-16','802.3df','800GBASE-SR8'), + ('QSFP-DD800', 800, 'SMF', 0, 500, 1310, NULL, 'MPO-12','802.3df','800GBASE-DR8') +ON CONFLICT DO NOTHING; + +CREATE INDEX IF NOT EXISTS idx_ieee_lookup ON ieee_wavelength_lookup + (form_factor, speed_gbps, fiber_type, reach_min_m, reach_max_m); +``` + +--- + +## SCHRITT 3: Enrichment-Robot (neues Modul) + +**Erstelle Datei:** `packages/scraper/src/robots/wavelength-enricher.ts` + +Dieser Robot läuft als pg-boss Job (`enrich:wavelength`) alle 4 Stunden. +Er füllt `wavelength_tx_nm`, `wavelength_rx_nm`, `connector_type` automatisch. + +```typescript +/** + * Wavelength Enricher Robot + * + * Füllt fehlende wavelength_tx_nm / wavelength_rx_nm / connector_type + * aus drei Quellen (Priorität absteigend): + * 1. IEEE/MSA Lookup-Tabelle (sql/106) — deterministisch, keine Kosten + * 2. Produktname-Regex (heuristisch, pattern-basiert) + * 3. Quarantäne: Produkt bleibt ohne Match bis Daten vorhanden + * + * Kein LLM, kein Scraper, keine externen Calls — rein datenbankbasiert. + */ +import { pool } from "../utils/db"; + +// ── Regex-Patterns für Wellenlänge aus Produktnamen ────────────────────────── + +const WAVELENGTH_PATTERNS: Array<{ + pattern: RegExp; + tx: number; + rx?: number; + notes: string; +}> = [ + // BiDi explizit + { pattern: /\b1270\s*\/\s*1330\b/i, tx: 1270, rx: 1330, notes: "BiDi 1270/1330" }, + { pattern: /\b1330\s*\/\s*1270\b/i, tx: 1330, rx: 1270, notes: "BiDi 1330/1270" }, + { pattern: /\b1310\s*\/\s*1550\b/i, tx: 1310, rx: 1550, notes: "BiDi 1310/1550" }, + { pattern: /\b1550\s*\/\s*1310\b/i, tx: 1550, rx: 1310, notes: "BiDi 1550/1310" }, + { pattern: /\b1295\s*\/\s*1310\b/i, tx: 1295, rx: 1310, notes: "BiDi CWDM" }, + // Direkte nm-Angabe + { pattern: /\b850\s*nm\b/i, tx: 850, notes: "850nm explicit" }, + { pattern: /\b1310\s*nm\b/i, tx: 1310, notes: "1310nm explicit" }, + { pattern: /\b1550\s*nm\b/i, tx: 1550, notes: "1550nm explicit" }, + { pattern: /\b1270\s*nm\b/i, tx: 1270, notes: "1270nm explicit" }, + { pattern: /\b1330\s*nm\b/i, tx: 1330, notes: "1330nm explicit" }, + // DWDM Channels (C-Band ~1530-1565nm) + { pattern: /\bDWDM\b.*\bC\d{2}\b/i, tx: 1550, notes: "DWDM C-Band" }, + { pattern: /\bDWDM\b/i, tx: 1550, notes: "DWDM generic" }, + // Standard-Kurzbezeichnungen → implizite Wellenlänge + { pattern: /\bSR4?\b/i, tx: 850, notes: "SR/SR4 = 850nm MMF" }, + { pattern: /\bLR4?\b/i, tx: 1310, notes: "LR/LR4 = 1310nm SMF" }, + { pattern: /\bER4?\b/i, tx: 1310, notes: "ER/ER4 = 1310nm SMF" }, + { pattern: /\bFR4?\b/i, tx: 1310, notes: "FR/FR4 = 1310nm SMF" }, + { pattern: /\bDR4?\b/i, tx: 1310, notes: "DR/DR4 = 1310nm SMF" }, + { pattern: /\bZR4?\b/i, tx: 1550, notes: "ZR/ZR4 = 1550nm SMF" }, +]; + +const CONNECTOR_PATTERNS: Array<{ pattern: RegExp; connector: string }> = [ + { pattern: /\bMPO.?16\b/i, connector: "MPO-16" }, + { pattern: /\bMPO.?12\b/i, connector: "MPO-12" }, + { pattern: /\bMPO\b/i, connector: "MPO-12" }, // default MPO = MPO-12 + { pattern: /\bMTP\b/i, connector: "MPO-12" }, + { pattern: /\bCS\s*connector\b/i, connector: "CS" }, + { pattern: /\bSN\s*connector\b/i, connector: "SN" }, + { pattern: /\bRJ.?45\b/i, connector: "RJ45" }, + { pattern: /\bbase.?t\b/i, connector: "RJ45" }, + { pattern: /\bSC\b/i, connector: "SC" }, + { pattern: /\bLC\b/i, connector: "LC" }, // LC zuletzt (häufig im Text) +]; + +// DAC/AOC haben keinen Fiber-Connector +const DAC_AOC_PATTERN = /\bDAC\b|\bAOC\b|\btwinax\b/i; + +function extractWavelengthFromName(name: string): { tx: number; rx?: number; notes: string } | null { + for (const p of WAVELENGTH_PATTERNS) { + if (p.pattern.test(name)) { + return { tx: p.tx, rx: p.rx, notes: p.notes }; + } + } + return null; +} + +function extractConnectorFromName(name: string): string | null { + if (DAC_AOC_PATTERN.test(name)) return "DAC/AOC"; + for (const p of CONNECTOR_PATTERNS) { + if (p.pattern.test(name)) return p.connector; + } + return null; +} + +export async function runWavelengthEnricher(): Promise { + console.log("=== Wavelength Enricher Robot ==="); + + // Alle Transceivers mit fehlenden Pflichtfeldern + const { rows: transceivers } = await pool.query<{ + id: string; + standard_name: string; + part_number: string; + form_factor: string; + speed_gbps: number; + fiber_type: string; + reach_meters: number; + wavelength_tx_nm: number | null; + wavelength_rx_nm: number | null; + connector_type: string | null; + }>(` + SELECT id, standard_name, part_number, form_factor, speed_gbps, + fiber_type, reach_meters, wavelength_tx_nm, wavelength_rx_nm, connector_type + FROM transceivers + WHERE enrichment_needed = TRUE + ORDER BY data_completeness DESC -- Produkte mit mehr Daten zuerst + LIMIT 5000 + `); + + let fromIeee = 0; + let fromRegex = 0; + let stillMissing = 0; + + for (const t of transceivers) { + let txNm = t.wavelength_tx_nm; + let rxNm = t.wavelength_rx_nm; + let connector = t.connector_type; + let source = ""; + + // ── Quelle 1: IEEE/MSA Lookup ─────────────────────────────────────────── + if (txNm === null && t.form_factor && t.speed_gbps && t.fiber_type && t.reach_meters) { + const { rows: ieee } = await pool.query<{ + wavelength_tx_nm: number; + wavelength_rx_nm: number | null; + connector_type: string; + }>(` + SELECT wavelength_tx_nm, wavelength_rx_nm, connector_type + FROM ieee_wavelength_lookup + WHERE form_factor = $1 + AND speed_gbps = $2 + AND fiber_type = $3 + AND reach_min_m <= $4 + AND reach_max_m >= $4 + LIMIT 1 + `, [t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters]); + + if (ieee.length > 0) { + txNm = ieee[0].wavelength_tx_nm; + rxNm = ieee[0].wavelength_rx_nm ?? null; + if (!connector) connector = ieee[0].connector_type; + source = "ieee_lookup"; + fromIeee++; + } + } + + // ── Quelle 2: Produktname-Regex ───────────────────────────────────────── + const nameForExtraction = [t.standard_name, t.part_number].filter(Boolean).join(" "); + + if (txNm === null && nameForExtraction) { + const extracted = extractWavelengthFromName(nameForExtraction); + if (extracted) { + txNm = extracted.tx; + rxNm = extracted.rx ?? null; + source = `regex:${extracted.notes}`; + fromRegex++; + } + } + + if (connector === null && nameForExtraction) { + const extractedConn = extractConnectorFromName(nameForExtraction); + if (extractedConn && extractedConn !== "DAC/AOC") connector = extractedConn; + } + + // ── Update wenn etwas gefunden ────────────────────────────────────────── + if (txNm !== null || connector !== null) { + await pool.query(` + UPDATE transceivers SET + wavelength_tx_nm = COALESCE($1, wavelength_tx_nm), + wavelength_rx_nm = COALESCE($2, wavelength_rx_nm), + connector_type = COALESCE($3, connector_type), + updated_at = NOW() + WHERE id = $4 + `, [txNm, rxNm, connector, t.id]); + } else { + stillMissing++; + } + } + + console.log(` IEEE Lookup: ${fromIeee} enriched`); + console.log(` Regex Extract: ${fromRegex} enriched`); + console.log(` Still missing: ${stillMissing} (Quarantäne bis Daten verfügbar)`); + console.log("=== Wavelength Enricher Complete ==="); +} +``` + +--- + +## SCHRITT 4: Neuer deterministischer Matcher + +**Ändere in:** `packages/scraper/src/scheduler.ts` + +Suche den Block `// fiber_type match` (ca. Zeile 2780) und ersetze die gesamte +Matching-Funktion `maintenance:find-equivalences` durch folgende Logik: + +```typescript +// ── NEUE deterministische Match-Logik ──────────────────────────────────────── +// Kein Confidence-Score mehr. Nur MATCH oder KEIN MATCH. +// Voraussetzung: ALLE 6 Pflichtfelder müssen bekannt sein. + +// Pflichtfelder prüfen — fehlt auch nur eines → kein Match +const fxComplete = fx.form_factor && fx.speed_gbps && fx.fiber_type && + fx.reach_meters && fx.wavelength_tx_nm && fx.connector_type; +const candComplete = cand.form_factor && cand.speed_gbps && cand.fiber_type && + cand.reach_meters && cand.wavelength_tx_nm && cand.connector_type; + +if (!fxComplete || !candComplete) { + // Fehlende Daten → Enrichment-Queue, kein Match-Versuch + incompleteCount++; + continue; +} + +// ── Exact Match (mit definierten Toleranzen) ────────────────────────────────── +const formFactorMatch = fx.form_factor.trim().toUpperCase() === cand.form_factor.trim().toUpperCase(); +if (!formFactorMatch) continue; // Hart: falsches Gehäuse = nicht kompatibel + +const speedMatch = Math.abs(Number(fx.speed_gbps) - Number(cand.speed_gbps)) < 0.1; +if (!speedMatch) continue; // Hart: 10G ≠ 25G + +const fiberMatch = fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase(); +if (!fiberMatch) continue; // Hart: SMF ≠ MMF = komplett anderes Produkt + +// Reach: ±10% Toleranz (Herstellervarianz bei Kabelqualität) +const reachRatio = Math.abs(fx.reach_meters - cand.reach_meters) / Math.max(fx.reach_meters, 1); +if (reachRatio > 0.10) continue; + +// Wellenlänge: ±5nm Toleranz (Herstellervarianz innerhalb Spec) +const wlTxDiff = Math.abs((fx.wavelength_tx_nm ?? 0) - (cand.wavelength_tx_nm ?? 0)); +if (wlTxDiff > 5) continue; + +// BiDi RX nur prüfen wenn einer von beiden BiDi ist +const fxHasBidi = fx.wavelength_rx_nm != null; +const candHasBidi = cand.wavelength_rx_nm != null; +if (fxHasBidi || candHasBidi) { + const wlRxDiff = Math.abs((fx.wavelength_rx_nm ?? 0) - (cand.wavelength_rx_nm ?? 0)); + if (wlRxDiff > 5) continue; +} + +// Connector: Exact Match (LC ≠ SC ≠ MPO-12) +const connMatch = fx.connector_type.trim().toUpperCase() === cand.connector_type.trim().toUpperCase(); +if (!connMatch) continue; + +// ── Alle Tests bestanden → MATCH (100% sicher) ─────────────────────────────── +const matchBasis = ['form_factor', 'speed_gbps', 'fiber_type', 'reach', 'wavelength_tx', 'connector']; +const notes = `${fx.part_number} ↔ ${cand.part_number} (${cand.vendor_name}) | ` + + `basis: ${matchBasis.join(', ')} | DETERMINISTIC MATCH`; + +// Direkt auto_approved — kein pending mehr +await pool.query(` + INSERT INTO transceiver_equivalences + (flexoptix_id, competitor_id, confidence, match_basis, match_notes, status) + VALUES ($1, $2, 1.0, $3, $4, 'auto_approved') + ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET + confidence = 1.0, + match_basis = EXCLUDED.match_basis, + match_notes = EXCLUDED.match_notes, + status = 'auto_approved', + updated_at = NOW() + WHERE transceiver_equivalences.status = 'pending' +`, [fx.id, cand.competitor_id, matchBasis, notes]); + +// competitor_verified setzen +await pool.query(` + UPDATE transceivers + SET competitor_verified = true, + competitor_verified_at = NOW(), + competitor_status = 'matched', + competitor_status_updated_at = NOW() + WHERE id = $1 AND competitor_verified = false +`, [fx.id]); + +matchedCount++; +``` + +--- + +## SCHRITT 5: Pending-Queue bereinigen + +**Erstelle Datei:** `sql/107-clear-pending-queue.sql` + +```sql +-- Migration 107: Pending Queue bereinigen +-- Alle 'pending' Einträge die KEINEN deterministischen Match haben → reject +-- Alle 'pending' Einträge die jetzt deterministische Matches wären → werden +-- durch den neuen Matcher bei nächstem Run neu erzeugt (als auto_approved) + +-- Schritt 1: Alle pending-Einträge rejcten (veraltete unsichere Matches) +UPDATE transceiver_equivalences +SET status = 'rejected', + reject_reason = 'Superseded by deterministic matcher — confidence-based match removed', + reviewed_at = NOW(), + reviewed_by = 'system:migration-107' +WHERE status = 'pending'; + +-- Schritt 2: Statistik loggen +DO $$ +DECLARE + pending_count INTEGER; + approved_count INTEGER; + rejected_count INTEGER; +BEGIN + SELECT COUNT(*) INTO pending_count FROM transceiver_equivalences WHERE status = 'pending'; + SELECT COUNT(*) INTO approved_count FROM transceiver_equivalences WHERE status IN ('approved', 'auto_approved'); + SELECT COUNT(*) INTO rejected_count FROM transceiver_equivalences WHERE status = 'rejected'; + RAISE NOTICE 'After migration 107: pending=%, approved=%, rejected=%', + pending_count, approved_count, rejected_count; +END; +$$; + +-- Schritt 3: Index für deterministischen Matcher optimieren +CREATE INDEX IF NOT EXISTS idx_eq_deterministic ON transceiver_equivalences + (flexoptix_id, competitor_id, status) + WHERE status = 'auto_approved'; +``` + +--- + +## SCHRITT 6: pg-boss Job für Enrichment-Robot registrieren + +**Ändere in:** `packages/scraper/src/scheduler.ts` + +Im Block wo Jobs registriert werden, hinzufügen: + +```typescript +// Wavelength Enricher — läuft alle 4 Stunden +await boss.schedule('enrich:wavelength', '0 */4 * * *', {}, {}); + +// Handler registrieren +boss.work('enrich:wavelength', async () => { + await runWavelengthEnricher(); +}); +``` + +--- + +## SCHRITT 7: API — Manual Review UI deaktivieren + +**Ändere in:** `packages/api/src/routes/review.ts` + +Den POST-Endpunkt `/equivalences/:id/approve` mit einem Guard versehen: + +```typescript +reviewRouter.post("/equivalences/:id/approve", async (req, res) => { + // Manual approval ist deaktiviert — deterministischer Matcher übernimmt + res.status(410).json({ + error: "Manual approval disabled", + message: "The system now uses deterministic matching. No manual review needed.", + info: "Matches are auto-approved when all 6 mandatory fields match exactly." + }); +}); +``` + +--- + +## SCHRITT 8: Transceivers-Query für Matcher aktualisieren + +**Ändere in:** `packages/scraper/src/scheduler.ts` + +Den SQL-Query der Flexoptix-Transceivers für den Matcher aktualisieren, +damit wavelength_tx_nm und connector_type mitgeladen werden: + +```sql +-- Flexoptix-Transceivers für Matcher laden +SELECT + t.id, t.part_number, t.standard_name, t.form_factor, + t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, + t.wavelength_tx_nm, -- NEU + t.wavelength_rx_nm, -- NEU + t.connector_type, -- NEU (war vorher 'connector') + t.data_completeness, -- NEU + t.enrichment_needed -- NEU +FROM transceivers t +JOIN vendors v ON v.id = t.vendor_id +WHERE v.name = 'Flexoptix' + AND t.enrichment_needed = FALSE -- NUR vollständige Datensätze matchen + AND t.data_completeness >= 80 +ORDER BY t.part_number +``` + +Gleiches für den Kandidaten-Query (Competitor-Transceivers). + +--- + +## SCHRITT 9: Bestehende wavelengths-Spalte migrieren + +Die existierende Spalte `transceivers.wavelengths` (TEXT, z.B. "1310nm" oder "1270/1330nm") +in die neuen numerischen Spalten überführen: + +**Erstelle Datei:** `sql/108-migrate-wavelengths-text-to-int.sql` + +```sql +-- Migration 108: wavelengths TEXT → wavelength_tx_nm / wavelength_rx_nm INTEGER + +UPDATE transceivers SET + wavelength_tx_nm = CASE + WHEN wavelengths ~ '^\s*850' THEN 850 + WHEN wavelengths ~ '^\s*1270' THEN 1270 + WHEN wavelengths ~ '^\s*1310' THEN 1310 + WHEN wavelengths ~ '^\s*1330' THEN 1330 + WHEN wavelengths ~ '^\s*1490' THEN 1490 + WHEN wavelengths ~ '^\s*1550' THEN 1550 + WHEN wavelengths ~ '^\s*1270\s*/\s*1330' THEN 1270 + WHEN wavelengths ~ '^\s*1330\s*/\s*1270' THEN 1330 + ELSE NULL + END, + wavelength_rx_nm = CASE + WHEN wavelengths ~ '1270\s*/\s*1330' THEN 1330 + WHEN wavelengths ~ '1330\s*/\s*1270' THEN 1270 + WHEN wavelengths ~ '1310\s*/\s*1550' THEN 1550 + WHEN wavelengths ~ '1550\s*/\s*1310' THEN 1310 + ELSE NULL + END +WHERE wavelengths IS NOT NULL + AND wavelength_tx_nm IS NULL; + +-- Connector aus alter connector-Spalte übernehmen (falls vorhanden) +UPDATE transceivers SET + connector_type = CASE connector + WHEN 'LC' THEN 'LC' + WHEN 'SC' THEN 'SC' + WHEN 'MPO' THEN 'MPO-12' + WHEN 'MPO-12' THEN 'MPO-12' + WHEN 'MPO-16' THEN 'MPO-16' + WHEN 'RJ45' THEN 'RJ45' + ELSE connector + END +WHERE connector IS NOT NULL AND connector_type IS NULL; + +-- Completeness neu berechnen nach Migration +UPDATE transceivers SET + data_completeness = calc_data_completeness( + form_factor, speed_gbps, fiber_type, + reach_meters, wavelength_tx_nm, connector_type + ), + enrichment_needed = ( + form_factor IS NULL OR speed_gbps IS NULL OR + fiber_type IS NULL OR reach_meters IS NULL OR + wavelength_tx_nm IS NULL OR connector_type IS NULL + ); +``` + +--- + +## Ausführungsreihenfolge für Codex + +``` +1. sql/105-wavelength-connector-completeness.sql → psql ausführen +2. sql/106-ieee-msa-wavelength-lookup.sql → psql ausführen +3. sql/108-migrate-wavelengths-text-to-int.sql → psql ausführen +4. packages/scraper/src/robots/wavelength-enricher.ts → neue Datei erstellen +5. packages/scraper/src/scheduler.ts → Matcher-Logik ersetzen (Schritt 4) +6. packages/scraper/src/scheduler.ts → Job registrieren (Schritt 6) +7. packages/api/src/routes/review.ts → Approve-Endpunkt deaktivieren (Schritt 7) +8. npm run build -w packages/scraper → Build +9. npm run build -w packages/api → Build +10. sql/107-clear-pending-queue.sql → psql ausführen (ZULETZT — bereinigt Queue) +11. pm2 restart tip-scraper-daemon → Daemon neu starten +12. Enricher manuell triggern: POST /api/review/run-matcher +``` + +--- + +## Erwartetes Ergebnis nach Deployment + +| Metrik | Vorher | Nachher | +|--------|--------|---------| +| Pending Queue | 13.374 | 0 | +| Confidence-Score | 0.0–1.0 (fuzzy) | entfällt | +| Match-Typ | probabilistisch | deterministisch | +| Manuelle Freigaben/Tag | ~50-200 | 0 | +| False-Positive-Rate | ~15% | ~0% | +| Transceivers mit Wellenlänge | ~30% | >85% (nach Enricher) | + +--- + +## Nicht ändern (außerhalb dieses Tasks) + +- `packages/scraper/src/scrapers/*` — Scraper-Logik bleibt unverändert +- `packages/api/src/routes/` (außer review.ts) — API-Endpunkte bleiben +- `sql/001-036` — Bestehende Migrationen nicht anfassen +- Alle `packages/core/src/` Typen — nur erweitern, nicht löschen +- PM2-Konfiguration — nicht anfassen diff --git a/packages/scraper/src/index-fs-only.ts b/packages/scraper/src/index-fs-only.ts new file mode 100644 index 0000000..22c4420 --- /dev/null +++ b/packages/scraper/src/index-fs-only.ts @@ -0,0 +1,77 @@ +/** + * TIP FS.COM + NADDOD Dedicated Scraper — index-fs-only.ts + * + * Runs on ERIK but routes traffic through Pi's SOCKS5 proxy so target sites + * see a residential IP instead of the IONOS datacenter IP range. + * + * Required env: PROXY_URLS=socks5://10.10.0.6:1080 + * + * Jobs consumed from pg-boss: + * scrape:pricing:fs — FS.com (Playwright, scheduled 02:00 + 14:00) + * scrape:pricing:naddod — NADDOD (fetch, scheduled 00:00/06:00/12:00/18:00) + */ +import { config } from "dotenv"; +import { join } from "path"; +config({ path: join(__dirname, "..", "..", "..", ".env") }); + +import PgBoss from "pg-boss"; +import { mkdirSync, rmSync } from "fs"; + +const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`; + +async function withIsolatedStorage(name: string, fn: () => Promise): Promise { + const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`; + mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); + mkdirSync(join(dir, "datasets", "default"), { recursive: true }); + mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); + const prev = process.env.CRAWLEE_STORAGE_DIR; + process.env.CRAWLEE_STORAGE_DIR = dir; + try { await fn(); } + finally { + process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; + try { rmSync(dir, { recursive: true, force: true }); } catch {} + } +} + +async function main() { + const proxy = process.env.PROXY_URLS; + console.log(`\n=== TIP FS.COM/NADDOD Scraper (proxy: ${proxy ?? "NONE — datacenter IP!"}) ===\n`); + if (!proxy) { + console.warn("WARNING: PROXY_URLS not set — FS.com will see IONOS IP (likely blocked by Cloudflare)"); + } + + const boss = new PgBoss({ + connectionString, + retryLimit: 3, + retryDelay: 300, // 5 min retry on failure + expireInSeconds: 7200, // 2h timeout for full FS catalog run + monitorStateIntervalSeconds: 60, + }); + + boss.on("error", (e: Error) => console.error("pg-boss error:", e.message)); + await boss.start(); + + await boss.createQueue("scrape:pricing:fs").catch(() => {}); + await boss.createQueue("scrape:pricing:naddod").catch(() => {}); + + const { scrapeFs } = await import("./scrapers/fs-com"); + const { scrapeNaddod } = await import("./scrapers/naddod"); + + // FS.com: Playwright crawler — PROXY_URLS auto-picked up by buildProxyConfiguration() + await boss.work("scrape:pricing:fs", async () => { + console.log(`[${new Date().toISOString()}] FS.COM → via ${proxy ?? "direct (no proxy)"}`); + await withIsolatedStorage("fs", scrapeFs); + }); + + // NADDOD: fetch-based — residential IP reduces rate-limit risk on US CDN + await boss.work("scrape:pricing:naddod", async () => { + console.log(`[${new Date().toISOString()}] NADDOD → via ${proxy ?? "direct (no proxy)"}`); + await scrapeNaddod(); + }); + + console.log("FS.COM + NADDOD workers active — waiting for pg-boss jobs\n"); + process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); }); + process.on("SIGINT", async () => { await boss.stop(); process.exit(0); }); +} + +main().catch((e) => { console.error("Fatal:", e); process.exit(1); }); diff --git a/packages/scraper/src/index-pi.ts b/packages/scraper/src/index-pi.ts index 0ffc3fc..932648d 100644 --- a/packages/scraper/src/index-pi.ts +++ b/packages/scraper/src/index-pi.ts @@ -1,12 +1,17 @@ /** * TIP Pi Scraper Node — index-pi.ts * - * Runs as a pg-boss worker on Raspberry Pi fleet (Pi1/Pi2/Pi3). - * Handles pricing, catalog, compatibility and intelligence scraping. - * Connects to production DB via WireGuard VPN. + * FETCH-ONLY worker for Raspberry Pi fleet (Pi1/Pi2/Pi3). + * NO Playwright — pure fetch/cheerio scrapers that run on ARM without Chromium. + * Connects to production DB via WireGuard VPN (10.10.0.1 = Erik). * - * Erik (IONOS VPS) runs ONLY: tip-api, tip-mcp, tip-postgres (Docker), tip-qdrant (Docker) - * All scraping work is distributed to the Pi fleet. + * Playwright-heavy scrapers (FS.COM, switch-assets, eBay enricher) stay on ERIK + * and route through the Pi's SOCKS5 proxy for IP diversity. + * + * Architecture: + * Pi node 24/7: fetch-only scrapers → pg-boss queues on Erik's PostgreSQL + * Pi SOCKS5: dante-server on WireGuard IP, Erik's FS.COM routes through it + * Erik only: Playwright scrapers, API, MCP, Qdrant, heavy compute */ import { config } from "dotenv"; import { join } from "path"; @@ -32,12 +37,16 @@ async function withIsolatedStorage(name: string, fn: () => Promise): Promi } } +// ── FETCH-ONLY scrapers — safe on ARM/Pi without Playwright ────────────────── +// FS.COM intentionally EXCLUDED: uses Playwright, runs on Erik via Pi SOCKS5 proxy +// ATGBICS intentionally EXCLUDED: uses Playwright detail pages +// eBay enricher intentionally EXCLUDED: Playwright +// switch-assets intentionally EXCLUDED: Playwright variants const QUEUES = [ - // Pricing scrapers - "scrape:pricing:fs", + // Pricing scrapers — all fetch/cheerio based + "scrape:pricing:naddod", // fetch + LD+JSON — residential IP beneficial "scrape:pricing:10gtek", "scrape:pricing:prolabs", - "scrape:pricing:atgbics", "scrape:pricing:optcore", "scrape:pricing:fluxlight", "scrape:pricing:gbics", @@ -65,9 +74,7 @@ const QUEUES = [ // Vendor scrapers "scrape:vendors:flexoptix", "scrape:vendors:flexoptix-supported", - // Compatibility scrapers - "scrape:compat:cisco", - "scrape:compat:juniper", + // Compatibility scrapers (static HCL pages, fetch-based) "scrape:compat:sonic", "scrape:compat:ufispace", "scrape:compat:edgecore", @@ -77,22 +84,12 @@ const QUEUES = [ "scrape:nog-talks", "scrape:community-issues", "scrape:datasheet-links", - // Switch assets - "scrape:assets:switches", - // eBay enrichment - "enrich:ebay-transceivers", - "enrich:ebay-switches", - // Prediction signals + // Prediction signals (API-based, no browser) "scrape:signals:sec-edgar", "scrape:signals:github", - "scrape:signals:ebay-velocity", "scrape:signals:ai-clusters", "scrape:signals:distributor-leads", "scrape:signals:standards", - // Compute - "compute:abc", - "compute:reorder-signals", - "compute:forecast", ]; async function main() { @@ -117,11 +114,10 @@ async function main() { const log = (q: string) => console.log(`[${new Date().toISOString()}] [${PI_NAME}] ${q}`); - // ── Pricing scrapers ────────────────────────────────────────────────── - const { scrapeFs } = await import("./scrapers/fs-com"); + // ── Pricing scrapers (fetch/cheerio — NO Playwright) ───────────────── + const { scrapeNaddod } = await import("./scrapers/naddod"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrapeProLabs } = await import("./scrapers/prolabs"); - const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeOptcore } = await import("./scrapers/optcore"); const { scrapeFluxlight } = await import("./scrapers/fluxlight"); const { scrapeGbics } = await import("./scrapers/gbics"); @@ -153,36 +149,27 @@ async function main() { const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors"); - // ── Compatibility scrapers ──────────────────────────────────────────── - const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); - const { scrapeJuniperHct } = await import("./scrapers/juniper-hct"); + // ── Compatibility scrapers (static HTML, fetch-based) ──────────────── const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl"); const { scrapeUfiSpace } = await import("./scrapers/ufispace"); const { scrapeEdgecore } = await import("./scrapers/edgecore"); // ── Intelligence scrapers ───────────────────────────────────────────── const { scrapeNews } = await import("./scrapers/news"); - const { scrapeMarketIntelligence, computeAbcClassification, computeReorderSignals } = await import("./scrapers/market-intelligence"); + const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); const { scrapeAllSwitchIssues, findAndSeedDatasheetLinks } = await import("./scrapers/community-issues"); - const { scrapeSwitchAssets } = await import("./scrapers/switch-assets"); - // ── eBay enrichment ─────────────────────────────────────────────────── - const { enrichTransceiversFromEbay, enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher"); - - // ── Prediction signals ──────────────────────────────────────────────── + // ── Prediction signals (API-based, no browser) ──────────────────────── const { scrapeSecEdgar } = await import("./scrapers/sec-edgar"); const { scrapeGithubSignals } = await import("./scrapers/github-signals"); - const { scrapeEbayVelocity } = await import("./scrapers/ebay-velocity"); const { scrapeAiClusters } = await import("./scrapers/ai-clusters"); const { scrapeDistributorLeads } = await import("./scrapers/distributor-leads"); const { scrapeStandardsTracker } = await import("./scrapers/standards-tracker"); - const { runForecastEngine } = await import("./utils/forecast-engine"); // ── Register workers ────────────────────────────────────────────────── - await boss.work("scrape:pricing:fs", async () => { log("fs"); await withIsolatedStorage("fs", scrapeFs); }); + await boss.work("scrape:pricing:naddod", async () => { log("naddod"); await scrapeNaddod(); }); await boss.work("scrape:pricing:10gtek", async () => { log("10gtek"); await withIsolatedStorage("10gtek", scrape10Gtek); }); await boss.work("scrape:pricing:prolabs", async () => { log("prolabs"); await withIsolatedStorage("prolabs", scrapeProLabs); }); - await boss.work("scrape:pricing:atgbics", async () => { log("atgbics"); await withIsolatedStorage("atgbics", scrapeAtgbics); }); await boss.work("scrape:pricing:optcore", async () => { log("optcore"); await withIsolatedStorage("optcore", scrapeOptcore); }); await boss.work("scrape:pricing:fluxlight", async () => { log("fluxlight"); await withIsolatedStorage("fluxlight", scrapeFluxlight); }); await boss.work("scrape:pricing:gbics", async () => { log("gbics"); await withIsolatedStorage("gbics", scrapeGbics); }); @@ -198,45 +185,34 @@ async function main() { await boss.work("scrape:pricing:vcelink", async () => { log("vcelink"); await scrapeVcelink(); }); await boss.work("scrape:pricing:opticsbay", async () => { log("opticsbay"); await scrapeOpticsBay(); }); - await boss.work("scrape:pricing:comms-express", async () => { log("comms-express"); await scrapeCommsExpress(); }); - await boss.work("scrape:pricing:router-switch", async () => { log("router-switch"); await scrapeRouterSwitch(); }); - await boss.work("scrape:pricing:multimode-inc", async () => { log("multimode-inc"); await scrapeMultimodeInc(); }); + await boss.work("scrape:pricing:comms-express", async () => { log("comms-express"); await scrapeCommsExpress(); }); + await boss.work("scrape:pricing:router-switch", async () => { log("router-switch"); await scrapeRouterSwitch(); }); + await boss.work("scrape:pricing:multimode-inc", async () => { log("multimode-inc"); await scrapeMultimodeInc(); }); await boss.work("scrape:pricing:optictransceiver", async () => { log("optictransceiver"); await scrapeOpticTransceiver(); }); - await boss.work("scrape:pricing:wiitek", async () => { log("wiitek"); await scrapeWiitek(); }); + await boss.work("scrape:pricing:wiitek", async () => { log("wiitek"); await scrapeWiitek(); }); await boss.work("scrape:pricing:flexoptix", async () => { log("flexoptix-catalog"); await scrapeFlexoptixCatalog(); }); - await boss.work("scrape:catalog:smartoptics", async () => { log("smartoptics"); await withIsolatedStorage("smartoptics", scrapeSmartOptics); }); - await boss.work("scrape:catalog:hubersuhner", async () => { log("hubersuhner"); await withIsolatedStorage("hubersuhner", scrapeHuberSuhner); }); + await boss.work("scrape:catalog:smartoptics", async () => { log("smartoptics"); await withIsolatedStorage("smartoptics", scrapeSmartOptics); }); + await boss.work("scrape:catalog:hubersuhner", async () => { log("hubersuhner"); await withIsolatedStorage("hubersuhner", scrapeHuberSuhner); }); await boss.work("scrape:vendors:flexoptix", async () => { log("flexoptix-vendors"); await scrapeFlexoptixVendors(); }); await boss.work("scrape:vendors:flexoptix-supported", async () => { log("flexoptix-supported"); await seedFlexoptixVendors(); }); - await boss.work("scrape:compat:cisco", async () => { log("cisco-compat"); await withIsolatedStorage("cisco", scrapeCiscoTmg); }); - await boss.work("scrape:compat:juniper", async () => { log("juniper-compat"); await withIsolatedStorage("juniper", scrapeJuniperHct); }); await boss.work("scrape:compat:sonic", async () => { log("sonic-compat"); await withIsolatedStorage("sonic", scrapeSonicHcl); }); await boss.work("scrape:compat:ufispace", async () => { log("ufispace"); await withIsolatedStorage("ufispace", scrapeUfiSpace); }); await boss.work("scrape:compat:edgecore", async () => { log("edgecore"); await withIsolatedStorage("edgecore", scrapeEdgecore); }); - await boss.work("scrape:news", async () => { log("news"); await scrapeNews(); }); - await boss.work("scrape:market-intel", async () => { log("market-intel"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); }); - await boss.work("scrape:nog-talks", async () => { log("nog-talks"); const { scrapeNogTalks } = await import("./scrapers/nog-talks"); await scrapeNogTalks(); }); - await boss.work("scrape:community-issues", async () => { log("community"); await withIsolatedStorage("community", () => scrapeAllSwitchIssues(30)); }); - await boss.work("scrape:datasheet-links", async () => { log("datasheets"); await findAndSeedDatasheetLinks(50); }); - await boss.work("scrape:assets:switches", async () => { log("switch-assets"); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); }); + await boss.work("scrape:news", async () => { log("news"); await scrapeNews(); }); + await boss.work("scrape:market-intel", async () => { log("market-intel"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); }); + await boss.work("scrape:nog-talks", async () => { log("nog-talks"); const { scrapeNogTalks } = await import("./scrapers/nog-talks"); await scrapeNogTalks(); }); + await boss.work("scrape:community-issues", async () => { log("community"); await withIsolatedStorage("community", () => scrapeAllSwitchIssues(30)); }); + await boss.work("scrape:datasheet-links", async () => { log("datasheets"); await findAndSeedDatasheetLinks(50); }); - await boss.work("enrich:ebay-transceivers", async () => { log("ebay-transceivers"); await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); }); - await boss.work("enrich:ebay-switches", async () => { log("ebay-switches"); await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30)); }); - - await boss.work("scrape:signals:sec-edgar", async () => { log("sec-edgar"); await scrapeSecEdgar(); }); - await boss.work("scrape:signals:github", async () => { log("github-signals"); await scrapeGithubSignals(); }); - await boss.work("scrape:signals:ebay-velocity", async () => { log("ebay-velocity"); await scrapeEbayVelocity(); }); - await boss.work("scrape:signals:ai-clusters", async () => { log("ai-clusters"); await scrapeAiClusters(); }); + await boss.work("scrape:signals:sec-edgar", async () => { log("sec-edgar"); await scrapeSecEdgar(); }); + await boss.work("scrape:signals:github", async () => { log("github-signals"); await scrapeGithubSignals(); }); + await boss.work("scrape:signals:ai-clusters", async () => { log("ai-clusters"); await scrapeAiClusters(); }); await boss.work("scrape:signals:distributor-leads", async () => { log("distributor-leads"); await scrapeDistributorLeads(); }); - await boss.work("scrape:signals:standards", async () => { log("standards"); await scrapeStandardsTracker(); }); - - await boss.work("compute:abc", async () => { log("abc"); await computeAbcClassification(); }); - await boss.work("compute:reorder-signals", async () => { log("reorder"); await computeReorderSignals(); }); - await boss.work("compute:forecast", async () => { log("forecast"); await runForecastEngine(); }); + await boss.work("scrape:signals:standards", async () => { log("standards"); await scrapeStandardsTracker(); }); console.log(`${QUEUES.length} queues / workers active — running 24/7\n`); process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); }); diff --git a/scripts/pi-scraper-setup.sh b/scripts/pi-scraper-setup.sh index a53e024..3930683 100644 --- a/scripts/pi-scraper-setup.sh +++ b/scripts/pi-scraper-setup.sh @@ -70,119 +70,13 @@ EOF echo ".env written" # ── 6. Pi-specific scheduler index ─────────────────────────────────────────── -# The Pi runs only fetch/cheerio scrapers — no Playwright -cat > "$INSTALL_DIR/packages/scraper/src/index-pi.ts" <<'PIEOF' -/** - * Pi Scraper Index — lightweight fetch/cheerio only - * No Playwright, no eBay enricher, no heavy compute - * Runs 24/7 on Raspberry Pi nodes - */ -import { config } from "dotenv"; -import { join } from "path"; -config({ path: join(__dirname, "..", "..", "..", ".env") }); - -import PgBoss from "pg-boss"; - -const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`; - -const PI_QUEUES = [ - "scrape:pricing:fluxlight", - "scrape:pricing:gbics", - "scrape:pricing:optcore", - "scrape:pricing:champion-one", - "scrape:pricing:sfpcables", - "scrape:pricing:blueoptics", - "scrape:pricing:fiber24", - "scrape:pricing:tscom", - "scrape:pricing:skylane", - "scrape:pricing:ascentoptics", - "scrape:pricing:gaotek", - "scrape:catalog:smartoptics", - "scrape:catalog:hubersuhner", - "scrape:news", - "scrape:market-intel", -]; - -async function main() { - console.log(`\n=== TIP Pi Scraper (${process.env.PI_NAME || "pi"}) ===\n`); - - const boss = new PgBoss({ - connectionString, - retryLimit: 2, - retryDelay: 60, - expireInSeconds: 3600, - monitorStateIntervalSeconds: 60, - }); - - boss.on("error", (e) => console.error("pg-boss error:", e)); - await boss.start(); - - for (const q of PI_QUEUES) { - await boss.createQueue(q).catch(() => {}); - } - - // Register workers for all Pi-safe scrapers - const mods: Record = { - "scrape:pricing:fluxlight": "./scrapers/fluxlight", - "scrape:pricing:gbics": "./scrapers/gbics", - "scrape:pricing:optcore": "./scrapers/optcore", - "scrape:pricing:champion-one": "./scrapers/champion-one", - "scrape:pricing:sfpcables": "./scrapers/sfpcables", - "scrape:pricing:blueoptics": "./scrapers/blueoptics", - "scrape:pricing:fiber24": "./scrapers/fiber24", - "scrape:pricing:tscom": "./scrapers/tscom", - "scrape:pricing:skylane": "./scrapers/skylane", - "scrape:pricing:ascentoptics": "./scrapers/ascentoptics", - "scrape:pricing:gaotek": "./scrapers/gaotek", - "scrape:catalog:smartoptics": "./scrapers/smartoptics", - "scrape:catalog:hubersuhner": "./scrapers/hubersuhner", - "scrape:news": "./scrapers/news", - }; - - const fnNames: Record = { - "scrape:pricing:fluxlight": "scrapeFluxlight", - "scrape:pricing:gbics": "scrapeGbics", - "scrape:pricing:optcore": "scrapeOptcore", - "scrape:pricing:champion-one": "scrapeChampionOne", - "scrape:pricing:sfpcables": "scrapeSfpCables", - "scrape:pricing:blueoptics": "scrapeBlueOptics", - "scrape:pricing:fiber24": "scrapeFiber24", - "scrape:pricing:tscom": "scrapeTsCom", - "scrape:pricing:skylane": "scrapeSkylane", - "scrape:pricing:ascentoptics": "scrapeAscentOptics", - "scrape:pricing:gaotek": "scrapeGaoTek", - "scrape:catalog:smartoptics": "scrapeSmartOptics", - "scrape:catalog:hubersuhner": "scrapeHuberSuhner", - "scrape:news": "scrapeNews", - }; - - for (const [queue, modPath] of Object.entries(mods)) { - const mod = await import(modPath); - const fn = mod[fnNames[queue]]; - if (!fn) { console.warn(`No function ${fnNames[queue]} in ${modPath}`); continue; } - await boss.work(queue, async () => { - console.log(`[${new Date().toISOString()}] [${process.env.PI_NAME || "pi"}] Running: ${queue}`); - try { await fn(); } - catch (e) { console.error(`[${queue}] failed:`, String(e).slice(0, 200)); } - }); - } - - // Market intel worker - await boss.work("scrape:market-intel", async () => { - console.log(`[${new Date().toISOString()}] Running: Market intelligence`); - const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); - try { await scrapeMarketIntelligence(); } - catch (e) { console.error("market-intel failed:", String(e).slice(0, 200)); } - }); - - console.log(`Pi worker registered for ${PI_QUEUES.length} queues\nWaiting for jobs...\n`); - - process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); }); - process.on("SIGINT", async () => { await boss.stop(); process.exit(0); }); -} - -main().catch((e) => { console.error("Fatal:", e); process.exit(1); }); -PIEOF +# index-pi.ts is in the repo — fetch-only, NO Playwright. +# No inline override needed. Verify it's there: +if [ ! -f "$INSTALL_DIR/packages/scraper/src/index-pi.ts" ]; then + echo "ERROR: index-pi.ts missing in repo — check git clone" + exit 1 +fi +echo "index-pi.ts: OK ($(wc -l < "$INSTALL_DIR/packages/scraper/src/index-pi.ts") lines)" # ── 7. WireGuard (connects to Erik 10.10.0.1 for DB access) ───────────────── WG_PRIVKEY="${WG_PRIVKEY:-}"