diff --git a/knowledge/fixes.json b/knowledge/fixes.json
new file mode 100644
index 0000000..c5a4610
--- /dev/null
+++ b/knowledge/fixes.json
@@ -0,0 +1,1014 @@
+[
+  {
+    "id": "fix-001",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "cloudflared",
+    "severity": "critical",
+    "title": "Cloudflare Tunnel 530 — QUIC-Timeouts, Tunnel nicht erreichbar",
+    "symptoms": [
+      "Browser zeigt: 530 The origin has been unregistered from Argo Tunnel",
+      "cloudflared logs: 'failed to dial to edge with quic: timeout: no recent network activity'",
+      "Tunnel verbindet sich kurz (Registered), bricht dann sofort wieder ab",
+      "Betrifft alle Subdomains des Tunnels",
+      "Alle 4 connIndex fallen nacheinander aus"
+    ],
+    "root_cause": "UDP Port 7844 (QUIC-Protokoll) wird vom NAT-Router oder einer Firewall gedroppt. cloudflared wählt standardmäßig QUIC, scheitert nach kurzer Idle-Zeit, reconnectet dauerhaft — 530-Fehler für Endnutzer.",
+    "fix": {
+      "steps": [
+        "1. SSH auf cloudflared-Host",
+        "2. /etc/systemd/system/cloudflared.service öffnen",
+        "3. In ExecStart: '--protocol http2' vor 'tunnel run' einfügen",
+        "4. systemctl daemon-reload && systemctl restart cloudflared",
+        "5. Verify: journalctl -u cloudflared -n 5 | grep 'Registered' → alle connIndex zeigen protocol=http2"
+      ]
+    },
+    "prevention": "Bei jeder neuen cloudflared-Installation in NAT/Home-Lab-Umgebung sofort --protocol http2 setzen."
+  },
+  {
+    "id": "fix-002",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "pvestatd",
+    "severity": "critical",
+    "title": "pvestatd D-State — Proxmox GUI Graphs leer, CT/VM-Status fehlt",
+    "symptoms": [
+      "Proxmox GUI: alle Graphs leer (CPU, Memory, Network, Disk IO)",
+      "CT/LXC/VM-Status-Icons fehlen oder zeigen falsche Farbe",
+      "API /nodes/pve/lxc/{id}/rrddata gibt nur 'time' zurück, keine Metriken",
+      "'systemctl restart pvestatd' schlägt fehl mit: 'can't acquire lock'",
+      "Prozesszustand 'Ds' (D=uninterruptible sleep)",
+      "SIGKILL hat keine Wirkung"
+    ],
+    "root_cause": "pvestatd-Prozess gerät in D-State beim Kernel-Aufruf free_pgtables. Tritt auf bei hoher I/O-Last (z.B. während Backup). SIGKILL kann D-State-Prozesse nicht beenden.",
+    "fix": {
+      "steps": [
+        "1. Stuck PID finden: pid=$(pgrep pvestatd | head -1)",
+        "2. Prozess aus Cgroup verschieben: echo $pid > /sys/fs/cgroup/cgroup.procs",
+        "3. Lock-Files entfernen: rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid",
+        "4. Service reset + start: systemctl reset-failed pvestatd && systemctl start pvestatd"
+      ],
+      "note": "Echter Fix nur via Reboot. Cgroup-Trick ermöglicht neuen pvestatd-Start ohne Reboot."
+    },
+    "prevention": "Reboot des Proxmox-Hosts nach schweren Backup-Windows empfohlen."
+  },
+  {
+    "id": "fix-003",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10 (VM 123)",
+    "component": "opnsense-disk",
+    "severity": "warning",
+    "title": "OPNsense Disk >75% — automatisches Log-Cleanup erforderlich",
+    "symptoms": [
+      "Disk-Nutzung nahe dem Limit",
+      "OPNsense Web UI zeigt Warnung über Speicherplatz"
+    ],
+    "root_cause": "OPNsense akkumuliert Suricata JSON-Logs, filterlog, dnsmasq-Logs, pkg-Cache und temporäre Dateien.",
+    "fix": {
+      "steps": [
+        "1. Alte Logs: find /var/log -name '*.log' -mtime +7 -delete",
+        "2. Suricata: find /var/log/suricata -name '*.json' -mtime +3 -delete",
+        "3. Temp: find /tmp /var/tmp -mtime +1 -delete",
+        "4. Pkg-Cache: pkg clean -y"
+      ]
+    },
+    "prevention": "Automatisches Cleanup-Script als cron einrichten wenn Disk >75%."
+  },
+  {
+    "id": "fix-004",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "swap",
+    "severity": "warning",
+    "title": "Swap 77% voll trotz 73% freiem RAM — Swappiness zu hoch",
+    "symptoms": [
+      "SWAP usage 77% bei nur 28% RAM-Nutzung",
+      "Prozesse wurden bei früherer Last ausgelagert und nicht zurückgeholt"
+    ],
+    "root_cause": "Linux Standard-Swappiness von 60 lagert Pages aus auch wenn RAM verfügbar. Bei 62GB RAM zu aggressiv.",
+    "fix": {
+      "immediate": "swapoff -a && swapon -a (nur wenn RAM < 50%)",
+      "permanent": "echo 'vm.swappiness=10' >> /etc/sysctl.conf && sysctl vm.swappiness=10"
+    },
+    "prevention": "vm.swappiness=10 auf allen Servern mit >16GB RAM setzen."
+  },
+  {
+    "id": "fix-005",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "backup-load",
+    "severity": "info",
+    "title": "Proxmox Backup verursacht extrem hohe Load — Kaskaden-Effekt",
+    "symptoms": [
+      "Load Average 60+ auf 22-CPU System",
+      "SSH extrem langsam",
+      "pvestatd und andere Services gehen in D-State"
+    ],
+    "root_cause": "vzdump ohne I/O-Limit liest/schreibt massiv auf Disk. Kaskadiert mit Tunnel-Reconnect und D-State-Prozessen.",
+    "fix": {
+      "permanent": [
+        "bwlimit: 50000 in /etc/vzdump.conf",
+        "ionice: 7 (idle I/O priority)",
+        "Kompression: lzo statt gzip",
+        "Backup-Zeiten auf Nacht verschieben"
+      ]
+    },
+    "prevention": "vzdump.conf: ionice: 7 und bwlimit setzen. Backups in Maintenance-Fenster."
+  },
+  {
+    "id": "fix-006",
+    "date": "2026-04-13",
+    "system": "ctxevent",
+    "host": "82.165.222.127",
+    "component": "database-auth",
+    "severity": "critical",
+    "title": "Production Login komplett kaputt — falscher DB-User in DATABASE_URL",
+    "symptoms": [
+      "Admin und Participant login: Internal server error",
+      "Fehler: SASL: SCRAM-SERVER-FIRST-MESSAGE: client password must be a string"
+    ],
+    "root_cause": "DATABASE_URL enthielt falschen PostgreSQL-User (renefichtmueller statt ctxmeet). PrismaPg/SCRAM-SHA-256 erfordert zwingend Passwort-String.",
+    "fix": {
+      "steps": [
+        "1. PostgreSQL-Rollen prüfen: sudo -u postgres psql -c '\\du'",
+        "2. Korrekten User mit Passwort in DATABASE_URL: postgresql://ctxmeet:<pw>@localhost:5432/ctxmeet",
+        "3. PM2 restart mit --update-env Flag"
+      ]
+    },
+    "prevention": "DATABASE_URL IMMER mit Passwort setzen. Korrekte DB-User dokumentieren."
+  },
+  {
+    "id": "fix-007",
+    "date": "2026-04-13",
+    "system": "ctxevent",
+    "host": "82.165.222.127",
+    "component": "prisma-schema",
+    "severity": "critical",
+    "title": "Prisma Schema Drift — migrate status sagt 'up to date' aber Spalten fehlen physisch",
+    "symptoms": [
+      "'prisma migrate status' zeigt 'up to date'",
+      "Spalten fehlen: twoFactorFailCount, twoFactorLockedUntil, recoveryCodesHash",
+      "Fehler: P2022: The column does not exist"
+    ],
+    "root_cause": "Migrationen als 'applied' markiert, aber SQL nie korrekt ausgeführt. prisma migrate status lügt.",
+    "fix": {
+      "steps": [
+        "1. Echten DB-Stand prüfen: \\d 'TableName' in psql",
+        "2. Force-Push Schema: npx prisma db push",
+        "3. Clean rebuild: rm -rf .next && npx prisma generate && npm run build"
+      ]
+    },
+    "prevention": "IMMER echten DB-Stand mit \\d verifizieren, nicht nur prisma migrate status."
+  },
+  {
+    "id": "fix-008",
+    "date": "2026-04-13",
+    "system": "ctxevent",
+    "host": "82.165.222.127",
+    "component": "table-ownership",
+    "severity": "high",
+    "title": "PostgreSQL Table Ownership falsch — prisma db push scheitert",
+    "symptoms": [
+      "prisma db push: permission denied for table",
+      "Tabellen gehören 'postgres' statt App-User"
+    ],
+    "root_cause": "Tabellen von postgres Superuser erstellt. App-User hat keine ALTER-Rechte.",
+    "fix": {
+      "sql": "DO $$ DECLARE r RECORD; BEGIN FOR r IN SELECT tablename FROM pg_tables WHERE schemaname='public' AND tableowner != 'ctxmeet' LOOP EXECUTE format('ALTER TABLE %I OWNER TO ctxmeet', r.tablename); END LOOP; END $$;"
+    },
+    "prevention": "Alle DB-Operationen immer als App-User, nie als postgres."
+  },
+  {
+    "id": "fix-009",
+    "date": "2026-04-13",
+    "system": "ctxevent",
+    "host": "82.165.222.127",
+    "component": "pm2-env",
+    "severity": "high",
+    "title": "PM2 verwendet gecachte alte Umgebungsvariablen nach .env Änderung",
+    "symptoms": [
+      "Neuer .env Wert wird ignoriert",
+      "App-Verhalten passt nicht zu .env"
+    ],
+    "root_cause": "PM2 cached Umgebungsvariablen beim ersten Start. Ohne --update-env werden Änderungen ignoriert.",
+    "fix": {
+      "command": "pm2 restart <app-name> --update-env"
+    },
+    "prevention": "IMMER --update-env bei pm2 restart verwenden."
+  },
+  {
+    "id": "fix-010",
+    "date": "2026-04-13",
+    "system": "erik-server",
+    "host": "82.165.222.127",
+    "component": "sshd",
+    "severity": "critical",
+    "title": "Erik SSH komplett down — fehlerhafte sshd_config + fehlendes /run/sshd",
+    "symptoms": [
+      "SSH Port 22: Connection refused",
+      "sshd -t: 'no argument after keyword'"
+    ],
+    "root_cause": "Fehlerhafte Zeile in sshd_config (Copy-Paste Unfall) + fehlender /run/sshd Ordner.",
+    "fix": {
+      "steps": [
+        "1. IONOS VNC Console nutzen (direkte noVNC URL, nicht iframe)",
+        "2. Fehlerhafte Zeile entfernen: sed -i 'Nd' /etc/ssh/sshd_config",
+        "3. mkdir -p /run/sshd",
+        "4. sshd -t (Config testen)",
+        "5. systemctl start ssh",
+        "6. Permanent: echo 'd /run/sshd 0755 root root -' > /etc/tmpfiles.d/sshd.conf"
+      ]
+    },
+    "prevention": "IMMER sshd -t vor restart. /run/sshd in tmpfiles.d. Min. 2 SSH-Pfade zu jedem Server."
+  },
+  {
+    "id": "fix-011",
+    "date": "2026-04-13",
+    "system": "network",
+    "host": "192.168.178.2",
+    "component": "cisco-stp",
+    "severity": "high",
+    "title": "Cisco SG350 STP Loop — 171ms Latenz durch FritzBox BPDU-Reflexion",
+    "symptoms": [
+      "GE2 cycling LOOPBACKDET alle 30s",
+      "MAC-Tabelle flusht",
+      "171ms interne Latenz"
+    ],
+    "root_cause": "FritzBox reflektiert BPDUs → Switch erkennt eigene BPDUs als Loop → Port deaktiviert → MAC flush.",
+    "fix": {
+      "action": "spanning-tree disable auf GE2 (FritzBox Uplink)"
+    },
+    "prevention": "Bei Consumer-Router-Uplinks STP IMMER deaktivieren."
+  },
+  {
+    "id": "fix-012",
+    "date": "2026-04-13",
+    "system": "network",
+    "host": "192.168.178.2",
+    "component": "vlan-starlink",
+    "severity": "high",
+    "title": "Starlink CGNAT flooding LAN ohne VLAN-Isolation",
+    "symptoms": [
+      "100.64.x.x Adressen im LAN",
+      "DHCP-Konflikte",
+      "Falsche Default Routes"
+    ],
+    "root_cause": "Starlink ohne VLAN sendet CGNAT DHCP ins Management-LAN.",
+    "fix": {
+      "steps": [
+        "1. VLAN anlegen: vlan 30 name Starlink-WAN (eine Zeile!)",
+        "2. GE12 als Access auf VLAN 30",
+        "3. STP auf GE12 deaktivieren"
+      ],
+      "note": "Cisco SG350 VLAN Syntax: 'vlan X name Y' in EINER Zeile"
+    },
+    "prevention": "Starlink IMMER in dediziertes VLAN."
+  },
+  {
+    "id": "fix-013",
+    "date": "2026-04-13",
+    "system": "network",
+    "host": "192.168.178.3",
+    "component": "aruba-reset",
+    "severity": "warning",
+    "title": "Aruba 1830 Factory Reset Falle — Switch nach Reset unerreichbar",
+    "symptoms": [
+      "Kein Ping, HTTP, SSH nach Factory Reset",
+      "L2-Forwarding funktioniert aber Management nicht"
+    ],
+    "root_cause": "Factory Reset setzt Aruba in Cloud-Provisioning-Modus. Lokales Management erst nach Cloud-Registrierung.",
+    "fix": {
+      "action": "Cloud Portal → portal.instant-on.hpe.com → Add Device → Serial",
+      "warning": "NIE Factory Reset zur Lösung von Login-Problemen"
+    },
+    "prevention": "Aruba nie per Factory Reset troubleshooten."
+  },
+  {
+    "id": "fix-014",
+    "date": "2026-04-13",
+    "system": "opnsense",
+    "host": "192.168.178.11",
+    "component": "config-restore",
+    "severity": "critical",
+    "title": "OPNsense config.xml nach blindem Revert kaputt",
+    "symptoms": [
+      "WAN auf falschem vtnet Interface",
+      "DHCP auf falschem Subnet",
+      "Kein Internet"
+    ],
+    "root_cause": "Blinder Revert auf alte Config hat falsche Interface-Zuordnung und falsches Subnet.",
+    "fix": {
+      "steps": [
+        "1. VM stoppen",
+        "2. LVM-Disk mounten (UFS2)",
+        "3. config.xml.bak (letzte gute Config) als config.xml verwenden",
+        "4. Interface-Zuordnungen verifizieren (vtnet0=LAN, vtnet1=WAN)",
+        "5. Unmount + VM starten"
+      ]
+    },
+    "prevention": "VOR jedem Revert die .bak Datei inspizieren. Interfaces verifizieren."
+  },
+  {
+    "id": "fix-015",
+    "date": "2026-04-13",
+    "system": "opnsense",
+    "host": "192.168.178.11",
+    "component": "boot-issues",
+    "severity": "high",
+    "title": "OPNsense Boot-Probleme — LAN Route, SSH, pf Blocking",
+    "symptoms": [
+      "LAN-Zugang fehlt nach Boot",
+      "SSH nicht erreichbar",
+      "Web UI geblockt"
+    ],
+    "root_cause": "Drei Boot-Probleme: LAN-Route fehlt, SSH noauto=1, pf blockiert.",
+    "fix": {
+      "steps": [
+        "1. configctl interface reconfigure lan",
+        "2. SSH: noauto=0 in config.xml",
+        "3. pf: pfctl -d && configctl filter reload"
+      ]
+    },
+    "prevention": "SSH autostart auf noauto=0. LAN-Route in Monitoring."
+  },
+  {
+    "id": "fix-016",
+    "date": "2026-04-13",
+    "system": "peercortex",
+    "host": "82.165.222.127",
+    "component": "cache-null",
+    "severity": "high",
+    "title": "Null-Cache Bug — fehlgeschlagene API-Responses 15min gecacht",
+    "symptoms": [
+      "ASN-Lookup zeigt 0 Neighbours/Prefixes",
+      "Werte bleiben 15min auf 0"
+    ],
+    "root_cause": "Fehlgeschlagene RIPE Stat Responses (null) wurden gecacht.",
+    "fix": {
+      "rule": "NIEMALS null-Responses cachen: if (result !== null) cache.set(key, result)"
+    },
+    "prevention": "Cache-Implementierung muss null/error explizit ausschließen."
+  },
+  {
+    "id": "fix-017",
+    "date": "2026-04-13",
+    "system": "peercortex",
+    "host": "82.165.222.127",
+    "component": "api-timeout",
+    "severity": "high",
+    "title": "RIPE Stat Timeout zu kurz für Tier-1 Carrier",
+    "symptoms": [
+      "Lookups für AS174 (Cogent) etc. geben leere Ergebnisse",
+      "Timeout nach 30s"
+    ],
+    "root_cause": "30s Timeout zu kurz für Tier-1 mit 5000+ Neighbours.",
+    "fix": {
+      "change": "Timeout von 30s auf 45s erhöhen"
+    },
+    "prevention": "API-Timeouts nach Worst-Case kalibrieren."
+  },
+  {
+    "id": "fix-018",
+    "date": "2026-04-13",
+    "system": "peercortex",
+    "host": "82.165.222.127",
+    "component": "api-ratelimit",
+    "severity": "high",
+    "title": "PeeringDB/RIPE Stat Rate-Limit Flood ohne Concurrency-Kontrolle",
+    "symptoms": [
+      "HTTP 429 Too Many Requests",
+      "Hunderte parallele Requests"
+    ],
+    "root_cause": "Kein Concurrency-Limit auf externe API-Calls.",
+    "fix": {
+      "steps": [
+        "Semaphore PeeringDB: max 5 concurrent",
+        "Semaphore RIPE Stat: max 15 concurrent",
+        "Retry mit Backoff bei 429"
+      ]
+    },
+    "prevention": "IMMER Semaphore auf externe API-Calls. Standard: 5-15."
+  },
+  {
+    "id": "fix-019",
+    "date": "2026-04-13",
+    "system": "peercortex",
+    "host": "82.165.222.127",
+    "component": "css-injection",
+    "severity": "warning",
+    "title": "CSS SyntaxError — Multiline CSS in JavaScript-String",
+    "symptoms": [
+      "SyntaxError im CSS-Parser",
+      "Styles nicht korrekt"
+    ],
+    "root_cause": "CSS als JS-String statt in <style> Block.",
+    "fix": {
+      "rule": "CSS IMMER in <style> Blöcke verschieben. Nie als JS-String inline."
+    },
+    "prevention": "CSS nur via <style> oder .css Dateien."
+  },
+  {
+    "id": "fix-020",
+    "date": "2026-04-13",
+    "system": "tip",
+    "host": "82.165.222.127",
+    "component": "postgresql-port",
+    "severity": "high",
+    "title": "TIP PostgreSQL auf Port 5433 — nicht Default 5432",
+    "symptoms": [
+      "Connection refused auf 5432"
+    ],
+    "root_cause": "Mehrere PostgreSQL-Instanzen auf Erik. TIP nutzt 5433.",
+    "fix": {
+      "rule": "DATABASE_URL MUSS Port 5433 enthalten"
+    },
+    "prevention": "Alle PostgreSQL-Instanzen mit explizitem Port."
+  },
+  {
+    "id": "fix-021",
+    "date": "2026-04-13",
+    "system": "tip",
+    "host": "82.165.222.127",
+    "component": "deploy-cwd",
+    "severity": "high",
+    "title": "TIP Deploy — IMMER cd /opt/tip vor Befehlen",
+    "symptoms": [
+      "git pull zieht falsches Repo",
+      "pm2 startet falschen Prozess"
+    ],
+    "root_cause": "SSH-Befehle landen im Home-Verzeichnis.",
+    "fix": {
+      "rule": "IMMER: ssh root@host 'cd /opt/tip && git pull && pm2 restart tip-api'"
+    },
+    "prevention": "Deploy-Skripte müssen cd als ersten Befehl haben."
+  },
+  {
+    "id": "fix-022",
+    "date": "2026-04-13",
+    "system": "exo-cluster",
+    "host": "192.168.178.213 + .67",
+    "component": "cluster-setup",
+    "severity": "high",
+    "title": "exo AI Cluster — Split-Brain, Thunderbolt, mDNS Probleme",
+    "symptoms": [
+      "Nodes finden sich nicht",
+      "Nur ein Node sichtbar"
+    ],
+    "root_cause": "Ohne --force-master Split-Brain. Thunderbolt Bridge stört mDNS. Stale event_log.",
+    "fix": {
+      "steps": [
+        "1. rm -rf ~/.exo/event_log/ (beide Nodes)",
+        "2. Thunderbolt Bridge deaktivieren",
+        "3. Mac Studio --force-master",
+        "4. Gleicher Namespace + Port",
+        "5. Master ZUERST starten, 10s warten"
+      ]
+    },
+    "prevention": "event_log vor Start löschen. Master mit --force-master."
+  },
+  {
+    "id": "fix-023",
+    "date": "2026-04-13",
+    "system": "eo-global-pulse",
+    "host": "82.165.222.127",
+    "component": "pwa-cache",
+    "severity": "high",
+    "title": "PWA cached aggressiv — User sehen alte Version",
+    "symptoms": [
+      "Nach Deploy: alte UI sichtbar",
+      "Browser-Refresh hilft nicht"
+    ],
+    "root_cause": "Service Worker cache-first Strategie serviert alte Assets.",
+    "fix": {
+      "steps": [
+        "1. Cache-Version in sw.js hochzählen",
+        "2. User muss /api/clear-cache besuchen"
+      ]
+    },
+    "prevention": "Cache-Version bei JEDEM Deploy bumpen."
+  },
+  {
+    "id": "fix-024",
+    "date": "2026-04-13",
+    "system": "eo-global-pulse",
+    "host": "alle",
+    "component": "fetch-timeout",
+    "severity": "warning",
+    "title": "safeFetch Pattern — AbortController + Timeout + no-store",
+    "symptoms": [
+      "API-Calls hängen endlos",
+      "Stale Cache-Daten",
+      "Memory Leaks"
+    ],
+    "root_cause": "Standard fetch() hat keinen Timeout und kann gecachte Responses liefern.",
+    "fix": {
+      "pattern": "AbortController + setTimeout + cache: 'no-store'. Abort in useEffect cleanup."
+    },
+    "prevention": "Alle fetch() über safeFetch-Wrapper."
+  },
+  {
+    "id": "fix-025",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "security-random",
+    "severity": "warning",
+    "title": "Math.random durch crypto.getRandomValues ersetzen",
+    "symptoms": [
+      "Security Audit flaggt Math.random",
+      "Vorhersagbare Zufallswerte"
+    ],
+    "root_cause": "Math.random() ist kryptographisch unsicher.",
+    "fix": {
+      "rule": "crypto.getRandomValues() statt Math.random() für IDs, Tokens, Nonces."
+    },
+    "prevention": "ESLint-Regel: no-math-random."
+  },
+  {
+    "id": "fix-026",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "prisma-adapter",
+    "severity": "warning",
+    "title": "Prisma PrismaPg Adapter — TypeScript 'as any' Cast nötig",
+    "symptoms": [
+      "TypeScript Fehler mit PrismaPg Adapter"
+    ],
+    "root_cause": "PrismaPg Adapter-Typ passt nicht exakt zu PrismaClient TypeDef.",
+    "fix": {
+      "pattern": "new PrismaClient({ adapter } as any)"
+    },
+    "prevention": "Standard-Pattern für alle Prisma + PrismaPg Projekte."
+  },
+  {
+    "id": "fix-027",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "prisma-null",
+    "severity": "warning",
+    "title": "Prisma OR-Query mit NULL — falsches Pattern",
+    "symptoms": [
+      "{ OR: [{ f: false }, { f: null }] } liefert falsche Ergebnisse"
+    ],
+    "root_cause": "Prisma behandelt OR mit null unerwartet.",
+    "fix": {
+      "wrong": "{ OR: [{ f: false }, { f: null }] }",
+      "correct": "{ field: { not: true } }"
+    },
+    "prevention": "{ not: true } statt OR mit null."
+  },
+  {
+    "id": "fix-028",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "api-async",
+    "severity": "high",
+    "title": "API Routes — NIEMALS synchrone Shell-Befehle, IMMER async mit Timeout",
+    "symptoms": [
+      "Server hängt bei langem Shell-Befehl",
+      "Alle Requests blockiert"
+    ],
+    "root_cause": "Synchrone Aufrufe blockieren den gesamten Node.js Event Loop.",
+    "fix": {
+      "rule": "IMMER async child_process mit Timeout verwenden. Nie synchrone Varianten in API-Routen."
+    },
+    "prevention": "ESLint-Regel gegen synchrone Shell-Calls in API-Routen."
+  },
+  {
+    "id": "fix-029",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "cloudflare-cache",
+    "severity": "warning",
+    "title": "Cloudflare Cache-Control für API Routes — no-store Pflicht",
+    "symptoms": [
+      "API-Responses von Cloudflare gecacht",
+      "Stale Daten trotz Server-Updates"
+    ],
+    "root_cause": "Cloudflare cached ohne explizites no-store auch dynamische Responses.",
+    "fix": {
+      "rule": "Cache-Control: no-store für alle /api/ Routen in Middleware setzen."
+    },
+    "prevention": "Standard-Middleware in allen Projekten."
+  },
+  {
+    "id": "fix-030",
+    "date": "2026-04-13",
+    "system": "ctxevent",
+    "host": "82.165.222.127",
+    "component": "self-healing",
+    "severity": "info",
+    "title": "Self-Healing System — automatische Fehlerbehebung für bekannte Fehlerklassen",
+    "symptoms": [
+      "P2022 Column Missing",
+      "Permission Denied",
+      "PostgreSQL Connection Refused",
+      "Prisma Client Error"
+    ],
+    "root_cause": "Bekannte Fehlerklassen die automatisiert behebbar sind.",
+    "fix": {
+      "handlers": {
+        "P2022_COLUMN_MISSING": "prisma db push",
+        "PERMISSION_DENIED": "chown -R www-data:www-data",
+        "CONNECTION_REFUSED": "systemctl restart postgresql",
+        "PRISMA_CLIENT_ERROR": "delete globalThis.prisma"
+      },
+      "watchdog": "GET /api/sysadmin/security/watchdog alle 5min. 3x fail → auto-heal + Telegram."
+    },
+    "prevention": "Watchdog in allen Produktions-Apps einbauen."
+  },
+  {
+    "id": "fix-031",
+    "date": "2026-04-13",
+    "system": "stalwart",
+    "host": "82.165.222.127",
+    "component": "mailcow-migration",
+    "severity": "info",
+    "title": "Mailcow → Stalwart Migration — 3GB auf 150MB RAM",
+    "symptoms": [
+      "Mailcow verbraucht 3GB+ RAM",
+      "19 Docker Container",
+      "Komplexes Debugging"
+    ],
+    "root_cause": "Mailcow für Einzelbenutzer massiv overengineered.",
+    "fix": {
+      "result": "Single Rust binary. RAM: 3GB → 150MB. 19 Container → 1 Binary. Startup: 30s → 2s."
+    },
+    "prevention": "Für kleine Teams: Stalwart statt Mailcow."
+  },
+  {
+    "id": "fix-032",
+    "date": "2026-04-13",
+    "system": "medium-crosspost",
+    "host": "localhost",
+    "component": "cloudflare-challenge",
+    "severity": "warning",
+    "title": "Medium Import — Cloudflare blockiert Datacenter IPs",
+    "symptoms": [
+      "Playwright auf VPS wird geblockt",
+      "Cloudflare Challenge auf import-story"
+    ],
+    "root_cause": "Medium Cloudflare blockiert Datacenter IPs. Residential IPs funktionieren mit richtigem Fingerprint.",
+    "fix": {
+      "steps": [
+        "Nur auf Mac (residential IP) ausführen",
+        "Echtes Chrome: channel: 'chrome', headless: false",
+        "--disable-blink-features=AutomationControlled",
+        "Medium nutzt div.js-importUrl (contenteditable), NICHT input"
+      ]
+    },
+    "prevention": "Browser-Automation gegen Cloudflare nur mit residential IP."
+  },
+  {
+    "id": "fix-033",
+    "date": "2026-04-13",
+    "system": "llm-gateway",
+    "host": "82.165.222.127",
+    "component": "free-llm-fallback",
+    "severity": "info",
+    "title": "LLM Gateway Free API Fallback Chain",
+    "symptoms": [
+      "Ollama nicht erreichbar",
+      "Kein LLM-Fallback"
+    ],
+    "root_cause": "Single Point of Failure bei nur einem LLM-Provider.",
+    "fix": {
+      "chain": "Cerebras → Groq → Mistral AI → NVIDIA NIM → Cloudflare Workers AI"
+    },
+    "prevention": "Min. 2 LLM-Provider in Fallback-Chain."
+  },
+  {
+    "id": "fix-034",
+    "date": "2026-04-13",
+    "system": "shieldx",
+    "host": "192.168.178.213",
+    "component": "cipher-bypass",
+    "severity": "high",
+    "title": "Encoded Payloads umgehen Text-basierte Detection",
+    "symptoms": [
+      "ROT13/Base64/Hex Injections werden nicht erkannt",
+      "Nur Plain-Text Injections geblockt"
+    ],
+    "root_cause": "Angreifer encodieren Payloads mit einfachen Chiffren.",
+    "fix": {
+      "solution": "CipherDecoder als Layer 0: ROT13, Base64, Hex, Atbash, Caesar, Vigenère, A1Z26, EmojiSmuggling, UpsideDown",
+      "rule": "Decode-Layer IMMER VOR Detection-Layern"
+    },
+    "prevention": "Encode-Detection als erste Verteidigungslinie."
+  },
+  {
+    "id": "fix-035",
+    "date": "2026-04-13",
+    "system": "shieldx",
+    "host": "192.168.178.213",
+    "component": "resource-exhaustion",
+    "severity": "high",
+    "title": "Resource Exhaustion — Token Bombs und Context Stuffing",
+    "symptoms": [
+      "LLM-Kosten explodieren",
+      "Extrem langsame Responses",
+      "Endlos-Schleifen"
+    ],
+    "root_cause": "Token Bombs, Context Stuffing, Loop Patterns treiben Kosten.",
+    "fix": {
+      "solution": "ResourceExhaustionDetector als L0.5: Token-Counting, Context-Ratio, Loop-Detection, Batch-Amplification",
+      "rule": "EARLY im Pipeline — vor teuren Downstream-Checks"
+    },
+    "prevention": "Token-Limits als erste Verteidigung. Cost-Tracking pro Session."
+  },
+  {
+    "id": "fix-036",
+    "date": "2026-04-13",
+    "system": "llm-gateway",
+    "host": "82.165.222.127",
+    "component": "injection-trie",
+    "severity": "info",
+    "title": "KeywordTrie statt Regex für Injection Detection — O(n) statt O(n*m)",
+    "symptoms": [
+      "Regex-Detection langsam bei langen Prompts",
+      "CPU-Spikes"
+    ],
+    "root_cause": "Regex skaliert O(n*m). Bei 500+ Patterns und langen Prompts zu langsam.",
+    "fix": {
+      "pattern": "KeywordTrie: Alle Keywords in Trie. Scan in O(n). 8 Attack-Kategorien."
+    },
+    "prevention": "Trie statt Regex für Keyword-Listen."
+  },
+  {
+    "id": "fix-037",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "mcp-stdout",
+    "severity": "high",
+    "title": "MCP Server — NIEMALS auf stdout loggen",
+    "symptoms": [
+      "MCP Server antwortet nicht",
+      "JSON-RPC Parsing fehlschlägt"
+    ],
+    "root_cause": "console.log auf stdout korrumpiert JSON-RPC Stream.",
+    "fix": {
+      "rule": "ALLE Logs über stderr (console.error). stdout NUR für JSON-RPC."
+    },
+    "prevention": "Logging Framework auf stderr konfigurieren."
+  },
+  {
+    "id": "fix-038",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "security-scan",
+    "severity": "critical",
+    "title": "Triple Security Scan vor JEDEM GitHub Push",
+    "symptoms": [
+      "Secrets in Public Repo",
+      "Interne IPs im Code",
+      "Credentials committed"
+    ],
+    "root_cause": "Versehentliches Pushen sensibler Daten. Mehrfach passiert.",
+    "fix": {
+      "scans": [
+        "Secrets: api_key, password, secret, eyJ, sk-",
+        "Private Data: 192.168., @gmail, /Users/",
+        "Config: DATABASE_URL, SENDGRID, VAPID"
+      ],
+      "scope": "git diff origin/main..HEAD",
+      "enforcement": "pre-push Git Hook"
+    },
+    "prevention": "pre-push Hook in JEDEM GitHub-Repo BEVOR erster Push."
+  },
+  {
+    "id": "fix-039",
+    "date": "2026-04-13",
+    "system": "blog",
+    "host": "82.165.222.127",
+    "component": "blog-security",
+    "severity": "critical",
+    "title": "Blog Publish Security — ZERO interne Daten publizieren",
+    "symptoms": [
+      "Interne IPs im Blog sichtbar",
+      "Server-Namen im Text"
+    ],
+    "root_cause": "Blog-Posts aus internem Kontext generiert, Infra-Details lecken.",
+    "fix": {
+      "scan": "Vor publish prüfen: keine IPs, Server-Namen, Pfade, Keys, Ports"
+    },
+    "prevention": "Automatisierter Scan im Publish-Workflow."
+  },
+  {
+    "id": "fix-040",
+    "date": "2026-04-13",
+    "system": "switchblade",
+    "host": "192.168.178.2",
+    "component": "cisco-ssh",
+    "severity": "warning",
+    "title": "Cisco SG350 SSH — nur interactive/expect Mode",
+    "symptoms": [
+      "'Packet integrity error' bei normalem SSH"
+    ],
+    "root_cause": "SG350 unterstützt nur interaktives SSH mit Legacy-Algorithmen.",
+    "fix": {
+      "steps": [
+        "Legacy SSH: ssh-rsa, diffie-hellman-group14-sha1, aes256-ctr, hmac-sha1",
+        "expect-Wrapper Skript für Automation"
+      ]
+    },
+    "prevention": "expect-Wrapper für ältere Cisco-Switches."
+  },
+  {
+    "id": "fix-041",
+    "date": "2026-04-13",
+    "system": "claude-bridge",
+    "host": "82.165.222.127",
+    "component": "subscription-proxy",
+    "severity": "info",
+    "title": "Claude-Bridge — Subscription statt per-Token API für Blog-Generierung",
+    "symptoms": [
+      "API-Kosten zu hoch für automatische Generierung"
+    ],
+    "root_cause": "Per-Token API teuer bei regelmäßiger Nutzung.",
+    "fix": {
+      "architecture": "Node.js Server auf Port 3250, ruft claude CLI als subprocess. Flat Fee statt per-Token."
+    },
+    "prevention": "Subscription-Modelle prüfen für wiederkehrende LLM-Tasks."
+  },
+  {
+    "id": "fix-042",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "prompt-caching",
+    "severity": "info",
+    "title": "Prompt Caching — 90% Kostenreduktion für System-Prompts",
+    "symptoms": [
+      "Hohe API-Kosten durch wiederholte System-Prompts"
+    ],
+    "root_cause": "Identische System-Prompts werden als neue Tokens berechnet.",
+    "fix": {
+      "pattern": "cache_control: { type: 'ephemeral' }. Cache Hit = 10% des Preises.",
+      "split": "System-Prompt in cacheable Prefix (stabil) + dynamic Suffix (pro Session)"
+    },
+    "prevention": "System-Prompts immer für Caching optimieren."
+  },
+  {
+    "id": "fix-043",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "agentic-patterns",
+    "severity": "info",
+    "title": "Anti-Rationalization + Verification Patterns für LLM-Agents",
+    "symptoms": [
+      "Agent rationalisiert Fehler weg",
+      "Agent sagt 'sieht korrekt aus' ohne Test"
+    ],
+    "root_cause": "LLMs neigen zu Confirmation Bias.",
+    "fix": {
+      "patterns": [
+        "Anti-Rationalization: Failure Modes namentlich benennen + Gegenverhalten erzwingen",
+        "Verification: Binary Verdict, adversarial Probes VOR jedem PASS",
+        "Purpose Statement: Worker-Prompts mit Purpose versehen",
+        "Continue vs. Spawn: Research→Continue, Ansatz falsch→Spawn fresh"
+      ]
+    },
+    "prevention": "In allen Agent-Prompts: Anti-Rationalisierung + Verification."
+  },
+  {
+    "id": "fix-044",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "ollama-models",
+    "severity": "info",
+    "title": "Ollama Model-Auswahl — richtige Größe für Use Case",
+    "symptoms": [
+      "Falsches Model gewählt",
+      "Tool-Calling geht nicht"
+    ],
+    "root_cause": "Nicht jedes Modell unterstützt Tool-Calling.",
+    "fix": {
+      "models": {
+        "fast": "qwen2.5:3b",
+        "medium": "qwen2.5:14b (beste Tool-Calling)",
+        "large": "qwen2.5:32b",
+        "reasoning": "llama3.3:70b"
+      },
+      "rule": "qwen2.5 + mistral = beste lokale Tool-Calling"
+    },
+    "prevention": "PII Masking auch bei lokalen Modellen."
+  },
+  {
+    "id": "fix-045",
+    "date": "2026-04-13",
+    "system": "general",
+    "host": "alle",
+    "component": "version-deploy",
+    "severity": "warning",
+    "title": "Versionsnummer PFLICHT bei jedem Deploy",
+    "symptoms": [
+      "Unklar welche Version auf Production",
+      "Changelog nicht aktuell"
+    ],
+    "root_cause": "Deploy ohne Version-Update = untrackbare Zustände.",
+    "fix": {
+      "rule": "JEDER Deploy: Version bump + Changelog + git tag vX.Y.Z"
+    },
+    "prevention": "Automatisiert in Deploy-Skripten."
+  },
+  {
+    "id": "fix-046",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10 → CT 100 (192.168.178.22)",
+    "component": "pihole-FTL",
+    "severity": "critical",
+    "title": "pihole-FTL v6.5 DNS — Proxmox-Host bekommt connection refused trotz laufendem Dienst",
+    "symptoms": [
+      "dig @192.168.178.22 cloudflare.com → communications error: connection refused",
+      "DNS-Auflösung vom Host (192.168.178.10) schlägt fehl",
+      "Innerhalb des CT (pct exec 100) funktioniert nslookup",
+      "ss -ulnp inside CT zeigt: UNCONN 0.0.0.0:53 (pihole-FTL läuft)",
+      "Nach Reboot: UDP sendto() port 53 gibt EINVAL zurück (transient)",
+      "cloudflared startet nicht weil DNS nicht auflöst"
+    ],
+    "root_cause": "pihole-FTL v6 Standard: listeningMode=LOCAL akzeptiert nur Anfragen aus dem lokalen CT-Subnetz. Proxmox-Host ist zwar im selben /24, aber nicht im CT-internen Subnetz. Zusätzlich: direkt nach Reboot kann der Kernel-Netzwerk-Stack UDP port 53 mit EINVAL ablehnen (transient, verschwindet nach vollständiger Initialisierung).",
+    "fix": {
+      "steps": [
+        "1. Im CT 100: nano /etc/pihole/pihole.toml",
+        "2. Zeile ändern: listeningMode = \"LOCAL\" → listeningMode = \"ALL\"",
+        "3. systemctl restart pihole-FTL (oder: pihole-FTL --restart)",
+        "4. Verify: dig @192.168.178.22 cloudflare.com +short → IPs zurück",
+        "Falls EINVAL nach Reboot: 60s warten, dann dig erneut testen"
+      ],
+      "also_verify": "grep interface /etc/pihole/pihole.toml → sollte interface=\"eth0\" oder interface=\"\" sein"
+    },
+    "prevention": "Bei pihole v6 Erstinstallation: listeningMode=ALL sofort setzen wenn pihole als LAN-DNS-Server dient. In pihole.toml dokumentieren."
+  },
+  {
+    "id": "fix-047",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "cloudflared-boot",
+    "severity": "high",
+    "title": "cloudflared startet nach Reboot nicht weil DNS noch nicht bereit ist",
+    "symptoms": [
+      "Nach Proxmox-Reboot: cloudflared.service startet aber Tunnel bleibt disconnected",
+      "cloudflared logs: failed to lookup edge address... no such host",
+      "dig @1.1.1.1 oder @192.168.178.22 gibt connection refused zurück",
+      "LXC-Container mit pihole braucht ~30-60s bis DNS erreichbar ist"
+    ],
+    "root_cause": "cloudflared.service startet systemd-seitig zu früh, bevor der Netzwerk-Stack vollständig initialisiert ist und pihole (CT 100) DNS-Anfragen annimmt.",
+    "fix": {
+      "steps": [
+        "1. /etc/systemd/system/cloudflared.service bearbeiten",
+        "2. Unter [Service] hinzufügen: ExecStartPre=/bin/sh -c \"until nslookup cloudflare.com 1.1.1.1 >/dev/null 2>&1; do sleep 2; done\"",
+        "3. systemctl daemon-reload && systemctl restart cloudflared",
+        "4. Verify nach Reboot: journalctl -u cloudflared -n 10 | grep \"until nslookup\""
+      ]
+    },
+    "prevention": "Bei allen Services die DNS benötigen: ExecStartPre DNS-wait hinzufügen."
+  },
+  {
+    "id": "fix-048",
+    "date": "2026-04-13",
+    "system": "proxmox",
+    "host": "192.168.178.10",
+    "component": "vzdump",
+    "severity": "critical",
+    "title": "vzdump Backup → Load 232 → Host komplett unresponsive → Emergency Reboot",
+    "symptoms": [
+      "Load Average steigt auf 232 bei 22-CPU System (10x Anzahl CPUs)",
+      "SSH nicht mehr erreichbar (timeout)",
+      "Proxmox Web GUI timeout",
+      "pvestatd geht erneut in D-State",
+      "Backup von CT 119 läuft ohne I/O-Limit"
+    ],
+    "root_cause": "vzdump ohne I/O-Limit liest/schreibt mit voller Disk-Bandbreite. Führt zu Kernel-D-State-Kaskade bei pvestatd und anderen Prozessen. Einziger Recovery-Weg: physischer Reboot.",
+    "fix": {
+      "permanent": [
+        "nano /etc/vzdump.conf → einfügen:",
+        "bwlimit: 30000  # 30 MB/s max",
+        "ionice: 7       # idle I/O priority",
+        "compress: lzo   # schneller als gzip"
+      ],
+      "post_crash": [
+        "1. Physischer Reboot (KVM/IPMI oder vor Ort)",
+        "2. Nach Reboot: systemctl status pvestatd cloudflared pihole-FTL",
+        "3. Falls pvestatd D-State: cgroup-move + reset-failed + start (fix-002)",
+        "4. Falls DNS broken: pihole listeningMode=ALL prüfen (fix-046)",
+        "5. Falls cloudflared nicht startet: DNS-wait ExecStartPre (fix-047)"
+      ]
+    },
+    "prevention": "/etc/vzdump.conf mit bwlimit und ionice ist PFLICHT auf jedem Proxmox-Host. Ohne diese Settings IMMER Gefahr eines Crashes."
+  }
+]
\ No newline at end of file
diff --git a/packages/ctx-health/src/checks/index.ts b/packages/ctx-health/src/checks/index.ts
index 5e48e63..d0b2737 100644
--- a/packages/ctx-health/src/checks/index.ts
+++ b/packages/ctx-health/src/checks/index.ts
@@ -35,6 +35,7 @@ const ALLOWED_COMMANDS = new Set([
   '/usr/sbin/systemctl',
   '/usr/bin/sync',
   '/bin/sync',
+  '/usr/bin/ssh',
 ]);
 
 async function safeExec(
@@ -69,7 +70,13 @@ async function findPm2(): Promise<string | null> {
 
 // ─── 1. PM2 processes ────────────────────────────────────────────────────────
 
-const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
+const PM2_REQUIRED_PROCESSES = [
+  'llm-gateway', 'llm-learning',
+  'magatama', 'magatama-dashboard',
+  'tip-api', 'tip-scraper-daemon',
+  'peercortex', 'eo-global-pulse',
+  'ghost-blog', 'nognet',
+];
 
 async function checkPm2(): Promise<CheckResult> {
   const start = Date.now();
@@ -114,10 +121,40 @@ async function healPm2(diagnosis: string): Promise<HealResult> {
   const pm2 = await findPm2();
   if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
 
-  const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
+  // Only restart processes that are actually offline — avoid pm2 restart all
+  const { stdout: jlist } = await safeExec(pm2, ['jlist']);
+  let processes: Array<{ name: string; pm2_env?: { status?: string } }> = [];
+  try { processes = JSON.parse(jlist) as typeof processes; } catch { /* ignore */ }
+
+  const offline = PM2_REQUIRED_PROCESSES.filter((name) => {
+    const proc = processes.find((p) => p.name === name);
+    return !proc || proc.pm2_env?.status !== 'online';
+  });
+
+  if (offline.length === 0) {
+    return { action_taken: 'no offline processes found — skipping restart', success: true };
+  }
+
+  const results: string[] = [];
+  for (const name of offline) {
+    const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
+    results.push(`${name}: ${success ? 'restarted' : stderr.slice(0, 80)}`);
+  }
+
+  const output = results.join('; ');
+  logger.info({ diagnosis: diagnosis.slice(0, 120), output, offline }, 'PM2 targeted restart executed');
+  return { action_taken: `pm2 restart ${offline.join(', ')}`, success: true, output };
+}
+
+// ─── pm2-aware targeted restart (shared helper) ───────────────────────────────
+
+async function restartProcess(name: string): Promise<HealResult> {
+  const pm2 = await findPm2();
+  if (!pm2) return { action_taken: 'pm2 not found', success: false };
+  const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
   const output = `${stdout}\n${stderr}`.trim();
-  logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
-  return { action_taken: 'pm2 restart all', success: true, output };
+  logger.info({ name, success, output: output.slice(0, 200) }, 'PM2 targeted process restart');
+  return { action_taken: `pm2 restart ${name}`, success, output };
 }
 
 // ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
@@ -401,6 +438,237 @@ async function healWireGuard(_diagnosis: string): Promise<HealResult> {
   return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
 }
 
+// ─── 9. Service port health checks ───────────────────────────────────────────
+// For each critical service, performs a real HTTP check on the local port.
+// This catches crash-loops where PM2 shows "online" but the port is not responding.
+
+interface ServiceDef {
+  name: string;       // PM2 process name to restart on failure
+  port: number;
+  path: string;       // health endpoint path
+  okStatus?: number[];// accepted HTTP status codes (default: <500)
+}
+
+const SERVICES: ServiceDef[] = [
+  { name: 'magatama',           port: 3210, path: '/' },
+  { name: 'magatama-dashboard', port: 3211, path: '/' },
+  { name: 'magatama-admin',     port: 3212, path: '/' },
+  { name: 'tip-api',            port: 3201, path: '/api/health' },
+  { name: 'peercortex',         port: 3101, path: '/' },
+  { name: 'llm-gateway',        port: 3103, path: '/health' },
+  { name: 'eo-global-pulse',    port: 3000, path: '/' },
+  { name: 'nognet',             port: 3001, path: '/' },
+  { name: 'ghost-blog',         port: 2368, path: '/' },
+  { name: 'switchblade',        port: 3334, path: '/' },
+];
+
+async function probePort(service: ServiceDef): Promise<{ ok: boolean; status?: number; error?: string }> {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), 8_000);
+  try {
+    const res = await fetch(`http://localhost:${service.port}${service.path}`, {
+      signal: controller.signal,
+      cache: 'no-store',
+      redirect: 'manual',
+    } as RequestInit);
+    const acceptedCodes = service.okStatus ?? [];
+    const ok = acceptedCodes.length > 0
+      ? acceptedCodes.includes(res.status)
+      : res.status < 500 || (res.status >= 300 && res.status < 400);
+    return { ok, status: res.status };
+  } catch (err) {
+    return { ok: false, error: err instanceof Error ? err.message : String(err) };
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+async function checkServicePorts(): Promise<CheckResult> {
+  const start = Date.now();
+  const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
+  const latency_ms = Date.now() - start;
+
+  const failing = results.filter((r) => !r.probe.ok);
+
+  if (failing.length === 0) {
+    return { healthy: true, message: `All ${SERVICES.length} service ports are responding`, latency_ms };
+  }
+
+  const details = Object.fromEntries(
+    failing.map((r) => [r.svc.name, r.probe.status ?? r.probe.error ?? 'no response']),
+  );
+
+  return {
+    healthy: false,
+    message: `${failing.length} service(s) not responding: ${failing.map((r) => r.svc.name).join(', ')}`,
+    details,
+    latency_ms,
+  };
+}
+
+async function healServicePorts(diagnosis: string): Promise<HealResult> {
+  // Re-probe to find which services are currently failing
+  const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
+  const failing = results.filter((r) => !r.probe.ok);
+
+  if (failing.length === 0) {
+    return { action_taken: 'services recovered on re-check — no restart needed', success: true };
+  }
+
+  const healResults: string[] = [];
+  for (const { svc } of failing) {
+    const result = await restartProcess(svc.name);
+    healResults.push(`${svc.name}: ${result.success ? 'restarted' : result.output?.slice(0, 80) ?? 'failed'}`);
+    logger.info({ service: svc.name, diagnosis: diagnosis.slice(0, 80) }, 'Service port failure — PM2 restart triggered');
+  }
+
+  return {
+    action_taken: `restarted: ${failing.map((r) => r.svc.name).join(', ')}`,
+    success: true,
+    output: healResults.join('; '),
+  };
+}
+
+// ─── SSH helper ──────────────────────────────────────────────────────────────
+
+const SSH_OPTS = ['-o', 'BatchMode=yes', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=accept-new'];
+
+async function sshExec(host: string, command: string): Promise<{ stdout: string; stderr: string; success: boolean }> {
+  return safeExec('/usr/bin/ssh', [...SSH_OPTS, host, command]);
+}
+
+// ─── 10. Proxmox pvestatd health ─────────────────────────────────────────────
+
+const PROXMOX_HOST = process.env['PROXMOX_HOST'] ?? 'root@192.168.178.10';
+
+async function checkProxmoxPvestatd(): Promise<CheckResult> {
+  const start = Date.now();
+  const { stdout, success } = await sshExec(
+    PROXMOX_HOST,
+    'pid=$(pgrep pvestatd 2>/dev/null | head -1); [ -n "$pid" ] && cat /proc/$pid/status 2>/dev/null | grep "^State:" || echo "NOT_RUNNING"',
+  );
+  const latency_ms = Date.now() - start;
+
+  if (!success) {
+    return { healthy: false, message: 'Cannot SSH to Proxmox to check pvestatd', latency_ms };
+  }
+
+  const out = stdout.trim();
+
+  if (out === 'NOT_RUNNING') {
+    return { healthy: false, message: 'pvestatd is not running on Proxmox', details: { state: 'not_running' }, latency_ms };
+  }
+
+  const stateMatch = out.match(/State:\s+(\S)/);
+  const state = stateMatch?.[1] ?? '?';
+
+  if (state === 'D') {
+    return {
+      healthy: false,
+      message: 'pvestatd is in D-state (kernel deadlock) — Proxmox GUI graphs will be empty',
+      details: { state: 'D', raw: out.slice(0, 200) },
+      latency_ms,
+    };
+  }
+
+  return { healthy: true, message: `pvestatd is running (state: ${state})`, details: { state }, latency_ms };
+}
+
+async function healProxmoxPvestatd(diagnosis: string): Promise<HealResult> {
+  // Step 1: Get PID
+  const { stdout: pidOut } = await sshExec(PROXMOX_HOST, 'pgrep pvestatd 2>/dev/null | head -1');
+  const pid = pidOut.trim();
+
+  if (!pid) {
+    // Not running at all — just start it
+    const { success, stdout, stderr } = await sshExec(PROXMOX_HOST, 'systemctl start pvestatd 2>&1');
+    return { action_taken: 'systemctl start pvestatd', success, output: `${stdout}\n${stderr}`.trim() };
+  }
+
+  // Step 2: Move stuck process to root cgroup so systemd can reclaim the unit
+  await sshExec(PROXMOX_HOST, `echo ${pid} > /sys/fs/cgroup/cgroup.procs 2>/dev/null || true`);
+
+  // Step 3: Remove stale lock files
+  await sshExec(PROXMOX_HOST, 'rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid 2>/dev/null || true');
+
+  // Step 4: Reset failed state and start
+  const { stdout, stderr, success } = await sshExec(
+    PROXMOX_HOST,
+    'systemctl reset-failed pvestatd 2>/dev/null; systemctl start pvestatd 2>&1; sleep 3; systemctl is-active pvestatd',
+  );
+  const output = `${stdout}\n${stderr}`.trim();
+  logger.info({ pid, diagnosis: diagnosis.slice(0, 120), output }, 'pvestatd D-state heal executed');
+  return { action_taken: `moved pid ${pid} to root cgroup, reset-failed, started pvestatd`, success, output };
+}
+
+// ─── 11. OPNsense disk space ─────────────────────────────────────────────────
+
+const OPNSENSE_HOST = process.env['OPNSENSE_HOST'] ?? 'root@192.168.178.11';
+const OPNSENSE_DISK_THRESHOLD = 75;
+
+async function checkOpnsenseDisk(): Promise<CheckResult> {
+  const start = Date.now();
+  const { stdout, success } = await sshExec(
+    OPNSENSE_HOST,
+    "df -h / | awk 'NR==2{print $5}' | tr -d '%'",
+  );
+  const latency_ms = Date.now() - start;
+
+  if (!success) {
+    return { healthy: false, message: 'Cannot SSH to OPNsense to check disk', latency_ms };
+  }
+
+  const usedPct = parseInt(stdout.trim(), 10);
+
+  if (isNaN(usedPct)) {
+    return { healthy: false, message: `Cannot parse OPNsense disk usage: "${stdout.trim()}"`, latency_ms };
+  }
+
+  if (usedPct >= OPNSENSE_DISK_THRESHOLD) {
+    return {
+      healthy: false,
+      message: `OPNsense disk usage ${usedPct}% ≥ threshold ${OPNSENSE_DISK_THRESHOLD}%`,
+      details: { usedPercent: usedPct, threshold: OPNSENSE_DISK_THRESHOLD },
+      latency_ms,
+    };
+  }
+
+  return {
+    healthy: true,
+    message: `OPNsense disk usage: ${usedPct}%`,
+    details: { usedPercent: usedPct },
+    latency_ms,
+  };
+}
+
+async function healOpnsenseDisk(diagnosis: string): Promise<HealResult> {
+  const steps = [
+    // Remove logs older than 7 days
+    'find /var/log -name "*.log" -mtime +7 -delete 2>/dev/null || true',
+    // Suricata JSON logs older than 3 days
+    'find /var/log/suricata -name "*.json" -mtime +3 -delete 2>/dev/null || true',
+    'find /var/log/suricata -name "*.json.gz" -mtime +1 -delete 2>/dev/null || true',
+    // Tmp files older than 1 day
+    'find /tmp /var/tmp -mtime +1 -delete 2>/dev/null || true',
+    // pkg cache
+    'pkg clean -y 2>/dev/null || true',
+    // Report new usage
+    "df -h / | awk 'NR==2{print $5}'",
+  ];
+
+  const { stdout, stderr, success } = await sshExec(OPNSENSE_HOST, steps.join('; '));
+  const output = `${stdout}\n${stderr}`.trim();
+  const newUsage = stdout.trim().split('\n').at(-1) ?? '?';
+
+  logger.info({ diagnosis: diagnosis.slice(0, 120), newUsage, output: output.slice(0, 400) }, 'OPNsense disk cleanup executed');
+
+  return {
+    action_taken: `cleaned logs, tmp, pkg cache on OPNsense — disk now at ${newUsage}`,
+    success,
+    output: output.slice(0, 500),
+  };
+}
+
 // ─── Exported check list ──────────────────────────────────────────────────────
 
 export const healthChecks: HealthCheck[] = [
@@ -408,8 +676,11 @@ export const healthChecks: HealthCheck[] = [
   { name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
   { name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
   { name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
+  { name: 'service-ports', category: 'service', check: checkServicePorts, heal: healServicePorts },
   { name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
   { name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
   { name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
   { name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
+  { name: 'proxmox-pvestatd', category: 'service', check: checkProxmoxPvestatd, heal: healProxmoxPvestatd },
+  { name: 'opnsense-disk', category: 'service', check: checkOpnsenseDisk, heal: healOpnsenseDisk },
 ];