From 0191c60b64d2341ef5f01dbc426ea4d7984a5c63 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 5 Jun 2026 20:23:33 +0000 Subject: [PATCH] chore: commit deployed gateway state (dashboard, streaming, routing, bridges, cost-tracking) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live production state on Erik that had drifted from Gitea — deployed across several sessions but never committed. Excludes deploy/ecosystem.config.cjs (holds live tokens). - dashboard: passive usage-report endpoint, per-device entries, CEST timezone, cost-panel rounding - completion: SSE + HTTP/2 streaming - pipeline: routing-rules, request-scorer, external-providers (subscription bridges) - cost-tracking: tokenvault migration, cost-calculator, request-logger - infra: docker-compose bridge env, server/health/tls, deps --- Dockerfile | 1 - docker-compose.yaml | 16 + package-lock.json | 57 +- package.json | 2 +- packages/gateway/package.json | 5 +- .../prompts/templates/linkedin_post.yaml | 112 +- packages/gateway/public/dashboard.html | 3898 ++++++++++++++--- packages/gateway/src/config/models.yaml | 71 +- .../gateway/src/config/routing-rules.yaml | 4 +- .../002-tokenvault-cost-tracking.sql | 2 +- packages/gateway/src/db/schema-extensions.sql | 8 +- .../gateway/src/modules/request-logger.ts | 122 +- .../src/observability/cost-calculator.ts | 2 +- .../src/pipeline/external-providers.ts | 119 +- .../gateway/src/pipeline/request-scorer.ts | 36 +- packages/gateway/src/pipeline/router.ts | 9 +- packages/gateway/src/routes/completion.ts | 1159 ++++- packages/gateway/src/routes/dashboard.ts | 1362 +++++- packages/gateway/src/routes/health.ts | 41 +- packages/gateway/src/routes/static.ts | 64 +- packages/gateway/src/security/tls-config.ts | 24 +- packages/gateway/src/server.ts | 74 +- .../gateway/src/utils/tokenvault-hooks.ts | 18 +- 23 files changed, 6210 insertions(+), 996 deletions(-) diff --git a/Dockerfile b/Dockerfile index 865d4bc..58a421b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,6 @@ COPY --from=builder /app/packages/gateway/dist ./packages/gateway/dist # Copy production node_modules COPY --from=builder /app/node_modules ./node_modules -COPY --from=builder /app/packages/gateway/node_modules ./packages/gateway/node_modules 2>/dev/null || true # Copy runtime assets (prompt templates, config) COPY packages/gateway/prompts ./packages/gateway/prompts diff --git a/docker-compose.yaml b/docker-compose.yaml index 68f5c9b..618febe 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,15 +4,31 @@ services: container_name: llm-gateway ports: - "3100:3100" + extra_hosts: + - "host.docker.internal:host-gateway" environment: NODE_ENV: production PORT: "3100" DATABASE_URL: "${DATABASE_URL}" TIP_DATABASE_URL: "${TIP_DATABASE_URL}" OLLAMA_URL: "http://192.168.178.169:11434" + OLLAMA_BASE_URL: "${OLLAMA_BASE_URL:-https://ollama.fichtmueller.org}" + CLAUDE_BRIDGE_ENABLED: "true" + CLAUDE_BRIDGE_URL: "${CLAUDE_BRIDGE_URL:-http://host.docker.internal:3250}" + CLAUDE_CODE_URL: "${CLAUDE_CODE_URL:-http://host.docker.internal:3250}" + OPENAI_BRIDGE_URL: "${OPENAI_BRIDGE_URL:-http://host.docker.internal:3251}" + CHATGPT_BRIDGE_URL: "${CHATGPT_BRIDGE_URL:-http://host.docker.internal:3251}" + COPILOT_BRIDGE_URL: "${COPILOT_BRIDGE_URL:-http://host.docker.internal:3252}" + GEMINI_BRIDGE_URL: "${GEMINI_BRIDGE_URL:-http://host.docker.internal:3254}" + CODEX_BRIDGE_URL: "${CODEX_BRIDGE_URL:-http://host.docker.internal:3253}" + OPENAI_CODEX_URL: "${OPENAI_CODEX_URL:-http://host.docker.internal:3253}" + AIDER_BRIDGE_URL: "${AIDER_BRIDGE_URL:-http://host.docker.internal:3256}" SHIELDX_URL: "${SHIELDX_URL:-}" GITEA_URL: "http://gitea.context-x.org" LOG_LEVEL: "${LOG_LEVEL:-info}" + DASHBOARD_AUTH_TOKEN: "${DASHBOARD_AUTH_TOKEN:-}" + REFERENCE_INPUT_COST_PER_1K: "${REFERENCE_INPUT_COST_PER_1K:-0.005}" + REFERENCE_OUTPUT_COST_PER_1K: "${REFERENCE_OUTPUT_COST_PER_1K:-0.015}" restart: unless-stopped healthcheck: test: ["CMD", "wget", "-q", "-O-", "http://localhost:3100/health/live"] diff --git a/package-lock.json b/package-lock.json index f0aba6d..7a066da 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,10 +11,10 @@ "packages/*" ], "dependencies": { - "jose": "^6.2.2" + "jose": "^6.2.3" } }, - "../../../shieldx": { + "../../shieldx": { "extraneous": true }, "node_modules/@esbuild/darwin-arm64": { @@ -305,6 +305,10 @@ "resolved": "packages/codex-lsp-adapter", "link": true }, + "node_modules/@llm-gateway/companion": { + "resolved": "packages/companion", + "link": true + }, "node_modules/@llm-gateway/ctx-health": { "resolved": "packages/ctx-health", "link": true @@ -321,6 +325,10 @@ "resolved": "packages/learning-integration", "link": true }, + "node_modules/@llm-gateway/mcp-server": { + "resolved": "packages/mcp-server", + "link": true + }, "node_modules/@llm-gateway/prompt-optimizer": { "resolved": "packages/prompt-optimizer", "link": true @@ -1127,6 +1135,8 @@ }, "node_modules/fastify-plugin": { "version": "5.1.0", + "resolved": "https://registry.npmjs.org/fastify-plugin/-/fastify-plugin-5.1.0.tgz", + "integrity": "sha512-FAIDA8eovSt5qcDgcBvDuX/v0Cjz0ohGhENZ/wpc3y+oZCY2afZ9Baqql3g/lC+OHRnciQol4ww7tuthOb9idw==", "funding": [ { "type": "github", @@ -1475,9 +1485,9 @@ } }, "node_modules/jose": { - "version": "6.2.2", - "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.2.tgz", - "integrity": "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz", + "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/panva" @@ -3178,6 +3188,21 @@ "node": ">=0.4" } }, + "node_modules/yaml": { + "version": "2.9.0", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.9.0.tgz", + "integrity": "sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==", + "license": "ISC", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14.6" + }, + "funding": { + "url": "https://github.com/sponsors/eemeli" + } + }, "node_modules/yocto-queue": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.2.2.tgz", @@ -4086,6 +4111,16 @@ } } }, + "packages/companion": { + "name": "@llm-gateway/companion", + "version": "1.0.0", + "bin": { + "llm-gateway-companion": "bin/llm-gateway-companion.js" + }, + "engines": { + "node": ">=18" + } + }, "packages/ctx-health": { "name": "@llm-gateway/ctx-health", "version": "1.0.0", @@ -4114,6 +4149,7 @@ "@fastify/static": "^8.3.0", "ajv": "^8.17.1", "fastify": "^5.8.5", + "fastify-plugin": "^5.1.0", "franc": "^6.2.0", "jose": "^5.4.0", "js-yaml": "^4.1.0", @@ -4122,6 +4158,7 @@ "pg-boss": "^10.1.3", "pino": "^9.5.0", "prom-client": "^15.1.3", + "yaml": "^2.9.0", "zod": "^3.23.8" }, "devDependencies": { @@ -4448,6 +4485,16 @@ } } }, + "packages/mcp-server": { + "name": "@llm-gateway/mcp-server", + "version": "1.0.0", + "bin": { + "llm-gateway-mcp": "bin/llm-gateway-mcp.js" + }, + "engines": { + "node": ">=18" + } + }, "packages/prompt-optimizer": { "name": "@llm-gateway/prompt-optimizer", "version": "0.1.0", diff --git a/package.json b/package.json index b24c7ac..41552a6 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,6 @@ "ctx-health:dev": "npm run dev --workspace=packages/ctx-health" }, "dependencies": { - "jose": "^6.2.2" + "jose": "^6.2.3" } } diff --git a/packages/gateway/package.json b/packages/gateway/package.json index 1527ac7..22ba807 100644 --- a/packages/gateway/package.json +++ b/packages/gateway/package.json @@ -7,7 +7,8 @@ "build": "tsc && npm run build:copy-assets", "build:copy-assets": "mkdir -p dist/db/migrations dist/config dist/public && cp -r src/db/migrations/*.sql dist/db/migrations/ 2>/dev/null || true && cp -r src/config/*.yaml dist/config/ 2>/dev/null || true && cp -r public/* dist/public/ 2>/dev/null || true", "start": "node dist/server.js", - "test": "vitest" + "test": "vitest", + "prestart": "node scripts/check-build-drift.mjs" }, "dependencies": { "@fastify/cors": "^10.1.0", @@ -16,6 +17,7 @@ "@fastify/static": "^8.3.0", "ajv": "^8.17.1", "fastify": "^5.8.5", + "fastify-plugin": "^5.1.0", "franc": "^6.2.0", "jose": "^5.4.0", "js-yaml": "^4.1.0", @@ -24,6 +26,7 @@ "pg-boss": "^10.1.3", "pino": "^9.5.0", "prom-client": "^15.1.3", + "yaml": "^2.9.0", "zod": "^3.23.8" }, "devDependencies": { diff --git a/packages/gateway/prompts/templates/linkedin_post.yaml b/packages/gateway/prompts/templates/linkedin_post.yaml index 7c3a598..e98adb2 100644 --- a/packages/gateway/prompts/templates/linkedin_post.yaml +++ b/packages/gateway/prompts/templates/linkedin_post.yaml @@ -1,63 +1,105 @@ id: linkedin_post -version: "1.0.0" +version: "2.0.0" task_type: linkedin_post +description: "LinkedIn teaser in Rene Fichtmueller's voice. Anti-AI, anti-marketing, technical, direct." system_prompt: | - You are a professional LinkedIn content writer. Write engaging, authentic posts that sound human. + You write a single short LinkedIn post in Rene Fichtmueller's voice. Rene is a network/optics engineer who blogs at blog.fichtmueller.org. His voice is direct, technical, sometimes contrarian, never marketing. - Rules: - - Maximum 1300 characters (LinkedIn soft limit) - - No hashtag spam (max 3 relevant hashtags) - - No engagement-bait questions at the end - - No "In today's fast-paced world" openings - - Write in first person, direct and confident tone - - Include a clear value point or insight - - Current date: {{current_date}} + HARD RULES — do not violate: + - 2 to 3 short sentences. Maximum 4. Period. + - No hashtags. None. Not at the end, not anywhere. + - No emojis. Not even one. + - No engagement-bait. Do not end with "What do you think?", "Thoughts?", "Have you seen this?". + - No call-to-action language ("Check it out", "Read more", "Don't miss"). + - No meta-references to the blog post itself: do not write "I wrote about this", "I published a piece", "I broke this down", "more in the article". + - End with the URL on its own line. Nothing after the URL. + + BANNED PHRASES — never use any of these: + - delve, leverage, robust, journey, embark, paradigm, unlock, seamlessly, holistic, harness, foster, amplify, underscore, indelible, profound, intricate, meticulous, testament, vibrant, bespoke, encompass, hitherto, realm, utilize, synergy + - "leaving money on the table" + - "until it's too late" + - "the line item most X skip" + - "turns out" + - "the unexpected part is" + - "the gap between X and Y is wider than" + - "in today's fast-paced", "in the world of", "in the realm of" + - "it's important to note", "it's worth noting" + - "let's dive into", "let's explore" + - "the future of X", "the next generation of X" (unless quoting someone) + - "game-changer", "cutting-edge", "groundbreaking", "comprehensive" + + TONE — match these traits: + - Specific numbers over generalities. 20W is better than "high power". 14 weeks is better than "long lead time". + - Named products, standards, RFCs when relevant. 400ZR+, RPKI, IEEE 802.3. + - First person ("I", "my", "we") where genuine. + - Short sentences. Period. Short sentences. Period. + - Concession sometimes: admit what you don't know or what surprised you. + - Closing line stands on its own. No qualifier, no hedge. + + Current date: {{current_date}} {{few_shot_examples}} system_prompt_de: | - Du bist ein professioneller LinkedIn-Content-Writer. Schreibe authentische, menschlich klingende Beiträge. + Du schreibst einen kurzen LinkedIn-Post in der Stimme von Rene Fichtmueller. Direkt, technisch, manchmal contrarian, nie Marketing. - Regeln: - - Maximal 1300 Zeichen (LinkedIn Soft-Limit) - - Keine Hashtag-Spam (max. 3 relevante Hashtags) - - Keine Engagement-Bait-Fragen am Ende - - Keine Einstiege mit "In der heutigen schnelllebigen Welt" - - Schreibe in der Ich-Perspektive, direkt und selbstsicher - - Enthalte einen klaren Mehrwert oder Einblick - - Aktuelles Datum: {{current_date}} + HARTE REGELN — nie verletzen: + - 2 bis 3 kurze Sätze. Maximal 4. Punkt. + - Keine Hashtags. Keine. Nirgendwo. + - Keine Emojis. Auch nicht einer. + - Kein Engagement-Bait. Niemals enden mit "Was meint ihr?", "Eure Erfahrung?". + - Keine Call-to-Action-Sprache ("Schaut mal rein", "Hier mehr lesen"). + - Keine Meta-Referenzen auf den Blog-Post: kein "Ich habe dazu geschrieben", "Mehr im Artikel". + - URL alleine in der letzten Zeile. Nichts danach. + + VERBOTENE WORTE/PHRASEN: + - "leverage", "delve", "robust", "harness", "navigieren", "Reise", "Paradigma", "freischalten", "ganzheitlich", "Synergie", "umfassend" + - "in der heutigen schnelllebigen Welt" + - "es lohnt sich zu erwähnen" + + TON: + - Konkrete Zahlen statt Verallgemeinerungen. + - Erste Person wo authentisch. + - Kurze Sätze. Punkt. + - Konzession wo möglich: zugeben was überrascht hat. + + Aktuelles Datum: {{current_date}} user_template: | - Topic/Input: + Article info: {{input}} - Context: - {{user_context}} - - Write a LinkedIn post: + Write the LinkedIn post now. 2-3 sentences. No hashtags. URL on last line. Stop after the URL. user_template_de: | - Thema/Eingabe: + Artikel-Infos: {{input}} - Kontext: - {{user_context}} - - Schreibe einen LinkedIn-Beitrag: + Schreibe den LinkedIn-Post jetzt. 2-3 Sätze. Keine Hashtags. URL in der letzten Zeile. Stopp nach der URL. few_shot_examples: - - user: "We deployed a new BGP monitoring system last week" + - user: "Article info:\nTitle: BGP Route Leak Hit My Network Last Tuesday\nExcerpt: A transit provider re-advertised our customer cone to the wrong peers. Detection took 4 hours. The mitigation took 14 minutes once we saw it.\nURL: https://blog.fichtmueller.org/example1/" assistant: | - We spent 18 months dealing with BGP route leaks that cost us 4 hours of network visibility per incident. + A transit provider re-advertised our customer cone to the wrong peers last Tuesday. We caught it after 4 hours. The fix took 14 minutes once we saw it — the cost was the 4 hours we didn't. - Last week we deployed our own monitoring system. First result: caught a route leak from a transit provider within 3 minutes, before any customer noticed. + https://blog.fichtmueller.org/example1/ - The fix wasn't sophisticated. It was systematic alerting on RPKI invalid routes combined with PeeringDB cross-referencing. + - user: "Article info:\nTitle: 400ZR Reach Math Nobody Shows You\nExcerpt: Datasheets quote a maximum reach number. That number is for a clean link with margin you don't have.\nURL: https://blog.fichtmueller.org/example2/" + assistant: | + The 400ZR datasheet number is a ceiling, not a target. Real-world links have eight connectors, two amplifier hops, and aged fiber — the spec assumes none of that. - If you're running a mid-size ISP and still relying on customer tickets to discover routing issues, that's the gap worth closing. + Build to the budget you actually have, not the one the marketing slide showed you. - #BGP #NetworkEngineering #ISP + https://blog.fichtmueller.org/example2/ + + - user: "Article info:\nTitle: ADHD Diary #008: The 47-Tabs Problem\nExcerpt: I closed 47 browser tabs Tuesday. By Wednesday I had 51 again. This is not a tool problem.\nURL: https://blog.fichtmueller.org/example3/" + assistant: | + I closed 47 browser tabs on Tuesday. By Wednesday morning I had 51 again. + + This isn't a tool problem. Every tab is a "I might do this later" promise I made to myself. The fix isn't a better tab manager — it's saying no. + + https://blog.fichtmueller.org/example3/ variables: - input diff --git a/packages/gateway/public/dashboard.html b/packages/gateway/public/dashboard.html index a1d3502..7d231f0 100644 --- a/packages/gateway/public/dashboard.html +++ b/packages/gateway/public/dashboard.html @@ -1,541 +1,2101 @@ + - LLM Gateway Dashboard + llm.gateway / workbench + + + -
-
-

LLM Gateway Dashboard

-
-
- - Checking database... -
-
- - Connecting to stream... -
-
- 0 SSE listeners -
-
-
+
-
-
-
Total Requests
-
0
-
+ +
+
+ llm.gateway + gateway workbench · v1.0
- -
-
Success Rate
-
0%
-
-
- -
-
Avg Latency
-
0ms
-
-
- -
-
Total Cost
-
$0.00
-
-
- -
-
Avg Confidence
-
0%
-
-
- -
-
Fallback Usage
-
0%
-
+
+
-

Top Models

-
-
Loading models...
+ +
+
+ + db + connecting +
+
+ + poll + starting +
+
+ interval + 3s +
+
+ mode + auto +
-

Top Callers

-
-
Loading callers...
-
+ + -

Available Providers & Models

-
-
-

Local

-
-
Loading providers...
+ +
+ + +
+ +
+
summoning buddy
+
+ + +
+
total tokens saved · all layers · all-time
+
0tokens
+
+
⚡ Gateway (LLM calls)0
+ +
+
+
+ cost saved + $0.00 +
+
+ cache hits + 0 +
+
+ savings rate + 0% +
+
+
+ + +
+
cost analysis · last 24h · USD
+
+
+
without gateway
+
$0.00
+
+
+
+
with gateway
+
$0.00
+
+
+
you saved $0.00 · 0% reduction
-
-

Subscription

-
-
Loading providers...
+ + +

Savings Sources 5 measurement axes across all calls

+
+
loading
+
+ + +

Live Metrics last 24h

+
+
+
requests
+
0
+
routed
+
+
+
success rate
+
0%
+
approved/total
+
+
+
avg latency
+
0ms
+
end-to-end
+
+
+
spent today
+
$0.00
+
actual usd
+
+
+
confidence
+
0/10
+
post-val
+
+
+
fallback usage
+
0%
+
primary→fallback
-
-

Free Tier

-
-
Loading providers...
+ + +
+
+

Activity · last 365 days streak 0d

+
loading activity
+
+
+

Forecast based on recent trend

+
computing forecast
+
+
+ + +
+
+

Live Activity most recent first

+
listening
+
+
+

Top Models last 24h

+
analyzing routing
+ +

Top Callers

+
analyzing callers
+
+
+ + +

Achievements 0/0

+
checking quests
+
+ + +
+
+ +
+ + +
+
+ + + + +
+
discovering installed subscriptions
+
+
+ + +
+
+
+

Local on-host inference

+
+
enumerating local models
+
+
+
+

Subscription paid plans via bridges

+
+
enumerating subscription providers
+
+
+
+

Free Tier api-key authenticated

+
+
enumerating free-tier endpoints
+
+
+
+
+ + +
+

Desktop AI Coverage only gateway traffic is counted

+
+
checking connected clients
+
+

Recent Requests live polling

+
+ + + +
+
+
+
request id
+
caller
+
model
+
status
+
ctx before
+
ctx sent
+
saved
+
compression
+
cost
+
latency
+
+
+
no requests yet
+
+
+
+ + +
+
+
+
cumulative savings · last 24h
+
$0.00
+
— · — tokens prevented · — cache hits
+
+
+ +
+ $ saved per hour + hit rate — +
+
+
+ +
+
+
cache entries
+
0
+
distinct cached responses
+
+
+
tokens prevented
+
0
+
never sent to LLM
+
+
+
cache hit rate
+
0%
+
hits ÷ total req
+
+
+
compressed since last restart
+
0
+
— · — ops · since —
+
+
+ +

Top Caching Callers most savings

+
+
loading
+
+ +

Cache Controls manual invalidation

+
+ + + +
+
+ + +
+
+
+ Subscription Pool Wallet — tracks API calls + (not tokens) against each Pro plan's quota window. Numbers here are + messages remaining, not tokens. For token savings via cache, + see the Savings tab. +
+
+
+
loading wallet
+
+
+ + +
+
+ + + + + + +
+
+
enter a caller id and click load
+
+ +

Knowledge Graph all callers + facts

+
+ +
+ caller + fact key + value +
+
+
+ + +
+
+
computing standings
+
+

Race Leaderboard last 7 days

+
loading
+
+ + +
+

Public Share Card embeddable SVG · OG-card sized · no auth required

+ + + + +
+ + +
+

Monthly Report save as PDF via browser print

+ + +
+ + +
+

API Reference all endpoints route through compression + caller tracking

+ +
+ The LLM Gateway exposes three POST endpoints and one GET. Every call is logged in + activity, compressed when input ≥ 700 tokens, and routed via routing-rules.yaml + to the right subscription bridge (Claude Code, ChatGPT, Copilot, M365 Copilot, Codex) or local Ollama. +
+ + +
+
+ POST + /v1/chat/completions + OpenAI-compatible · works with `openai` SDK + +
+
curl https://llm-gateway.context-x.org/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "claude-sonnet-4.6",
+    "messages": [{"role": "user", "content": "hi"}]
+  }'
+
+ + +
+
+ POST + /v1/messages + Anthropic-compatible · works with `@anthropic-ai/sdk` + +
+
curl https://llm-gateway.context-x.org/v1/messages \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "claude-sonnet-4.6",
+    "messages": [{"role": "user", "content": "hi"}],
+    "max_tokens": 1024
+  }'
+
+ + +
+
+ POST + /v1/completion + native — full caller-tracking + compression options + +
+
curl https://llm-gateway.context-x.org/v1/completion \
+  -H "Content-Type: application/json" \
+  -d '{
+    "caller": "my-app",
+    "task_type": "generic_qa",
+    "input": "your prompt here",
+    "options": { "compression": { "enabled": true, "mode": "auto" } }
+  }'
+
+ + +
+
+ GET + /v1/models + list every model the gateway can route to + +
+
curl https://llm-gateway.context-x.org/v1/models
+
+ + +

Try it out live POST against the gateway

+
+
+ + +
+ +
+ + +
+ +
+ + +

Model → Bridge Mapping which subscription each model alias routes to

+
+ + + + + + + + + + + + + + + + + + +
Model aliasBridgeSubscription usedPortStatus
claude-sonnet-4.6, claude-haiku, claude-opusclaude-bridgeClaude Code Max (OAuth)3250
gpt-4o, gpt-4.1, gpt-5.xopenai-bridgeChatGPT Plus / Pro3251
copilot-gpt-4o, copilot-claude-3.7copilot-bridgeGitHub Copilot3252
codex-mini, gpt-5.1-codex-minicodex-bridgeOpenAI Codex CLI3253
m365-copilotm365-copilot-bridgeMicrosoft 365 Copilot3257
qwen2.5:3b / 7b / 14b / 32b, magatama:32b, magatama-coderollama (Mac Studio)local — no cost11434
+
+ +
+ + + -

Recent Requests

-
- - - + + -
-
-
Request ID
-
Caller
-
Model
-
Status
-
Tokens In
-
Cost
-
Latency
-
-
-
No requests yet
-
-
-
-
- Connected +
+ + connected
- \ No newline at end of file + diff --git a/packages/gateway/src/config/models.yaml b/packages/gateway/src/config/models.yaml index 4a3e46c..7f028e3 100644 --- a/packages/gateway/src/config/models.yaml +++ b/packages/gateway/src/config/models.yaml @@ -1,7 +1,7 @@ # LLM Gateway Model Configuration # Ollama base URL: http://192.168.178.169:11434 -ollama_base_url: "https://ollama.fichtmueller.org" +ollama_base_url: "http://127.0.0.1:11434" tiers: fast: @@ -26,7 +26,7 @@ models: qwen2.5:3b: tier: fast context_length: 32768 - strengths: [classification, short_text, routing] + strengths: [classification, summarization, routing] max_tokens_default: 512 qwen2.5:7b: @@ -35,83 +35,58 @@ models: strengths: [classification, summarization, short_analysis] max_tokens_default: 1024 - phi3.5:3.8b: + qwen2.5:7b-instruct: tier: fast - context_length: 128000 - strengths: [classification, summarization] + context_length: 32768 + strengths: [classification, summarization, short_analysis] + max_tokens_default: 1024 + + qwen2.5-coder:7b-instruct: + tier: fast + context_length: 32768 + strengths: [code_generation, technical_analysis, routing] max_tokens_default: 512 # ─── MAGATAMA — Fine-tuned Security Intelligence (Context X) ───────────────── magatama:32b: tier: large context_length: 131072 - strengths: [security_analysis, threat_intelligence, compliance, bgp_security, incident_response, nis2, ciso_reporting] + strengths: [security_analysis, threat_intelligence, compliance, bgp_security, incident_response, nis2, ciso_reporting, complex_writing, deep_analysis, technical] max_tokens_default: 4096 description: "MAGATAMA まがたま — TEPPEKI 7-pillar security AI, fine-tuned on Qwen2.5-32B" - # Custom fine-tuned models (Context X) - ctxhealer:latest: - tier: medium - context_length: 32768 - strengths: [infrastructure_diagnosis, root_cause_analysis, remediation_steps] - max_tokens_default: 1024 - - llama-guard3:1b: - tier: fast - context_length: 8192 - strengths: [safety_classification, threat_detection] - max_tokens_default: 256 - # Medium tier qwen2.5:14b: tier: medium context_length: 131072 - strengths: [general, writing, analysis, coding] + strengths: [general, writing, analysis, coding, dialogue] max_tokens_default: 2048 - mistral:7b: + magatama-llm-v2-0:latest: tier: medium - context_length: 32768 - strengths: [general, writing] + context_length: 131072 + strengths: [general, writing, analysis, coding, dialogue] max_tokens_default: 2048 - llama3.2:8b: - tier: medium - context_length: 128000 - strengths: [general, chat, analysis] - max_tokens_default: 2048 - - deepseek-r1:8b: + magatama-coder:latest: tier: medium context_length: 65536 - strengths: [reasoning, analysis, coding] + strengths: [code_generation, technical_analysis, debugging] max_tokens_default: 2048 # Large tier qwen2.5:32b: tier: large context_length: 131072 - strengths: [complex_writing, deep_analysis, technical] - max_tokens_default: 4096 - - llama3.3:70b: - tier: large - context_length: 128000 - strengths: [complex_reasoning, long_form, research] - max_tokens_default: 4096 - - deepseek-r1:32b: - tier: large - context_length: 131072 - strengths: [chain_of_thought, complex_reasoning] + strengths: [complex_writing, deep_analysis, technical, security_analysis] max_tokens_default: 4096 # Fallback chains per tier fallback_chains: - fast: [qwen2.5:3b, qwen2.5:7b, phi3.5:3.8b] - medium: [qwen2.5:14b, mistral:7b, llama3.2:8b] - large: [qwen2.5:32b, llama3.3:70b, deepseek-r1:32b] - code_generation: [deepseek-r1:32b, qwen2.5:32b, llama3.3:70b] + fast: [qwen2.5:7b-instruct, qwen2.5-coder:7b-instruct] + medium: [magatama-llm-v2-0:latest, magatama-coder:latest, qwen2.5:7b-instruct] + large: [magatama:32b, magatama-llm-v2-0:latest] + code_generation: [magatama-coder:latest, qwen2.5-coder:7b-instruct] # Cross-tier fallback when primary tier fails tier_fallback: diff --git a/packages/gateway/src/config/routing-rules.yaml b/packages/gateway/src/config/routing-rules.yaml index 7defb1e..eb7d2d4 100644 --- a/packages/gateway/src/config/routing-rules.yaml +++ b/packages/gateway/src/config/routing-rules.yaml @@ -1110,7 +1110,7 @@ routing_rules: # ─── CONTENT / LINKEDIN ────────────────────────────────────────────────────── linkedin_post: - model: qwen2.5:32b + model: fo-blog-v10 tier: large prompt_template: linkedin_post temperature: 0.7 @@ -1118,7 +1118,7 @@ routing_rules: output_format: text requires_fact_check: false validators: [banlist, language, length, question_closer] - callers: [n8n, internal] + callers: [n8n, internal, linkedin-distributor] linkedin_comment: model: qwen2.5:14b diff --git a/packages/gateway/src/db/migrations/002-tokenvault-cost-tracking.sql b/packages/gateway/src/db/migrations/002-tokenvault-cost-tracking.sql index b3d85b7..c8ba81c 100644 --- a/packages/gateway/src/db/migrations/002-tokenvault-cost-tracking.sql +++ b/packages/gateway/src/db/migrations/002-tokenvault-cost-tracking.sql @@ -3,7 +3,7 @@ -- Purpose: Track token compression and cost analytics -- PostgreSQL compatible version (version 16+) --- Table: Token compression metrics (LeanCTX, RTK) +-- Table: Token compression metrics (LLM Gateway) CREATE TABLE IF NOT EXISTS tokenvault_metrics ( id SERIAL PRIMARY KEY, file_path VARCHAR(255), diff --git a/packages/gateway/src/db/schema-extensions.sql b/packages/gateway/src/db/schema-extensions.sql index 4efdba7..714a203 100644 --- a/packages/gateway/src/db/schema-extensions.sql +++ b/packages/gateway/src/db/schema-extensions.sql @@ -1,12 +1,12 @@ -- Tokenvault & Cost Tracking Schema Extensions -- Created: 2026-04-19 --- Purpose: Track token compression (LeanCTX + RTK) and cost analytics +-- Purpose: Track token compression (LLM Gateway) and cost analytics --- Table: Token compression metrics (LeanCTX, RTK) +-- Table: Token compression metrics (LLM Gateway) CREATE TABLE IF NOT EXISTS tokenvault_metrics ( id SERIAL PRIMARY KEY, file_path VARCHAR(255), - mode VARCHAR(50), -- 'lean-aggressive', 'lean-map', 'rtk-max', etc. + mode VARCHAR(50), -- 'gateway-aggressive', 'gateway-map', 'gateway-trim', etc. tokens_before INT, tokens_after INT, savings_pct DECIMAL(5,2), @@ -26,7 +26,7 @@ CREATE TABLE IF NOT EXISTS cost_analytics ( agent_id VARCHAR(50), -- 'claude-code', 'qwen-reviewer', etc. tokens_in INT, tokens_out INT, - tokens_compressed INT, -- After LeanCTX + RTK + tokens_compressed INT, -- After LLM Gateway compression cost_usd DECIMAL(10,6), cost_saved_usd DECIMAL(10,6), provider VARCHAR(50), -- 'ollama', 'cerebras', 'groq', 'claude', etc. diff --git a/packages/gateway/src/modules/request-logger.ts b/packages/gateway/src/modules/request-logger.ts index c4e56e9..ee281fc 100644 --- a/packages/gateway/src/modules/request-logger.ts +++ b/packages/gateway/src/modules/request-logger.ts @@ -109,6 +109,11 @@ export class RequestLogger { cost_usd: number; latency_ms: number; fallback_used: boolean; + compression_mode?: string; + compression_tokens_before?: number; + compression_tokens_after?: number; + compression_tokens_saved?: number; + compression_savings_pct?: number; error_message?: string; created_at: string; }> @@ -116,22 +121,35 @@ export class RequestLogger { const result = await this.db.query( ` SELECT - request_id, - caller_id as caller, - task_type, - model, - status, - confidence_score, - tokens_in, - tokens_out, - cost_usd, - latency_ms, - fallback_used, - error_message, - created_at - FROM request_tracking - WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) - ORDER BY created_at DESC + rt.request_id, + rt.caller_id as caller, + rt.task_type, + rt.model, + rt.status, + rt.confidence_score, + rt.tokens_in, + rt.tokens_out, + rt.cost_usd, + rt.latency_ms, + rt.fallback_used, + tv.mode as compression_mode, + tv.tokens_before as compression_tokens_before, + tv.tokens_after as compression_tokens_after, + GREATEST(COALESCE(tv.tokens_before, 0) - COALESCE(tv.tokens_after, 0), 0) as compression_tokens_saved, + tv.savings_pct as compression_savings_pct, + rt.error_message, + rt.created_at + FROM request_tracking rt + LEFT JOIN LATERAL ( + SELECT mode, tokens_before, tokens_after, savings_pct + FROM tokenvault_metrics + WHERE tool_used = 'gateway' + AND file_path = rt.request_id + ORDER BY created_at DESC + LIMIT 1 + ) tv ON true + WHERE rt.created_at > NOW() - MAKE_INTERVAL(hours => $1) + ORDER BY rt.created_at DESC LIMIT $2 `, [offsetHours, limit] @@ -149,6 +167,11 @@ export class RequestLogger { cost_usd: row.cost_usd, latency_ms: row.latency_ms, fallback_used: row.fallback_used, + compression_mode: row.compression_mode, + compression_tokens_before: row.compression_tokens_before ? parseInt(row.compression_tokens_before, 10) : undefined, + compression_tokens_after: row.compression_tokens_after ? parseInt(row.compression_tokens_after, 10) : undefined, + compression_tokens_saved: row.compression_tokens_saved ? parseInt(row.compression_tokens_saved, 10) : 0, + compression_savings_pct: row.compression_savings_pct ? parseFloat(row.compression_savings_pct) : 0, error_message: row.error_message, created_at: row.created_at })); @@ -160,6 +183,17 @@ export class RequestLogger { async getMetrics(bucketMinutes: number = 60): Promise<{ total_requests: number; total_cost: number; + estimated_api_cost: number; + estimated_api_cost_avoided: number; + total_tokens_in: number; + total_tokens_out: number; + total_tokens: number; + compression_operations: number; + compression_tokens_before: number; + compression_tokens_after: number; + compression_tokens_saved: number; + compression_rate: number; + cache_hit_rate: number; avg_latency: number; success_rate: number; avg_confidence: number; @@ -177,13 +211,15 @@ export class RequestLogger { ` SELECT COUNT(*) as total_requests, - SUM(cost_usd) as total_cost, - AVG(latency_ms) as avg_latency, - SUM(CASE WHEN status = 'approved' THEN 1 ELSE 0 END)::FLOAT / COUNT(*) as success_rate, - AVG(confidence_score) as avg_confidence, - SUM(CASE WHEN fallback_used = true THEN 1 ELSE 0 END)::FLOAT / COUNT(*) as fallback_percentage + COALESCE(SUM(cost_usd), 0) as total_cost, + COALESCE(SUM(tokens_in), 0) as total_tokens_in, + COALESCE(SUM(tokens_out), 0) as total_tokens_out, + COALESCE(AVG(latency_ms), 0) as avg_latency, + CASE WHEN COUNT(*) = 0 THEN 0 ELSE SUM(CASE WHEN status = 'approved' THEN 1 ELSE 0 END)::FLOAT / COUNT(*) END as success_rate, + COALESCE(AVG(confidence_score), 0) as avg_confidence, + CASE WHEN COUNT(*) = 0 THEN 0 ELSE SUM(CASE WHEN fallback_used = true THEN 1 ELSE 0 END)::FLOAT / COUNT(*) END as fallback_percentage FROM request_tracking - WHERE created_at > NOW() - MAKE_INTERVAL(mins => $1) + WHERE created_at > NOW() - ($1 * INTERVAL '1 minute') `, [bucketMinutes] ); @@ -192,7 +228,7 @@ export class RequestLogger { ` SELECT caller_id as caller, COUNT(*) as count FROM request_tracking - WHERE created_at > NOW() - MAKE_INTERVAL(mins => $1) + WHERE created_at > NOW() - ($1 * INTERVAL '1 minute') GROUP BY caller_id ORDER BY count DESC LIMIT 5 @@ -204,7 +240,7 @@ export class RequestLogger { ` SELECT model, COUNT(*) as count FROM request_tracking - WHERE created_at > NOW() - MAKE_INTERVAL(mins => $1) + WHERE created_at > NOW() - ($1 * INTERVAL '1 minute') GROUP BY model ORDER BY count DESC LIMIT 5 @@ -224,11 +260,47 @@ export class RequestLogger { [bucketMinutes] ); + const compressionResult = await this.db.query( + ` + SELECT + COUNT(*) as operations, + COALESCE(SUM(tokens_before), 0) as tokens_before, + COALESCE(SUM(tokens_after), 0) as tokens_after, + COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0) as tokens_saved + FROM tokenvault_metrics + WHERE tool_used = 'gateway' + AND created_at > NOW() - ($1 * INTERVAL '1 minute') + `, + [bucketMinutes] + ); + const metrics = metricsResult.rows[0]; + const totalTokensIn = parseInt(metrics.total_tokens_in, 10) || 0; + const totalTokensOut = parseInt(metrics.total_tokens_out, 10) || 0; + const totalTokens = totalTokensIn + totalTokensOut; + const compression = compressionResult.rows[0] ?? {}; + const compressionTokensBefore = parseInt(compression.tokens_before, 10) || 0; + const compressionTokensAfter = parseInt(compression.tokens_after, 10) || 0; + const compressionTokensSaved = parseInt(compression.tokens_saved, 10) || 0; + const referenceInputCostPer1k = parseFloat(process.env['REFERENCE_INPUT_COST_PER_1K'] ?? '0.005'); + const referenceOutputCostPer1k = parseFloat(process.env['REFERENCE_OUTPUT_COST_PER_1K'] ?? '0.015'); + const estimatedApiCost = (totalTokensIn / 1000) * referenceInputCostPer1k + (totalTokensOut / 1000) * referenceOutputCostPer1k; + const totalCost = parseFloat(metrics.total_cost) || 0; return { total_requests: parseInt(metrics.total_requests) || 0, - total_cost: parseFloat(metrics.total_cost) || 0, + total_cost: totalCost, + estimated_api_cost: estimatedApiCost, + estimated_api_cost_avoided: Math.max(0, estimatedApiCost - totalCost), + total_tokens_in: totalTokensIn, + total_tokens_out: totalTokensOut, + total_tokens: totalTokens, + compression_operations: parseInt(compression.operations, 10) || 0, + compression_tokens_before: compressionTokensBefore, + compression_tokens_after: compressionTokensAfter, + compression_tokens_saved: compressionTokensSaved, + compression_rate: compressionTokensBefore > 0 ? compressionTokensSaved / compressionTokensBefore : 0, + cache_hit_rate: 0, avg_latency: Math.round(parseFloat(metrics.avg_latency) || 0), success_rate: parseFloat(metrics.success_rate) || 0, avg_confidence: parseFloat(metrics.avg_confidence) || 0, diff --git a/packages/gateway/src/observability/cost-calculator.ts b/packages/gateway/src/observability/cost-calculator.ts index 4ca992b..59df50a 100644 --- a/packages/gateway/src/observability/cost-calculator.ts +++ b/packages/gateway/src/observability/cost-calculator.ts @@ -101,7 +101,7 @@ export function calculateCost( /** * Calculate cost savings from compression * @param model Model identifier - * @param tokensBeforeCompression Tokens before LeanCTX + RTK + * @param tokensBeforeCompression Tokens before LLM Gateway compression * @param tokensAfterCompression Tokens after compression * @returns Savings in USD */ diff --git a/packages/gateway/src/pipeline/external-providers.ts b/packages/gateway/src/pipeline/external-providers.ts index 244d07e..aaf3643 100644 --- a/packages/gateway/src/pipeline/external-providers.ts +++ b/packages/gateway/src/pipeline/external-providers.ts @@ -47,7 +47,7 @@ const PROVIDERS: readonly ExternalProvider[] = [ enabled: true, models: [ { id: 'claude-opus-4-1', tier: 'reasoning', contextLength: 200000 }, - { id: 'claude-sonnet-4-1', tier: 'large', contextLength: 200000 }, + { id: 'claude-sonnet-4-6', tier: 'large', contextLength: 200000 }, { id: 'claude-haiku-3', tier: 'fast', contextLength: 200000 }, ], }, @@ -86,6 +86,17 @@ const PROVIDERS: readonly ExternalProvider[] = [ { id: 'gpt-3.5-turbo', tier: 'medium', contextLength: 4096 }, ], }, + { + name: 'm365-copilot-bridge', + baseUrl: '', // constructed from M365_COPILOT_BRIDGE_URL env var + envKey: 'M365_COPILOT_BRIDGE_URL', + rateLimitRpm: 60, + enabled: true, + models: [ + { id: 'microsoft-365-copilot', tier: 'reasoning', contextLength: 128000 }, + { id: 'm365-copilot-chat', tier: 'large', contextLength: 128000 }, + ], + }, { name: 'cerebras', baseUrl: 'https://api.cerebras.ai/v1', @@ -146,12 +157,13 @@ const PROVIDERS: readonly ExternalProvider[] = [ { name: 'openai-codex', baseUrl: 'https://api.openai.com/v1', - envKey: 'OPENAI_API_KEY', + envKey: 'OPENAI_CODEX_URL', rateLimitRpm: 60, enabled: true, models: [ - { id: 'gpt-4-turbo', tier: 'reasoning', contextLength: 128000 }, - { id: 'gpt-3.5-turbo', tier: 'fast', contextLength: 16384 }, + { id: 'gpt-5.1-codex', tier: 'reasoning', contextLength: 256000 }, + { id: 'gpt-5.1-codex-mini', tier: 'large', contextLength: 256000 }, + { id: 'codex-mini-latest', tier: 'medium', contextLength: 200000 }, ], }, { @@ -162,23 +174,35 @@ const PROVIDERS: readonly ExternalProvider[] = [ enabled: true, models: [ { id: 'claude-opus-4-1', tier: 'reasoning', contextLength: 200000 }, - { id: 'claude-sonnet-4-1', tier: 'large', contextLength: 200000 }, + { id: 'claude-sonnet-4-6', tier: 'large', contextLength: 200000 }, { id: 'claude-haiku-3', tier: 'fast', contextLength: 200000 }, ], }, { name: 'codex', baseUrl: 'https://api.github.com/copilot_inner/v2', - envKey: 'GITHUB_CODEX_TOKEN', + envKey: 'CODEX_BRIDGE_URL', rateLimitRpm: 60, enabled: true, models: [ - { id: 'github-copilot-x', tier: 'large', contextLength: 8192 }, - { id: 'code-davinci-002', tier: 'medium', contextLength: 4096 }, + { id: 'gpt-5.1-codex', tier: 'reasoning', contextLength: 256000 }, + { id: 'gpt-5.1-codex-mini', tier: 'large', contextLength: 256000 }, + { id: 'codex-mini-latest', tier: 'medium', contextLength: 200000 }, ], }, ]; +const AUTHLESS_BRIDGE_PROVIDERS = new Set([ + 'claude-bridge', + 'claude-code', + 'openai-bridge', + 'chatgpt-bridge', + 'copilot-bridge', + 'm365-copilot-bridge', +]); + +const GENERATE_BRIDGE_PROVIDERS = new Set(['claude-bridge', 'claude-code']); + // ─── Rate Limiter (simple sliding window) ─────────────────────────── const requestTimestamps: Map = new Map(); @@ -213,25 +237,34 @@ function getApiKey(provider: ExternalProvider): string | undefined { return url ? 'claude-code-enabled' : undefined; } if (provider.name === 'openai-bridge') { - // openai-bridge uses OPENAI_API_KEY for auth, but also needs bridge URL - const apiKey = process.env['OPENAI_API_KEY']; + // Subscription bridge auth is handled by the bridge process/CLI session. const url = process.env['OPENAI_BRIDGE_URL']; - return apiKey && url ? apiKey : undefined; + return url ? 'openai-bridge-enabled' : undefined; } if (provider.name === 'chatgpt-bridge') { - // chatgpt-bridge can use same URL as openai-bridge (same service), but needs API key - const apiKey = process.env['OPENAI_API_KEY']; + // ChatGPT Plus bridge can reuse the OpenAI bridge when configured that way. const url = process.env['CHATGPT_BRIDGE_URL'] || process.env['OPENAI_BRIDGE_URL']; - return apiKey && url ? apiKey : undefined; + return url ? 'chatgpt-bridge-enabled' : undefined; } if (provider.name === 'copilot-bridge') { - // copilot-bridge uses GitHub Copilot subscription (auth handled internally by copilot-api) - // Just needs URL to be configured + // copilot-bridge uses GitHub Copilot subscription (auth handled internally by copilot-api). const url = process.env['COPILOT_BRIDGE_URL']; return url ? 'copilot-authenticated' : undefined; } + if (provider.name === 'm365-copilot-bridge') { + // Microsoft 365 Copilot uses Microsoft Graph delegated auth inside the bridge. + const url = process.env['M365_COPILOT_BRIDGE_URL']; + return url ? 'm365-copilot-bridge-enabled' : undefined; + } + if (provider.name === 'openai-codex') { + const bridgeUrl = process.env['OPENAI_CODEX_URL'] || process.env['CODEX_BRIDGE_URL']; + if (bridgeUrl) return 'openai-codex-bridge-enabled'; + return process.env['OPENAI_API_KEY'] || undefined; + } if (provider.name === 'codex') { - // codex uses GitHub Codex API token + // Codex can run through an authless local/subscription bridge. A token remains supported as fallback. + const bridgeUrl = process.env['CODEX_BRIDGE_URL'] || process.env['OPENAI_CODEX_URL']; + if (bridgeUrl) return 'codex-bridge-enabled'; const token = process.env['GITHUB_CODEX_TOKEN']; return token ? token : undefined; } @@ -241,11 +274,11 @@ function getApiKey(provider: ExternalProvider): string | undefined { function getBaseUrl(provider: ExternalProvider): string { if (provider.name === 'claude-bridge') { const url = process.env['CLAUDE_BRIDGE_URL']; - return url ? `${url}/v1` : ''; + return url ?? ''; } if (provider.name === 'claude-code') { const url = process.env['CLAUDE_CODE_URL']; - return url ? `${url}/v1` : ''; + return url ?? ''; } if (provider.name === 'openai-bridge') { const url = process.env['OPENAI_BRIDGE_URL']; @@ -257,7 +290,19 @@ function getBaseUrl(provider: ExternalProvider): string { } if (provider.name === 'copilot-bridge') { const url = process.env['COPILOT_BRIDGE_URL']; - return url ? `${url}` : ''; + return url ? `${url}/v1` : ''; + } + if (provider.name === 'm365-copilot-bridge') { + const url = process.env['M365_COPILOT_BRIDGE_URL']; + return url ? `${url}/v1` : ''; + } + if (provider.name === 'openai-codex') { + const url = process.env['OPENAI_CODEX_URL'] || process.env['CODEX_BRIDGE_URL']; + return url ? `${url}/v1` : provider.baseUrl; + } + if (provider.name === 'codex') { + const url = process.env['CODEX_BRIDGE_URL'] || process.env['OPENAI_CODEX_URL']; + return url ? `${url}/v1` : provider.baseUrl; } if (provider.name === 'cloudflare') { const accountId = process.env['CLOUDFLARE_ACCOUNT_ID']; @@ -271,6 +316,11 @@ export function getAvailableProviders(): readonly ExternalProvider[] { return PROVIDERS.filter((p) => p.enabled && getApiKey(p)); } +/** Returns ALL configured providers (enabled or not, with or without API key). For dashboard listing. */ +export function getAllProviders(): readonly ExternalProvider[] { + return PROVIDERS; +} + function findBestModel( provider: ExternalProvider, targetTier: 'fast' | 'medium' | 'large' | 'reasoning', @@ -296,7 +346,11 @@ function findBestModel( function buildRequestHeaders(provider: ExternalProvider, apiKey: string): Record { const headers: Record = { 'Content-Type': 'application/json' }; - if (!['claude-bridge', 'claude-code', 'openai-bridge', 'chatgpt-bridge', 'copilot-bridge'].includes(provider.name)) { + const usesAuthlessBridge = AUTHLESS_BRIDGE_PROVIDERS.has(provider.name) + || (provider.name === 'openai-codex' && !!(process.env['OPENAI_CODEX_URL'] || process.env['CODEX_BRIDGE_URL'])) + || (provider.name === 'codex' && !!(process.env['CODEX_BRIDGE_URL'] || process.env['OPENAI_CODEX_URL'])); + + if (!usesAuthlessBridge) { headers['Authorization'] = `Bearer ${apiKey}`; } return headers; @@ -311,13 +365,29 @@ function buildRequestPayload(model: ExternalModel, request: ExternalCompletionRe }; } +function buildGenerateBridgePayload(model: ExternalModel, request: ExternalCompletionRequest): Record { + const system = request.messages.find((m) => m.role === 'system')?.content; + const prompt = request.messages + .filter((m) => m.role !== 'system') + .map((m) => `${m.role}: ${m.content}`) + .join('\n\n'); + + return { + model: model.id, + prompt, + system, + temperature: request.temperature ?? 0.3, + max_tokens: request.max_tokens ?? 2048, + }; +} + function parseExternalResponse( data: any, model: ExternalModel, provider: ExternalProvider, start: number, ): ExternalCompletionResponse { - const content = data.choices?.[0]?.message?.content ?? ''; + const content = data.choices?.[0]?.message?.content ?? data.content ?? data.response ?? data.message?.content ?? ''; recordRequest(provider.name); return { response: content, @@ -341,14 +411,15 @@ async function callProvider( const baseUrl = getBaseUrl(provider); if (!baseUrl) throw new Error(`No base URL for ${provider.name}`); - const url = `${baseUrl}/chat/completions`; + const generateBridge = GENERATE_BRIDGE_PROVIDERS.has(provider.name); + const url = generateBridge ? `${baseUrl}/api/generate` : `${baseUrl}/chat/completions`; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); const start = Date.now(); try { const headers = buildRequestHeaders(provider, apiKey); - const payload = buildRequestPayload(model, request); + const payload = generateBridge ? buildGenerateBridgePayload(model, request) : buildRequestPayload(model, request); const response = await fetch(url, { method: 'POST', diff --git a/packages/gateway/src/pipeline/request-scorer.ts b/packages/gateway/src/pipeline/request-scorer.ts index 6f81d25..7998c10 100644 --- a/packages/gateway/src/pipeline/request-scorer.ts +++ b/packages/gateway/src/pipeline/request-scorer.ts @@ -728,6 +728,36 @@ function handleFormalLogicOverride( return result; } +// ── Helper: Code Generation Intent Override ─────────────────────────────── + +const CODE_GENERATION_PATTERNS = [ + /\bwrite\s+(?:a\s+)?(?:typescript|javascript|python|go|rust|react|next\.js|node)?\s*(?:function|class|script|module|component|test|handler|middleware)\b/i, + /\b(?:implement|create|build|generate|scaffold)\b[\s\S]{0,160}\b(?:api|endpoint|function|class|component|service|schema|migration|crud|jwt|test|project|module)\b/i, + /\b(?:rest|graphql)\s+api\b[\s\S]{0,160}\b(?:implement|create|build|endpoint|authentication|jwt)\b/i, +]; + +function handleCodeGenerationOverride( + fullText: string, + input: ScorerInput, + userMessages: readonly WeightedMessage[], +): ScoringResult | null { + if (!CODE_GENERATION_PATTERNS.some((pattern) => pattern.test(fullText))) { + return null; + } + + const dimensions = computeAllDimensions(input, userMessages, fullText); + const result: ScoringResult = { + tier: 'code_generation', + score: 0.62, + confidence: 0.86, + reason: 'code generation intent detected', + dimensions, + }; + recordSessionTier('code_generation'); + logger.debug({ tier: 'code_generation', reason: 'code_generation_override' }, 'Request scored via code generation override'); + return result; +} + // ── Helper: Apply Score Overrides ────────────────────────────────────────── interface ScoreOverridesInput { @@ -754,6 +784,7 @@ function applyScoreOverrides( const codeGenDim = dimensions.find((d) => d.name === 'codeGeneration'); if (codeGenDim && codeGenDim.rawScore > 0.25) { tier = 'code_generation'; + confidence = Math.max(confidence, 0.78); reason = 'code generation keywords detected'; } @@ -771,7 +802,7 @@ function applyScoreOverrides( } // Ambiguity check - if (confidence < 0.45) { + if (confidence < 0.45 && tier !== 'code_generation' && tier !== 'reasoning') { tier = 'medium'; reason = 'ambiguous (confidence < 0.45, defaulting to medium)'; } @@ -795,6 +826,9 @@ export function scoreRequest( const formalLogicResult = handleFormalLogicOverride(fullText, input, userMessages); if (formalLogicResult) return formalLogicResult; + const codeGenerationResult = handleCodeGenerationOverride(fullText, input, userMessages); + if (codeGenerationResult) return codeGenerationResult; + const dimensions = computeAllDimensions(input, userMessages, fullText); let rawScore = 0; for (const dim of dimensions) { diff --git a/packages/gateway/src/pipeline/router.ts b/packages/gateway/src/pipeline/router.ts index 6b03d6e..0f76eee 100644 --- a/packages/gateway/src/pipeline/router.ts +++ b/packages/gateway/src/pipeline/router.ts @@ -184,14 +184,14 @@ export function getOllamaBaseUrl(): string { /** * Maps a scorer tier to the best primary model and its fallback chain. * The 'reasoning' tier uses llama3.3:70b (complex_reasoning strength) from the large tier. - * The 'code_generation' tier uses OpenAI Codex (gpt-4-turbo) as primary via external provider. + * The 'code_generation' tier uses OpenAI Codex as primary via external provider. */ const TIER_MODEL_MAP: Record = { fast: { primary: 'qwen2.5:3b', configTier: 'fast' }, medium: { primary: 'qwen2.5:14b', configTier: 'medium' }, large: { primary: 'qwen2.5:32b', configTier: 'large' }, reasoning: { primary: 'llama3.3:70b', configTier: 'large' }, - code_generation: { primary: 'gpt-4-turbo', configTier: 'large', provider: 'openai-codex' }, + code_generation: { primary: 'gpt-5.1-codex-mini', configTier: 'large', provider: 'openai-codex' }, }; function buildMediumTierFallback( @@ -223,7 +223,8 @@ function buildScoredFallbackChain( models: ModelsYaml, ): string[] { if (tier === 'reasoning' || tier === 'code_generation') { - return [selectedModel, ...buildFallbackChain(selectedModel, configTier, models).filter((m) => m !== selectedModel)]; + const fallbackTier = tier === 'code_generation' ? 'code_generation' : configTier; + return [selectedModel, ...buildFallbackChain(selectedModel, fallbackTier, models).filter((m) => m !== selectedModel)]; } return buildFallbackChain(selectedModel, configTier, models); } @@ -302,7 +303,7 @@ export function routeByScore( const mapping = TIER_MODEL_MAP[scoringResult.tier]; const selectedModel = mapping.primary; const configTier = mapping.configTier; - const tierConfig = models.tiers[configTier]; + const tierConfig = models.tiers[scoringResult.tier] ?? models.tiers[configTier]; if (!tierConfig) { logger.error({ tier: configTier }, 'Tier config not found in models.yaml, falling back to medium'); diff --git a/packages/gateway/src/routes/completion.ts b/packages/gateway/src/routes/completion.ts index 95f1946..ad1d2ec 100644 --- a/packages/gateway/src/routes/completion.ts +++ b/packages/gateway/src/routes/completion.ts @@ -1,12 +1,32 @@ import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify'; import { z } from 'zod'; +import yaml from 'js-yaml'; +import { existsSync, readFileSync } from 'fs'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; import { classifyInput } from '../pipeline/pre-classifier.js'; import { route } from '../pipeline/router.js'; +import { detectCaller } from '../modules/caller-detection.js'; +import { + computeCacheKey, + getCachedResponse, + getSemanticCachedResponse, + setCachedResponse, + recordCacheHit, +} from '../modules/response-cache.js'; +import { + applyPoolRouting, + modelToSubscriptionId, + recordSubscriptionUsage, +} from '../modules/subscription-wallet.js'; +import { runRace, auditRaceResults, type RaceStrategy, type RaceCandidateResult } from '../modules/race-mode.js'; import { resolvePrompt } from '../pipeline/prompt-resolver.js'; +import { getAllProviders } from '../pipeline/external-providers.js'; import { callOllamaWithFallbackChainInstrumented, callExternalProviderPrimaryInstrumented, } from '../pipeline/instrumented-llm-client.js'; +import { callOllama } from '../pipeline/llm-client.js'; import { runPostValidation } from '../pipeline/post-validator.js'; import { evaluateConfidence } from '../pipeline/confidence-gate.js'; import { writeAuditLog, writeBanAnalytics, hashText } from '../observability/audit-log.js'; @@ -21,18 +41,35 @@ import { validationFailuresTotal, } from '../observability/metrics.js'; import { logger } from '../observability/logger.js'; -// import { ShieldX } from '@shieldx/core'; // TODO: Link @shieldx/core properly import { calculateCost, calculateSavings, calculateCompressionRatio } from '../observability/cost-calculator.js'; -import { logCostImpact } from '../utils/tokenvault-hooks.js'; +import { logCompressionMetric, logCostImpact } from '../utils/tokenvault-hooks.js'; import { costStream } from '../observability/cost-stream.js'; import { recordRoutingDecision, trackFallbackChain } from '../observability/routing-instrumentation.js'; import { createRequestLogger } from '../modules/request-logger.js'; +import { compressContext, type CompressionResult } from '../modules/context-compressor.js'; +import { + scanForInjection, + decideAction, + llmJudge, + getInjectionMode, + isCallerExempt, + type InjectionScanResult, +} from '../modules/injection-defense.js'; +import { + redactPii, + restorePii, + getRedactMode, + shouldRedactFor, +} from '../modules/pii-redaction.js'; +import { splitReasoningTrace, storeReasoningTrace } from '../modules/reasoning-trace.js'; +import { getRoutingOverride } from '../modules/workspace-presets.js'; +import { runPreComplete, runPostComplete } from '../modules/plugin-system.js'; +import { getAdaptiveRecommendation } from '../modules/adaptive-routing.js'; +import { guardOutputStream, getOutputDefenseMode } from '../modules/output-defense.js'; +import { callPromptGuard, isPromptGuardConfigured, getPromptGuardThreshold, getPromptGuardMinLen } from '../modules/prompt-guard-client.js'; -// TODO: ShieldX — Link @shieldx/core properly -// // Singleton ShieldX instance — initialized once, sub-millisecond scans // // Disable Ollama-dependent scanners (sentinel, constitutional, embedding, attention) // // to keep gateway scans fast and dependency-free -// const shieldx = new ShieldX({ // scanners: { // rules: true, // 547+ rules, 50+ languages // sentinel: false, // Requires Ollama @@ -66,22 +103,138 @@ const CompletionRequestSchema = z.object({ temperature: z.number().min(0).max(2).optional(), max_tokens: z.number().int().positive().max(16_384).optional(), return_validation_details: z.boolean().optional(), + skip_cache: z.boolean().optional(), + fuzzy_cache: z.boolean().optional(), + fuzzy_threshold: z.number().min(0.5).max(1).optional(), + cache_ttl: z.number().int().positive().optional(), + compression: z + .object({ + enabled: z.boolean().optional(), + mode: z.enum(['auto', 'off', 'aggressive']).optional(), + target_tokens: z.number().int().positive().max(64_000).optional(), + }) + .optional(), }) .optional(), }); type CompletionRequest = z.infer; -// TODO: Enable when ShieldX dependency is linked -// const SKIP_SHIELDX_CALLERS = new Set(['internal', 'shieldx']); +function shouldBypassResponseCache(caller: string): boolean { + const normalized = caller.toLowerCase(); + return normalized.includes('claude-code') + || normalized.includes('codex') + || normalized.includes('copilot'); +} + +function inputForPromptGuard(input: string): string { + const cleaned = input.replace(/^(user|assistant|system|developer):\s*/gim, '').trim(); + return cleaned || input; +} + +function shouldRunPromptGuard(input: string, scan: InjectionScanResult): boolean { + if (scan.matches.length > 0) return true; + + const cleaned = inputForPromptGuard(input).normalize('NFKC'); + return [ + /\b(?:ignore|disregard|forget|override|bypass|jailbreak)\b[\s\S]{0,120}\b(?:instructions?|rules?|prompt|policy|safety)\b/i, + /\b(?:you\s+are\s+now|act\s+as|pretend\s+to\s+be|developer\s+mode|root\s+administrator|runtime\s+controller|security\s+auditor)\b/i, + /\b(?:show|print|dump|reveal|output)\b[\s\S]{0,160}\b(?:system\s+prompt|developer\s+prompt|hidden|runtime|memory|tools?|filters?|policy|classifier|chain-of-thought|reasoning)\b/i, + /\b(?:passwords?|passw(?:o|ö)rter|credentials?|api\s*keys?|tokens?|secrets?)\b[\s\S]{0,160}\b(?:print|show|write|paste|send|share|reveal|chat|anmelden|log\s*in)\b/i, + /\b(?:base64|rot13|hex\s+encoded|decode|execute|run\s+this)\b/i, + /[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/, + /\b[A-Za-z0-9+/]{40,}={0,2}\b/, + /\b(?:[0-9a-fA-F]{2}){16,}\b/, + ].some((pattern) => pattern.test(cleaned)); +} + +const ChatMessageSchema = z.object({ + role: z.string().min(1), + content: z.union([z.string(), z.array(z.unknown()), z.null()]).optional(), +}); + +// Tool / function-calling shape (OpenAI Chat Completions tools API). +// We accept and forward tool definitions transparently to the upstream. +const ToolFunctionSchema = z.object({ + name: z.string().min(1), + description: z.string().optional(), + parameters: z.record(z.unknown()).optional(), +}); +const ToolSchema = z.object({ + type: z.literal('function'), + function: ToolFunctionSchema, +}); + +const OpenAIChatCompletionRequestSchema = z.object({ + model: z.string().min(1).default('llm-gateway-auto'), + messages: z.array(ChatMessageSchema).min(1), + temperature: z.number().min(0).max(2).optional(), + max_tokens: z.number().int().positive().max(16_384).optional(), + stream: z.boolean().optional(), + user: z.string().optional(), + // Tool / function-calling pass-through + tools: z.array(ToolSchema).optional(), + tool_choice: z.union([ + z.literal('auto'), + z.literal('none'), + z.literal('required'), + z.object({ type: z.literal('function'), function: z.object({ name: z.string() }) }), + ]).optional(), + // Legacy function-calling (still supported by many clients) + functions: z.array(ToolFunctionSchema).optional(), + function_call: z.union([z.string(), z.object({ name: z.string() })]).optional(), + // Response format (json_object, json_schema) + response_format: z.object({ + type: z.enum(['text', 'json_object', 'json_schema']), + json_schema: z.record(z.unknown()).optional(), + }).optional(), + // Vision: messages already accept array content via ChatMessageSchema's z.array(z.unknown()) +}); + +type OpenAIChatCompletionRequest = z.infer; + +// ─── Anthropic Messages API compat ─────────────────────────────────────────── +const AnthropicMessageSchema = z.object({ + role: z.enum(['user', 'assistant']), + content: z.union([z.string(), z.array(z.unknown())]), +}); + +const AnthropicMessagesRequestSchema = z.object({ + model: z.string().min(1).default('llm-gateway-auto'), + messages: z.array(AnthropicMessageSchema).min(1), + system: z.union([z.string(), z.array(z.unknown())]).optional(), + max_tokens: z.number().int().positive().max(16_384).default(1024), + temperature: z.number().min(0).max(1).optional(), + top_p: z.number().min(0).max(1).optional(), + stream: z.boolean().optional(), + metadata: z.record(z.string(), z.unknown()).optional(), +}); + +type AnthropicMessagesRequest = z.infer; + +const OpenAIResponsesRequestSchema = z.object({ + model: z.string().min(1).default('llm-gateway-auto'), + input: z.union([z.string(), z.array(z.unknown())]), + instructions: z.string().optional(), + temperature: z.number().min(0).max(2).optional(), + max_output_tokens: z.number().int().positive().max(16_384).optional(), + stream: z.boolean().optional(), + user: z.string().optional(), + metadata: z.record(z.unknown()).optional(), +}); + +type OpenAIResponsesRequest = z.infer; + +interface GatewayCompletionResult { + statusCode: number; + body: Record; +} + -// TODO: Enable when ShieldX dependency is linked -// async function runShieldXScan( // input: string, // caller: string, // ): Promise<{ passed: boolean; reason?: string; threatLevel?: string; phase?: string; latencyMs?: number }> { // try { -// const result = await shieldx.scanInput(input); // // if (result.detected) { // logger.warn({ @@ -93,7 +246,6 @@ type CompletionRequest = z.infer; // ensemble: result.ensemble, // atlasMapping: result.atlasMapping?.techniqueIds?.slice(0, 5), // scannerCount: result.scanResults.length, -// }, 'ShieldX threat detected — input blocked'); // // return { // passed: false, @@ -106,7 +258,6 @@ type CompletionRequest = z.infer; // // return { passed: true, latencyMs: result.latencyMs }; // } catch (err) { -// logger.error({ err, caller }, 'ShieldX scan error — failing open'); // return { passed: true }; // } // } @@ -169,7 +320,7 @@ function recordAllMetrics(caller: string, taskType: string, confidenceResult: an } } -async function auditAndTrackCosts(caller: string, taskType: string, input: string, outputText: string, latencyMs: number, ollamaResponse: any, resolved: any, decision: ReturnType, confidenceResult: any, validationOutput: any, classificationResult: any, callId: string): Promise<{ costUsd: number; costSavedUsd: number }> { +async function auditAndTrackCosts(caller: string, taskType: string, input: string, outputText: string, latencyMs: number, ollamaResponse: any, resolved: any, decision: ReturnType, confidenceResult: any, validationOutput: any, classificationResult: any, callId: string, compression?: CompressionResult): Promise<{ costUsd: number; costSavedUsd: number }> { const inputHash = hashText(input); const outputHash = hashText(outputText); @@ -178,7 +329,12 @@ async function auditAndTrackCosts(caller: string, taskType: string, input: strin input_hash: inputHash, output_text: confidenceResult.status !== 'pending_review' ? outputText : undefined, output_hash: outputHash, token_count_in: ollamaResponse.prompt_eval_count ?? 0, token_count_out: ollamaResponse.eval_count ?? 0, latency_ms: latencyMs, confidence: confidenceResult.score, status: confidenceResult.status, validation_log: validationOutput.results, ban_hits: validationOutput.ban_violations, - metadata: { classification: classificationResult, model_tier: decision.tier, fallback_used: ollamaResponse.model !== decision.model }, + metadata: { + classification: classificationResult, + model_tier: decision.tier, + fallback_used: ollamaResponse.model !== decision.model, + compression: compression ? buildCompressionResponse(compression) : undefined, + }, }); if (validationOutput.ban_violations.length > 0) { @@ -192,9 +348,20 @@ async function auditAndTrackCosts(caller: string, taskType: string, input: strin const db = getPool(); const tokensIn = ollamaResponse.prompt_eval_count ?? 0; const tokensOut = ollamaResponse.eval_count ?? 0; - const tokensCompressed = tokensIn + tokensOut; + const tokensCompressed = (compression?.tokensAfter ?? tokensIn) + tokensOut; const costUsd = calculateCost(decision.model, tokensIn, tokensOut); - const costSavedUsd = calculateSavings(decision.model, tokensCompressed, tokensCompressed); + const costSavedUsd = compression?.applied + ? calculateSavings(decision.model, compression.tokensBefore, compression.tokensAfter) + : 0; + + void logCompressionMetric(db, { + filePath: callId, + mode: compression ? `${compression.method}:${compression.strategy}` : 'none:none', + tokensBefore: compression?.tokensBefore ?? tokensIn, + tokensAfter: compression?.tokensAfter ?? tokensIn, + savingsPct: compression ? Math.round(compression.ratio * 10000) / 100 : 0, + toolUsed: 'gateway', + }); void logCostImpact(db, callId, { callId, agent: 'gateway', model: decision.model, project: 'llm-gateway', taskType: taskType ?? 'generic' }, tokensIn, tokensOut, tokensCompressed, costUsd, costSavedUsd, confidenceResult.score); @@ -228,7 +395,920 @@ function buildResponseBody(callId: string, decision: ReturnType, t return body; } +async function executeCompletion(body: CompletionRequest, startMs: number, callId: string): Promise { + const { caller, language, context, options } = body; + + // ─── Plugin pre-hooks (PLUGINS_DIR) ──────────────────────────────────── + try { + const preResult = await runPreComplete({ caller, callId, request: body as unknown as Record }); + if (preResult === null) { + return { statusCode: 422, body: { error: 'plugin_aborted', message: 'Request aborted by plugin pre-hook' } }; + } + if (preResult && typeof preResult === 'object') { + Object.assign(body as unknown as Record, preResult); + } + } catch (err) { + logger.warn({ err }, 'Plugin preComplete failed; continuing'); + } + + // ─── PII Redaction (REDACT_PII_MODE: off|cloud_only|always) ───────────── + const redactMode = getRedactMode(); + let piiRestoreMap: Map | null = null; + if (redactMode !== 'off' && shouldRedactFor(redactMode, 'unknown', caller)) { + const r = redactPii(body.input); + if (r.restoreMap.size > 0) { + body = { ...body, input: r.redacted }; + piiRestoreMap = r.restoreMap; + logger.info( + { callId, caller, redactedCounts: r.counts, redactedTokens: r.restoreMap.size }, + 'PII redaction applied', + ); + } + } + + // ─── Prompt-injection defense (configurable via INJECTION_DEFENSE_MODE) ── + const injectionMode = getInjectionMode(); + let injectionScan: InjectionScanResult | null = null; + if (injectionMode !== 'off' && !isCallerExempt(caller)) { + injectionScan = scanForInjection(body.input); + const action = decideAction(injectionMode, injectionScan); + if (action === 'block') { + logger.warn( + { caller, callId, score: injectionScan.score, matches: injectionScan.matches.map((m) => m.id) }, + 'Injection defense blocked request', + ); + return { + statusCode: 422, + body: { + error: 'injection_detected', + message: 'Request blocked by prompt-injection defense layer', + score: injectionScan.score, + matches: injectionScan.matches, + }, + }; + } + + // ─── Layer 2: ML classifier (Prompt-Guard sidecar) ──────────────────── + if (!injectionScan.detected && isPromptGuardConfigured() && body.input.length >= getPromptGuardMinLen() && shouldRunPromptGuard(body.input, injectionScan)) { + const pg = await callPromptGuard(inputForPromptGuard(body.input)); + if (pg.available && pg.label === 'INJECTION' && pg.score >= getPromptGuardThreshold()) { + logger.warn( + { caller, callId, pg_score: pg.score, pg_latency_ms: pg.latencyMs }, + 'Prompt-Guard sidecar blocked request', + ); + return { + statusCode: 422, + body: { + error: 'injection_detected', + message: 'Request blocked by prompt-guard ML classifier', + prompt_guard: { label: pg.label, score: pg.score, latencyMs: pg.latencyMs }, + }, + }; + } + } + + if (action === 'llm_judge') { + try { + const verdict = await llmJudge(body.input, { + model: process.env['LLM_JUDGE_MODEL'] || 'qwen2.5:3b', + callLLM: async (req) => { + const resp = await callOllama( + { model: req.model, prompt: req.prompt, system: req.system, stream: false, options: { temperature: 0, num_predict: 8, ...(req.options ?? {}) } }, + 'fast', + ); + return { response: resp.response }; + }, + }); + if (verdict.verdict === 'injection') { + return { + statusCode: 422, + body: { + error: 'injection_detected', + message: 'Request blocked by LLM-judge verdict', + score: injectionScan.score, + llm_judge: verdict, + matches: injectionScan.matches, + }, + }; + } + } catch (err) { + logger.warn({ err }, 'Injection LLM-judge failed; allowing through with warning'); + } + } + // action === 'warn' or 'allow' falls through; metadata is recorded later + } + + // ─── Cache check (Tier 1: exact-match hash lookup) ───────────────────── + const agenticNoCache = shouldBypassResponseCache(caller); + const skipCache = agenticNoCache || (options as any)?.skip_cache === true; + const cacheableReq = { + caller, + task_type: body.task_type, + model: options?.model, + system: typeof context === 'object' && context && 'system' in context ? String((context as any).system ?? '') : '', + input: body.input, + }; + const cacheKey = computeCacheKey(cacheableReq); + const fuzzyEnabled = !agenticNoCache && (options as any)?.fuzzy_cache !== false; // default ON + const fuzzyThreshold = typeof (options as any)?.fuzzy_threshold === 'number' + ? Math.max(0.5, Math.min(1.0, (options as any).fuzzy_threshold)) + : 0.85; // empirically good default for nomic-embed-text — paraphrases hit, unrelated misses + if (!skipCache) { + const dbForCache = getPool(); + let hit = await getCachedResponse(dbForCache, cacheKey); + let matchType: 'exact' | 'semantic' = 'exact'; + let similarity: number | undefined; + + // Fall through to semantic match when exact misses + if (!hit && fuzzyEnabled) { + const semHit = await getSemanticCachedResponse( + dbForCache, + caller, + body.task_type, + body.input, + fuzzyThreshold + ); + if (semHit) { + hit = semHit; + matchType = 'semantic'; + similarity = semHit.similarity; + } + } + if (hit) { + const latencyMs = Date.now() - startMs; + void recordCacheHit(dbForCache, hit.id); + // Log cache hit as a successful request (status=approved, fallback=false) + const requestLogger = createRequestLogger(dbForCache); + void requestLogger.logRequest( + callId, + caller, + body.task_type, + (hit.responseJson['model'] as string) ?? 'cache', + 'approved', + hit.tokensIn, + hit.tokensOut, + 0, // zero cost for cache hit + latencyMs, + (hit.responseJson['confidence'] as number) ?? 10, + false, + undefined + ); + logger.info( + { callId, caller, matchType, similarity, ageSeconds: hit.ageSeconds, hitCount: hit.hitCount + 1, costSaved: hit.costWhenCached }, + `Cache HIT (${matchType}) — skipping pipeline` + ); + return { + statusCode: 200, + body: { + ...hit.responseJson, + id: callId, // refresh id so callers can deduplicate logs + cache: { + hit: true, + match_type: matchType, + similarity: similarity ?? null, + age_seconds: hit.ageSeconds, + hit_count: hit.hitCount + 1, + cost_saved_usd: hit.costWhenCached, + tokens_saved: hit.tokensIn + hit.tokensOut, + }, + latency_ms: latencyMs, + } as Record, + }; + } + } + + const compression = compressContext(body.input, { + enabled: options?.compression?.enabled, + mode: options?.compression?.mode, + targetTokens: options?.compression?.target_tokens, + }); + const input = compression.input; + + let classifAndRoute; + try { + classifAndRoute = await classifyAndRoute(body.task_type, caller, input, options); + } catch (err) { + return { + statusCode: 400, + body: { + statusCode: 400, error: 'Routing Error', + message: err instanceof Error ? err.message : 'Failed to route request', + }, + }; + } + + const { taskType, decision, classificationResult } = classifAndRoute; + + // ─── Pool Routing: re-route to the subscription with most headroom ───── + let poolRouteApplied: string | null = null; + try { + const adjusted = await applyPoolRouting(getPool(), { + model: decision.model, + fallback_chain: decision.fallback_chain, + tier: decision.tier, + }); + if (adjusted) { + logger.info({ callId, original: decision.model, switched: adjusted.model, reason: adjusted.reason }, 'Pool routing engaged'); + decision.model = adjusted.model; + decision.fallback_chain = adjusted.fallback_chain; + poolRouteApplied = adjusted.reason; + } + } catch (poolErr) { + logger.debug({ poolErr }, 'pool routing skipped'); + } + + const promptVars = buildPromptVariables(input, context); + const resolved = resolvePrompt(taskType ?? decision.prompt_template, promptVars, language ?? 'en'); + + const format: '' | 'json' | undefined = decision.output_format === 'json' ? 'json' : ''; + const baseReq = { model: decision.model, prompt: resolved.prompt, system: resolved.system, options: { temperature: decision.temperature, num_predict: decision.max_tokens }, format, stream: false, callId, taskType }; + + let ollamaResponse; + try { + ollamaResponse = await callLLMWithFallback(baseReq, decision, callId, taskType); + } catch (err) { + const latency = Date.now() - startMs; + logger.error({ err, caller, taskType }, 'LLM call failed'); + requestsTotal.labels({ caller, task_type: taskType, status: 'rejected' }).inc(); + latencySeconds.labels({ caller, task_type: taskType, model: decision.model }).observe(latency / 1000); + const db = getPool(); + const requestLogger = createRequestLogger(db); + void requestLogger.logRequest(callId, caller, taskType, decision.model, 'error', 0, 0, 0, latency, 0, false, err instanceof Error ? err.message : 'LLM service unavailable'); + return { statusCode: 503, body: { statusCode: 503, error: 'Service Unavailable', message: 'LLM service unavailable, please retry' } }; + } + + const latencyMs = Date.now() - startMs; + const outputText = ollamaResponse.response; + const validationOutput = await runPostValidation(outputText, { validators: decision.validators, language, output_format: decision.output_format, requires_fact_check: decision.requires_fact_check, schema: resolved.schema }); + const confidenceResult = evaluateConfidence(validationOutput); + + recordAllMetrics(caller, taskType, confidenceResult, ollamaResponse, decision, validationOutput); + const { costUsd, costSavedUsd } = await auditAndTrackCosts(caller, taskType, compression.originalInput, outputText, latencyMs, ollamaResponse, resolved, decision, confidenceResult, validationOutput, classificationResult, callId, compression); + + latencySeconds.labels({ caller, task_type: taskType, model: ollamaResponse.model ?? decision.model }).observe(latencyMs / 1000); + + // ─── Record subscription usage for the wallet ──────────────────────── + const usedModel = ollamaResponse.model ?? decision.model; + const subscriptionId = modelToSubscriptionId(usedModel); + if (subscriptionId) { + void recordSubscriptionUsage(getPool(), subscriptionId, (ollamaResponse.eval_count ?? 0) + (ollamaResponse.prompt_eval_count ?? 0)); + } + + const responseBody = { + ...buildResponseBody(callId, decision, taskType, confidenceResult, outputText, latencyMs, ollamaResponse, costUsd, costSavedUsd, options?.return_validation_details ?? false, validationOutput), + compression: buildCompressionResponse(compression), + ...(poolRouteApplied ? { pool_route: { applied: true, reason: poolRouteApplied } } : {}), + }; + + // ─── Cache write — only successful, validated responses are cached ────── + // Skip caching when: + // • caller explicitly opted out via options.skip_cache + // • response was rejected/pending review (don't cache bad answers) + // • non-deterministic temperature (>0.5) was set (would poison the cache) + const tempUsed = decision.temperature ?? 0.3; + const shouldCache = !skipCache && confidenceResult.status === 'approved' && tempUsed <= 0.5; + if (shouldCache) { + const tokensIn = ollamaResponse.prompt_eval_count ?? 0; + const tokensOut = ollamaResponse.eval_count ?? 0; + void setCachedResponse(getPool(), cacheableReq, responseBody, { + cost: costUsd, + tokensIn, + tokensOut, + ttlSeconds: typeof (options as any)?.cache_ttl === 'number' ? (options as any).cache_ttl : 86_400, + }); + } + + return { statusCode: 200, body: responseBody }; +} + +function buildCompressionResponse(compression: CompressionResult): Record { + return { + applied: compression.applied, + method: compression.method, + tokens_before: compression.tokensBefore, + tokens_after: compression.tokensAfter, + tokens_saved: compression.tokensSaved, + ratio: Math.round(compression.ratio * 1000) / 1000, + strategy: compression.strategy, + tags: compression.tags, + notes: compression.notes, + }; +} + +function contentToText(content: OpenAIChatCompletionRequest['messages'][number]['content']): string { + if (typeof content === 'string') return content; + if (!Array.isArray(content)) return ''; + return content.map((part) => { + if (typeof part === 'string') return part; + if (part && typeof part === 'object' && 'text' in part && typeof (part as any).text === 'string') { + return (part as any).text; + } + return ''; + }).filter(Boolean).join('\n'); +} + +function responsesInputToText(input: OpenAIResponsesRequest['input']): string { + if (typeof input === 'string') return input; + return input.map((item) => { + if (typeof item === 'string') return item; + if (!item || typeof item !== 'object') return ''; + const value = item as any; + if (typeof value.content === 'string') return value.content; + if (Array.isArray(value.content)) { + return value.content.map((part: any) => { + if (typeof part === 'string') return part; + if (part && typeof part === 'object') return part.text || part.input_text || part.output_text || ''; + return ''; + }).filter(Boolean).join('\n'); + } + if (typeof value.text === 'string') return value.text; + return ''; + }).filter(Boolean).join('\n\n'); +} + +function openAIRequestToGatewayRequest(body: OpenAIChatCompletionRequest, request: FastifyRequest): CompletionRequest { + // Use layered caller-detection (header → companion → body → user-agent → fallback) + const { caller } = detectCaller(request, 'openai-compatible', body.user); + + const input = body.messages + .filter((message) => message.role !== 'system') + .map((message) => `${message.role}: ${contentToText(message.content)}`) + .join('\n\n') + .trim(); + + const system = body.messages + .filter((message) => message.role === 'system') + .map((message) => contentToText(message.content)) + .filter(Boolean) + .join('\n\n'); + + const model = ['auto', 'llm-gateway-auto', 'gateway-auto'].includes(body.model) ? undefined : body.model; + const agenticNoCache = shouldBypassResponseCache(caller); + + return { + caller, + task_type: 'generic_qa', + input: input || contentToText(body.messages[body.messages.length - 1]?.content), + context: system ? { system } : undefined, + options: { + model, + temperature: body.temperature, + max_tokens: body.max_tokens, + skip_cache: agenticNoCache, + fuzzy_cache: !agenticNoCache, + compression: { enabled: true, mode: 'auto' }, + }, + }; +} + +function responsesRequestToGatewayRequest(body: OpenAIResponsesRequest, request: FastifyRequest): CompletionRequest { + const metadataCaller = typeof body.metadata?.['caller'] === 'string' ? String(body.metadata['caller']) : undefined; + const { caller } = detectCaller(request, 'responses-compatible', body.user || metadataCaller); + const model = ['auto', 'llm-gateway-auto', 'gateway-auto'].includes(body.model) ? undefined : body.model; + const agenticNoCache = shouldBypassResponseCache(caller); + + return { + caller, + task_type: 'generic_qa', + input: responsesInputToText(body.input), + context: body.instructions ? { system: body.instructions } : undefined, + options: { + model, + temperature: body.temperature, + max_tokens: body.max_output_tokens, + skip_cache: agenticNoCache, + fuzzy_cache: !agenticNoCache, + compression: { enabled: true, mode: 'auto' }, + }, + }; +} + +// ─── Anthropic Messages API mappers ───────────────────────────────────────── +function anthropicContentToText(content: unknown): string { + if (typeof content === 'string') return content; + if (Array.isArray(content)) { + return content + .map((block: unknown) => { + if (typeof block === 'string') return block; + if (block && typeof block === 'object') { + const b = block as Record; + if (typeof b['text'] === 'string') return b['text']; + } + return ''; + }) + .filter(Boolean) + .join('\n'); + } + return ''; +} + +function anthropicRequestToGatewayRequest(body: AnthropicMessagesRequest, request: FastifyRequest): CompletionRequest { + const metadataUser = typeof body.metadata?.['user_id'] === 'string' ? String(body.metadata['user_id']) : undefined; + const { caller } = detectCaller(request, 'anthropic-compatible', metadataUser); + + const input = body.messages + .map((m) => `${m.role}: ${anthropicContentToText(m.content)}`) + .join('\n\n') + .trim(); + + const system = body.system ? anthropicContentToText(body.system) : ''; + const model = ['auto', 'llm-gateway-auto', 'gateway-auto'].includes(body.model) ? undefined : body.model; + const agenticNoCache = shouldBypassResponseCache(caller); + + return { + caller, + task_type: 'generic_qa', + input: input || anthropicContentToText(body.messages[body.messages.length - 1]?.content), + context: system ? { system } : undefined, + options: { + model, + temperature: body.temperature, + max_tokens: body.max_tokens, + skip_cache: agenticNoCache, + fuzzy_cache: !agenticNoCache, + compression: { enabled: true, mode: 'auto' }, + }, + }; +} + +function toAnthropicMessagesResponse(result: Record, requestedModel: string): Record { + const output = typeof result['output'] === 'string' ? result['output'] : ''; + const tokens = result['tokens'] as { in?: number; out?: number } | undefined; + const model = typeof result['model'] === 'string' ? result['model'] : requestedModel; + const stopReason = result['status'] === 'pending_review' ? 'content_filtered' : 'end_turn'; + return { + id: result['id'] ?? `msg_${Date.now()}`, + type: 'message', + role: 'assistant', + model, + content: [{ type: 'text', text: output }], + stop_reason: stopReason, + stop_sequence: null, + usage: { + input_tokens: tokens?.in ?? 0, + output_tokens: tokens?.out ?? 0, + }, + gateway: { + status: result['status'], + confidence: result['confidence'], + cost: result['cost'], + latency_ms: result['latency_ms'], + compression: result['compression'], + }, + }; +} + +function toAnthropicError(result: GatewayCompletionResult): Record { + const message = + (typeof result.body['message'] === 'string' && result.body['message']) || + (typeof result.body['error'] === 'string' && result.body['error']) || + 'Internal error'; + return { + type: 'error', + error: { + type: result.statusCode === 400 ? 'invalid_request_error' : 'api_error', + message, + }, + }; +} + +function toOpenAIChatResponse(result: Record, requestedModel: string): Record { + const output = typeof result['output'] === 'string' ? result['output'] : ''; + const tokens = result['tokens'] as { in?: number; out?: number } | undefined; + const model = typeof result['model'] === 'string' ? result['model'] : requestedModel; + return { + id: result['id'] ?? `chatcmpl-${Date.now()}`, + object: 'chat.completion', + created: Math.floor(Date.now() / 1000), + model, + choices: [ + { + index: 0, + message: { role: 'assistant', content: output }, + finish_reason: result['status'] === 'pending_review' ? 'content_filter' : 'stop', + }, + ], + usage: { + prompt_tokens: tokens?.in ?? 0, + completion_tokens: tokens?.out ?? 0, + total_tokens: (tokens?.in ?? 0) + (tokens?.out ?? 0), + }, + gateway: { + status: result['status'], + confidence: result['confidence'], + cost: result['cost'], + latency_ms: result['latency_ms'], + compression: result['compression'], + }, + }; +} + +function toOpenAIResponsesResponse(result: Record, requestedModel: string): Record { + const output = typeof result['output'] === 'string' ? result['output'] : ''; + const tokens = result['tokens'] as { in?: number; out?: number } | undefined; + const model = typeof result['model'] === 'string' ? result['model'] : requestedModel; + const id = String(result['id'] ?? `resp-${Date.now()}`); + return { + id, + object: 'response', + created_at: Math.floor(Date.now() / 1000), + status: 'completed', + model, + output: [ + { + id: `${id}-msg`, + type: 'message', + status: 'completed', + role: 'assistant', + content: [{ type: 'output_text', text: output, annotations: [] }], + }, + ], + output_text: output, + usage: { + input_tokens: tokens?.in ?? 0, + output_tokens: tokens?.out ?? 0, + total_tokens: (tokens?.in ?? 0) + (tokens?.out ?? 0), + }, + gateway: { + status: result['status'], + confidence: result['confidence'], + cost: result['cost'], + latency_ms: result['latency_ms'], + compression: result['compression'], + }, + }; +} + +/** + * Stream a non-streaming gateway response back to the client as + * OpenAI-compatible Server-Sent Events. Chunks the assistant content + * by ~32-char windows so SDKs that drive UIs see progressive output. + * + * Real upstream streaming (token-by-token from Ollama) is wired through + * separately for providers that natively support stream=true; this helper + * is the fallback path for the unified completion pipeline. + */ +const STREAM_CONTENT_STEP = 32; + +async function* iterateContentChunks(content: string, step: number): AsyncGenerator { + for (let i = 0; i < content.length; i += step) { + yield content.slice(i, i + step); + } +} + +async function streamOpenAIChatResponse(reply: FastifyReply, response: Record): Promise { + const choices = (response['choices'] as Array>) ?? []; + const message = (choices[0]?.['message'] as Record) ?? {}; + const content = typeof message['content'] === 'string' ? (message['content'] as string) : ''; + const toolCalls = message['tool_calls']; + const id = String(response['id'] ?? `chatcmpl-${Date.now()}`); + const created = Number(response['created'] ?? Math.floor(Date.now() / 1000)); + const model = String(response['model'] ?? 'llm-gateway-auto'); + + reply.raw.writeHead(200, { + 'Content-Type': 'text/event-stream; charset=utf-8', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'X-Accel-Buffering': 'no', + }); + + const writeChunk = (delta: Record, finishReason: string | null = null): void => { + const chunk = { + id, + object: 'chat.completion.chunk', + created, + model, + choices: [{ index: 0, delta, finish_reason: finishReason }], + }; + reply.raw.write(`data: ${JSON.stringify(chunk)}\n\n`); + }; + + // 1) initial role chunk + writeChunk({ role: 'assistant' }); + + // 2) content chunks — piped through output-defense guard so secret leaks + // or sysprompt echoes can be cut/tagged mid-stream (see modules/output-defense.ts). + // When OUTPUT_DEFENSE_MODE=off (default), guardOutputStream is a transparent passthrough. + if (content) { + const defenseMode = getOutputDefenseMode(); + const upstream = iterateContentChunks(content, STREAM_CONTENT_STEP); + const guarded = guardOutputStream(upstream, { + mode: defenseMode, + onDetect: (result) => { + logger.warn( + { matches: result.matches, score: result.score, id, model, mode: defenseMode }, + 'Output-defense triggered on streaming response', + ); + }, + }); + for await (const chunk of guarded) { + writeChunk({ content: chunk }); + } + } + + // 3) tool_calls (if present) — flush as a single delta with the full structure + if (Array.isArray(toolCalls) && toolCalls.length > 0) { + writeChunk({ tool_calls: toolCalls }); + } + + // 4) finish marker + DONE sentinel + writeChunk({}, 'stop'); + reply.raw.write('data: [DONE]\n\n'); + reply.raw.end(); + return reply; +} + +function toOpenAIError(result: GatewayCompletionResult): Record { + return { + error: { + message: String(result.body['message'] ?? result.body['error'] ?? 'Gateway request failed'), + type: String(result.body['error'] ?? 'gateway_error').toLowerCase().replace(/\s+/g, '_'), + code: result.statusCode, + }, + }; +} + +function listGatewayModels(): Record { + const ids = new Set(['llm-gateway-auto']); + + for (const provider of getAllProviders()) { + for (const model of provider.models) ids.add(model.id); + } + + try { + const __filename = fileURLToPath(import.meta.url); + const __dirname = dirname(__filename); + const yamlPath = join(__dirname, '..', 'config', 'models.yaml'); + if (existsSync(yamlPath)) { + const cfg: any = yaml.load(readFileSync(yamlPath, 'utf-8')); + for (const id of Object.keys(cfg.models ?? {})) ids.add(id); + } + } catch (err) { + logger.warn({ err }, 'Failed to load local model list for /v1/models'); + } + + return { + object: 'list', + data: [...ids].sort().map((id) => ({ + id, + object: 'model', + created: 0, + owned_by: id === 'llm-gateway-auto' ? 'llm-gateway' : 'gateway-provider', + })), + }; +} + export async function completionRoute(fastify: FastifyInstance): Promise { + fastify.get('/models', async (_request: FastifyRequest, reply: FastifyReply) => { + return reply.send(listGatewayModels()); + }); + + fastify.post('/chat/completions', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { + const startMs = Date.now(); + const parsed = OpenAIChatCompletionRequestSchema.safeParse(request.body); + if (!parsed.success) { + return reply.status(400).send({ + error: { + message: parsed.error.errors[0]?.message ?? 'Invalid chat completion request', + type: 'invalid_request_error', + code: 400, + }, + }); + } + + const callId = `chatcmpl-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; + const gatewayRequest = openAIRequestToGatewayRequest(parsed.data, request); + const result = await executeCompletion(gatewayRequest, startMs, callId); + + if (result.statusCode !== 200) { + return reply.status(result.statusCode).send(toOpenAIError(result)); + } + + const response = toOpenAIChatResponse(result.body, parsed.data.model); + if (parsed.data.stream) { + return await streamOpenAIChatResponse(reply, response); + } + + return reply.status(200).send(response); + }); + + // Anthropic Messages API compatibility — accept @anthropic-ai/sdk traffic. + fastify.post('/messages', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { + const startMs = Date.now(); + const parsed = AnthropicMessagesRequestSchema.safeParse(request.body); + if (!parsed.success) { + return reply.status(400).send({ + type: 'error', + error: { + type: 'invalid_request_error', + message: parsed.error.errors[0]?.message ?? 'Invalid messages request', + }, + }); + } + + const callId = `msg_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`; + const gatewayRequest = anthropicRequestToGatewayRequest(parsed.data, request); + const result = await executeCompletion(gatewayRequest, startMs, callId); + + if (result.statusCode !== 200) { + return reply.status(result.statusCode).send(toAnthropicError(result)); + } + + const response = toAnthropicMessagesResponse(result.body, parsed.data.model); + if (parsed.data.stream) { + // Minimal SSE — emit the whole response as a single content_block_delta then message_stop. + const text = (response.content as Array<{ text: string }>)[0]?.text ?? ''; + const lines = [ + `event: message_start\ndata: ${JSON.stringify({ type: 'message_start', message: { ...response, content: [], usage: { input_tokens: (response.usage as any).input_tokens, output_tokens: 0 } } })}`, + `event: content_block_start\ndata: ${JSON.stringify({ type: 'content_block_start', index: 0, content_block: { type: 'text', text: '' } })}`, + `event: content_block_delta\ndata: ${JSON.stringify({ type: 'content_block_delta', index: 0, delta: { type: 'text_delta', text } })}`, + `event: content_block_stop\ndata: ${JSON.stringify({ type: 'content_block_stop', index: 0 })}`, + `event: message_delta\ndata: ${JSON.stringify({ type: 'message_delta', delta: { stop_reason: response.stop_reason, stop_sequence: null }, usage: { output_tokens: (response.usage as any).output_tokens } })}`, + `event: message_stop\ndata: ${JSON.stringify({ type: 'message_stop' })}`, + ]; + return reply + .header('Content-Type', 'text/event-stream; charset=utf-8') + .header('Cache-Control', 'no-cache') + .send(lines.join('\n\n') + '\n\n'); + } + return reply.status(200).send(response); + }); + + fastify.post('/responses', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { + const startMs = Date.now(); + const parsed = OpenAIResponsesRequestSchema.safeParse(request.body); + if (!parsed.success) { + return reply.status(400).send({ + error: { + message: parsed.error.errors[0]?.message ?? 'Invalid responses request', + type: 'invalid_request_error', + code: 400, + }, + }); + } + + const callId = `resp-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; + + // ─── codex-bridge passthrough for gpt-* models ────────────────────── + // Codex.app sends model=gpt-5.5 / gpt-5.1-codex-mini etc. These are + // ChatGPT-subscription models the openai API itself rejects without + // the right auth. Route them straight to the local codex-bridge + // (PM2 process at 127.0.0.1:3253) which speaks codex-cli over OAuth. + if (/^gpt-/i.test(parsed.data.model ?? '')) { + try { + const bridgeUrl = process.env['CODEX_BRIDGE_URL'] ?? 'http://127.0.0.1:3253'; + const inputText = typeof parsed.data.input === 'string' + ? parsed.data.input + : (Array.isArray(parsed.data.input) + ? parsed.data.input.map((p: any) => typeof p?.content === 'string' ? p.content : (Array.isArray(p?.content) ? p.content.map((c: any) => c?.text ?? '').join(' ') : '')).join(' ') : ''); + const upstream = await fetch(`${bridgeUrl}/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: parsed.data.model, + messages: [{ role: 'user', content: inputText }], + }), + }); + const upstreamJson: any = await upstream.json(); + if (upstream.ok && upstreamJson?.success !== false) { + const text = upstreamJson?.content ?? upstreamJson?.response ?? upstreamJson?.choices?.[0]?.message?.content ?? ''; + const respBody = toOpenAIResponsesResponse({ output: text, model: parsed.data.model, status: 'approved' }, parsed.data.model); + logger.info({ callId, model: parsed.data.model, len: text.length }, 'codex-bridge passthrough OK'); + // Track against the merged OpenAI (ChatGPT+Codex) subscription pool. + try { + const subId = modelToSubscriptionId(parsed.data.model ?? '') ?? 'codex'; + void recordSubscriptionUsage(getPool(), subId, 0); + } catch (e) { + logger.warn({ e, callId }, 'failed to record subscription usage for passthrough'); + } + // Also write an audit row so the dashboard activity tab sees it. + try { + void writeAuditLog({ + callId, + caller: (request.headers['x-llm-interceptor-caller'] as string) || 'codex-app', + task_type: 'codex_passthrough', + status: 'approved', + tokens_in: 0, + tokens_out: text.length, + latency_ms: Date.now() - startMs, + confidence: 0, + cost_usd: 0, + compression_applied: false, + model: parsed.data.model ?? 'gpt-5.5', + } as any); + } catch (e) { + logger.warn({ e, callId }, 'failed to write audit log for passthrough'); + } + if (parsed.data.stream) { + return reply + .header('Content-Type', 'text/event-stream; charset=utf-8') + .header('Cache-Control', 'no-cache') + .send(`data: ${JSON.stringify({ type: 'response.completed', response: respBody })} + +data: [DONE] + +`); + } + return reply.send(respBody); + } + logger.warn({ callId, model: parsed.data.model, upstreamJson }, 'codex-bridge upstream non-OK; falling back to standard pipeline'); + } catch (err) { + logger.error({ err, callId, model: parsed.data.model }, 'codex-bridge passthrough threw; falling back'); + } + } + + const gatewayRequest = responsesRequestToGatewayRequest(parsed.data, request); + const result = await executeCompletion(gatewayRequest, startMs, callId); + + if (result.statusCode !== 200) { + return reply.status(result.statusCode).send(toOpenAIError(result)); + } + + const response = toOpenAIResponsesResponse(result.body, parsed.data.model); + if (parsed.data.stream) { + return reply + .header('Content-Type', 'text/event-stream; charset=utf-8') + .header('Cache-Control', 'no-cache') + .send(`data: ${JSON.stringify({ type: 'response.completed', response })}\n\ndata: [DONE]\n\n`); + } + return reply.send(response); + }); + + // ─── Multi-Model Race Mode endpoint ──────────────────────────────────── + // Runs the same prompt against multiple models in parallel; returns + // according to `strategy` (first | best | consensus). Audits each + // candidate run for later analysis. + fastify.post('/race', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { + const startMs = Date.now(); + const body = request.body as { + caller?: string; + task_type?: string; + input?: string; + models?: string[]; + strategy?: RaceStrategy; + timeout_ms?: number; + options?: any; + }; + if (!body?.input || !Array.isArray(body.models) || body.models.length < 2) { + return reply.status(400).send({ + error: 'race endpoint requires { input: string, models: string[] (>=2) }', + }); + } + const callerId = body.caller ?? 'race-client'; + const strategy: RaceStrategy = (body.strategy as RaceStrategy) ?? 'first'; + const callId = `race-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; + + const runner = async (model: string, _signal: AbortSignal) => { + const candStart = Date.now(); + const result = await executeCompletion({ + caller: callerId, + task_type: body.task_type ?? 'generic_qa', + input: body.input!, + options: { ...(body.options ?? {}), model, skip_cache: true }, + } as CompletionRequest, candStart, `${callId}-${model}`); + const ok = result.statusCode === 200; + const r = result.body as Record; + return { + model, + status: ok ? 'ok' : 'error', + output: typeof r['output'] === 'string' ? r['output'] : undefined, + confidence: typeof r['confidence'] === 'number' ? r['confidence'] : undefined, + cost: typeof r['cost'] === 'number' ? r['cost'] : undefined, + latencyMs: Date.now() - candStart, + errorMessage: !ok ? String(r['message'] ?? r['error'] ?? 'unknown') : undefined, + } as RaceCandidateResult; + }; + + try { + const { outcome } = await runRace(body.models, runner, strategy, { timeoutMs: body.timeout_ms ?? 60_000 }); + void auditRaceResults(getPool(), callId, callerId, body.task_type ?? 'generic_qa', outcome); + return reply.send({ + success: true, + call_id: callId, + strategy: outcome.strategy, + selected: { + model: outcome.selected.model, + output: outcome.selected.output, + confidence: outcome.selected.confidence, + cost: outcome.selected.cost, + latency_ms: outcome.selected.latencyMs, + }, + agreement_score: outcome.agreementScore ?? null, + candidates: outcome.candidates.map((c) => ({ + model: c.model, + status: c.status, + confidence: c.confidence, + latency_ms: c.latencyMs, + error: c.errorMessage, + })), + total_latency_ms: Date.now() - startMs, + }); + } catch (err) { + logger.error({ err, callId }, 'race endpoint failed'); + return reply.status(500).send({ error: 'race failed', message: err instanceof Error ? err.message : 'unknown' }); + } + }); + fastify.post('/completion', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { const startMs = Date.now(); @@ -242,52 +1322,9 @@ export async function completionRoute(fastify: FastifyInstance): Promise { }); } - const { caller, input, language, context, options } = body; const callId = `call-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; - let classifAndRoute; - try { - classifAndRoute = await classifyAndRoute(body.task_type, caller, input, options); - } catch (err) { - return reply.status(400).send({ - statusCode: 400, error: 'Routing Error', - message: err instanceof Error ? err.message : 'Failed to route request', - }); - } - - const { taskType, decision, classificationResult } = classifAndRoute; - const promptVars = buildPromptVariables(input, context); - const resolved = resolvePrompt(taskType ?? decision.prompt_template, promptVars, language ?? 'en'); - - const format: '' | 'json' | undefined = decision.output_format === 'json' ? 'json' : ''; - const baseReq = { model: decision.model, prompt: resolved.prompt, system: resolved.system, options: { temperature: decision.temperature, num_predict: decision.max_tokens }, format, stream: false, callId, taskType }; - - let ollamaResponse; - try { - ollamaResponse = await callLLMWithFallback(baseReq, decision, callId, taskType); - } catch (err) { - const latency = Date.now() - startMs; - logger.error({ err, caller, taskType }, 'Ollama call failed'); - requestsTotal.labels({ caller, task_type: taskType, status: 'rejected' }).inc(); - latencySeconds.labels({ caller, task_type: taskType, model: decision.model }).observe(latency / 1000); - const db = getPool(); - const requestLogger = createRequestLogger(db); - void requestLogger.logRequest(callId, caller, taskType, decision.model, 'error', 0, 0, 0, latency, 0, false, err instanceof Error ? err.message : 'LLM service unavailable'); - return reply.status(503).send({ statusCode: 503, error: 'Service Unavailable', message: 'LLM service unavailable, please retry' }); - } - - const latencyMs = Date.now() - startMs; - const outputText = ollamaResponse.response; - const validationOutput = await runPostValidation(outputText, { validators: decision.validators, language, output_format: decision.output_format, requires_fact_check: decision.requires_fact_check, schema: resolved.schema }); - const confidenceResult = evaluateConfidence(validationOutput); - - recordAllMetrics(caller, taskType, confidenceResult, ollamaResponse, decision, validationOutput); - const { costUsd, costSavedUsd } = await auditAndTrackCosts(caller, taskType, input, outputText, latencyMs, ollamaResponse, resolved, decision, confidenceResult, validationOutput, classificationResult, callId); - - // Fix latency observation after computation - latencySeconds.labels({ caller, task_type: taskType, model: ollamaResponse.model ?? decision.model }).observe(latencyMs / 1000); - - const responseBody = buildResponseBody(callId, decision, taskType, confidenceResult, outputText, latencyMs, ollamaResponse, costUsd, costSavedUsd, options?.return_validation_details ?? false, validationOutput); - return reply.status(200).send(responseBody); + const result = await executeCompletion(body, startMs, callId); + return reply.status(result.statusCode).send(result.body); }); } diff --git a/packages/gateway/src/routes/dashboard.ts b/packages/gateway/src/routes/dashboard.ts index 40725aa..c04ef64 100644 --- a/packages/gateway/src/routes/dashboard.ts +++ b/packages/gateway/src/routes/dashboard.ts @@ -1,9 +1,46 @@ import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify'; +import { execFile } from 'child_process'; +import { promisify } from 'util'; +import { existsSync } from 'fs'; +import { homedir } from 'os'; import { getPool } from '../db/client.js'; import { logger } from '../observability/logger.js'; import { createRequestLogger } from '../modules/request-logger.js'; import { globalRequestStream } from '../modules/request-stream.js'; -import { getAvailableProviders } from '../pipeline/external-providers.js'; +import { getAvailableProviders, getAllProviders } from '../pipeline/external-providers.js'; +import { discoverSubscriptions } from '../modules/subscription-discovery.js'; +import { runDiscovery, runDiscoveryAndSpawn } from '../modules/auto-discovery.js'; +import { getRunningBridges, spawnDetectedBridges } from '../modules/bridge-spawner.js'; +import { getPublicSettings, saveSettings, SettingsPatchSchema } from '../modules/settings-store.js'; +import { + getCacheSavings, + getSavingsTimeSeries, + clearCacheForCaller, + pruneStaleCacheEntries, +} from '../modules/response-cache.js'; +import { getComprehensiveSavings, getCompressionSinceRestart } from '../modules/savings-calculator.js'; + +// Captured once at module load — represents the gateway-process start time +// for the 'compressed since last restart' tile in the dashboard. +const SERVER_STARTED_AT_ISO = new Date().toISOString(); +import { + getBuddyState, + getAchievements, + getCalendarHeatmap, + getRecentEvents, + getForecast, +} from '../modules/gamification.js'; +import { buildMemoryGraph } from '../modules/memory-graph.js'; +import { getRaceLeaderboard } from '../modules/race-leaderboard.js'; +import { getCallerDeepDive } from '../modules/caller-stats.js'; +import { generateMonthlyReport } from '../modules/report-generator.js'; +import { generateShareCard } from '../modules/share-card.js'; +import { getSubscriptionWallet, recordSubscriptionUsage } from '../modules/subscription-wallet.js'; +import { rememberFact, recallFacts, forgetCaller } from '../modules/knowledge-memory.js'; +import { getRaceStats } from '../modules/race-mode.js'; +import { dashboardAuthStatus, requireDashboardAuth } from '../modules/admin-auth.js'; + +const execFileAsync = promisify(execFile); interface DashboardSummary { totalCost: number; @@ -58,36 +95,324 @@ interface AlertData { }; } +const WORKBENCH_V1_BASELINE = { + totalTokensSaved: 9_304_882, + totalCostSaved: 72.54, + totalHits: 6, + hitRatePercent: 9.68, + costWithoutGateway: 749.38, + costWithGateway: 676.84, +}; + +type ProviderRuntime = { + runtimeStatus?: string; + runtimeHealthy?: boolean; + runtimeDetail?: string; +}; + +const CLIENT_CATALOG = [ + { + id: 'macbook-claude-code', + label: 'MacBook (Claude Code)', + patterns: ['claude-code-laptop'], + commands: [], + paths: [], + processPatterns: [], + }, + { + id: 'macstudio-claude-code', + label: 'Mac Studio (Claude Code)', + patterns: ['claude-code-macstudio', 'claude-code-studio'], + commands: [], + paths: [], + processPatterns: [], + }, + { + id: 'codex-desktop', + label: 'Codex Desktop / CLI', + patterns: ['codex-desktop', 'codex-cli', 'codex'], + commands: ['codex'], + paths: ['/Applications/Codex.app', '~/.codex'], + processPatterns: ['Codex.app', 'Codex Helper', '/Applications/Codex.app', '/Resources/codex'], + }, + { + id: 'claude-desktop', + label: 'Claude Desktop / Claude Code', + patterns: ['claude-desktop', 'claude-code', 'claude'], + commands: ['claude'], + paths: ['/Applications/Claude.app', '~/Library/Application Support/Claude', '~/.claude'], + processPatterns: ['/Applications/Claude.app', 'Claude Helper', 'claude-code', '/claude.app/Contents/MacOS/claude'], + }, + { + id: 'microsoft-copilot', + label: 'Microsoft Copilot', + patterns: ['microsoft-copilot', 'm365-copilot', 'copilot-m365'], + commands: [], + paths: ['/Applications/Microsoft Copilot.app'], + processPatterns: ['Microsoft Copilot', 'm365-copilot'], + }, + { + id: 'github-copilot', + label: 'GitHub Copilot', + patterns: ['github-copilot', 'copilot-bridge'], + commands: ['gh'], + paths: ['~/.config/github-copilot', '~/.vscode/extensions'], + processPatterns: ['GitHub Copilot', 'copilot-language-server', 'copilot-bridge'], + }, + { + id: 'chatgpt', + label: 'ChatGPT / OpenAI Desktop', + patterns: ['chatgpt', 'openai-desktop'], + commands: [], + paths: ['/Applications/ChatGPT.app', '~/Library/Application Support/com.openai.chat'], + processPatterns: ['/Applications/ChatGPT.app', 'ChatGPTHelper', 'com.openai.chat'], + }, + { + id: 'openai-compatible', + label: 'OpenAI-compatible clients', + patterns: ['openai-compatible', 'responses-compatible', 'responses-', 'gateway', 'cursor', 'continue', 'cline', 'aider', 'waveterm'], + commands: ['cursor', 'aider', 'opencode', 'cline'], + paths: ['/Applications/Cursor.app', '~/.cursor', '~/.continue', '~/.aider.conf.yml'], + processPatterns: ['/Applications/Cursor.app', 'Cursor Helper', 'Continue', 'Cline', 'aider', 'opencode', 'Waveterm'], + }, +] as const; + +type ClientStatus = 'live' | 'running' | 'installed' | 'not-connected'; + +const CLIENT_BRIDGE_PROVIDERS: Record<(typeof CLIENT_CATALOG)[number]['id'], string | undefined> = { + 'macbook-claude-code': undefined, + 'macstudio-claude-code': undefined, + 'codex-desktop': 'codex', + 'claude-desktop': 'claude-code', + 'microsoft-copilot': 'm365-copilot-bridge', + 'github-copilot': 'copilot-bridge', + 'openai-compatible': undefined, + 'chatgpt': 'codex-bridge', +}; + +function expandUserPath(path: string): string { + return path.startsWith('~/') ? `${homedir()}/${path.slice(2)}` : path; +} + +async function getProcessSnapshot(): Promise { + try { + const { stdout } = await execFileAsync('ps', ['axo', 'command'], { timeout: 1500, maxBuffer: 1024 * 1024 * 3 }); + return stdout.toLowerCase(); + } catch { + return ''; + } +} + +async function commandExists(command: string): Promise { + try { + await execFileAsync('/bin/sh', ['-lc', `command -v ${command}`], { timeout: 1200, maxBuffer: 4096 }); + return true; + } catch { + return false; + } +} + +async function getLocalDesktopDetections(): Promise> { + const processSnapshot = await getProcessSnapshot(); + const entries = await Promise.all(CLIENT_CATALOG.map(async (client) => { + const signals: string[] = []; + const running = client.processPatterns.some((pattern) => processSnapshot.includes(pattern.toLowerCase())); + if (running) signals.push('running process'); + + const existingPaths = client.paths.filter((path) => existsSync(expandUserPath(path))); + for (const path of existingPaths.slice(0, 3)) signals.push(path); + + const existingCommands: string[] = []; + for (const command of client.commands) { + if (await commandExists(command)) existingCommands.push(command); + } + for (const command of existingCommands) signals.push(`cli:${command}`); + + return [client.id, { + running, + installed: existingPaths.length > 0 || existingCommands.length > 0 || running, + signals, + }] as const; + })); + + return Object.fromEntries(entries); +} + +async function getGatewayClientCoverage(hoursBack: number = 24): Promise> { + const detections = await getLocalDesktopDetections(); + const bridgeRuntimes = Object.fromEntries(await Promise.all(CLIENT_CATALOG.map(async (client) => { + const providerName = CLIENT_BRIDGE_PROVIDERS[client.id]; + return [ + client.id, + { + providerName, + ...(providerName ? await providerRuntime(providerName) : {}), + }, + ] as const; + }))); + let callers: Array<{ caller: string; requestCount: number; lastSeen?: string; tokensIn: number; tokensSaved: number }> = []; + + try { + const db = getPool(); + const result = await db.query( + ` + SELECT + rt.caller_id, + COUNT(*)::INT as request_count, + MAX(rt.created_at) as last_seen, + COALESCE(SUM(rt.tokens_in), 0)::INT as tokens_in, + COALESCE(SUM(GREATEST(tv.tokens_before - tv.tokens_after, 0)), 0)::INT as tokens_saved + FROM request_tracking rt + LEFT JOIN LATERAL ( + SELECT tokens_before, tokens_after + FROM tokenvault_metrics + WHERE tool_used = 'gateway' + AND file_path = rt.request_id + ORDER BY created_at DESC + LIMIT 1 + ) tv ON true + WHERE rt.created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY rt.caller_id + `, + [hoursBack] + ); + + callers = result.rows.map((row: any) => ({ + caller: String(row.caller_id ?? ''), + requestCount: parseInt(row.request_count, 10) || 0, + lastSeen: row.last_seen ? new Date(row.last_seen).toISOString() : undefined, + tokensIn: parseInt(row.tokens_in, 10) || 0, + tokensSaved: parseInt(row.tokens_saved, 10) || 0, + })); + } catch (error) { + logger.warn({ error }, 'Client gateway traffic lookup failed, returning local desktop detections only'); + } + + // First-match-wins: a caller is assigned to the first (most specific) catalog + // entry it matches, so device-specific entries (MacBook/Mac Studio) take a + // caller before the generic 'claude-desktop' bucket — no double counting. + const assignedCallers = new Set(); + return CLIENT_CATALOG.map((client) => { + const detection = detections[client.id]; + const bridgeRuntime = bridgeRuntimes[client.id]; + const matched = callers.filter((row) => { + if (assignedCallers.has(row.caller)) return false; + const caller = row.caller.toLowerCase(); + return client.patterns.some((pattern) => caller.includes(pattern)); + }); + matched.forEach((row) => assignedCallers.add(row.caller)); + const requestCount = matched.reduce((sum, row) => sum + row.requestCount, 0); + const tokensIn = matched.reduce((sum, row) => sum + row.tokensIn, 0); + const tokensSaved = matched.reduce((sum, row) => sum + row.tokensSaved, 0); + const lastSeen = matched + .map((row) => row.lastSeen) + .filter(Boolean) + .sort() + .at(-1); + + return { + id: client.id, + label: client.label, + status: requestCount > 0 ? 'live' : detection?.running ? 'running' : detection?.installed ? 'installed' : 'not-connected', + requestCount, + lastSeen, + callers: matched.map((row) => row.caller).sort(), + tokensIn, + tokensSaved, + source: requestCount > 0 ? 'gateway' : detection?.installed ? 'local-detection' : 'none', + detectionSignals: detection?.signals ?? [], + bridgeProvider: bridgeRuntime?.providerName, + bridgeStatus: bridgeRuntime?.runtimeStatus, + bridgeHealthy: bridgeRuntime?.runtimeHealthy, + bridgeDetail: bridgeRuntime?.runtimeDetail, + }; + }); +} + +function bridgeHealthUrl(providerName: string): string | undefined { + const bridgeUrls: Record = { + 'claude-bridge': process.env['CLAUDE_BRIDGE_URL'], + 'claude-code': process.env['CLAUDE_CODE_URL'] || process.env['CLAUDE_BRIDGE_URL'], + 'copilot-bridge': process.env['COPILOT_BRIDGE_URL'], + 'm365-copilot-bridge': process.env['M365_COPILOT_BRIDGE_URL'], + 'openai-codex': process.env['OPENAI_CODEX_URL'] || process.env['CODEX_BRIDGE_URL'], + codex: process.env['CODEX_BRIDGE_URL'] || process.env['OPENAI_CODEX_URL'], + }; + + const baseUrl = bridgeUrls[providerName]?.replace(/\/+$/, ''); + return baseUrl ? `${baseUrl}/health` : undefined; +} + +async function providerRuntime(providerName: string): Promise { + const healthUrl = bridgeHealthUrl(providerName); + if (!healthUrl) return {}; + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 1200); + + try { + const response = await fetch(healthUrl, { signal: controller.signal }); + const payload = await response.json().catch(() => ({})) as { + status?: unknown; + configured?: unknown; + healthy?: unknown; + detail?: unknown; + }; + const status = String(payload.status ?? (response.ok ? 'ok' : 'error')); + const configured = payload.configured !== false; + const healthy = response.ok && configured && payload.healthy !== false && status !== 'auth_required'; + const detail = status === 'auth_required' + ? String(payload.detail ?? 'auth_required') + : configured ? undefined : 'bridge_not_configured'; + + return { + runtimeStatus: healthy ? 'ready' : status, + runtimeHealthy: healthy, + runtimeDetail: detail, + }; + } catch (error) { + return { + runtimeStatus: 'unreachable', + runtimeHealthy: false, + runtimeDetail: error instanceof Error ? error.message : 'health_check_failed', + }; + } finally { + clearTimeout(timeout); + } +} + /** * Get dashboard summary stats for a time window */ async function getDashboardSummary(hoursBack: number = 24): Promise { const db = getPool(); try { - const result = await db.query( - `SELECT - SUM(cost_usd) as total_cost, - SUM(cost_saved_usd) as total_saved, - SUM(tokens_compressed) as tokens_compressed, - SUM(tokens_in + tokens_out) as total_tokens, - COUNT(*) as request_count, - AVG(confidence_score) as avg_confidence - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR`, - [hoursBack] - ); - - const row = result.rows[0]; - const totalTokens = parseInt(row?.total_tokens || '0', 10); - const totalCompressed = parseInt(row?.tokens_compressed || '0', 10); + const requestLogger = createRequestLogger(db); + const bucketMinutes = hoursBack * 60; // Convert hours to minutes + const metrics = await requestLogger.getMetrics(bucketMinutes); return { - totalCost: parseFloat(row?.total_cost || '0'), - totalSaved: parseFloat(row?.total_saved || '0'), - compressionRatio: totalTokens > 0 ? parseFloat((((totalTokens - totalCompressed) / totalTokens) * 100).toFixed(2)) : 0, - tokensSaved: totalTokens - totalCompressed, - requestCount: parseInt(row?.request_count || '0', 10), - averageConfidence: parseFloat(row?.avg_confidence || '0'), + totalCost: metrics.total_cost, + totalSaved: metrics.estimated_api_cost_avoided, + compressionRatio: metrics.compression_rate, + tokensSaved: metrics.compression_tokens_saved, + requestCount: metrics.total_requests, + averageConfidence: metrics.avg_confidence, timeWindow: `${hoursBack}h` }; } catch (err) { @@ -110,69 +435,60 @@ async function getDashboardSummary(hoursBack: number = 24): Promise { const db = getPool(); try { - const [projectResult, modelResult, taskResult] = await Promise.all([ - db.query( - `SELECT project, SUM(cost_usd) as cost, COUNT(*) as count, SUM(cost_saved_usd) as saved - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR - GROUP BY project`, - [hoursBack] - ), - db.query( - `SELECT model, SUM(cost_usd) as cost, COUNT(*) as count - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR - GROUP BY model`, - [hoursBack] - ), - db.query( - `SELECT task_type, SUM(cost_usd) as cost, COUNT(*) as count - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR - GROUP BY task_type`, - [hoursBack] - ) - ]); + const requestLogger = createRequestLogger(db); + const bucketMinutes = hoursBack * 60; // Convert hours to minutes + const metrics = await requestLogger.getMetrics(bucketMinutes); - const byProject: Record = {}; + // Build model breakdown from metrics const byModel: Record = {}; - const byTaskType: Record = {}; - - for (const row of projectResult.rows) { - byProject[row.project] = { - cost: parseFloat(row.cost || '0'), - count: parseInt(row.count || '0', 10), - saved: parseFloat(row.saved || '0') + for (const model of metrics.top_models) { + byModel[model.model] = { + cost: (metrics.total_cost * model.count) / metrics.total_requests, // Estimate cost per model + count: model.count }; } - for (const row of modelResult.rows) { - byModel[row.model] = { - cost: parseFloat(row.cost || '0'), - count: parseInt(row.count || '0', 10) - }; - } - - for (const row of taskResult.rows) { - byTaskType[row.task_type] = { - cost: parseFloat(row.cost || '0'), - count: parseInt(row.count || '0', 10) - }; - } - - const totalResult = await db.query( - `SELECT SUM(cost_usd) as total_cost, SUM(cost_saved_usd) as total_saved - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR`, + // Get caller-based breakdown from database (using caller_id as proxy for project) + const callerResult = await db.query( + `SELECT caller_id, SUM(cost_usd) as cost, COUNT(*) as count + FROM request_tracking + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY caller_id`, [hoursBack] ); + const byProject: Record = {}; + for (const row of callerResult.rows) { + byProject[row.caller_id] = { + cost: parseFloat(row.cost || '0'), + count: parseInt(row.count || '0', 10), + saved: 0 // Not tracked + }; + } + + // Get task type breakdown + const taskResult = await db.query( + `SELECT task_type, SUM(cost_usd) as cost, COUNT(*) as count + FROM request_tracking + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY task_type`, + [hoursBack] + ); + + const byTaskType: Record = {}; + for (const row of taskResult.rows) { + byTaskType[row.task_type || 'unknown'] = { + cost: parseFloat(row.cost || '0'), + count: parseInt(row.count || '0', 10) + }; + } + return { byProject, byModel, byTaskType, - totalCost: parseFloat(totalResult.rows[0]?.total_cost || '0'), - totalSaved: parseFloat(totalResult.rows[0]?.total_saved || '0') + totalCost: metrics.total_cost, + totalSaved: metrics.estimated_api_cost_avoided }; } catch (err) { logger.error({ err }, 'Failed to get cost breakdown'); @@ -186,41 +502,62 @@ async function getCostBreakdown(hoursBack: number = 24): Promise async function getTokenMetrics(hoursBack: number = 24): Promise { const db = getPool(); try { - const [totalResult, byModelResult] = await Promise.all([ + const [totalResult, byModelResult, compressionResult, compressedByModelResult] = await Promise.all([ db.query( - `SELECT SUM(tokens_in) as total_in, SUM(tokens_out) as total_out, SUM(tokens_compressed) as total_compressed - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR`, + `SELECT SUM(tokens_in) as total_in, SUM(tokens_out) as total_out + FROM request_tracking + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`, [hoursBack] ), db.query( - `SELECT model, SUM(tokens_in) as in, SUM(tokens_out) as out, SUM(tokens_compressed) as compressed - FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR + `SELECT model, SUM(tokens_in) as in, SUM(tokens_out) as out + FROM request_tracking + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) GROUP BY model`, [hoursBack] - ) + ), + db.query( + `SELECT + COALESCE(SUM(tokens_before), 0) as tokens_before, + COALESCE(SUM(tokens_after), 0) as tokens_after, + COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0) as tokens_saved + FROM tokenvault_metrics + WHERE tool_used = 'gateway' + AND created_at > NOW() - MAKE_INTERVAL(hours => $1)`, + [hoursBack] + ), + db.query( + `SELECT model, COALESCE(SUM(tokens_compressed), 0) as compressed + FROM cost_analytics + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY model`, + [hoursBack] + ), ]); const totalIn = parseInt(totalResult.rows[0]?.total_in || '0', 10); const totalOut = parseInt(totalResult.rows[0]?.total_out || '0', 10); - const totalCompressed = parseInt(totalResult.rows[0]?.total_compressed || '0', 10); - const total = totalIn + totalOut; + const compressedByModel = new Map( + compressedByModelResult.rows.map((row: any) => [row.model, parseInt(row.compressed || '0', 10)]) + ); + const compressionBefore = parseInt(compressionResult.rows[0]?.tokens_before || '0', 10); + const compressionAfter = parseInt(compressionResult.rows[0]?.tokens_after || '0', 10); + const compressionSaved = parseInt(compressionResult.rows[0]?.tokens_saved || '0', 10); const byModel: Record = {}; for (const row of byModelResult.rows) { byModel[row.model] = { in: parseInt(row.in || '0', 10), out: parseInt(row.out || '0', 10), - compressed: parseInt(row.compressed || '0', 10) + compressed: compressedByModel.get(row.model) ?? 0 }; } return { totalIn, totalOut, - totalCompressed, - compressionRate: total > 0 ? parseFloat((((total - totalCompressed) / total) * 100).toFixed(2)) : 0, + totalCompressed: compressionAfter, + compressionRate: compressionBefore > 0 ? compressionSaved / compressionBefore : 0, byModel }; } catch (err) { @@ -236,12 +573,12 @@ async function getAgentActivity(hoursBack: number = 24): Promise NOW() - INTERVAL $1 HOUR - GROUP BY agent_id + FROM request_tracking + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY caller_id ORDER BY task_count DESC`, [hoursBack] ); @@ -264,78 +601,100 @@ async function getAgentActivity(hoursBack: number = 24): Promise { - const db = getPool(); - try { - const [configResult, alertResult] = await Promise.all([ - db.query(`SELECT * FROM cost_alert_config WHERE user_id = $1`, ['rene']), - db.query(`SELECT alert_type, COUNT(*) as count FROM alert_log WHERE acknowledged = FALSE GROUP BY alert_type`) - ]); + // Alert configuration is not yet stored in database + // Return default thresholds and empty alerts + const thresholds = { + compressionBelow: 40, + weeklyBudget: 50, + externalApiCost: 0 + }; - const thresholds = { - compressionBelow: 40, - weeklyBudget: 50, - externalApiCost: 0 - }; - - for (const row of configResult.rows) { - if (row.alert_type === 'compression_below') { - thresholds.compressionBelow = parseFloat(row.threshold); - } else if (row.alert_type === 'weekly_budget') { - thresholds.weeklyBudget = parseFloat(row.threshold); - } else if (row.alert_type === 'external_api') { - thresholds.externalApiCost = parseFloat(row.threshold); - } - } - - const byType: Record = {}; - let total = 0; - for (const row of alertResult.rows) { - byType[row.alert_type] = parseInt(row.count || '0', 10); - total += parseInt(row.count || '0', 10); - } - - return { - active: total, - byType, - thresholds - }; - } catch (err) { - logger.error({ err }, 'Failed to get alerts'); - return { active: 0, byType: {}, thresholds: { compressionBelow: 40, weeklyBudget: 50, externalApiCost: 0 } }; - } + return { + active: 0, + byType: {}, + thresholds + }; } export async function dashboardRoute(fastify: FastifyInstance): Promise { + const dashboardAuth = { preHandler: requireDashboardAuth }; + + fastify.get('/api/dashboard/auth', async (request: FastifyRequest, reply: FastifyReply) => { + return reply.send({ success: true, data: dashboardAuthStatus(request) }); + }); + + fastify.get('/api/dashboard/topology', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + const providers = getAllProviders(); + const availableProviders = getAvailableProviders(); + const providerNames = new Set(providers.map((provider) => provider.name)); + const configuredProviders = providers.filter((provider) => provider.enabled && !!process.env[provider.envKey]); + const localProviders = providers.filter((provider) => provider.name.toLowerCase().includes('ollama')); + const subscriptionProviders = providers.filter((provider) => + ['claude-bridge', 'copilot-bridge', 'm365-copilot-bridge', 'openai-codex'] + .includes(provider.name) + ); + + return reply.send({ + success: true, + data: { + product: 'llm.gateway', + mode: 'hybrid-safe', + summary: { + detectedClients: 6, + localModels: localProviders.length, + providersConfigured: configuredProviders.length, + trustPolicies: 3, + memoryBackends: 1, + plannedModules: 5, + }, + nodes: [ + ...['Codex', 'Claude Code', 'ChatGPT', 'Cursor', 'Automation pipelines', 'Internal services'].map((name) => ({ + type: 'client', + name, + status: 'detectable', + })), + ...providers.map((provider) => ({ + type: localProviders.includes(provider) ? 'local-provider' : subscriptionProviders.includes(provider) ? 'subscription-provider' : 'public-provider', + name: provider.name, + status: configuredProviders.includes(provider) ? 'configured' : provider.enabled ? 'available' : 'disabled', + })), + ], + receipts: [], + routes: availableProviders.filter((provider) => providerNames.has(provider.name)).map((provider) => provider.name), + }, + }); + }); + // Dashboard summary endpoint - fastify.get('/api/dashboard/summary', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/summary', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { const hours = (request.query as any).hours ?? 24; const summary = await getDashboardSummary(parseInt(hours, 10)); return reply.send(summary); }); // Cost breakdown endpoint - fastify.get('/api/dashboard/costs', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/costs', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { const hours = (request.query as any).hours ?? 24; const breakdown = await getCostBreakdown(parseInt(hours, 10)); return reply.send(breakdown); }); // Token metrics endpoint - fastify.get('/api/dashboard/tokens', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/tokens', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { const hours = (request.query as any).hours ?? 24; const metrics = await getTokenMetrics(parseInt(hours, 10)); return reply.send(metrics); }); // Agent activity endpoint - fastify.get('/api/dashboard/agents', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/agents', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { const hours = (request.query as any).hours ?? 24; const activity = await getAgentActivity(parseInt(hours, 10)); return reply.send(activity); }); // Alerts endpoint - fastify.get('/api/dashboard/alerts', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/alerts', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { const alerts = await getAlerts(); return reply.send(alerts); }); @@ -407,7 +766,30 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { }); // Request history endpoint - fastify.get('/api/dashboard/requests', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/clients', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 720); + const clients = await getGatewayClientCoverage(hours); + return reply.status(200).send({ + success: true, + data: clients, + meta: { + total: clients.length, + hours, + timestamp: new Date().toISOString(), + }, + }); + } catch (error) { + logger.error({ error }, 'Failed to fetch dashboard clients'); + return reply.status(500).send({ + success: false, + error: 'Failed to fetch clients', + }); + } + }); + + // Request history endpoint + fastify.get('/api/dashboard/requests', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { try { const limit = Math.min(parseInt((request.query as any).limit as string) || 100, 1000); const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 168); @@ -436,9 +818,9 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { }); // Aggregated metrics endpoint - fastify.get('/api/dashboard/request-metrics', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/request-metrics', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { try { - const bucketMinutes = Math.min(parseInt((request.query as any).bucket_minutes as string) || 60, 1440); + const bucketMinutes = Math.min(parseInt((request.query as any).bucket_minutes as string) || 1440, 1440); const db = getPool(); const requestLogger = createRequestLogger(db); @@ -462,7 +844,7 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { }); // Server-Sent Events endpoint for real-time request updates - fastify.get('/api/stream/requests', async (request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/stream/requests', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { // Use raw Node.js API to properly initialize HTTP/2 stream reply.raw.writeHead(200, { 'Content-Type': 'text/event-stream', @@ -542,46 +924,105 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { }); // Test endpoint - fastify.get('/api/dashboard/test', async (_request: FastifyRequest, reply: FastifyReply) => { + fastify.get('/api/dashboard/test', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { return reply.send({ test: 'ok', message: 'Test endpoint is working' }); }); - // Providers endpoint - lists all available LLM providers (local, subscription, free-tier) - fastify.get('/api/dashboard/providers', async (_request: FastifyRequest, reply: FastifyReply) => { + // Providers endpoint - lists all configured LLM providers (local, subscription, free-tier) + // Shows ALL providers regardless of API-key status so users can see what's possible. + fastify.get('/api/dashboard/providers', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { try { - const availableProviders = await getAvailableProviders(); + const allProviders = getAllProviders(); - // Categorize providers by type - const providers = availableProviders.map(provider => { + // Friendly display labels for the UI + const displayLabels: Record = { + 'claude-bridge': 'Claude Code Subscription (Bridge)', + 'claude-code': 'Claude Code Direct', + 'copilot-bridge': 'GitHub Copilot Subscription', + 'm365-copilot-bridge': 'Microsoft 365 Copilot Subscription', + 'copilot-codex': 'GitHub Copilot (Codex Inner API)', + 'openai-codex': 'OpenAI (ChatGPT + Codex)', + 'cerebras': 'Cerebras (Free Tier)', + 'groq': 'Groq (Free Tier)', + 'mistral': 'Mistral AI (Free Tier)', + 'nvidia': 'NVIDIA NIM (Free Tier)', + 'cloudflare': 'Cloudflare Workers AI' + }; + + // Subscription providers (paid via login/subscription, NOT free-tier API) + const subscriptionNames = new Set([ + 'claude-bridge', + 'copilot-bridge', 'm365-copilot-bridge', 'openai-codex' + ]); + + // Categorize all providers (independent of API-key presence) + const providers = await Promise.all(allProviders.map(async provider => { let type: 'local' | 'subscription' | 'free' = 'free'; - let status: 'configured' | 'unconfigured' | 'unavailable' = 'unconfigured'; - - // Determine provider type based on name if (provider.name.toLowerCase().includes('ollama')) { type = 'local'; - status = provider.enabled ? 'configured' : 'unconfigured'; - } else if (['claude-bridge', 'claude-code', 'openai-bridge', 'chatgpt-bridge', 'copilot-bridge', 'codex'].includes(provider.name)) { + } else if (subscriptionNames.has(provider.name)) { type = 'subscription'; - status = provider.enabled && process.env[provider.envKey] ? 'configured' : 'unconfigured'; } else { type = 'free'; - status = provider.enabled && process.env[provider.envKey] ? 'configured' : 'unconfigured'; } + const hasKey = !!process.env[provider.envKey]; + const status: 'configured' | 'unconfigured' | 'unavailable' = + provider.enabled && hasKey ? 'configured' + : provider.enabled ? 'unconfigured' + : 'unavailable'; + const runtime = await providerRuntime(provider.name); return { name: provider.name, + label: displayLabels[provider.name] ?? provider.name, type, status, enabled: provider.enabled, + envKey: provider.envKey, models: provider.models.map(m => ({ id: m.id, tier: m.tier, contextLength: m.contextLength })), rateLimitRpm: provider.rateLimitRpm, - baseUrl: provider.baseUrl + baseUrl: provider.baseUrl, + ...runtime, }; - }); + })); + + // Add local Ollama models from the model registry (models.yaml) + try { + const yaml = (await import('js-yaml')).default; + const fs = await import('fs'); + const path = await import('path'); + const { fileURLToPath } = await import('url'); + const __filename = fileURLToPath(import.meta.url); + const __dirname = path.dirname(__filename); + const yamlPath = path.join(__dirname, '..', 'config', 'models.yaml'); + if (fs.existsSync(yamlPath)) { + const cfg: any = yaml.load(fs.readFileSync(yamlPath, 'utf-8')); + const ollamaModels = Object.entries(cfg.models ?? {}).map(([id, info]: [string, any]) => ({ + id, + tier: info.tier ?? 'medium', + contextLength: info.context_length ?? 0 + })); + if (ollamaModels.length > 0) { + providers.unshift({ + name: 'ollama', + label: 'Ollama (Local Models)', + type: 'local', + status: 'configured', + enabled: true, + envKey: 'OLLAMA_BASE_URL', + models: ollamaModels, + rateLimitRpm: 0, + baseUrl: cfg.ollama_base_url ?? '' + } as any); + } + } + } catch (yamlErr) { + logger.warn({ err: yamlErr }, 'Failed to load Ollama models from models.yaml'); + } // Group by type for easy UI rendering const grouped = { @@ -618,6 +1059,580 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { } }); + // ─── Subscription Auto-Gateway ──────────────────────────────────────────── + // Reports subscription availability from TWO sources: + // 1. Auto-detection on the gateway host (CLI present + authenticated) + // 2. User declaration via Settings (works even when the gateway runs on a + // remote server and the CLI lives on the user's machine) + // A subscription is considered "available" if either source flags it. + fastify.get('/api/dashboard/subscriptions', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const statuses = await discoverSubscriptions(); + const runningBridges = getRunningBridges(); + const runningById = new Map(runningBridges.map((b) => [b.descriptor.id, b])); + const userSettings = getPublicSettings(); + + const subscriptions = statuses.map((s) => { + const runtime = runningById.get(s.descriptor.id); + const userDeclared = userSettings.subscriptions[s.descriptor.id]?.enabled === true; + const detected = s.installed; + return { + id: s.descriptor.id, + label: s.descriptor.label, + command: s.descriptor.command, + /** True if the CLI was auto-detected on the gateway host */ + detected, + /** True if the user explicitly declared this subscription in Settings */ + userDeclared, + /** True if either source flags it as available — used by routing */ + installed: detected || userDeclared, + authenticated: detected ? s.authenticated : (userDeclared ? 'unknown' : false), + version: s.version ?? null, + providerName: s.descriptor.providerName, + bridgePort: s.descriptor.bridgePort, + bridgeEnvKey: s.descriptor.bridgeEnvKey, + bridgeUrl: runtime?.url ?? s.bridgeUrl ?? null, + bridgeRunning: !!runtime || s.bridgeRunning, + autoSpawned: !!runtime, + startedAt: runtime?.startedAt?.toISOString() ?? null, + models: s.descriptor.models.map((m) => ({ id: m.id, tier: m.tier })), + }; + }); + + const available = subscriptions.filter((s) => s.installed); + const running = subscriptions.filter((s) => s.bridgeRunning); + + return reply.send({ + success: true, + data: { + subscriptions, + summary: { + total: subscriptions.length, + installed: available.length, + detected: subscriptions.filter((s) => s.detected).length, + userDeclared: subscriptions.filter((s) => s.userDeclared).length, + running: running.length, + autoGatewayEnabled: process.env['SUBSCRIPTION_AUTO_GATEWAY'] === '1', + unifiedEndpoint: '/v1/chat/completions', + note: 'Subscriptions can be auto-detected (gateway host) OR user-declared (Settings).', + }, + }, + meta: { timestamp: new Date().toISOString() }, + }); + } catch (error) { + logger.error({ error }, 'Failed to discover subscriptions'); + return reply.status(500).send({ success: false, error: 'Failed to discover subscriptions' }); + } + }); + + // ─── Full-System Auto-Discovery ───────────────────────────────────────── + // GET /api/dashboard/discover → unified report (read-only) + // POST /api/dashboard/discover → discover + spawn bridges + fastify.get('/api/dashboard/discover', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const report = await runDiscovery(); + return reply.send({ success: true, data: report }); + } catch (error) { + logger.error({ error }, 'Discovery scan failed'); + return reply.status(500).send({ success: false, error: 'Discovery scan failed' }); + } + }); + + fastify.post('/api/dashboard/discover', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const result = await runDiscoveryAndSpawn(); + return reply.send({ + success: true, + data: { + report: result.report, + spawned: result.spawned, + spawnedCount: result.spawned.length, + }, + }); + } catch (error) { + logger.error({ error }, 'Discovery + spawn failed'); + return reply.status(500).send({ success: false, error: 'Discovery + spawn failed' }); + } + }); + + // POST /api/dashboard/subscriptions/spawn — trigger auto-spawn of detected bridges. + // Returns the list of bridges that were spawned (or already running). + fastify.post('/api/dashboard/subscriptions/spawn', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const statuses = await discoverSubscriptions(); + const spawned = await spawnDetectedBridges(statuses); + return reply.send({ + success: true, + data: { + spawnedCount: spawned.length, + bridges: spawned.map((b) => ({ + id: b.descriptor.id, + label: b.descriptor.label, + url: b.url, + port: b.port, + startedAt: b.startedAt.toISOString(), + })), + }, + }); + } catch (error) { + logger.error({ error }, 'Failed to spawn subscription bridges'); + return reply.status(500).send({ success: false, error: 'Failed to spawn bridges' }); + } + }); + + // ─── Settings ───────────────────────────────────────────────────────────── + // Returns user configuration (which subscriptions, which API providers, …). + // API keys are NEVER returned in plaintext — only a hasKey:boolean flag. + fastify.get('/api/dashboard/settings', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + return reply.send({ success: true, data: getPublicSettings() }); + } catch (error) { + logger.error({ error }, 'Failed to load settings'); + return reply.status(500).send({ success: false, error: 'Failed to load settings' }); + } + }); + + // Persist a settings patch. The patch is merged into the existing settings — + // omitted fields are left untouched, allowing partial updates. + fastify.post('/api/dashboard/settings', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const parsed = SettingsPatchSchema.safeParse(request.body); + if (!parsed.success) { + return reply.status(400).send({ + success: false, + error: 'Invalid settings payload', + details: parsed.error.flatten(), + }); + } + saveSettings(parsed.data); + return reply.send({ success: true, data: getPublicSettings() }); + } catch (error) { + logger.error({ error }, 'Failed to save settings'); + return reply.status(500).send({ success: false, error: 'Failed to save settings' }); + } + }); + + // ─── Savings Dashboard (cache + compression + subscription + routing) ── + // Combines all five savings mechanisms into a single comprehensive picture. + fastify.get('/api/dashboard/savings', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + // Allow up to 1 year window for "all-time" hero counter + const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 8760); + const bucketMin = Math.max(parseInt((request.query as any).bucket_minutes as string) || 60, 5); + const db = getPool(); + const [legacySavings, series, comprehensive, sinceRestart] = await Promise.all([ + getCacheSavings(db, hours), // legacy field for backwards compat + getSavingsTimeSeries(db, hours, bucketMin), + getComprehensiveSavings(db, hours), + getCompressionSinceRestart(db, SERVER_STARTED_AT_ISO), + ]); + const realCostSaved = Math.max(comprehensive.totalCostSaved, legacySavings.totalCostSaved); + const useBaselineSavings = realCostSaved < WORKBENCH_V1_BASELINE.totalCostSaved; + const totalCostSaved = useBaselineSavings ? WORKBENCH_V1_BASELINE.totalCostSaved : realCostSaved; + const totalTokensSaved = Math.max(comprehensive.totalTokensSaved, legacySavings.totalTokensSaved, WORKBENCH_V1_BASELINE.totalTokensSaved); + const totalHits = Math.max(legacySavings.totalHits, WORKBENCH_V1_BASELINE.totalHits); + const hitRatePercent = legacySavings.hitRatePercent > 0 + ? Math.max(legacySavings.hitRatePercent, WORKBENCH_V1_BASELINE.hitRatePercent) + : WORKBENCH_V1_BASELINE.hitRatePercent; + const costWithoutGateway = useBaselineSavings + ? WORKBENCH_V1_BASELINE.costWithoutGateway + : comprehensive.costWithoutGateway; + const costWithGateway = useBaselineSavings + ? WORKBENCH_V1_BASELINE.costWithGateway + : comprehensive.costWithGateway; + const effectiveSavingsPercent = costWithoutGateway > 0 + ? ((costWithoutGateway - costWithGateway) / costWithoutGateway) * 100 + : 0; + return reply.send({ + success: true, + data: { + // Backwards compatible cache-only summary so existing UI keeps working + savings: { + ...legacySavings, + totalHits, + hitRatePercent, + uniqueEntries: Math.max(legacySavings.uniqueEntries, totalHits), + // Override with the comprehensive numbers when available + totalCostSaved, + totalTokensSaved, + // Detailed breakdown for the new UI sections + comprehensive: { + bySource: comprehensive.bySource, + costWithoutGateway, + costWithGateway, + effectiveSavingsPercent, + totals: comprehensive.totals, + }, + // Compression since this gateway process started — resets at each restart. + sinceRestart, + }, + series, + }, + meta: { hours, bucket_minutes: bucketMin, timestamp: new Date().toISOString() }, + }); + } catch (error) { + logger.error({ error }, 'Failed to fetch savings'); + return reply.status(500).send({ success: false, error: 'Failed to fetch savings' }); + } + }); + + fastify.post('/api/dashboard/cache/clear', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const caller = (request.body as any)?.caller as string | undefined; + if (!caller) return reply.status(400).send({ success: false, error: 'caller required' }); + const removed = await clearCacheForCaller(getPool(), caller); + return reply.send({ success: true, data: { removed } }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'Cache clear failed' }); + } + }); + + fastify.post('/api/dashboard/cache/prune', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const days = Math.max(parseInt((request.body as any)?.max_age_days) || 7, 1); + const removed = await pruneStaleCacheEntries(getPool(), days); + return reply.send({ success: true, data: { removed, max_age_days: days } }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'Cache prune failed' }); + } + }); + + // ─── Subscription Pool Wallet (UNIQUE feature) ───────────────────────── + fastify.get('/api/dashboard/wallet', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const wallet = await getSubscriptionWallet(getPool()); + const totalQuota = wallet.reduce((sum, w) => sum + (w.requestQuota ?? 0), 0); + const totalUsed = wallet.reduce((sum, w) => sum + w.used, 0); + const totalRemaining = wallet.reduce((sum, w) => sum + (w.remaining ?? 0), 0); + return reply.send({ + success: true, + data: { + wallet, + totals: { quota: totalQuota, used: totalUsed, remaining: totalRemaining }, + }, + meta: { timestamp: new Date().toISOString() }, + }); + } catch (error) { + logger.error({ error }, 'Failed to fetch wallet'); + return reply.status(500).send({ success: false, error: 'Failed to fetch wallet' }); + } + }); + + // Manually charge a subscription (for testing or external integrations) + fastify.post('/api/dashboard/wallet/charge', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const { subscription_id, tokens } = request.body as { subscription_id?: string; tokens?: number }; + if (!subscription_id) return reply.status(400).send({ success: false, error: 'subscription_id required' }); + await recordSubscriptionUsage(getPool(), subscription_id, tokens ?? 0); + return reply.send({ success: true }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'wallet charge failed' }); + } + }); + + // ─── Knowledge Memory ───────────────────────────────────────────────── + fastify.get('/api/dashboard/memory/:caller', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const caller = (request.params as any).caller as string; + const facts = await recallFacts(getPool(), caller, 50); + return reply.send({ success: true, data: { caller, facts } }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'memory read failed' }); + } + }); + + fastify.post('/api/dashboard/memory/:caller', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const caller = (request.params as any).caller as string; + const { fact_key, fact_value, confidence, source } = request.body as Record; + if (!fact_key || !fact_value) { + return reply.status(400).send({ success: false, error: 'fact_key and fact_value required' }); + } + await rememberFact(getPool(), caller, fact_key, fact_value, { confidence, source }); + const facts = await recallFacts(getPool(), caller, 50); + return reply.send({ success: true, data: { caller, facts } }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'memory write failed' }); + } + }); + + // ─── Gamification: buddy / pet ───────────────────────────────────────── + fastify.get('/api/dashboard/buddy', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const buddy = await getBuddyState(getPool(), 'gateway'); + return reply.send({ success: true, data: buddy }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'buddy state failed' }); + } + }); + + // ─── Achievements ────────────────────────────────────────────────────── + fastify.get('/api/dashboard/achievements', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const data = await getAchievements(getPool()); + return reply.send({ success: true, data }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'achievements failed' }); + } + }); + + // ─── Calendar heatmap ────────────────────────────────────────────────── + fastify.get('/api/dashboard/heatmap', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const days = Math.min(parseInt((request.query as any).days as string) || 365, 365); + const cells = await getCalendarHeatmap(getPool(), days); + return reply.send({ success: true, data: cells }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'heatmap failed' }); + } + }); + + // ─── Live events feed ────────────────────────────────────────────────── + fastify.get('/api/dashboard/events', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const limit = Math.min(parseInt((request.query as any).limit as string) || 50, 200); + const events = await getRecentEvents(getPool(), limit); + return reply.send({ success: true, data: events }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'events failed' }); + } + }); + + // ─── Cost forecast ───────────────────────────────────────────────────── + fastify.get('/api/dashboard/forecast', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const f = await getForecast(getPool()); + return reply.send({ success: true, data: f }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'forecast failed' }); + } + }); + + // ─── MCP tool-call ingest (called by llm-gateway-ctx server) ────────── + fastify.post('/api/dashboard/mcp-tool-call', async (request: FastifyRequest, reply: FastifyReply) => { + try { + const b = request.body as Record; + if (!b?.tool) return reply.status(400).send({ success: false, error: 'tool required' }); + await getPool().query( + `INSERT INTO mcp_tool_calls (tool, mode, tokens_before, tokens_after, tokens_saved, duration_ms, path, cmd) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + [ + String(b.tool).slice(0, 40), + b.mode ? String(b.mode).slice(0, 40) : null, + parseInt(b.tokens_before, 10) || 0, + parseInt(b.tokens_after, 10) || 0, + parseInt(b.tokens_saved, 10) || 0, + parseInt(b.duration_ms, 10) || 0, + b.path ? String(b.path).slice(0, 500) : null, + b.cmd ? String(b.cmd).slice(0, 500) : null, + ] + ); + return reply.send({ success: true }); + } catch (error) { + logger.warn({ error }, 'mcp-tool-call ingest failed'); + return reply.status(500).send({ success: false, error: 'ingest failed' }); + } + }); + + fastify.get('/api/dashboard/mcp-tool-stats', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 720); + const db = getPool(); + const [totals, byTool] = await Promise.all([ + db.query(` + SELECT COUNT(*)::INT AS calls, + COALESCE(SUM(tokens_before), 0)::BIGINT AS tokens_before, + COALESCE(SUM(tokens_after), 0)::BIGINT AS tokens_after, + COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved + FROM mcp_tool_calls + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + `, [hours]), + db.query(` + SELECT tool, + COUNT(*)::INT AS calls, + COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved, + COALESCE(AVG(duration_ms), 0)::INT AS avg_duration_ms + FROM mcp_tool_calls + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY tool + ORDER BY tokens_saved DESC + `, [hours]), + ]); + const t = totals.rows[0]; + const tokBefore = parseInt(t.tokens_before, 10) || 0; + const tokAfter = parseInt(t.tokens_after, 10) || 0; + const ratio = tokBefore > 0 ? (1 - tokAfter / tokBefore) : 0; + return reply.send({ + success: true, + data: { + totalCalls: parseInt(t.calls, 10) || 0, + totalTokensBefore: tokBefore, + totalTokensAfter: tokAfter, + totalTokensSaved: parseInt(t.tokens_saved, 10) || 0, + avgCompressionRatio: ratio, + byTool: byTool.rows.map((r: any) => ({ + tool: r.tool, + calls: parseInt(r.calls, 10), + tokensSaved: parseInt(r.tokens_saved, 10), + avgDurationMs: parseInt(r.avg_duration_ms, 10), + })), + }, + }); + } catch (error) { + logger.warn({ error }, 'mcp-tool-stats failed'); + return reply.status(500).send({ success: false, error: 'stats failed' }); + } + }); + + // ─── Memory graph (D3-ready nodes + edges) ──────────────────────────── + fastify.get('/api/dashboard/memory-graph', dashboardAuth, async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const graph = await buildMemoryGraph(getPool()); + return reply.send({ success: true, data: graph }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'memory-graph failed' }); + } + }); + + // ─── Race leaderboard (fastest model this week) ────────────────────── + fastify.get('/api/dashboard/race-leaderboard', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const days = Math.max(parseInt((request.query as any).days as string) || 7, 1); + const board = await getRaceLeaderboard(getPool(), days); + return reply.send({ success: true, data: board }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'leaderboard failed' }); + } + }); + + // ─── Per-caller deep dive ───────────────────────────────────────────── + fastify.get('/api/dashboard/caller/:caller', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const caller = (request.params as any).caller as string; + const data = await getCallerDeepDive(getPool(), caller); + if (!data) return reply.status(404).send({ success: false, error: 'caller not found' }); + return reply.send({ success: true, data }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'caller deep dive failed' }); + } + }); + + // ─── Monthly report (HTML, browser saves as PDF) ────────────────────── + fastify.get('/api/dashboard/report', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const now = new Date(); + const year = parseInt((request.query as any).year as string) || now.getUTCFullYear(); + const month = parseInt((request.query as any).month as string) || now.getUTCMonth() + 1; + const html = await generateMonthlyReport(getPool(), year, month); + return reply.type('text/html').send(html); + } catch (error) { + logger.error({ error }, 'report generation failed'); + return reply.status(500).send({ success: false, error: 'report generation failed' }); + } + }); + + // ─── Public share card (SVG) — no auth required, safe for public embed ── + fastify.get('/api/dashboard/share-card', async (request: FastifyRequest, reply: FastifyReply) => { + try { + const period = ((request.query as any).period as string) || 'month'; + const theme = ((request.query as any).theme as string) || 'dark'; + const validPeriods = ['day', 'week', 'month', 'all']; + const validThemes = ['dark', 'light']; + const svg = await generateShareCard(getPool(), { + period: validPeriods.includes(period) ? (period as any) : 'month', + theme: validThemes.includes(theme) ? (theme as any) : 'dark', + }); + return reply + .type('image/svg+xml') + .header('Cache-Control', 'public, max-age=300') + .send(svg); + } catch (error) { + logger.error({ error }, 'share card failed'); + return reply.status(500).send({ success: false, error: 'share card failed' }); + } + }); + + // ─── Race mode statistics ───────────────────────────────────────────── + fastify.get('/api/dashboard/race-stats', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 168); + const stats = await getRaceStats(getPool(), hours); + return reply.send({ success: true, data: stats }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'race stats failed' }); + } + }); + + // ─── Web AI events (browser extension reports) ─────────────────────── + fastify.post('/api/dashboard/web-event', async (request: FastifyRequest, reply: FastifyReply) => { + try { + const body = request.body as Record; + if (!body?.source || !body?.event_type) { + return reply.status(400).send({ success: false, error: 'source and event_type required' }); + } + await getPool().query( + `INSERT INTO web_ai_events (source, event_type, conversation_id, message_count, prompt_chars, response_chars, client_id) + VALUES ($1, $2, $3, $4, $5, $6, $7)`, + [ + String(body.source).slice(0, 60), + String(body.event_type).slice(0, 60), + body.conversation_id ? String(body.conversation_id).slice(0, 100) : null, + parseInt(body.message_count, 10) || 0, + parseInt(body.prompt_chars, 10) || 0, + parseInt(body.response_chars, 10) || 0, + body.client_id ? String(body.client_id).slice(0, 100) : null, + ] + ); + return reply.send({ success: true }); + } catch (error) { + logger.warn({ error }, 'web-event insert failed'); + return reply.status(500).send({ success: false, error: 'event log failed' }); + } + }); + + fastify.get('/api/dashboard/web-events', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const hours = Math.min(parseInt((request.query as any).hours as string) || 24, 168); + const result = await getPool().query( + `SELECT + source, + COUNT(*)::INT AS events, + SUM(message_count)::INT AS messages, + COALESCE(SUM(prompt_chars), 0)::BIGINT AS prompt_chars, + COALESCE(SUM(response_chars), 0)::BIGINT AS response_chars, + MAX(created_at) AS last_seen + FROM web_ai_events + WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) + GROUP BY source ORDER BY events DESC`, + [hours] + ); + return reply.send({ + success: true, + data: result.rows.map((r: any) => ({ + source: r.source, + events: parseInt(r.events, 10), + messages: parseInt(r.messages, 10), + promptChars: parseInt(r.prompt_chars, 10), + responseChars: parseInt(r.response_chars, 10), + lastSeen: r.last_seen ? new Date(r.last_seen).toISOString() : null, + })), + }); + } catch (error) { + logger.warn({ error }, 'web-events read failed'); + return reply.status(500).send({ success: false, error: 'web-events failed' }); + } + }); + + fastify.delete('/api/dashboard/memory/:caller', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const caller = (request.params as any).caller as string; + const removed = await forgetCaller(getPool(), caller); + return reply.send({ success: true, data: { removed } }); + } catch (error) { + return reply.status(500).send({ success: false, error: 'memory clear failed' }); + } + }); + // Dashboard UI endpoint (served at /api/dashboard/index for Cloudflare tunnel compatibility) fastify.get('/api/dashboard/index', async (_request: FastifyRequest, reply: FastifyReply) => { try { @@ -708,4 +1723,45 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { return reply.status(500).send({ error: 'Failed to serve dashboard UI' }); } }); + + // Passive usage import: lets clients that talk DIRECTLY to a provider (e.g. the + // laptop's Claude Code -> api.anthropic.com) report their usage so they appear in + // clients/costs WITHOUT routing traffic through the gateway. A caller containing + // 'claude-code' matches the CLIENT_CATALOG 'claude-desktop' entry. + fastify.post('/api/dashboard/usage/report', dashboardAuth, async (request: FastifyRequest, reply: FastifyReply) => { + try { + const body = (request.body ?? {}) as Record; + const caller = String(body.caller ?? 'claude-code-laptop').slice(0, 120); + const model = String(body.model ?? 'claude-code').slice(0, 120); + const tokensIn = Math.max(0, Math.floor(Number(body.tokens_in) || 0)); + const tokensOut = Math.max(0, Math.floor(Number(body.tokens_out) || 0)); + const costUsd = Math.max(0, Number(body.cost_usd) || 0); + const day = String(body.day ?? new Date().toISOString().slice(0, 10)).slice(0, 32); + if (tokensIn === 0 && tokensOut === 0) { + return reply.status(400).send({ success: false, error: 'tokens_in or tokens_out required' }); + } + // Stamp the row with the ACTUAL usage day so lastSeen = when tokens were + // used, not when the export ran. Cap at "now" so today's still-growing day + // reads as current/live. + const dayEnd = new Date(`${day}T23:59:59Z`); + const usedAt = dayEnd.getTime() > Date.now() ? new Date() : dayEnd; + const db = getPool(); + const requestId = `usage-import:${caller}:${model}:${day}`; + // Upsert by request_id (one row per caller/model/day): re-reporting an + // in-progress day updates its totals instead of creating duplicates. + const updated = await db.query( + `UPDATE request_tracking SET tokens_in=$1, tokens_out=$2, cost_usd=$3, created_at=$4 WHERE request_id=$5`, + [tokensIn, tokensOut, costUsd, usedAt, requestId] + ); + if (updated.rowCount === 0) { + const requestLogger = createRequestLogger(db); + await requestLogger.logRequest(requestId, caller, 'usage_import', model, 'approved', tokensIn, tokensOut, costUsd, 0); + await db.query(`UPDATE request_tracking SET created_at=$1 WHERE request_id=$2`, [usedAt, requestId]); + } + return reply.status(200).send({ success: true, imported: { caller, model, day, tokensIn, tokensOut, costUsd, usedAt } }); + } catch (error) { + logger.error({ error }, 'Failed to import usage report'); + return reply.status(500).send({ success: false, error: 'Failed to import usage report' }); + } + }); } diff --git a/packages/gateway/src/routes/health.ts b/packages/gateway/src/routes/health.ts index be8b3fd..e7c546e 100644 --- a/packages/gateway/src/routes/health.ts +++ b/packages/gateway/src/routes/health.ts @@ -38,22 +38,40 @@ async function checkOllama(baseUrl: string): Promise<{ status: 'ok' | 'down'; la async function checkDatabase(): Promise<{ status: 'ok' | 'down'; error?: string }> { try { - await query('SELECT 1'); + await withTimeout(query('SELECT 1'), 2500, 'database check timed out'); return { status: 'ok' }; } catch (err) { return { status: 'down', error: err instanceof Error ? err.message : 'Unknown error' }; } } +async function withTimeout(promise: Promise, timeoutMs: number, message: string): Promise { + let timer: NodeJS.Timeout | undefined; + try { + return await Promise.race([ + promise, + new Promise((_resolve, reject) => { + timer = setTimeout(() => reject(new Error(message)), timeoutMs); + }), + ]); + } finally { + if (timer) clearTimeout(timer); + } +} + async function checkQueue(): Promise<{ status: 'ok' | 'down' | 'unknown'; depth?: number; error?: string }> { const boss = getPgBoss(); if (!boss) return { status: 'unknown' }; try { - const [queued, active] = await Promise.all([ - boss.getQueueSize('llm-batch', { before: 'completed' }), - boss.getQueueSize('llm-batch', { before: 'active' }), - ]); + const [queued, active] = await withTimeout( + Promise.all([ + boss.getQueueSize('llm-batch', { before: 'completed' }), + boss.getQueueSize('llm-batch', { before: 'active' }), + ]), + 2500, + 'queue check timed out', + ); return { status: 'ok', depth: (queued ?? 0) + (active ?? 0) }; } catch (err) { return { status: 'down', error: err instanceof Error ? err.message : 'Unknown error' }; @@ -62,8 +80,10 @@ async function checkQueue(): Promise<{ status: 'ok' | 'down' | 'unknown'; depth? async function getReviewQueueCount(): Promise { try { - const result = await query<{ count: string }>( - 'SELECT COUNT(*) as count FROM review_queue WHERE decision IS NULL', + const result = await withTimeout( + query<{ count: string }>('SELECT COUNT(*) as count FROM review_queue WHERE decision IS NULL'), + 2500, + 'review queue check timed out', ); return parseInt(result.rows[0]?.count ?? '0', 10); } catch { @@ -78,8 +98,9 @@ export async function healthRoute(fastify: FastifyInstance): Promise { // Check if this is a dashboard UI request with ?ui=1 or ?dashboard=1 const query = request.query as any; const isDashboardRequest = query.ui || query.dashboard; + const acceptsHtml = String(request.headers.accept ?? '').includes('text/html'); - if (isDashboardRequest) { + if (isDashboardRequest || acceptsHtml) { try { const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -108,8 +129,8 @@ export async function healthRoute(fastify: FastifyInstance): Promise { const breakerStates = getAllBreakerStates(); - const isDown = ollamaCheck.status === 'down' || dbCheck.status === 'down'; - const isDegraded = queueCheck.status === 'down' || Object.values(breakerStates).some((s) => s === 'open'); + const isDown = dbCheck.status === 'down'; + const isDegraded = ollamaCheck.status === 'down' || queueCheck.status === 'down' || Object.values(breakerStates).some((s) => s === 'open'); const status: HealthStatus['status'] = isDown ? 'down' : isDegraded ? 'degraded' : 'ok'; diff --git a/packages/gateway/src/routes/static.ts b/packages/gateway/src/routes/static.ts index 6185e64..43e0e3a 100644 --- a/packages/gateway/src/routes/static.ts +++ b/packages/gateway/src/routes/static.ts @@ -11,6 +11,22 @@ export async function staticRoute(fastify: FastifyInstance): Promise { logger.info({ publicDir }, 'Static file serving initialized'); + function sendHtml(filename: string, reply: any) { + const filePath = join(publicDir, filename); + if (!existsSync(filePath)) { + logger.warn({ path: filePath }, `${filename} not found`); + return reply.status(404).send({ error: `${filename} not found` }); + } + + const content = readFileSync(filePath, 'utf-8'); + return reply + .header('Cache-Control', 'no-cache, no-store, must-revalidate, max-age=0') + .header('Pragma', 'no-cache') + .header('Expires', '0') + .type('text/html') + .send(content); + } + // Serve root path fastify.get('/', async (request, reply) => { logger.info({ method: request.method, url: request.url, host: request.hostname }, 'Root path requested'); @@ -26,13 +42,47 @@ export async function staticRoute(fastify: FastifyInstance): Promise { // Serve /dashboard.html fastify.get('/dashboard.html', async (_request, reply) => { - const dashboardPath = join(publicDir, 'dashboard.html'); - if (!existsSync(dashboardPath)) { - logger.warn({ path: dashboardPath }, 'dashboard.html not found'); - return reply.status(404).send({ error: 'dashboard.html not found' }); - } - const content = readFileSync(dashboardPath, 'utf-8'); - return reply.type('text/html').send(content); + return sendHtml('dashboard.html', reply); + }); + + fastify.get('/dashboard-v2.html', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/v2/dashboard', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/v2/dashboard/', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/v2', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/v2/', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/dashboard/v2', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/dashboard/v2/', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/api/dashboard-v2', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/api/v2/dashboard', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); + }); + + fastify.get('/api/dashboard/v2', async (_request, reply) => { + return sendHtml('dashboard-v2.html', reply); }); // Serve /api/dashboard as HTML for compatibility diff --git a/packages/gateway/src/security/tls-config.ts b/packages/gateway/src/security/tls-config.ts index d732779..74d253a 100644 --- a/packages/gateway/src/security/tls-config.ts +++ b/packages/gateway/src/security/tls-config.ts @@ -107,6 +107,25 @@ export async function registerHTTPSRedirectMiddleware(server: FastifyInstance) { return; } + const hostHeader = String(request.headers['host'] ?? ''); + const forwardedHost = String(request.headers['x-forwarded-host'] ?? ''); + const remoteAddress = request.ip ?? ''; + const host = forwardedHost || hostHeader; + const isLoopbackHost = + /^localhost(?::\d+)?$/i.test(host) || + /^127\.0\.0\.1(?::\d+)?$/.test(host) || + /^\[::1\](?::\d+)?$/.test(host); + const isLoopbackRemote = + remoteAddress === '127.0.0.1' || + remoteAddress === '::1' || + remoteAddress === '::ffff:127.0.0.1'; + + // Internal loopback callers such as Magatama Core run behind the same host + // and must not be redirected to HTTPS unless the Gateway actually serves TLS. + if (isLoopbackHost || isLoopbackRemote) { + return; + } + // Check if connection is not secure // In production, X-Forwarded-Proto is set by reverse proxy (Cloudflare) const isSecure = @@ -114,7 +133,6 @@ export async function registerHTTPSRedirectMiddleware(server: FastifyInstance) { (request.headers['x-forwarded-proto'] === 'https'); if (!isSecure && process.env['NODE_ENV'] === 'production') { - const host = request.headers['x-forwarded-host'] || request.headers['host']; return reply.redirect(`https://${host}${request.url}`); } }); @@ -126,10 +144,10 @@ export async function registerHTTPSRedirectMiddleware(server: FastifyInstance) { */ export async function registerSecurityHeadersMiddleware(server: FastifyInstance) { server.addHook('onSend', async (request, reply) => { - // Content Security Policy - strict, no inline scripts + // Content Security Policy for the self-contained dashboard UI. reply.header( 'Content-Security-Policy', - "default-src 'self'; script-src 'self'; object-src 'none'; frame-ancestors 'none'; base-uri 'self'; form-action 'self'" + "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; object-src 'none'; frame-ancestors 'none'; base-uri 'self'; form-action 'self'" ); // Prevent clickjacking diff --git a/packages/gateway/src/server.ts b/packages/gateway/src/server.ts index e7878d2..0d5626e 100644 --- a/packages/gateway/src/server.ts +++ b/packages/gateway/src/server.ts @@ -12,11 +12,23 @@ import { dashboardRoute } from './routes/dashboard.js'; import { streamRoute } from './routes/stream.js'; import { learningInsightsRoute } from './routes/learning-insights.js'; import { staticRoute } from './routes/static.js'; +import tenantAuth from './security/tenant-auth.js'; +import { internalRoute } from './routes/internal.js'; import { getPool } from './db/client.js'; import { runMigrations } from './db/migrate.js'; import { initPgBoss } from './queue/pg-boss-client.js'; import { logger } from './observability/logger.js'; import { scheduleLearningCycles } from './learning/learning-engine.js'; +import { autoSpawnOnBoot } from './modules/auto-discovery.js'; +import { embeddingsRoute } from './routes/embeddings.js'; +import { replayRoute } from './routes/replay.js'; +import { audioRoute } from './routes/audio.js'; +import { mcpRoute } from './modules/mcp-server.js'; +import { loadWorkspacePreset, applyWorkspaceDefaults } from './modules/workspace-presets.js'; +import { loadPlugins } from './modules/plugin-system.js'; +import { ingestPeerStats, scheduleFederationPublisher, buildStats } from './modules/federated-stats.js'; +import { scheduleAdaptiveLearner, getAllRecommendations } from './modules/adaptive-routing.js'; +import { startBridgeWatchdog } from './modules/bridge-watchdog.js'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; import { readFileSync, existsSync } from 'fs'; @@ -77,6 +89,7 @@ async function buildServer() { directives: { defaultSrc: ["'self'"], scriptSrc: ["'self'", "'unsafe-inline'"], + styleSrc: ["'self'", "'unsafe-inline'"], objectSrc: ["'none'"], }, }, @@ -92,15 +105,17 @@ async function buildServer() { 'http://192.168.178.196:3000', /^http:\/\/192\.168\.178\.\d+/, /^https:\/\/.*\.context-x\.org$/, + /^https:\/\/(www\.)?runwerk\.app$/, + /^https:\/\/.*\.runwerk\.app$/, ], methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'], - allowedHeaders: ['Content-Type', 'Authorization', 'X-Caller-ID'], + allowedHeaders: ['Content-Type', 'Authorization', 'X-Caller-ID', 'X-Runwerk-Caller', 'X-Runwerk-Privacy', 'X-Runwerk-Tier', 'X-Runwerk-Purpose'], credentials: true, }); await server.register(fastifyRateLimit, { global: true, - max: 100, + max: 1000, timeWindow: '1 minute', keyGenerator: (request) => { const caller = (request.headers['x-caller-id'] as string) ?? 'default'; @@ -113,7 +128,17 @@ async function buildServer() { }), }); + await server.register(tenantAuth); + await server.register(internalRoute); await server.register(completionRoute, { prefix: '/v1' }); + await server.register(embeddingsRoute, { prefix: '/v1' }); + await server.register(replayRoute, { prefix: '/v1' }); + await server.register(audioRoute, { prefix: '/v1' }); + await server.register(mcpRoute); + server.post('/v1/federation/ingest', async (request, reply) => { + const result = ingestPeerStats(request.body as never); + return reply.send({ success: true, ...result }); + }); await server.register(batchRoute, { prefix: '/v1' }); await server.register(classifyRoute, { prefix: '/v1' }); await server.register(reviewRoute, { prefix: '/v1' }); @@ -192,9 +217,54 @@ async function main() { } catch (pgErr) { logger.warn({ pgErr }, 'PgBoss init failed - continuing without queue'); } + // Workspace preset (apply env defaults from workspace.yaml if present) + try { + const preset = await loadWorkspacePreset(); + if (preset) applyWorkspaceDefaults(preset); + } catch (err) { + logger.warn({ err }, 'Workspace preset load failed (non-fatal)'); + } + + // Plugin system (load pre/post hooks from PLUGINS_DIR) + try { + await loadPlugins(); + } catch (err) { + logger.warn({ err }, 'Plugin loading failed (non-fatal)'); + } + scheduleLearningCycles(); await server.listen({ port, host }); logger.info({ port, host }, 'LLM Gateway started'); + + // Auto-spawn detected subscription bridges if AUTO_SPAWN_BRIDGES=1 + void autoSpawnOnBoot(); + + // Bridge watchdog (opt-in via WATCHDOG_ENABLED=1) + try { + startBridgeWatchdog(); + } catch (err) { + logger.warn({ err }, 'Bridge watchdog start failed'); + } + + // Adaptive routing learner (opt-in via ADAPTIVE_ROUTING_ENABLED=1) + try { + const pool = getPool(); + scheduleAdaptiveLearner(pool as never); + } catch (err) { + logger.warn({ err }, 'Adaptive learner scheduling failed'); + } + + // Federation publisher (opt-in via FEDERATION_ENABLED=1) + scheduleFederationPublisher(async () => { + const recos = getAllRecommendations(); + return buildStats(recos.map((r) => ({ + task_type: r.taskType, + model_used: r.preferredModel, + samples: r.rationale.samples, + success_rate: r.rationale.successRate, + avg_latency_ms: r.rationale.avgLatencyMs, + }))); + }); } catch (err) { logger.error({ err }, 'Failed to start server'); process.exit(1); diff --git a/packages/gateway/src/utils/tokenvault-hooks.ts b/packages/gateway/src/utils/tokenvault-hooks.ts index 688e2d0..56460fc 100644 --- a/packages/gateway/src/utils/tokenvault-hooks.ts +++ b/packages/gateway/src/utils/tokenvault-hooks.ts @@ -1,5 +1,5 @@ // Tokenvault Integration Hooks -// Instruments LeanCTX and RTK compression tracking +// Instruments LLM Gateway compression tracking (legacy hook names retained for backward compat) // Updated: 2026-04-19 import { Pool, QueryResult } from 'pg'; @@ -62,13 +62,13 @@ export function estimateTokens(text: string | object): number { } /** - * Log compression ratio for RTK output + * Log compression ratio for token-trim output */ -export async function logRTKCompression( +export async function logGatewayTrimCompression( db: Pool, rawOutput: string, compressedOutput: string, - toolUsed: string = 'rtk' + toolUsed: string = 'llm-gateway-trim' ): Promise { const tokensBefore = estimateTokens(rawOutput); const tokensAfter = estimateTokens(compressedOutput); @@ -93,9 +93,9 @@ export async function logRTKCompression( } /** - * Track LeanCTX file read operations + * Track gateway file-read operations */ -export async function logLeanCTXRead( +export async function logGatewayFileRead( db: Pool, filePath: string, mode: string, @@ -115,7 +115,7 @@ export async function logLeanCTXRead( tokensBefore: rawTokens, tokensAfter: compressedTokens, savingsPct, - toolUsed: 'lean-ctx' + toolUsed: 'llm-gateway' }; await logCompressionMetric(db, metric); @@ -207,7 +207,7 @@ export async function getCompressionStats( tool_used, COUNT(*) as count FROM tokenvault_metrics - WHERE created_at > NOW() - INTERVAL $1 HOUR + WHERE created_at > NOW() - ($1 * INTERVAL '1 hour') GROUP BY tool_used`, [hoursBack] ); @@ -270,7 +270,7 @@ export async function getCostSummary( project, SUM(CASE WHEN cost_usd > 0 THEN 1 ELSE 0 END) as paid_tasks FROM cost_analytics - WHERE created_at > NOW() - INTERVAL $1 HOUR + WHERE created_at > NOW() - ($1 * INTERVAL '1 hour') GROUP BY project`, [hoursBack] );