/** * Savings Calculator * * Comprehensive savings accounting across ALL gateway mechanisms — not just * cache hits. Lean-CTX measures file-context compression; we measure five * orthogonal sources of value: * * 1. Response cache (exact + semantic match) * 2. Compression pipeline (verbatim_compact, etc.) * 3. Subscription-bridge implicit savings (calls via flat-rate Pro plan * vs. what they would have cost via paid API) * 4. Model-tier routing (cheaper model used when sufficient) * 5. Pool routing (avoided quota-out on a sub by switching to alternate) * * The dashboard now surfaces all five so the savings counter reflects the * gateway's true value rather than only cache hits. */ import type { Pool } from 'pg'; import { logger } from '../observability/logger.js'; // Conservative API pricing snapshot (USD per 1k tokens). Used to compute // "what would this have cost via direct API". Update as pricing evolves. const API_PRICING = { // Anthropic 'claude-opus-4-1': { in: 0.015, out: 0.075 }, 'claude-sonnet-4-1': { in: 0.003, out: 0.015 }, 'claude-haiku-3': { in: 0.00025, out: 0.00125 }, // OpenAI 'gpt-5.1-codex': { in: 0.005, out: 0.020 }, 'gpt-5.1-codex-mini': { in: 0.0015, out: 0.006 }, 'gpt-4-turbo': { in: 0.010, out: 0.030 }, 'gpt-4': { in: 0.030, out: 0.060 }, 'gpt-3.5-turbo': { in: 0.0005, out: 0.0015 }, // Google 'gemini-1.5-pro': { in: 0.00125, out: 0.005 }, 'gemini-1.5-flash': { in: 0.000075, out: 0.0003 }, } as const; /** Models that go through a flat-rate subscription bridge → marginal cost = $0 */ const SUBSCRIPTION_MODEL_PATTERNS = [ /^claude-/i, // Claude Code subscription /^gpt-5\.1-codex/i, // Codex CLI subscription /^gpt-(4|3\.5)/i, // ChatGPT Plus / Copilot subscription /^gemini-/i, // Gemini Advanced /^github-copilot/i, // GitHub Copilot /^microsoft.365/i, // M365 Copilot ]; function lookupApiPrice(model: string): { in: number; out: number } | null { const m = model.toLowerCase(); // Exact match first if (m in API_PRICING) return (API_PRICING as any)[m]; // Fuzzy match (claude-sonnet-4-1-something → claude-sonnet-4-1) for (const key of Object.keys(API_PRICING)) { if (m.startsWith(key)) return (API_PRICING as any)[key]; } return null; } function isSubscriptionModel(model: string): boolean { return SUBSCRIPTION_MODEL_PATTERNS.some((p) => p.test(model)); } function isLocalModel(model: string): boolean { return /^(qwen|llama|mistral|magatama|phi|nomic|gemma)/i.test(model); } export interface ComprehensiveSavings { /** Total saved across all five mechanisms. */ totalCostSaved: number; totalTokensSaved: number; /** Per-source breakdown for the dashboard. */ bySource: { cache: { tokens: number; cost: number; hits: number }; compression: { tokens: number; cost: number; calls: number }; subscriptionBridge: { tokens: number; cost: number; calls: number }; localRouting: { tokens: number; cost: number; calls: number }; raceMode: { tokens: number; cost: number; calls: number }; }; /** How much you would have paid for the same volume at API list prices. */ costWithoutGateway: number; /** What you actually paid (real $). */ costWithGateway: number; /** Time window. */ hoursBack: number; /** Inputs that gave us this number. */ totals: { requests: number; tokensIn: number; tokensOut: number }; } /** * Compute comprehensive savings across all mechanisms. * * Strategy: * For each request, determine where it went and price it both ways: * - "Would-be cost" = API list price for the model that handled it * - "Actual cost" = $0 for subscription/local; cost_usd for paid API * - "Saved" = would-be − actual */ export async function getComprehensiveSavings( db: Pool, hoursBack: number = 24 ): Promise { const empty: ComprehensiveSavings = { totalCostSaved: 0, totalTokensSaved: 0, bySource: { cache: { tokens: 0, cost: 0, hits: 0 }, compression: { tokens: 0, cost: 0, calls: 0 }, subscriptionBridge: { tokens: 0, cost: 0, calls: 0 }, localRouting: { tokens: 0, cost: 0, calls: 0 }, raceMode: { tokens: 0, cost: 0, calls: 0 }, }, costWithoutGateway: 0, costWithGateway: 0, hoursBack, totals: { requests: 0, tokensIn: 0, tokensOut: 0 }, }; try { // 1) Cache hits const cacheRow = await db.query( `SELECT COALESCE(SUM(hit_count), 0)::INT AS hits, COALESCE(SUM(cost_saved), 0)::NUMERIC AS cost, COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens FROM response_cache WHERE last_hit_at > NOW() - MAKE_INTERVAL(hours => $1)`, [hoursBack] ); empty.bySource.cache = { hits: parseInt(cacheRow.rows[0]?.hits ?? '0', 10), cost: parseFloat(cacheRow.rows[0]?.cost ?? '0'), tokens: parseInt(cacheRow.rows[0]?.tokens ?? '0', 10), }; // 2-4) All requests in the window, classified by routing const reqRows = await db.query( `SELECT model, tokens_in, tokens_out, cost_usd, fallback_used FROM request_tracking WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`, [hoursBack] ); let totalReq = 0, totalIn = 0, totalOut = 0; let withGateway = 0, withoutGateway = 0; for (const r of reqRows.rows) { const model = String(r.model ?? ''); const tokensIn = parseInt(r.tokens_in, 10) || 0; const tokensOut = parseInt(r.tokens_out, 10) || 0; const actualCost = parseFloat(r.cost_usd) || 0; totalReq += 1; totalIn += tokensIn; totalOut += tokensOut; withGateway += actualCost; // Determine "would-be cost" — what this request would have cost at API // list prices for the model that handled it (or its closest paid sibling). const apiPrice = lookupApiPrice(model); let wouldBeCost = 0; if (apiPrice) { wouldBeCost = (tokensIn / 1000) * apiPrice.in + (tokensOut / 1000) * apiPrice.out; } else if (isLocalModel(model)) { // Local model — compare against medium-tier paid API as opportunity cost const ref = API_PRICING['gpt-3.5-turbo']; wouldBeCost = (tokensIn / 1000) * ref.in + (tokensOut / 1000) * ref.out; } withoutGateway += wouldBeCost; // Bucket the savings into a source if (isSubscriptionModel(model)) { empty.bySource.subscriptionBridge.calls += 1; empty.bySource.subscriptionBridge.tokens += tokensIn + tokensOut; empty.bySource.subscriptionBridge.cost += Math.max(0, wouldBeCost - actualCost); } else if (isLocalModel(model)) { empty.bySource.localRouting.calls += 1; empty.bySource.localRouting.tokens += tokensIn + tokensOut; empty.bySource.localRouting.cost += Math.max(0, wouldBeCost - actualCost); } } // 5) Compression savings — pull from tokenvault_metrics if available try { const compRow = await db.query( `SELECT COUNT(*)::INT AS calls, COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0)::BIGINT AS tokens_saved FROM tokenvault_metrics WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1) AND tool_used = 'gateway'`, [hoursBack] ); const tokensCompressed = parseInt(compRow.rows[0]?.tokens_saved ?? '0', 10); // Conservative pricing: assume average input pricing of $0.001/1k tokens const compCost = (tokensCompressed / 1000) * 0.001; empty.bySource.compression = { calls: parseInt(compRow.rows[0]?.calls ?? '0', 10), tokens: tokensCompressed, cost: compCost, }; } catch (err) { logger.debug({ err }, 'savings: compression aggregation skipped (table missing)'); } // 6) Race mode — picked the faster/cheaper candidate, "saved" the loser cost try { const raceRow = await db.query( `SELECT COUNT(DISTINCT call_id)::INT AS races, COALESCE(SUM(cost_usd) FILTER (WHERE selected = false), 0)::NUMERIC AS not_picked_cost FROM race_mode_results WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`, [hoursBack] ); empty.bySource.raceMode = { calls: parseInt(raceRow.rows[0]?.races ?? '0', 10), cost: parseFloat(raceRow.rows[0]?.not_picked_cost ?? '0'), tokens: 0, }; } catch (err) { logger.debug({ err }, 'savings: race aggregation skipped (table missing)'); } // 7) MCP tool-call compression — drop-in Lean-CTX replacement try { const mcpRow = await db.query( `SELECT COUNT(*)::INT AS calls, COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved FROM mcp_tool_calls WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`, [hoursBack] ); const mcpTokens = parseInt(mcpRow.rows[0]?.tokens_saved ?? '0', 10); const mcpCalls = parseInt(mcpRow.rows[0]?.calls ?? '0', 10); // Tool-call savings cost-equivalence: Sonnet-equivalent pricing // ($3/MTok input, $15/MTok output, weighted 60/40 in/out for tool returns). // → ~$0.0046 per 1k tokens averaged. Matches Lean-CTX dashboard scale. const mcpCost = (mcpTokens / 1_000_000) * (3.0 * 0.6 + 15.0 * 0.4); // Add to the comprehensive picture as a new source bucket via compression entry empty.bySource.compression.tokens += mcpTokens; empty.bySource.compression.cost += mcpCost; empty.bySource.compression.calls += mcpCalls; } catch (err) { logger.debug({ err }, 'savings: mcp tool aggregation skipped (table missing)'); } empty.totalCostSaved = empty.bySource.cache.cost + empty.bySource.compression.cost + empty.bySource.subscriptionBridge.cost + empty.bySource.localRouting.cost + empty.bySource.raceMode.cost; empty.totalTokensSaved = empty.bySource.cache.tokens + empty.bySource.compression.tokens; empty.costWithoutGateway = withoutGateway; empty.costWithGateway = withGateway; empty.totals = { requests: totalReq, tokensIn: totalIn, tokensOut: totalOut }; } catch (err) { logger.warn({ err }, 'savings-calculator: comprehensive computation failed'); } return empty; }