llm-gateway/packages/gateway/src/modules/savings-calculator.ts

/**
 * Savings Calculator
 *
 * Comprehensive savings accounting across ALL gateway mechanisms — not just
 * cache hits. Lean-CTX measures file-context compression; we measure five
 * orthogonal sources of value:
 *
 *   1. Response cache (exact + semantic match)
 *   2. Compression pipeline (verbatim_compact, etc.)
 *   3. Subscription-bridge implicit savings (calls via flat-rate Pro plan
 *      vs. what they would have cost via paid API)
 *   4. Model-tier routing (cheaper model used when sufficient)
 *   5. Pool routing (avoided quota-out on a sub by switching to alternate)
 *
 * The dashboard now surfaces all five so the savings counter reflects the
 * gateway's true value rather than only cache hits.
 */
import type { Pool } from 'pg';
import { logger } from '../observability/logger.js';

// Conservative API pricing snapshot (USD per 1k tokens). Used to compute
// "what would this have cost via direct API". Update as pricing evolves.
const API_PRICING = {
  // Anthropic
  'claude-opus-4-1':       { in: 0.015,  out: 0.075 },
  'claude-sonnet-4-1':     { in: 0.003,  out: 0.015 },
  'claude-haiku-3':        { in: 0.00025, out: 0.00125 },
  // OpenAI
  'gpt-5.1-codex':         { in: 0.005,  out: 0.020 },
  'gpt-5.1-codex-mini':    { in: 0.0015, out: 0.006 },
  'gpt-4-turbo':           { in: 0.010,  out: 0.030 },
  'gpt-4':                 { in: 0.030,  out: 0.060 },
  'gpt-3.5-turbo':         { in: 0.0005, out: 0.0015 },
  // Google
  'gemini-1.5-pro':        { in: 0.00125, out: 0.005 },
  'gemini-1.5-flash':      { in: 0.000075, out: 0.0003 },
} as const;

/** Models that go through a flat-rate subscription bridge → marginal cost = $0 */
const SUBSCRIPTION_MODEL_PATTERNS = [
  /^claude-/i,         // Claude Code subscription
  /^gpt-5\.1-codex/i,  // Codex CLI subscription
  /^gpt-(4|3\.5)/i,    // ChatGPT Plus / Copilot subscription
  /^gemini-/i,         // Gemini Advanced
  /^github-copilot/i,  // GitHub Copilot
  /^microsoft.365/i,   // M365 Copilot
];

function lookupApiPrice(model: string): { in: number; out: number } | null {
  const m = model.toLowerCase();
  // Exact match first
  if (m in API_PRICING) return (API_PRICING as any)[m];
  // Fuzzy match (claude-sonnet-4-1-something → claude-sonnet-4-1)
  for (const key of Object.keys(API_PRICING)) {
    if (m.startsWith(key)) return (API_PRICING as any)[key];
  }
  return null;
}

function isSubscriptionModel(model: string): boolean {
  return SUBSCRIPTION_MODEL_PATTERNS.some((p) => p.test(model));
}

function isLocalModel(model: string): boolean {
  return /^(qwen|llama|mistral|magatama|phi|nomic|gemma)/i.test(model);
}

export interface ComprehensiveSavings {
  /** Total saved across all five mechanisms. */
  totalCostSaved: number;
  totalTokensSaved: number;
  /** Per-source breakdown for the dashboard. */
  bySource: {
    cache: { tokens: number; cost: number; hits: number };
    compression: { tokens: number; cost: number; calls: number };
    subscriptionBridge: { tokens: number; cost: number; calls: number };
    localRouting: { tokens: number; cost: number; calls: number };
    raceMode: { tokens: number; cost: number; calls: number };
  };
  /** How much you would have paid for the same volume at API list prices. */
  costWithoutGateway: number;
  /** What you actually paid (real $). */
  costWithGateway: number;
  /** Time window. */
  hoursBack: number;
  /** Inputs that gave us this number. */
  totals: { requests: number; tokensIn: number; tokensOut: number };
}

/**
 * Compute comprehensive savings across all mechanisms.
 *
 * Strategy:
 *   For each request, determine where it went and price it both ways:
 *     - "Would-be cost"  = API list price for the model that handled it
 *     - "Actual cost"    = $0 for subscription/local; cost_usd for paid API
 *     - "Saved"          = would-be − actual
 */
export async function getComprehensiveSavings(
  db: Pool,
  hoursBack: number = 24
): Promise<ComprehensiveSavings> {
  const empty: ComprehensiveSavings = {
    totalCostSaved: 0,
    totalTokensSaved: 0,
    bySource: {
      cache: { tokens: 0, cost: 0, hits: 0 },
      compression: { tokens: 0, cost: 0, calls: 0 },
      subscriptionBridge: { tokens: 0, cost: 0, calls: 0 },
      localRouting: { tokens: 0, cost: 0, calls: 0 },
      raceMode: { tokens: 0, cost: 0, calls: 0 },
    },
    costWithoutGateway: 0,
    costWithGateway: 0,
    hoursBack,
    totals: { requests: 0, tokensIn: 0, tokensOut: 0 },
  };

  try {
    // 1) Cache hits
    const cacheRow = await db.query(
      `SELECT
         COALESCE(SUM(hit_count), 0)::INT  AS hits,
         COALESCE(SUM(cost_saved), 0)::NUMERIC AS cost,
         COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens
       FROM response_cache
       WHERE last_hit_at > NOW() - MAKE_INTERVAL(hours => $1)`,
      [hoursBack]
    );
    empty.bySource.cache = {
      hits: parseInt(cacheRow.rows[0]?.hits ?? '0', 10),
      cost: parseFloat(cacheRow.rows[0]?.cost ?? '0'),
      tokens: parseInt(cacheRow.rows[0]?.tokens ?? '0', 10),
    };

    // 2-4) All requests in the window, classified by routing
    const reqRows = await db.query(
      `SELECT model, tokens_in, tokens_out, cost_usd, fallback_used
       FROM request_tracking
       WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
      [hoursBack]
    );

    let totalReq = 0, totalIn = 0, totalOut = 0;
    let withGateway = 0, withoutGateway = 0;

    for (const r of reqRows.rows) {
      const model = String(r.model ?? '');
      const tokensIn = parseInt(r.tokens_in, 10) || 0;
      const tokensOut = parseInt(r.tokens_out, 10) || 0;
      const actualCost = parseFloat(r.cost_usd) || 0;

      totalReq += 1;
      totalIn += tokensIn;
      totalOut += tokensOut;
      withGateway += actualCost;

      // Determine "would-be cost" — what this request would have cost at API
      // list prices for the model that handled it (or its closest paid sibling).
      const apiPrice = lookupApiPrice(model);
      let wouldBeCost = 0;
      if (apiPrice) {
        wouldBeCost = (tokensIn / 1000) * apiPrice.in + (tokensOut / 1000) * apiPrice.out;
      } else if (isLocalModel(model)) {
        // Local model — compare against medium-tier paid API as opportunity cost
        const ref = API_PRICING['gpt-3.5-turbo'];
        wouldBeCost = (tokensIn / 1000) * ref.in + (tokensOut / 1000) * ref.out;
      }
      withoutGateway += wouldBeCost;

      // Bucket the savings into a source
      if (isSubscriptionModel(model)) {
        empty.bySource.subscriptionBridge.calls += 1;
        empty.bySource.subscriptionBridge.tokens += tokensIn + tokensOut;
        empty.bySource.subscriptionBridge.cost += Math.max(0, wouldBeCost - actualCost);
      } else if (isLocalModel(model)) {
        empty.bySource.localRouting.calls += 1;
        empty.bySource.localRouting.tokens += tokensIn + tokensOut;
        empty.bySource.localRouting.cost += Math.max(0, wouldBeCost - actualCost);
      }
    }

    // 5) Compression savings — pull from tokenvault_metrics if available
    try {
      const compRow = await db.query(
        `SELECT
           COUNT(*)::INT AS calls,
           COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0)::BIGINT AS tokens_saved
         FROM tokenvault_metrics
         WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)
           AND tool_used = 'gateway'`,
        [hoursBack]
      );
      const tokensCompressed = parseInt(compRow.rows[0]?.tokens_saved ?? '0', 10);
      // Conservative pricing: assume average input pricing of $0.001/1k tokens
      const compCost = (tokensCompressed / 1000) * 0.001;
      empty.bySource.compression = {
        calls: parseInt(compRow.rows[0]?.calls ?? '0', 10),
        tokens: tokensCompressed,
        cost: compCost,
      };
    } catch (err) {
      logger.debug({ err }, 'savings: compression aggregation skipped (table missing)');
    }

    // 6) Race mode — picked the faster/cheaper candidate, "saved" the loser cost
    try {
      const raceRow = await db.query(
        `SELECT
           COUNT(DISTINCT call_id)::INT AS races,
           COALESCE(SUM(cost_usd) FILTER (WHERE selected = false), 0)::NUMERIC AS not_picked_cost
         FROM race_mode_results
         WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
        [hoursBack]
      );
      empty.bySource.raceMode = {
        calls: parseInt(raceRow.rows[0]?.races ?? '0', 10),
        cost: parseFloat(raceRow.rows[0]?.not_picked_cost ?? '0'),
        tokens: 0,
      };
    } catch (err) {
      logger.debug({ err }, 'savings: race aggregation skipped (table missing)');
    }

    // 7) MCP tool-call compression — drop-in Lean-CTX replacement
    try {
      const mcpRow = await db.query(
        `SELECT COUNT(*)::INT AS calls,
                COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved
         FROM mcp_tool_calls
         WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
        [hoursBack]
      );
      const mcpTokens = parseInt(mcpRow.rows[0]?.tokens_saved ?? '0', 10);
      const mcpCalls = parseInt(mcpRow.rows[0]?.calls ?? '0', 10);
      // Tool-call savings cost-equivalence: Sonnet-equivalent pricing
      // ($3/MTok input, $15/MTok output, weighted 60/40 in/out for tool returns).
      // → ~$0.0046 per 1k tokens averaged. Matches Lean-CTX dashboard scale.
      const mcpCost = (mcpTokens / 1_000_000) * (3.0 * 0.6 + 15.0 * 0.4);
      // Add to the comprehensive picture as a new source bucket via compression entry
      empty.bySource.compression.tokens += mcpTokens;
      empty.bySource.compression.cost += mcpCost;
      empty.bySource.compression.calls += mcpCalls;
    } catch (err) {
      logger.debug({ err }, 'savings: mcp tool aggregation skipped (table missing)');
    }

    empty.totalCostSaved =
      empty.bySource.cache.cost +
      empty.bySource.compression.cost +
      empty.bySource.subscriptionBridge.cost +
      empty.bySource.localRouting.cost +
      empty.bySource.raceMode.cost;

    empty.totalTokensSaved =
      empty.bySource.cache.tokens +
      empty.bySource.compression.tokens;

    empty.costWithoutGateway = withoutGateway;
    empty.costWithGateway = withGateway;
    empty.totals = { requests: totalReq, tokensIn: totalIn, tokensOut: totalOut };
  } catch (err) {
    logger.warn({ err }, 'savings-calculator: comprehensive computation failed');
  }

  return empty;
}