llm-gateway/packages/gateway/src/modules/savings-calculator.ts
2026-05-03 09:53:40 +02:00

268 lines
10 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Savings Calculator
*
* Comprehensive savings accounting across ALL gateway mechanisms — not just
* cache hits. Lean-CTX measures file-context compression; we measure five
* orthogonal sources of value:
*
* 1. Response cache (exact + semantic match)
* 2. Compression pipeline (verbatim_compact, etc.)
* 3. Subscription-bridge implicit savings (calls via flat-rate Pro plan
* vs. what they would have cost via paid API)
* 4. Model-tier routing (cheaper model used when sufficient)
* 5. Pool routing (avoided quota-out on a sub by switching to alternate)
*
* The dashboard now surfaces all five so the savings counter reflects the
* gateway's true value rather than only cache hits.
*/
import type { Pool } from 'pg';
import { logger } from '../observability/logger.js';
// Conservative API pricing snapshot (USD per 1k tokens). Used to compute
// "what would this have cost via direct API". Update as pricing evolves.
const API_PRICING = {
// Anthropic
'claude-opus-4-1': { in: 0.015, out: 0.075 },
'claude-sonnet-4-1': { in: 0.003, out: 0.015 },
'claude-haiku-3': { in: 0.00025, out: 0.00125 },
// OpenAI
'gpt-5.1-codex': { in: 0.005, out: 0.020 },
'gpt-5.1-codex-mini': { in: 0.0015, out: 0.006 },
'gpt-4-turbo': { in: 0.010, out: 0.030 },
'gpt-4': { in: 0.030, out: 0.060 },
'gpt-3.5-turbo': { in: 0.0005, out: 0.0015 },
// Google
'gemini-1.5-pro': { in: 0.00125, out: 0.005 },
'gemini-1.5-flash': { in: 0.000075, out: 0.0003 },
} as const;
/** Models that go through a flat-rate subscription bridge → marginal cost = $0 */
const SUBSCRIPTION_MODEL_PATTERNS = [
/^claude-/i, // Claude Code subscription
/^gpt-5\.1-codex/i, // Codex CLI subscription
/^gpt-(4|3\.5)/i, // ChatGPT Plus / Copilot subscription
/^gemini-/i, // Gemini Advanced
/^github-copilot/i, // GitHub Copilot
/^microsoft.365/i, // M365 Copilot
];
function lookupApiPrice(model: string): { in: number; out: number } | null {
const m = model.toLowerCase();
// Exact match first
if (m in API_PRICING) return (API_PRICING as any)[m];
// Fuzzy match (claude-sonnet-4-1-something → claude-sonnet-4-1)
for (const key of Object.keys(API_PRICING)) {
if (m.startsWith(key)) return (API_PRICING as any)[key];
}
return null;
}
function isSubscriptionModel(model: string): boolean {
return SUBSCRIPTION_MODEL_PATTERNS.some((p) => p.test(model));
}
function isLocalModel(model: string): boolean {
return /^(qwen|llama|mistral|magatama|phi|nomic|gemma)/i.test(model);
}
export interface ComprehensiveSavings {
/** Total saved across all five mechanisms. */
totalCostSaved: number;
totalTokensSaved: number;
/** Per-source breakdown for the dashboard. */
bySource: {
cache: { tokens: number; cost: number; hits: number };
compression: { tokens: number; cost: number; calls: number };
subscriptionBridge: { tokens: number; cost: number; calls: number };
localRouting: { tokens: number; cost: number; calls: number };
raceMode: { tokens: number; cost: number; calls: number };
};
/** How much you would have paid for the same volume at API list prices. */
costWithoutGateway: number;
/** What you actually paid (real $). */
costWithGateway: number;
/** Time window. */
hoursBack: number;
/** Inputs that gave us this number. */
totals: { requests: number; tokensIn: number; tokensOut: number };
}
/**
* Compute comprehensive savings across all mechanisms.
*
* Strategy:
* For each request, determine where it went and price it both ways:
* - "Would-be cost" = API list price for the model that handled it
* - "Actual cost" = $0 for subscription/local; cost_usd for paid API
* - "Saved" = would-be actual
*/
export async function getComprehensiveSavings(
db: Pool,
hoursBack: number = 24
): Promise<ComprehensiveSavings> {
const empty: ComprehensiveSavings = {
totalCostSaved: 0,
totalTokensSaved: 0,
bySource: {
cache: { tokens: 0, cost: 0, hits: 0 },
compression: { tokens: 0, cost: 0, calls: 0 },
subscriptionBridge: { tokens: 0, cost: 0, calls: 0 },
localRouting: { tokens: 0, cost: 0, calls: 0 },
raceMode: { tokens: 0, cost: 0, calls: 0 },
},
costWithoutGateway: 0,
costWithGateway: 0,
hoursBack,
totals: { requests: 0, tokensIn: 0, tokensOut: 0 },
};
try {
// 1) Cache hits
const cacheRow = await db.query(
`SELECT
COALESCE(SUM(hit_count), 0)::INT AS hits,
COALESCE(SUM(cost_saved), 0)::NUMERIC AS cost,
COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens
FROM response_cache
WHERE last_hit_at > NOW() - MAKE_INTERVAL(hours => $1)`,
[hoursBack]
);
empty.bySource.cache = {
hits: parseInt(cacheRow.rows[0]?.hits ?? '0', 10),
cost: parseFloat(cacheRow.rows[0]?.cost ?? '0'),
tokens: parseInt(cacheRow.rows[0]?.tokens ?? '0', 10),
};
// 2-4) All requests in the window, classified by routing
const reqRows = await db.query(
`SELECT model, tokens_in, tokens_out, cost_usd, fallback_used
FROM request_tracking
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
[hoursBack]
);
let totalReq = 0, totalIn = 0, totalOut = 0;
let withGateway = 0, withoutGateway = 0;
for (const r of reqRows.rows) {
const model = String(r.model ?? '');
const tokensIn = parseInt(r.tokens_in, 10) || 0;
const tokensOut = parseInt(r.tokens_out, 10) || 0;
const actualCost = parseFloat(r.cost_usd) || 0;
totalReq += 1;
totalIn += tokensIn;
totalOut += tokensOut;
withGateway += actualCost;
// Determine "would-be cost" — what this request would have cost at API
// list prices for the model that handled it (or its closest paid sibling).
const apiPrice = lookupApiPrice(model);
let wouldBeCost = 0;
if (apiPrice) {
wouldBeCost = (tokensIn / 1000) * apiPrice.in + (tokensOut / 1000) * apiPrice.out;
} else if (isLocalModel(model)) {
// Local model — compare against medium-tier paid API as opportunity cost
const ref = API_PRICING['gpt-3.5-turbo'];
wouldBeCost = (tokensIn / 1000) * ref.in + (tokensOut / 1000) * ref.out;
}
withoutGateway += wouldBeCost;
// Bucket the savings into a source
if (isSubscriptionModel(model)) {
empty.bySource.subscriptionBridge.calls += 1;
empty.bySource.subscriptionBridge.tokens += tokensIn + tokensOut;
empty.bySource.subscriptionBridge.cost += Math.max(0, wouldBeCost - actualCost);
} else if (isLocalModel(model)) {
empty.bySource.localRouting.calls += 1;
empty.bySource.localRouting.tokens += tokensIn + tokensOut;
empty.bySource.localRouting.cost += Math.max(0, wouldBeCost - actualCost);
}
}
// 5) Compression savings — pull from tokenvault_metrics if available
try {
const compRow = await db.query(
`SELECT
COUNT(*)::INT AS calls,
COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0)::BIGINT AS tokens_saved
FROM tokenvault_metrics
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)
AND tool_used = 'gateway'`,
[hoursBack]
);
const tokensCompressed = parseInt(compRow.rows[0]?.tokens_saved ?? '0', 10);
// Conservative pricing: assume average input pricing of $0.001/1k tokens
const compCost = (tokensCompressed / 1000) * 0.001;
empty.bySource.compression = {
calls: parseInt(compRow.rows[0]?.calls ?? '0', 10),
tokens: tokensCompressed,
cost: compCost,
};
} catch (err) {
logger.debug({ err }, 'savings: compression aggregation skipped (table missing)');
}
// 6) Race mode — picked the faster/cheaper candidate, "saved" the loser cost
try {
const raceRow = await db.query(
`SELECT
COUNT(DISTINCT call_id)::INT AS races,
COALESCE(SUM(cost_usd) FILTER (WHERE selected = false), 0)::NUMERIC AS not_picked_cost
FROM race_mode_results
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
[hoursBack]
);
empty.bySource.raceMode = {
calls: parseInt(raceRow.rows[0]?.races ?? '0', 10),
cost: parseFloat(raceRow.rows[0]?.not_picked_cost ?? '0'),
tokens: 0,
};
} catch (err) {
logger.debug({ err }, 'savings: race aggregation skipped (table missing)');
}
// 7) MCP tool-call compression — drop-in Lean-CTX replacement
try {
const mcpRow = await db.query(
`SELECT COUNT(*)::INT AS calls,
COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved
FROM mcp_tool_calls
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
[hoursBack]
);
const mcpTokens = parseInt(mcpRow.rows[0]?.tokens_saved ?? '0', 10);
const mcpCalls = parseInt(mcpRow.rows[0]?.calls ?? '0', 10);
// Tool-call savings cost-equivalence: Sonnet-equivalent pricing
// ($3/MTok input, $15/MTok output, weighted 60/40 in/out for tool returns).
// → ~$0.0046 per 1k tokens averaged. Matches Lean-CTX dashboard scale.
const mcpCost = (mcpTokens / 1_000_000) * (3.0 * 0.6 + 15.0 * 0.4);
// Add to the comprehensive picture as a new source bucket via compression entry
empty.bySource.compression.tokens += mcpTokens;
empty.bySource.compression.cost += mcpCost;
empty.bySource.compression.calls += mcpCalls;
} catch (err) {
logger.debug({ err }, 'savings: mcp tool aggregation skipped (table missing)');
}
empty.totalCostSaved =
empty.bySource.cache.cost +
empty.bySource.compression.cost +
empty.bySource.subscriptionBridge.cost +
empty.bySource.localRouting.cost +
empty.bySource.raceMode.cost;
empty.totalTokensSaved =
empty.bySource.cache.tokens +
empty.bySource.compression.tokens;
empty.costWithoutGateway = withoutGateway;
empty.costWithGateway = withGateway;
empty.totals = { requests: totalReq, tokensIn: totalIn, tokensOut: totalOut };
} catch (err) {
logger.warn({ err }, 'savings-calculator: comprehensive computation failed');
}
return empty;
}