268 lines
10 KiB
TypeScript
268 lines
10 KiB
TypeScript
/**
|
||
* Savings Calculator
|
||
*
|
||
* Comprehensive savings accounting across ALL gateway mechanisms — not just
|
||
* cache hits. Lean-CTX measures file-context compression; we measure five
|
||
* orthogonal sources of value:
|
||
*
|
||
* 1. Response cache (exact + semantic match)
|
||
* 2. Compression pipeline (verbatim_compact, etc.)
|
||
* 3. Subscription-bridge implicit savings (calls via flat-rate Pro plan
|
||
* vs. what they would have cost via paid API)
|
||
* 4. Model-tier routing (cheaper model used when sufficient)
|
||
* 5. Pool routing (avoided quota-out on a sub by switching to alternate)
|
||
*
|
||
* The dashboard now surfaces all five so the savings counter reflects the
|
||
* gateway's true value rather than only cache hits.
|
||
*/
|
||
import type { Pool } from 'pg';
|
||
import { logger } from '../observability/logger.js';
|
||
|
||
// Conservative API pricing snapshot (USD per 1k tokens). Used to compute
|
||
// "what would this have cost via direct API". Update as pricing evolves.
|
||
const API_PRICING = {
|
||
// Anthropic
|
||
'claude-opus-4-1': { in: 0.015, out: 0.075 },
|
||
'claude-sonnet-4-1': { in: 0.003, out: 0.015 },
|
||
'claude-haiku-3': { in: 0.00025, out: 0.00125 },
|
||
// OpenAI
|
||
'gpt-5.1-codex': { in: 0.005, out: 0.020 },
|
||
'gpt-5.1-codex-mini': { in: 0.0015, out: 0.006 },
|
||
'gpt-4-turbo': { in: 0.010, out: 0.030 },
|
||
'gpt-4': { in: 0.030, out: 0.060 },
|
||
'gpt-3.5-turbo': { in: 0.0005, out: 0.0015 },
|
||
// Google
|
||
'gemini-1.5-pro': { in: 0.00125, out: 0.005 },
|
||
'gemini-1.5-flash': { in: 0.000075, out: 0.0003 },
|
||
} as const;
|
||
|
||
/** Models that go through a flat-rate subscription bridge → marginal cost = $0 */
|
||
const SUBSCRIPTION_MODEL_PATTERNS = [
|
||
/^claude-/i, // Claude Code subscription
|
||
/^gpt-5\.1-codex/i, // Codex CLI subscription
|
||
/^gpt-(4|3\.5)/i, // ChatGPT Plus / Copilot subscription
|
||
/^gemini-/i, // Gemini Advanced
|
||
/^github-copilot/i, // GitHub Copilot
|
||
/^microsoft.365/i, // M365 Copilot
|
||
];
|
||
|
||
function lookupApiPrice(model: string): { in: number; out: number } | null {
|
||
const m = model.toLowerCase();
|
||
// Exact match first
|
||
if (m in API_PRICING) return (API_PRICING as any)[m];
|
||
// Fuzzy match (claude-sonnet-4-1-something → claude-sonnet-4-1)
|
||
for (const key of Object.keys(API_PRICING)) {
|
||
if (m.startsWith(key)) return (API_PRICING as any)[key];
|
||
}
|
||
return null;
|
||
}
|
||
|
||
function isSubscriptionModel(model: string): boolean {
|
||
return SUBSCRIPTION_MODEL_PATTERNS.some((p) => p.test(model));
|
||
}
|
||
|
||
function isLocalModel(model: string): boolean {
|
||
return /^(qwen|llama|mistral|magatama|phi|nomic|gemma)/i.test(model);
|
||
}
|
||
|
||
export interface ComprehensiveSavings {
|
||
/** Total saved across all five mechanisms. */
|
||
totalCostSaved: number;
|
||
totalTokensSaved: number;
|
||
/** Per-source breakdown for the dashboard. */
|
||
bySource: {
|
||
cache: { tokens: number; cost: number; hits: number };
|
||
compression: { tokens: number; cost: number; calls: number };
|
||
subscriptionBridge: { tokens: number; cost: number; calls: number };
|
||
localRouting: { tokens: number; cost: number; calls: number };
|
||
raceMode: { tokens: number; cost: number; calls: number };
|
||
};
|
||
/** How much you would have paid for the same volume at API list prices. */
|
||
costWithoutGateway: number;
|
||
/** What you actually paid (real $). */
|
||
costWithGateway: number;
|
||
/** Time window. */
|
||
hoursBack: number;
|
||
/** Inputs that gave us this number. */
|
||
totals: { requests: number; tokensIn: number; tokensOut: number };
|
||
}
|
||
|
||
/**
|
||
* Compute comprehensive savings across all mechanisms.
|
||
*
|
||
* Strategy:
|
||
* For each request, determine where it went and price it both ways:
|
||
* - "Would-be cost" = API list price for the model that handled it
|
||
* - "Actual cost" = $0 for subscription/local; cost_usd for paid API
|
||
* - "Saved" = would-be − actual
|
||
*/
|
||
export async function getComprehensiveSavings(
|
||
db: Pool,
|
||
hoursBack: number = 24
|
||
): Promise<ComprehensiveSavings> {
|
||
const empty: ComprehensiveSavings = {
|
||
totalCostSaved: 0,
|
||
totalTokensSaved: 0,
|
||
bySource: {
|
||
cache: { tokens: 0, cost: 0, hits: 0 },
|
||
compression: { tokens: 0, cost: 0, calls: 0 },
|
||
subscriptionBridge: { tokens: 0, cost: 0, calls: 0 },
|
||
localRouting: { tokens: 0, cost: 0, calls: 0 },
|
||
raceMode: { tokens: 0, cost: 0, calls: 0 },
|
||
},
|
||
costWithoutGateway: 0,
|
||
costWithGateway: 0,
|
||
hoursBack,
|
||
totals: { requests: 0, tokensIn: 0, tokensOut: 0 },
|
||
};
|
||
|
||
try {
|
||
// 1) Cache hits
|
||
const cacheRow = await db.query(
|
||
`SELECT
|
||
COALESCE(SUM(hit_count), 0)::INT AS hits,
|
||
COALESCE(SUM(cost_saved), 0)::NUMERIC AS cost,
|
||
COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens
|
||
FROM response_cache
|
||
WHERE last_hit_at > NOW() - MAKE_INTERVAL(hours => $1)`,
|
||
[hoursBack]
|
||
);
|
||
empty.bySource.cache = {
|
||
hits: parseInt(cacheRow.rows[0]?.hits ?? '0', 10),
|
||
cost: parseFloat(cacheRow.rows[0]?.cost ?? '0'),
|
||
tokens: parseInt(cacheRow.rows[0]?.tokens ?? '0', 10),
|
||
};
|
||
|
||
// 2-4) All requests in the window, classified by routing
|
||
const reqRows = await db.query(
|
||
`SELECT model, tokens_in, tokens_out, cost_usd, fallback_used
|
||
FROM request_tracking
|
||
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
|
||
[hoursBack]
|
||
);
|
||
|
||
let totalReq = 0, totalIn = 0, totalOut = 0;
|
||
let withGateway = 0, withoutGateway = 0;
|
||
|
||
for (const r of reqRows.rows) {
|
||
const model = String(r.model ?? '');
|
||
const tokensIn = parseInt(r.tokens_in, 10) || 0;
|
||
const tokensOut = parseInt(r.tokens_out, 10) || 0;
|
||
const actualCost = parseFloat(r.cost_usd) || 0;
|
||
|
||
totalReq += 1;
|
||
totalIn += tokensIn;
|
||
totalOut += tokensOut;
|
||
withGateway += actualCost;
|
||
|
||
// Determine "would-be cost" — what this request would have cost at API
|
||
// list prices for the model that handled it (or its closest paid sibling).
|
||
const apiPrice = lookupApiPrice(model);
|
||
let wouldBeCost = 0;
|
||
if (apiPrice) {
|
||
wouldBeCost = (tokensIn / 1000) * apiPrice.in + (tokensOut / 1000) * apiPrice.out;
|
||
} else if (isLocalModel(model)) {
|
||
// Local model — compare against medium-tier paid API as opportunity cost
|
||
const ref = API_PRICING['gpt-3.5-turbo'];
|
||
wouldBeCost = (tokensIn / 1000) * ref.in + (tokensOut / 1000) * ref.out;
|
||
}
|
||
withoutGateway += wouldBeCost;
|
||
|
||
// Bucket the savings into a source
|
||
if (isSubscriptionModel(model)) {
|
||
empty.bySource.subscriptionBridge.calls += 1;
|
||
empty.bySource.subscriptionBridge.tokens += tokensIn + tokensOut;
|
||
empty.bySource.subscriptionBridge.cost += Math.max(0, wouldBeCost - actualCost);
|
||
} else if (isLocalModel(model)) {
|
||
empty.bySource.localRouting.calls += 1;
|
||
empty.bySource.localRouting.tokens += tokensIn + tokensOut;
|
||
empty.bySource.localRouting.cost += Math.max(0, wouldBeCost - actualCost);
|
||
}
|
||
}
|
||
|
||
// 5) Compression savings — pull from tokenvault_metrics if available
|
||
try {
|
||
const compRow = await db.query(
|
||
`SELECT
|
||
COUNT(*)::INT AS calls,
|
||
COALESCE(SUM(GREATEST(tokens_before - tokens_after, 0)), 0)::BIGINT AS tokens_saved
|
||
FROM tokenvault_metrics
|
||
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)
|
||
AND tool_used = 'gateway'`,
|
||
[hoursBack]
|
||
);
|
||
const tokensCompressed = parseInt(compRow.rows[0]?.tokens_saved ?? '0', 10);
|
||
// Conservative pricing: assume average input pricing of $0.001/1k tokens
|
||
const compCost = (tokensCompressed / 1000) * 0.001;
|
||
empty.bySource.compression = {
|
||
calls: parseInt(compRow.rows[0]?.calls ?? '0', 10),
|
||
tokens: tokensCompressed,
|
||
cost: compCost,
|
||
};
|
||
} catch (err) {
|
||
logger.debug({ err }, 'savings: compression aggregation skipped (table missing)');
|
||
}
|
||
|
||
// 6) Race mode — picked the faster/cheaper candidate, "saved" the loser cost
|
||
try {
|
||
const raceRow = await db.query(
|
||
`SELECT
|
||
COUNT(DISTINCT call_id)::INT AS races,
|
||
COALESCE(SUM(cost_usd) FILTER (WHERE selected = false), 0)::NUMERIC AS not_picked_cost
|
||
FROM race_mode_results
|
||
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
|
||
[hoursBack]
|
||
);
|
||
empty.bySource.raceMode = {
|
||
calls: parseInt(raceRow.rows[0]?.races ?? '0', 10),
|
||
cost: parseFloat(raceRow.rows[0]?.not_picked_cost ?? '0'),
|
||
tokens: 0,
|
||
};
|
||
} catch (err) {
|
||
logger.debug({ err }, 'savings: race aggregation skipped (table missing)');
|
||
}
|
||
|
||
// 7) MCP tool-call compression — drop-in Lean-CTX replacement
|
||
try {
|
||
const mcpRow = await db.query(
|
||
`SELECT COUNT(*)::INT AS calls,
|
||
COALESCE(SUM(tokens_saved), 0)::BIGINT AS tokens_saved
|
||
FROM mcp_tool_calls
|
||
WHERE created_at > NOW() - MAKE_INTERVAL(hours => $1)`,
|
||
[hoursBack]
|
||
);
|
||
const mcpTokens = parseInt(mcpRow.rows[0]?.tokens_saved ?? '0', 10);
|
||
const mcpCalls = parseInt(mcpRow.rows[0]?.calls ?? '0', 10);
|
||
// Tool-call savings cost-equivalence: Sonnet-equivalent pricing
|
||
// ($3/MTok input, $15/MTok output, weighted 60/40 in/out for tool returns).
|
||
// → ~$0.0046 per 1k tokens averaged. Matches Lean-CTX dashboard scale.
|
||
const mcpCost = (mcpTokens / 1_000_000) * (3.0 * 0.6 + 15.0 * 0.4);
|
||
// Add to the comprehensive picture as a new source bucket via compression entry
|
||
empty.bySource.compression.tokens += mcpTokens;
|
||
empty.bySource.compression.cost += mcpCost;
|
||
empty.bySource.compression.calls += mcpCalls;
|
||
} catch (err) {
|
||
logger.debug({ err }, 'savings: mcp tool aggregation skipped (table missing)');
|
||
}
|
||
|
||
empty.totalCostSaved =
|
||
empty.bySource.cache.cost +
|
||
empty.bySource.compression.cost +
|
||
empty.bySource.subscriptionBridge.cost +
|
||
empty.bySource.localRouting.cost +
|
||
empty.bySource.raceMode.cost;
|
||
|
||
empty.totalTokensSaved =
|
||
empty.bySource.cache.tokens +
|
||
empty.bySource.compression.tokens;
|
||
|
||
empty.costWithoutGateway = withoutGateway;
|
||
empty.costWithGateway = withGateway;
|
||
empty.totals = { requests: totalReq, tokensIn: totalIn, tokensOut: totalOut };
|
||
} catch (err) {
|
||
logger.warn({ err }, 'savings-calculator: comprehensive computation failed');
|
||
}
|
||
|
||
return empty;
|
||
}
|