llm-gateway/packages/gateway/src/modules/race-leaderboard.ts
2026-05-03 09:53:40 +02:00

112 lines
4.3 KiB
TypeScript

/**
* Race Mode Leaderboard
*
* Aggregates `race_mode_results` to produce a weekly model leaderboard:
* who finished first most often, who had highest confidence, who was
* fastest on average. Used by the dashboard for the leaderboard tab and
* by the router (future) to bias against perpetually losing models.
*/
import type { Pool } from 'pg';
import { logger } from '../observability/logger.js';
export interface LeaderboardEntry {
model: string;
participations: number;
selectedCount: number;
firstFinishedCount: number;
/** Win rate = selectedCount / participations. */
winRate: number;
/** Speed rate = firstFinishedCount / participations. */
speedRate: number;
avgLatencyMs: number;
avgConfidence: number | null;
totalCost: number;
/** Composite score: 60% speed + 40% confidence, used to rank. */
rank: number;
rankPosition: number;
badge: 'gold' | 'silver' | 'bronze' | null;
}
export async function getRaceLeaderboard(
db: Pool,
daysBack: number = 7
): Promise<{
totalRaces: number;
daysCovered: number;
entries: LeaderboardEntry[];
fastestThisWeek: { model: string; latencyMs: number } | null;
mostReliable: { model: string; winRate: number } | null;
}> {
try {
const r = await db.query(`
SELECT candidate_model AS model,
COUNT(*)::INT AS participations,
SUM(CASE WHEN selected THEN 1 ELSE 0 END)::INT AS selected_count,
SUM(CASE WHEN finished_first THEN 1 ELSE 0 END)::INT AS first_finished_count,
COALESCE(AVG(latency_ms), 0)::NUMERIC(10,1) AS avg_latency,
AVG(confidence)::NUMERIC(4,2) AS avg_confidence,
COALESCE(SUM(cost_usd), 0)::NUMERIC AS total_cost
FROM race_mode_results
WHERE created_at > NOW() - MAKE_INTERVAL(days => $1)
GROUP BY candidate_model
ORDER BY first_finished_count DESC, avg_confidence DESC NULLS LAST
`, [daysBack]);
const totalRow = await db.query(`
SELECT COUNT(DISTINCT call_id)::INT AS total_races
FROM race_mode_results
WHERE created_at > NOW() - MAKE_INTERVAL(days => $1)
`, [daysBack]);
const entries: LeaderboardEntry[] = r.rows.map((row: any) => {
const participations = parseInt(row.participations, 10) || 0;
const selectedCount = parseInt(row.selected_count, 10) || 0;
const firstFinished = parseInt(row.first_finished_count, 10) || 0;
const avgLatency = parseFloat(row.avg_latency) || 0;
const avgConfidence = row.avg_confidence ? parseFloat(row.avg_confidence) : null;
const winRate = participations > 0 ? selectedCount / participations : 0;
const speedRate = participations > 0 ? firstFinished / participations : 0;
// Composite rank: 60% speed + 40% confidence (or 50/50 if no confidence)
const confScore = avgConfidence !== null ? (avgConfidence / 10) : 0.5;
const rank = speedRate * 0.6 + confScore * 0.4;
return {
model: row.model,
participations,
selectedCount,
firstFinishedCount: firstFinished,
winRate: parseFloat(winRate.toFixed(3)),
speedRate: parseFloat(speedRate.toFixed(3)),
avgLatencyMs: avgLatency,
avgConfidence,
totalCost: parseFloat(row.total_cost) || 0,
rank: parseFloat(rank.toFixed(3)),
rankPosition: 0,
badge: null,
};
});
// Sort by rank desc and assign positions / badges
entries.sort((a, b) => b.rank - a.rank);
entries.forEach((e, i) => {
e.rankPosition = i + 1;
if (i === 0) e.badge = 'gold';
else if (i === 1) e.badge = 'silver';
else if (i === 2) e.badge = 'bronze';
});
const fastest = [...entries].sort((a, b) => a.avgLatencyMs - b.avgLatencyMs)[0];
const reliable = [...entries].filter((e) => e.participations >= 2).sort((a, b) => b.winRate - a.winRate)[0];
return {
totalRaces: parseInt(totalRow.rows[0]?.total_races ?? '0', 10),
daysCovered: daysBack,
entries,
fastestThisWeek: fastest ? { model: fastest.model, latencyMs: fastest.avgLatencyMs } : null,
mostReliable: reliable ? { model: reliable.model, winRate: reliable.winRate } : null,
};
} catch (err) {
logger.warn({ err }, 'race-leaderboard: aggregation failed');
return { totalRaces: 0, daysCovered: daysBack, entries: [], fastestThisWeek: null, mostReliable: null };
}
}