llm-gateway/packages/gateway/src/modules/race-leaderboard.ts

/**
 * Race Mode Leaderboard
 *
 * Aggregates `race_mode_results` to produce a weekly model leaderboard:
 * who finished first most often, who had highest confidence, who was
 * fastest on average. Used by the dashboard for the leaderboard tab and
 * by the router (future) to bias against perpetually losing models.
 */
import type { Pool } from 'pg';
import { logger } from '../observability/logger.js';

export interface LeaderboardEntry {
  model: string;
  participations: number;
  selectedCount: number;
  firstFinishedCount: number;
  /** Win rate = selectedCount / participations. */
  winRate: number;
  /** Speed rate = firstFinishedCount / participations. */
  speedRate: number;
  avgLatencyMs: number;
  avgConfidence: number | null;
  totalCost: number;
  /** Composite score: 60% speed + 40% confidence, used to rank. */
  rank: number;
  rankPosition: number;
  badge: 'gold' | 'silver' | 'bronze' | null;
}

export async function getRaceLeaderboard(
  db: Pool,
  daysBack: number = 7
): Promise<{
  totalRaces: number;
  daysCovered: number;
  entries: LeaderboardEntry[];
  fastestThisWeek: { model: string; latencyMs: number } | null;
  mostReliable: { model: string; winRate: number } | null;
}> {
  try {
    const r = await db.query(`
      SELECT candidate_model AS model,
             COUNT(*)::INT AS participations,
             SUM(CASE WHEN selected THEN 1 ELSE 0 END)::INT AS selected_count,
             SUM(CASE WHEN finished_first THEN 1 ELSE 0 END)::INT AS first_finished_count,
             COALESCE(AVG(latency_ms), 0)::NUMERIC(10,1) AS avg_latency,
             AVG(confidence)::NUMERIC(4,2) AS avg_confidence,
             COALESCE(SUM(cost_usd), 0)::NUMERIC AS total_cost
      FROM race_mode_results
      WHERE created_at > NOW() - MAKE_INTERVAL(days => $1)
      GROUP BY candidate_model
      ORDER BY first_finished_count DESC, avg_confidence DESC NULLS LAST
    `, [daysBack]);

    const totalRow = await db.query(`
      SELECT COUNT(DISTINCT call_id)::INT AS total_races
      FROM race_mode_results
      WHERE created_at > NOW() - MAKE_INTERVAL(days => $1)
    `, [daysBack]);

    const entries: LeaderboardEntry[] = r.rows.map((row: any) => {
      const participations = parseInt(row.participations, 10) || 0;
      const selectedCount = parseInt(row.selected_count, 10) || 0;
      const firstFinished = parseInt(row.first_finished_count, 10) || 0;
      const avgLatency = parseFloat(row.avg_latency) || 0;
      const avgConfidence = row.avg_confidence ? parseFloat(row.avg_confidence) : null;
      const winRate = participations > 0 ? selectedCount / participations : 0;
      const speedRate = participations > 0 ? firstFinished / participations : 0;
      // Composite rank: 60% speed + 40% confidence (or 50/50 if no confidence)
      const confScore = avgConfidence !== null ? (avgConfidence / 10) : 0.5;
      const rank = speedRate * 0.6 + confScore * 0.4;
      return {
        model: row.model,
        participations,
        selectedCount,
        firstFinishedCount: firstFinished,
        winRate: parseFloat(winRate.toFixed(3)),
        speedRate: parseFloat(speedRate.toFixed(3)),
        avgLatencyMs: avgLatency,
        avgConfidence,
        totalCost: parseFloat(row.total_cost) || 0,
        rank: parseFloat(rank.toFixed(3)),
        rankPosition: 0,
        badge: null,
      };
    });

    // Sort by rank desc and assign positions / badges
    entries.sort((a, b) => b.rank - a.rank);
    entries.forEach((e, i) => {
      e.rankPosition = i + 1;
      if (i === 0) e.badge = 'gold';
      else if (i === 1) e.badge = 'silver';
      else if (i === 2) e.badge = 'bronze';
    });

    const fastest = [...entries].sort((a, b) => a.avgLatencyMs - b.avgLatencyMs)[0];
    const reliable = [...entries].filter((e) => e.participations >= 2).sort((a, b) => b.winRate - a.winRate)[0];

    return {
      totalRaces: parseInt(totalRow.rows[0]?.total_races ?? '0', 10),
      daysCovered: daysBack,
      entries,
      fastestThisWeek: fastest ? { model: fastest.model, latencyMs: fastest.avgLatencyMs } : null,
      mostReliable: reliable ? { model: reliable.model, winRate: reliable.winRate } : null,
    };
  } catch (err) {
    logger.warn({ err }, 'race-leaderboard: aggregation failed');
    return { totalRaces: 0, daysCovered: daysBack, entries: [], fastestThisWeek: null, mostReliable: null };
  }
}