"""Evaluation endpoints for retrieval quality metrics.""" from fastapi import APIRouter, HTTPException, Depends from pydantic import BaseModel from typing import List, Optional import logging from app.config import settings from app.db import get_session from app.services.evaluation_service import EvaluationService logger = logging.getLogger(__name__) router = APIRouter() class EvalQuery(BaseModel): query: str ground_truth_doc_ids: List[str] # Expected relevant documents class EvalRequest(BaseModel): domain: str = settings.LIGHTRAG_DOMAIN eval_set: str # e.g. "transceiver-50qa" queries: List[EvalQuery] metrics: List[str] = ["precision@5", "recall@10", "mrr@5", "ndcg@10"] compare_to: Optional[str] = "baseline_fts" class MetricResult(BaseModel): metric: str value: float baseline_value: Optional[float] = None improvement_pct: Optional[float] = None class EvalResponse(BaseModel): eval_set: str domain: str metrics: List[MetricResult] total_queries: int latency_p95_ms: float entity_extraction_accuracy: float @router.post("/eval", response_model=EvalResponse) async def evaluate_retrieval( req: EvalRequest, session = Depends(get_session) ): """ Evaluate retrieval quality using evaluation set. Metrics: - Precision@K: % of top-K results that are relevant - Recall@K: % of relevant documents that appear in top-K - MRR@K: Mean Reciprocal Rank - NDCG@K: Normalized Discounted Cumulative Gain - Entity Extraction Accuracy: % of expected entities found """ if not req.queries: raise HTTPException(status_code=400, detail="No evaluation queries provided") try: evaluator = EvaluationService(session) result = await evaluator.evaluate( domain=req.domain, eval_set=req.eval_set, queries=[{"query": q.query, "ground_truth_doc_ids": q.ground_truth_doc_ids} for q in req.queries], metrics=req.metrics, compare_to=req.compare_to ) return EvalResponse( eval_set=result["eval_set"], domain=result["domain"], metrics=[ MetricResult( metric=m["metric"], value=m["value"], baseline_value=m.get("baseline_value"), improvement_pct=m.get("improvement_pct") ) for m in result["metrics"] ], total_queries=result["total_queries"], latency_p95_ms=result.get("latency_p95_ms", 0), entity_extraction_accuracy=result.get("entity_extraction_accuracy", 0) ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except Exception as e: logger.error(f"Evaluation error: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) @router.get("/eval/datasets") async def list_eval_datasets(domain: Optional[str] = None): """List available evaluation datasets.""" datasets = { "transceiver": [ { "name": "transceiver-50qa", "queries": 50, "domains": ["transceiver", "standard", "vendor"], "created": "2024-12-01" } ], "switch": [], "standard": [] } if domain: return datasets.get(domain, []) return datasets @router.get("/eval/baseline/{eval_set}") async def get_baseline(eval_set: str, metric: str = "precision@5"): """Get baseline metric values (FTS) for comparison.""" baselines = { "transceiver-50qa": { "precision@5": 0.65, "recall@10": 0.72, "mrr@5": 0.58, "ndcg@10": 0.70 } } if eval_set not in baselines: raise HTTPException(status_code=404, detail=f"Baseline for {eval_set} not found") baseline = baselines[eval_set] if metric not in baseline: raise HTTPException(status_code=404, detail=f"Metric {metric} not in baseline") return { "eval_set": eval_set, "metric": metric, "baseline_value": baseline[metric], "method": "bm25_fts" } @router.post("/eval/create-dataset") async def create_evaluation_dataset(req: EvalRequest): """ Create a new evaluation dataset from queries. Stores for future runs and comparison tracking. """ if not req.queries or len(req.queries) < 10: raise HTTPException(status_code=400, detail="Need at least 10 evaluation queries") # TODO: Store eval dataset to database return { "eval_set": req.eval_set, "domain": req.domain, "queries": len(req.queries), "status": "created" }