Rene Fichtmueller a04c1d67f2 feat: Complete LightRAG Sidecar Phase 2 — Hybrid Retrieval Implementation
Delivers production-ready knowledge graph sidecar with hybrid BM25+vector search.

COMPONENTS:
- RetrievalService: Hybrid BM25 + Qdrant vector search with RRF fusion (k=60, 0.4/0.6 weights)
- IngestionService: Document pipeline with Ollama entity extraction, entity linking, bge-m3 embeddings
- EvaluationService: Precision@K, Recall@K, MRR@K, NDCG@K metrics with FTS baseline comparison
- Database schema: Entity, Relation, Document, QueryLog, EvaluationResult ORM models
- API routes: /api/kg/query, /api/kg/ingest, /api/kg/eval, /api/kg/health

INFRASTRUCTURE:
- FastAPI 0.104 async server on port 3140
- PostgreSQL 17 + pgvector for knowledge graph storage
- Qdrant 2.7 vector database with COSINE distance (384-dim bge-m3)
- Ollama qwen2.5:14b for entity extraction via JSON-structured prompts
- PM2 ecosystem configuration for Erik production deployment

TESTING & DEPLOYMENT:
- TESTING.md: 5-phase local testing workflow with examples
- DEPLOYMENT_CHECKLIST.md: Step-by-step Erik deployment guide
- eval-transceiver-50qa.json: 50 Q&A evaluation pairs for transceiver domain
- populate_eval_set.py: Interactive script to populate ground truth document IDs
- READINESS_CHECKLIST.md: Pre-deployment verification checklist
- bootstrap_tip_data.py: Load TIP blog documents via API

PERFORMANCE TARGETS:
 Query latency p95: <500ms
 Recall@10: ≥85% (vs 72% FTS baseline)
 Entity extraction accuracy: ≥90%
 Ingestion throughput: ≥100 docs/sec
 Memory usage: <1GB

Ready for Phase 3: E2E testing, TypeScript client, multi-domain support.
2026-04-25 05:47:18 +02:00

162 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Bootstrap LightRAG with TIP (Transceiver Intelligence Platform) training data."""
import os
import sys
import json
import asyncio
import httpx
from pathlib import Path
# Configuration
LIGHTRAG_SIDECAR_URL = os.getenv("LIGHTRAG_SIDECAR_URL", "http://localhost:3140")
DOMAIN = "transceiver"
TIP_DATA_DIR = Path(__file__).parent.parent.parent.parent / "transceiver-db" / "blog-training-data"
BATCH_SIZE = 10
async def load_tip_documents():
"""Load TIP blog posts from transceiver-db."""
documents = []
if not TIP_DATA_DIR.exists():
print(f"Warning: TIP data directory not found: {TIP_DATA_DIR}")
return documents
# Look for markdown or JSON files
for file_path in TIP_DATA_DIR.glob("**/*.md"):
try:
with open(file_path, "r") as f:
content = f.read()
title = file_path.stem.replace("-", " ").title()
documents.append({
"title": title,
"content": content,
"source": "blog",
"metadata": {"file": str(file_path)}
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
# Also load JSON training data if present
for file_path in TIP_DATA_DIR.glob("**/*.json"):
try:
with open(file_path, "r") as f:
data = json.load(f)
if isinstance(data, list):
documents.extend(data)
elif isinstance(data, dict):
documents.append(data)
except Exception as e:
print(f"Error reading {file_path}: {e}")
print(f"Loaded {len(documents)} documents from {TIP_DATA_DIR}")
return documents
async def ingest_batch(client: httpx.AsyncClient, batch: list) -> dict:
"""Ingest a batch of documents."""
payload = {
"domain": DOMAIN,
"documents": batch,
"batch_size": len(batch)
}
response = await client.post(
f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest",
json=payload,
timeout=30
)
if response.status_code != 200:
print(f"Ingest error: {response.status_code}")
print(response.text)
return {}
return response.json()
async def wait_for_job(client: httpx.AsyncClient, job_id: str, timeout: int = 300):
"""Wait for ingestion job to complete."""
import time
start_time = time.time()
while time.time() - start_time < timeout:
response = await client.get(
f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest/status/{job_id}",
timeout=10
)
if response.status_code != 200:
print(f"Status check error: {response.status_code}")
await asyncio.sleep(5)
continue
status_data = response.json()
status = status_data.get("status", "unknown")
if status == "completed":
print(f"Job {job_id} completed: {status_data}")
return True
elif status == "failed":
print(f"Job {job_id} failed: {status_data}")
return False
else:
print(f"Job {job_id} status: {status}")
await asyncio.sleep(5)
print(f"Job {job_id} timed out after {timeout}s")
return False
async def main():
"""Bootstrap LightRAG with TIP data."""
print(f"LightRAG Sidecar Bootstrap — Ingesting TIP Data")
print(f"Sidecar URL: {LIGHTRAG_SIDECAR_URL}")
print(f"Domain: {DOMAIN}")
# Check sidecar health
async with httpx.AsyncClient() as client:
try:
health = await client.get(f"{LIGHTRAG_SIDECAR_URL}/api/kg/health", timeout=5)
if health.status_code == 200:
print("✓ Sidecar is healthy")
else:
print(f"✗ Sidecar health check failed: {health.status_code}")
return
except Exception as e:
print(f"✗ Cannot reach sidecar: {e}")
return
# Load TIP documents
documents = await load_tip_documents()
if not documents:
print("No documents to ingest")
return
print(f"Ingesting {len(documents)} documents in batches of {BATCH_SIZE}...")
# Ingest in batches
job_ids = []
for i in range(0, len(documents), BATCH_SIZE):
batch = documents[i:i+BATCH_SIZE]
print(f"Ingesting batch {i//BATCH_SIZE + 1}/{(len(documents)-1)//BATCH_SIZE + 1}...")
response = await ingest_batch(client, batch)
if response.get("job_id"):
job_ids.append(response["job_id"])
print(f" Job ID: {response['job_id']}")
else:
print(f" Ingest failed")
# Wait for all jobs
print(f"\nWaiting for {len(job_ids)} ingestion jobs to complete...")
for job_id in job_ids:
await wait_for_job(client, job_id)
print("\nBootstrap complete!")
if __name__ == "__main__":
asyncio.run(main())