#!/usr/bin/env python3 """Bootstrap LightRAG with TIP (Transceiver Intelligence Platform) training data.""" import os import sys import json import asyncio import httpx from pathlib import Path # Configuration LIGHTRAG_SIDECAR_URL = os.getenv("LIGHTRAG_SIDECAR_URL", "http://localhost:3140") DOMAIN = "transceiver" TIP_DATA_DIR = Path(__file__).parent.parent.parent.parent / "transceiver-db" / "blog-training-data" BATCH_SIZE = 10 async def load_tip_documents(): """Load TIP blog posts from transceiver-db.""" documents = [] if not TIP_DATA_DIR.exists(): print(f"Warning: TIP data directory not found: {TIP_DATA_DIR}") return documents # Look for markdown or JSON files for file_path in TIP_DATA_DIR.glob("**/*.md"): try: with open(file_path, "r") as f: content = f.read() title = file_path.stem.replace("-", " ").title() documents.append({ "title": title, "content": content, "source": "blog", "metadata": {"file": str(file_path)} }) except Exception as e: print(f"Error reading {file_path}: {e}") # Also load JSON training data if present for file_path in TIP_DATA_DIR.glob("**/*.json"): try: with open(file_path, "r") as f: data = json.load(f) if isinstance(data, list): documents.extend(data) elif isinstance(data, dict): documents.append(data) except Exception as e: print(f"Error reading {file_path}: {e}") print(f"Loaded {len(documents)} documents from {TIP_DATA_DIR}") return documents async def ingest_batch(client: httpx.AsyncClient, batch: list) -> dict: """Ingest a batch of documents.""" payload = { "domain": DOMAIN, "documents": batch, "batch_size": len(batch) } response = await client.post( f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest", json=payload, timeout=30 ) if response.status_code != 200: print(f"Ingest error: {response.status_code}") print(response.text) return {} return response.json() async def wait_for_job(client: httpx.AsyncClient, job_id: str, timeout: int = 300): """Wait for ingestion job to complete.""" import time start_time = time.time() while time.time() - start_time < timeout: response = await client.get( f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest/status/{job_id}", timeout=10 ) if response.status_code != 200: print(f"Status check error: {response.status_code}") await asyncio.sleep(5) continue status_data = response.json() status = status_data.get("status", "unknown") if status == "completed": print(f"Job {job_id} completed: {status_data}") return True elif status == "failed": print(f"Job {job_id} failed: {status_data}") return False else: print(f"Job {job_id} status: {status}") await asyncio.sleep(5) print(f"Job {job_id} timed out after {timeout}s") return False async def main(): """Bootstrap LightRAG with TIP data.""" print(f"LightRAG Sidecar Bootstrap — Ingesting TIP Data") print(f"Sidecar URL: {LIGHTRAG_SIDECAR_URL}") print(f"Domain: {DOMAIN}") # Check sidecar health async with httpx.AsyncClient() as client: try: health = await client.get(f"{LIGHTRAG_SIDECAR_URL}/api/kg/health", timeout=5) if health.status_code == 200: print("✓ Sidecar is healthy") else: print(f"✗ Sidecar health check failed: {health.status_code}") return except Exception as e: print(f"✗ Cannot reach sidecar: {e}") return # Load TIP documents documents = await load_tip_documents() if not documents: print("No documents to ingest") return print(f"Ingesting {len(documents)} documents in batches of {BATCH_SIZE}...") # Ingest in batches job_ids = [] for i in range(0, len(documents), BATCH_SIZE): batch = documents[i:i+BATCH_SIZE] print(f"Ingesting batch {i//BATCH_SIZE + 1}/{(len(documents)-1)//BATCH_SIZE + 1}...") response = await ingest_batch(client, batch) if response.get("job_id"): job_ids.append(response["job_id"]) print(f" Job ID: {response['job_id']}") else: print(f" Ingest failed") # Wait for all jobs print(f"\nWaiting for {len(job_ids)} ingestion jobs to complete...") for job_id in job_ids: await wait_for_job(client, job_id) print("\nBootstrap complete!") if __name__ == "__main__": asyncio.run(main())