llm-gateway/packages/lightrag-sidecar/scripts/bootstrap_tip_data.py

#!/usr/bin/env python3
"""Bootstrap LightRAG with TIP (Transceiver Intelligence Platform) training data."""

import os
import sys
import json
import asyncio
import httpx
from pathlib import Path

# Configuration
LIGHTRAG_SIDECAR_URL = os.getenv("LIGHTRAG_SIDECAR_URL", "http://localhost:3140")
DOMAIN = "transceiver"
TIP_DATA_DIR = Path(__file__).parent.parent.parent.parent / "transceiver-db" / "blog-training-data"
BATCH_SIZE = 10


async def load_tip_documents():
    """Load TIP blog posts from transceiver-db."""
    documents = []

    if not TIP_DATA_DIR.exists():
        print(f"Warning: TIP data directory not found: {TIP_DATA_DIR}")
        return documents

    # Look for markdown or JSON files
    for file_path in TIP_DATA_DIR.glob("**/*.md"):
        try:
            with open(file_path, "r") as f:
                content = f.read()
                title = file_path.stem.replace("-", " ").title()
                documents.append({
                    "title": title,
                    "content": content,
                    "source": "blog",
                    "metadata": {"file": str(file_path)}
                })
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    # Also load JSON training data if present
    for file_path in TIP_DATA_DIR.glob("**/*.json"):
        try:
            with open(file_path, "r") as f:
                data = json.load(f)
                if isinstance(data, list):
                    documents.extend(data)
                elif isinstance(data, dict):
                    documents.append(data)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    print(f"Loaded {len(documents)} documents from {TIP_DATA_DIR}")
    return documents


async def ingest_batch(client: httpx.AsyncClient, batch: list) -> dict:
    """Ingest a batch of documents."""
    payload = {
        "domain": DOMAIN,
        "documents": batch,
        "batch_size": len(batch)
    }

    response = await client.post(
        f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest",
        json=payload,
        timeout=30
    )

    if response.status_code != 200:
        print(f"Ingest error: {response.status_code}")
        print(response.text)
        return {}

    return response.json()


async def wait_for_job(client: httpx.AsyncClient, job_id: str, timeout: int = 300):
    """Wait for ingestion job to complete."""
    import time
    start_time = time.time()

    while time.time() - start_time < timeout:
        response = await client.get(
            f"{LIGHTRAG_SIDECAR_URL}/api/kg/ingest/status/{job_id}",
            timeout=10
        )

        if response.status_code != 200:
            print(f"Status check error: {response.status_code}")
            await asyncio.sleep(5)
            continue

        status_data = response.json()
        status = status_data.get("status", "unknown")

        if status == "completed":
            print(f"Job {job_id} completed: {status_data}")
            return True
        elif status == "failed":
            print(f"Job {job_id} failed: {status_data}")
            return False
        else:
            print(f"Job {job_id} status: {status}")
            await asyncio.sleep(5)

    print(f"Job {job_id} timed out after {timeout}s")
    return False


async def main():
    """Bootstrap LightRAG with TIP data."""
    print(f"LightRAG Sidecar Bootstrap — Ingesting TIP Data")
    print(f"Sidecar URL: {LIGHTRAG_SIDECAR_URL}")
    print(f"Domain: {DOMAIN}")

    # Check sidecar health
    async with httpx.AsyncClient() as client:
        try:
            health = await client.get(f"{LIGHTRAG_SIDECAR_URL}/api/kg/health", timeout=5)
            if health.status_code == 200:
                print("✓ Sidecar is healthy")
            else:
                print(f"✗ Sidecar health check failed: {health.status_code}")
                return
        except Exception as e:
            print(f"✗ Cannot reach sidecar: {e}")
            return

        # Load TIP documents
        documents = await load_tip_documents()
        if not documents:
            print("No documents to ingest")
            return

        print(f"Ingesting {len(documents)} documents in batches of {BATCH_SIZE}...")

        # Ingest in batches
        job_ids = []
        for i in range(0, len(documents), BATCH_SIZE):
            batch = documents[i:i+BATCH_SIZE]
            print(f"Ingesting batch {i//BATCH_SIZE + 1}/{(len(documents)-1)//BATCH_SIZE + 1}...")

            response = await ingest_batch(client, batch)
            if response.get("job_id"):
                job_ids.append(response["job_id"])
                print(f"  Job ID: {response['job_id']}")
            else:
                print(f"  Ingest failed")

        # Wait for all jobs
        print(f"\nWaiting for {len(job_ids)} ingestion jobs to complete...")
        for job_id in job_ids:
            await wait_for_job(client, job_id)

        print("\nBootstrap complete!")


if __name__ == "__main__":
    asyncio.run(main())