llm-gateway/packages/fine-tuner/src/converter.py

"""
converter.py - Convert fine-tuned LoRA adapter to GGUF and register with Ollama.

Pipeline:
  1. Merge LoRA adapter weights into the base model.
  2. Save the merged full-precision HuggingFace model.
  3. Convert to GGUF via llama.cpp convert_hf_to_gguf.py.
  4. Quantize with llama-quantize (Q5_K_M by default).
  5. Create an Ollama Modelfile.
  6. Register the model with Ollama via POST /api/create.
  7. Run a lightweight evaluation to confirm the model is responsive.

All subprocess calls use a fixed argument list — no shell=True, no
string interpolation of user-controlled data into shell commands.
"""

from __future__ import annotations

import json
import logging
import subprocess
import time
from pathlib import Path
from typing import Optional

import requests

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Step 1 — Merge LoRA adapter into base model
# ---------------------------------------------------------------------------

def merge_lora_and_save(
    base_model_path: str,
    adapter_path: str,
    output_path: str,
) -> None:
    """
    Merge LoRA adapter weights into the base model and save the result.

    The merged model is saved in standard HuggingFace format and can
    then be converted to GGUF.  The base model is loaded in float32
    for maximum compatibility with MPS and llama.cpp.
    """
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch

    logger.info(
        "merge_lora_and_save: base=%s adapter=%s → output=%s",
        base_model_path,
        adapter_path,
        output_path,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        adapter_path,
        trust_remote_code=True,
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )

    model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        torch_dtype=torch.float16,
    )

    logger.info("Merging LoRA weights into base model...")
    merged = model.merge_and_unload()

    out = Path(output_path)
    out.mkdir(parents=True, exist_ok=True)

    merged.save_pretrained(str(out), safe_serialization=True)
    tokenizer.save_pretrained(str(out))
    logger.info("Merged model saved to %s", out)


# ---------------------------------------------------------------------------
# Step 2 — Convert HuggingFace model to GGUF
# ---------------------------------------------------------------------------

def convert_to_gguf(
    model_path: str,
    output_gguf_path: str,
    quantization: str = "Q5_K_M",
    convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
    quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
) -> bool:
    """
    Convert a HuggingFace model directory to a quantized GGUF file.

    Steps:
    1. Run convert_hf_to_gguf.py → unquantized fp16 GGUF.
    2. Run llama-quantize → Q5_K_M (or requested quantization).

    Returns True on success, False on any failure.
    All subprocess calls use explicit argument lists (no shell=True).
    """
    model_path_obj = Path(model_path)
    output_path_obj = Path(output_gguf_path)
    output_path_obj.parent.mkdir(parents=True, exist_ok=True)

    # Intermediate unquantized GGUF
    f16_gguf = str(output_path_obj.with_suffix("")) + "_f16.gguf"

    # --- Conversion step ---
    convert_script_path = Path(convert_script)
    if not convert_script_path.exists():
        # Try to find it relative to the llama-cpp-python package
        import importlib.util
        spec = importlib.util.find_spec("llama_cpp")
        if spec and spec.origin:
            pkg_dir = Path(spec.origin).parent
            alt_script = pkg_dir / "convert_hf_to_gguf.py"
            if alt_script.exists():
                convert_script_path = alt_script
            else:
                logger.error(
                    "convert_to_gguf: convert_hf_to_gguf.py not found at %s or %s",
                    convert_script,
                    alt_script,
                )
                return False
        else:
            logger.error(
                "convert_to_gguf: convert_hf_to_gguf.py not found at %s", convert_script
            )
            return False

    cmd_convert = [
        "python3",
        str(convert_script_path),
        str(model_path_obj),
        "--outfile",
        f16_gguf,
        "--outtype",
        "f16",
    ]

    logger.info("convert_to_gguf: running conversion: %s", " ".join(cmd_convert))
    try:
        result = subprocess.run(
            cmd_convert,
            capture_output=True,
            text=True,
            timeout=3600,  # 1 hour — large models take time
            check=False,
        )
        if result.returncode != 0:
            logger.error(
                "convert_to_gguf: conversion failed (rc=%d):\n%s\n%s",
                result.returncode,
                result.stdout[-2000:],
                result.stderr[-2000:],
            )
            return False
        logger.info("convert_to_gguf: f16 GGUF written to %s", f16_gguf)
    except subprocess.TimeoutExpired:
        logger.error("convert_to_gguf: conversion timed out after 3600s")
        return False
    except FileNotFoundError as exc:
        logger.error("convert_to_gguf: python3 not found: %s", exc)
        return False

    # --- Quantization step ---
    quantize_bin = Path(quantize_binary)
    if not quantize_bin.exists():
        logger.warning(
            "convert_to_gguf: llama-quantize not found at %s, skipping quantization",
            quantize_binary,
        )
        # Use f16 as the output without quantization
        Path(f16_gguf).rename(output_gguf_path)
        return True

    cmd_quantize = [
        str(quantize_bin),
        f16_gguf,
        output_gguf_path,
        quantization,
    ]

    logger.info("convert_to_gguf: quantizing: %s", " ".join(cmd_quantize))
    try:
        result = subprocess.run(
            cmd_quantize,
            capture_output=True,
            text=True,
            timeout=3600,
            check=False,
        )
        if result.returncode != 0:
            logger.error(
                "convert_to_gguf: quantization failed (rc=%d):\n%s\n%s",
                result.returncode,
                result.stdout[-2000:],
                result.stderr[-2000:],
            )
            return False
        logger.info("convert_to_gguf: quantized GGUF written to %s", output_gguf_path)
    except subprocess.TimeoutExpired:
        logger.error("convert_to_gguf: quantization timed out after 3600s")
        return False

    # Clean up intermediate f16 file
    try:
        Path(f16_gguf).unlink(missing_ok=True)
    except OSError:
        pass

    return True


# ---------------------------------------------------------------------------
# Step 3 — Create Ollama Modelfile
# ---------------------------------------------------------------------------

def create_ollama_modelfile(
    gguf_path: str,
    base_name: str,
    task_type: Optional[str],
    temperature: float = 0.3,
    num_ctx: int = 8192,
) -> str:
    """
    Generate Ollama Modelfile content for the fine-tuned model.

    The Modelfile uses the absolute GGUF path so Ollama can locate it
    regardless of working directory.
    """
    task_label = task_type or "general"
    model_name = f"llm-gateway-{task_label}-ft"

    system_prompt = (
        f"You are a fine-tuned assistant specialised in {task_label} tasks. "
        "Provide accurate, detailed, professional responses. "
        "Your outputs have been optimised through automated training on "
        "high-quality examples from the LLM Gateway learning corpus."
    )

    modelfile = (
        f"FROM {gguf_path}\n"
        f"PARAMETER temperature {temperature}\n"
        f"PARAMETER num_ctx {num_ctx}\n"
        f"PARAMETER stop \"<|im_end|>\"\n"
        f"TEMPLATE \"\"\"\n"
        f"{{{{- if .System}}}}<|im_start|>system\n{{{{.System}}}}<|im_end|>\n{{{{- end}}}}\n"
        f"{{{{- range .Messages}}}}\n"
        f"<|im_start|>{{{{.Role}}}}\n{{{{.Content}}}}<|im_end|>\n"
        f"{{{{- end}}}}\n"
        f"<|im_start|>assistant\n\"\"\"\n"
        f"SYSTEM \"{system_prompt}\"\n"
    )
    return modelfile


# ---------------------------------------------------------------------------
# Step 4 — Register with Ollama
# ---------------------------------------------------------------------------

def register_with_ollama(
    modelfile_content: str,
    model_name: str,
    ollama_url: str,
    timeout_s: int = 600,
) -> bool:
    """
    Register a model with Ollama via POST /api/create (streaming response).

    Streams the response to capture progress lines.
    Returns True when Ollama confirms success, False on any error.
    """
    url = f"{ollama_url}/api/create"
    payload = {"name": model_name, "modelfile": modelfile_content}

    logger.info("register_with_ollama: model=%s url=%s", model_name, url)

    try:
        with requests.post(
            url,
            json=payload,
            stream=True,
            timeout=timeout_s,
            headers={"Content-Type": "application/json"},
        ) as resp:
            resp.raise_for_status()

            last_status = ""
            for line in resp.iter_lines():
                if not line:
                    continue
                try:
                    data = json.loads(line)
                except json.JSONDecodeError:
                    continue

                status = data.get("status", "")
                if status != last_status:
                    logger.info("Ollama create: %s", status)
                    last_status = status

                if data.get("error"):
                    logger.error("Ollama create error: %s", data["error"])
                    return False

            logger.info("register_with_ollama: model=%s registered successfully", model_name)
            return True

    except requests.exceptions.Timeout:
        logger.error("register_with_ollama: timed out after %ds", timeout_s)
        return False
    except requests.exceptions.RequestException as exc:
        logger.error("register_with_ollama: HTTP error: %s", exc)
        return False


# ---------------------------------------------------------------------------
# Step 5 — Evaluate deployed model
# ---------------------------------------------------------------------------

def evaluate_model(
    model_name: str,
    task_type: str,
    gateway_url: str,
    n_samples: int = 20,
    timeout_s: int = 60,
) -> float:
    """
    Run evaluation prompts through the gateway using the newly deployed model.

    Returns average confidence score across all successful responses.
    Returns 0.0 if no successful responses were obtained.
    """
    from .evaluator import EVAL_PROMPTS, _call_gateway

    prompts = EVAL_PROMPTS.get(task_type, EVAL_PROMPTS.get("general", []))
    if not prompts:
        logger.warning(
            "evaluate_model: no eval prompts for task_type=%s, using general", task_type
        )
        prompts = EVAL_PROMPTS.get("general", [])

    # Limit to n_samples
    selected_prompts = prompts[:n_samples]
    scores: list[float] = []

    for prompt in selected_prompts:
        confidence = _call_gateway(
            gateway_url=gateway_url,
            model=model_name,
            prompt=prompt,
            task_type=task_type,
            timeout_s=timeout_s,
        )
        if confidence is not None:
            scores.append(confidence)
        time.sleep(0.3)  # avoid overwhelming the gateway

    if not scores:
        logger.warning(
            "evaluate_model: model=%s task=%s — no successful responses", model_name, task_type
        )
        return 0.0

    avg = round(sum(scores) / len(scores), 3)
    logger.info(
        "evaluate_model: model=%s task=%s avg_confidence=%.3f (n=%d)",
        model_name,
        task_type,
        avg,
        len(scores),
    )
    return avg


# ---------------------------------------------------------------------------
# Convenience: full pipeline
# ---------------------------------------------------------------------------

def run_conversion_and_registration(
    base_model_path: str,
    adapter_path: str,
    task_type: Optional[str],
    output_base_dir: str,
    ollama_url: str,
    gateway_url: str,
    quantization: str = "Q5_K_M",
    min_confidence_to_deploy: float = 0.0,
    convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
    quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
) -> dict:
    """
    End-to-end: merge → GGUF → Ollama registration → evaluation.

    Returns a result dict with keys: success, model_name, confidence, error.
    """
    task_label = task_type or "general"
    model_name = f"llm-gateway-{task_label}-ft"

    base = Path(output_base_dir)
    merged_dir = str(base / "merged")
    gguf_path = str(base / f"{model_name}.gguf")

    result: dict = {
        "success": False,
        "model_name": model_name,
        "confidence": 0.0,
        "error": None,
    }

    try:
        logger.info("Pipeline step 1/4: merging LoRA adapter")
        merge_lora_and_save(base_model_path, adapter_path, merged_dir)

        logger.info("Pipeline step 2/4: converting to GGUF (%s)", quantization)
        ok = convert_to_gguf(
            merged_dir,
            gguf_path,
            quantization=quantization,
            convert_script=convert_script,
            quantize_binary=quantize_binary,
        )
        if not ok:
            result["error"] = "GGUF conversion failed"
            return result

        logger.info("Pipeline step 3/4: registering with Ollama")
        modelfile = create_ollama_modelfile(gguf_path, model_name, task_type)
        registered = register_with_ollama(modelfile, model_name, ollama_url)
        if not registered:
            result["error"] = "Ollama registration failed"
            return result

        logger.info("Pipeline step 4/4: evaluating deployed model")
        confidence = evaluate_model(model_name, task_label, gateway_url)

        result["success"] = True
        result["confidence"] = confidence

        logger.info(
            "Conversion pipeline complete: model=%s confidence=%.3f",
            model_name,
            confidence,
        )

    except Exception as exc:
        logger.error("run_conversion_and_registration: unexpected error: %s", exc, exc_info=True)
        result["error"] = str(exc)

    return result