""" converter.py - Convert fine-tuned LoRA adapter to GGUF and register with Ollama. Pipeline: 1. Merge LoRA adapter weights into the base model. 2. Save the merged full-precision HuggingFace model. 3. Convert to GGUF via llama.cpp convert_hf_to_gguf.py. 4. Quantize with llama-quantize (Q5_K_M by default). 5. Create an Ollama Modelfile. 6. Register the model with Ollama via POST /api/create. 7. Run a lightweight evaluation to confirm the model is responsive. All subprocess calls use a fixed argument list — no shell=True, no string interpolation of user-controlled data into shell commands. """ from __future__ import annotations import json import logging import subprocess import time from pathlib import Path from typing import Optional import requests logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Step 1 — Merge LoRA adapter into base model # --------------------------------------------------------------------------- def merge_lora_and_save( base_model_path: str, adapter_path: str, output_path: str, ) -> None: """ Merge LoRA adapter weights into the base model and save the result. The merged model is saved in standard HuggingFace format and can then be converted to GGUF. The base model is loaded in float32 for maximum compatibility with MPS and llama.cpp. """ from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer import torch logger.info( "merge_lora_and_save: base=%s adapter=%s → output=%s", base_model_path, adapter_path, output_path, ) tokenizer = AutoTokenizer.from_pretrained( adapter_path, trust_remote_code=True, ) base_model = AutoModelForCausalLM.from_pretrained( base_model_path, torch_dtype=torch.float16, trust_remote_code=True, ) model = PeftModel.from_pretrained( base_model, adapter_path, torch_dtype=torch.float16, ) logger.info("Merging LoRA weights into base model...") merged = model.merge_and_unload() out = Path(output_path) out.mkdir(parents=True, exist_ok=True) merged.save_pretrained(str(out), safe_serialization=True) tokenizer.save_pretrained(str(out)) logger.info("Merged model saved to %s", out) # --------------------------------------------------------------------------- # Step 2 — Convert HuggingFace model to GGUF # --------------------------------------------------------------------------- def convert_to_gguf( model_path: str, output_gguf_path: str, quantization: str = "Q5_K_M", convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py", quantize_binary: str = "/opt/homebrew/bin/llama-quantize", ) -> bool: """ Convert a HuggingFace model directory to a quantized GGUF file. Steps: 1. Run convert_hf_to_gguf.py → unquantized fp16 GGUF. 2. Run llama-quantize → Q5_K_M (or requested quantization). Returns True on success, False on any failure. All subprocess calls use explicit argument lists (no shell=True). """ model_path_obj = Path(model_path) output_path_obj = Path(output_gguf_path) output_path_obj.parent.mkdir(parents=True, exist_ok=True) # Intermediate unquantized GGUF f16_gguf = str(output_path_obj.with_suffix("")) + "_f16.gguf" # --- Conversion step --- convert_script_path = Path(convert_script) if not convert_script_path.exists(): # Try to find it relative to the llama-cpp-python package import importlib.util spec = importlib.util.find_spec("llama_cpp") if spec and spec.origin: pkg_dir = Path(spec.origin).parent alt_script = pkg_dir / "convert_hf_to_gguf.py" if alt_script.exists(): convert_script_path = alt_script else: logger.error( "convert_to_gguf: convert_hf_to_gguf.py not found at %s or %s", convert_script, alt_script, ) return False else: logger.error( "convert_to_gguf: convert_hf_to_gguf.py not found at %s", convert_script ) return False cmd_convert = [ "python3", str(convert_script_path), str(model_path_obj), "--outfile", f16_gguf, "--outtype", "f16", ] logger.info("convert_to_gguf: running conversion: %s", " ".join(cmd_convert)) try: result = subprocess.run( cmd_convert, capture_output=True, text=True, timeout=3600, # 1 hour — large models take time check=False, ) if result.returncode != 0: logger.error( "convert_to_gguf: conversion failed (rc=%d):\n%s\n%s", result.returncode, result.stdout[-2000:], result.stderr[-2000:], ) return False logger.info("convert_to_gguf: f16 GGUF written to %s", f16_gguf) except subprocess.TimeoutExpired: logger.error("convert_to_gguf: conversion timed out after 3600s") return False except FileNotFoundError as exc: logger.error("convert_to_gguf: python3 not found: %s", exc) return False # --- Quantization step --- quantize_bin = Path(quantize_binary) if not quantize_bin.exists(): logger.warning( "convert_to_gguf: llama-quantize not found at %s, skipping quantization", quantize_binary, ) # Use f16 as the output without quantization Path(f16_gguf).rename(output_gguf_path) return True cmd_quantize = [ str(quantize_bin), f16_gguf, output_gguf_path, quantization, ] logger.info("convert_to_gguf: quantizing: %s", " ".join(cmd_quantize)) try: result = subprocess.run( cmd_quantize, capture_output=True, text=True, timeout=3600, check=False, ) if result.returncode != 0: logger.error( "convert_to_gguf: quantization failed (rc=%d):\n%s\n%s", result.returncode, result.stdout[-2000:], result.stderr[-2000:], ) return False logger.info("convert_to_gguf: quantized GGUF written to %s", output_gguf_path) except subprocess.TimeoutExpired: logger.error("convert_to_gguf: quantization timed out after 3600s") return False # Clean up intermediate f16 file try: Path(f16_gguf).unlink(missing_ok=True) except OSError: pass return True # --------------------------------------------------------------------------- # Step 3 — Create Ollama Modelfile # --------------------------------------------------------------------------- def create_ollama_modelfile( gguf_path: str, base_name: str, task_type: Optional[str], temperature: float = 0.3, num_ctx: int = 8192, ) -> str: """ Generate Ollama Modelfile content for the fine-tuned model. The Modelfile uses the absolute GGUF path so Ollama can locate it regardless of working directory. """ task_label = task_type or "general" model_name = f"llm-gateway-{task_label}-ft" system_prompt = ( f"You are a fine-tuned assistant specialised in {task_label} tasks. " "Provide accurate, detailed, professional responses. " "Your outputs have been optimised through automated training on " "high-quality examples from the LLM Gateway learning corpus." ) modelfile = ( f"FROM {gguf_path}\n" f"PARAMETER temperature {temperature}\n" f"PARAMETER num_ctx {num_ctx}\n" f"PARAMETER stop \"<|im_end|>\"\n" f"TEMPLATE \"\"\"\n" f"{{{{- if .System}}}}<|im_start|>system\n{{{{.System}}}}<|im_end|>\n{{{{- end}}}}\n" f"{{{{- range .Messages}}}}\n" f"<|im_start|>{{{{.Role}}}}\n{{{{.Content}}}}<|im_end|>\n" f"{{{{- end}}}}\n" f"<|im_start|>assistant\n\"\"\"\n" f"SYSTEM \"{system_prompt}\"\n" ) return modelfile # --------------------------------------------------------------------------- # Step 4 — Register with Ollama # --------------------------------------------------------------------------- def register_with_ollama( modelfile_content: str, model_name: str, ollama_url: str, timeout_s: int = 600, ) -> bool: """ Register a model with Ollama via POST /api/create (streaming response). Streams the response to capture progress lines. Returns True when Ollama confirms success, False on any error. """ url = f"{ollama_url}/api/create" payload = {"name": model_name, "modelfile": modelfile_content} logger.info("register_with_ollama: model=%s url=%s", model_name, url) try: with requests.post( url, json=payload, stream=True, timeout=timeout_s, headers={"Content-Type": "application/json"}, ) as resp: resp.raise_for_status() last_status = "" for line in resp.iter_lines(): if not line: continue try: data = json.loads(line) except json.JSONDecodeError: continue status = data.get("status", "") if status != last_status: logger.info("Ollama create: %s", status) last_status = status if data.get("error"): logger.error("Ollama create error: %s", data["error"]) return False logger.info("register_with_ollama: model=%s registered successfully", model_name) return True except requests.exceptions.Timeout: logger.error("register_with_ollama: timed out after %ds", timeout_s) return False except requests.exceptions.RequestException as exc: logger.error("register_with_ollama: HTTP error: %s", exc) return False # --------------------------------------------------------------------------- # Step 5 — Evaluate deployed model # --------------------------------------------------------------------------- def evaluate_model( model_name: str, task_type: str, gateway_url: str, n_samples: int = 20, timeout_s: int = 60, ) -> float: """ Run evaluation prompts through the gateway using the newly deployed model. Returns average confidence score across all successful responses. Returns 0.0 if no successful responses were obtained. """ from .evaluator import EVAL_PROMPTS, _call_gateway prompts = EVAL_PROMPTS.get(task_type, EVAL_PROMPTS.get("general", [])) if not prompts: logger.warning( "evaluate_model: no eval prompts for task_type=%s, using general", task_type ) prompts = EVAL_PROMPTS.get("general", []) # Limit to n_samples selected_prompts = prompts[:n_samples] scores: list[float] = [] for prompt in selected_prompts: confidence = _call_gateway( gateway_url=gateway_url, model=model_name, prompt=prompt, task_type=task_type, timeout_s=timeout_s, ) if confidence is not None: scores.append(confidence) time.sleep(0.3) # avoid overwhelming the gateway if not scores: logger.warning( "evaluate_model: model=%s task=%s — no successful responses", model_name, task_type ) return 0.0 avg = round(sum(scores) / len(scores), 3) logger.info( "evaluate_model: model=%s task=%s avg_confidence=%.3f (n=%d)", model_name, task_type, avg, len(scores), ) return avg # --------------------------------------------------------------------------- # Convenience: full pipeline # --------------------------------------------------------------------------- def run_conversion_and_registration( base_model_path: str, adapter_path: str, task_type: Optional[str], output_base_dir: str, ollama_url: str, gateway_url: str, quantization: str = "Q5_K_M", min_confidence_to_deploy: float = 0.0, convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py", quantize_binary: str = "/opt/homebrew/bin/llama-quantize", ) -> dict: """ End-to-end: merge → GGUF → Ollama registration → evaluation. Returns a result dict with keys: success, model_name, confidence, error. """ task_label = task_type or "general" model_name = f"llm-gateway-{task_label}-ft" base = Path(output_base_dir) merged_dir = str(base / "merged") gguf_path = str(base / f"{model_name}.gguf") result: dict = { "success": False, "model_name": model_name, "confidence": 0.0, "error": None, } try: logger.info("Pipeline step 1/4: merging LoRA adapter") merge_lora_and_save(base_model_path, adapter_path, merged_dir) logger.info("Pipeline step 2/4: converting to GGUF (%s)", quantization) ok = convert_to_gguf( merged_dir, gguf_path, quantization=quantization, convert_script=convert_script, quantize_binary=quantize_binary, ) if not ok: result["error"] = "GGUF conversion failed" return result logger.info("Pipeline step 3/4: registering with Ollama") modelfile = create_ollama_modelfile(gguf_path, model_name, task_type) registered = register_with_ollama(modelfile, model_name, ollama_url) if not registered: result["error"] = "Ollama registration failed" return result logger.info("Pipeline step 4/4: evaluating deployed model") confidence = evaluate_model(model_name, task_label, gateway_url) result["success"] = True result["confidence"] = confidence logger.info( "Conversion pipeline complete: model=%s confidence=%.3f", model_name, confidence, ) except Exception as exc: logger.error("run_conversion_and_registration: unexpected error: %s", exc, exc_info=True) result["error"] = str(exc) return result