- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator - ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation) - ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles) - ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral) - Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry - Integration tests: claude-code-integration.test.ts (14 test cases) - PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan - Post-deployment verification procedures for health, client fallback, metrics
459 lines
14 KiB
Python
459 lines
14 KiB
Python
"""
|
|
converter.py - Convert fine-tuned LoRA adapter to GGUF and register with Ollama.
|
|
|
|
Pipeline:
|
|
1. Merge LoRA adapter weights into the base model.
|
|
2. Save the merged full-precision HuggingFace model.
|
|
3. Convert to GGUF via llama.cpp convert_hf_to_gguf.py.
|
|
4. Quantize with llama-quantize (Q5_K_M by default).
|
|
5. Create an Ollama Modelfile.
|
|
6. Register the model with Ollama via POST /api/create.
|
|
7. Run a lightweight evaluation to confirm the model is responsive.
|
|
|
|
All subprocess calls use a fixed argument list — no shell=True, no
|
|
string interpolation of user-controlled data into shell commands.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1 — Merge LoRA adapter into base model
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def merge_lora_and_save(
|
|
base_model_path: str,
|
|
adapter_path: str,
|
|
output_path: str,
|
|
) -> None:
|
|
"""
|
|
Merge LoRA adapter weights into the base model and save the result.
|
|
|
|
The merged model is saved in standard HuggingFace format and can
|
|
then be converted to GGUF. The base model is loaded in float32
|
|
for maximum compatibility with MPS and llama.cpp.
|
|
"""
|
|
from peft import PeftModel
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
import torch
|
|
|
|
logger.info(
|
|
"merge_lora_and_save: base=%s adapter=%s → output=%s",
|
|
base_model_path,
|
|
adapter_path,
|
|
output_path,
|
|
)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
adapter_path,
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(
|
|
base_model_path,
|
|
torch_dtype=torch.float16,
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
model = PeftModel.from_pretrained(
|
|
base_model,
|
|
adapter_path,
|
|
torch_dtype=torch.float16,
|
|
)
|
|
|
|
logger.info("Merging LoRA weights into base model...")
|
|
merged = model.merge_and_unload()
|
|
|
|
out = Path(output_path)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
|
|
merged.save_pretrained(str(out), safe_serialization=True)
|
|
tokenizer.save_pretrained(str(out))
|
|
logger.info("Merged model saved to %s", out)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 2 — Convert HuggingFace model to GGUF
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def convert_to_gguf(
|
|
model_path: str,
|
|
output_gguf_path: str,
|
|
quantization: str = "Q5_K_M",
|
|
convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
|
|
quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
|
|
) -> bool:
|
|
"""
|
|
Convert a HuggingFace model directory to a quantized GGUF file.
|
|
|
|
Steps:
|
|
1. Run convert_hf_to_gguf.py → unquantized fp16 GGUF.
|
|
2. Run llama-quantize → Q5_K_M (or requested quantization).
|
|
|
|
Returns True on success, False on any failure.
|
|
All subprocess calls use explicit argument lists (no shell=True).
|
|
"""
|
|
model_path_obj = Path(model_path)
|
|
output_path_obj = Path(output_gguf_path)
|
|
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Intermediate unquantized GGUF
|
|
f16_gguf = str(output_path_obj.with_suffix("")) + "_f16.gguf"
|
|
|
|
# --- Conversion step ---
|
|
convert_script_path = Path(convert_script)
|
|
if not convert_script_path.exists():
|
|
# Try to find it relative to the llama-cpp-python package
|
|
import importlib.util
|
|
spec = importlib.util.find_spec("llama_cpp")
|
|
if spec and spec.origin:
|
|
pkg_dir = Path(spec.origin).parent
|
|
alt_script = pkg_dir / "convert_hf_to_gguf.py"
|
|
if alt_script.exists():
|
|
convert_script_path = alt_script
|
|
else:
|
|
logger.error(
|
|
"convert_to_gguf: convert_hf_to_gguf.py not found at %s or %s",
|
|
convert_script,
|
|
alt_script,
|
|
)
|
|
return False
|
|
else:
|
|
logger.error(
|
|
"convert_to_gguf: convert_hf_to_gguf.py not found at %s", convert_script
|
|
)
|
|
return False
|
|
|
|
cmd_convert = [
|
|
"python3",
|
|
str(convert_script_path),
|
|
str(model_path_obj),
|
|
"--outfile",
|
|
f16_gguf,
|
|
"--outtype",
|
|
"f16",
|
|
]
|
|
|
|
logger.info("convert_to_gguf: running conversion: %s", " ".join(cmd_convert))
|
|
try:
|
|
result = subprocess.run(
|
|
cmd_convert,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=3600, # 1 hour — large models take time
|
|
check=False,
|
|
)
|
|
if result.returncode != 0:
|
|
logger.error(
|
|
"convert_to_gguf: conversion failed (rc=%d):\n%s\n%s",
|
|
result.returncode,
|
|
result.stdout[-2000:],
|
|
result.stderr[-2000:],
|
|
)
|
|
return False
|
|
logger.info("convert_to_gguf: f16 GGUF written to %s", f16_gguf)
|
|
except subprocess.TimeoutExpired:
|
|
logger.error("convert_to_gguf: conversion timed out after 3600s")
|
|
return False
|
|
except FileNotFoundError as exc:
|
|
logger.error("convert_to_gguf: python3 not found: %s", exc)
|
|
return False
|
|
|
|
# --- Quantization step ---
|
|
quantize_bin = Path(quantize_binary)
|
|
if not quantize_bin.exists():
|
|
logger.warning(
|
|
"convert_to_gguf: llama-quantize not found at %s, skipping quantization",
|
|
quantize_binary,
|
|
)
|
|
# Use f16 as the output without quantization
|
|
Path(f16_gguf).rename(output_gguf_path)
|
|
return True
|
|
|
|
cmd_quantize = [
|
|
str(quantize_bin),
|
|
f16_gguf,
|
|
output_gguf_path,
|
|
quantization,
|
|
]
|
|
|
|
logger.info("convert_to_gguf: quantizing: %s", " ".join(cmd_quantize))
|
|
try:
|
|
result = subprocess.run(
|
|
cmd_quantize,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=3600,
|
|
check=False,
|
|
)
|
|
if result.returncode != 0:
|
|
logger.error(
|
|
"convert_to_gguf: quantization failed (rc=%d):\n%s\n%s",
|
|
result.returncode,
|
|
result.stdout[-2000:],
|
|
result.stderr[-2000:],
|
|
)
|
|
return False
|
|
logger.info("convert_to_gguf: quantized GGUF written to %s", output_gguf_path)
|
|
except subprocess.TimeoutExpired:
|
|
logger.error("convert_to_gguf: quantization timed out after 3600s")
|
|
return False
|
|
|
|
# Clean up intermediate f16 file
|
|
try:
|
|
Path(f16_gguf).unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 3 — Create Ollama Modelfile
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def create_ollama_modelfile(
|
|
gguf_path: str,
|
|
base_name: str,
|
|
task_type: Optional[str],
|
|
temperature: float = 0.3,
|
|
num_ctx: int = 8192,
|
|
) -> str:
|
|
"""
|
|
Generate Ollama Modelfile content for the fine-tuned model.
|
|
|
|
The Modelfile uses the absolute GGUF path so Ollama can locate it
|
|
regardless of working directory.
|
|
"""
|
|
task_label = task_type or "general"
|
|
model_name = f"llm-gateway-{task_label}-ft"
|
|
|
|
system_prompt = (
|
|
f"You are a fine-tuned assistant specialised in {task_label} tasks. "
|
|
"Provide accurate, detailed, professional responses. "
|
|
"Your outputs have been optimised through automated training on "
|
|
"high-quality examples from the LLM Gateway learning corpus."
|
|
)
|
|
|
|
modelfile = (
|
|
f"FROM {gguf_path}\n"
|
|
f"PARAMETER temperature {temperature}\n"
|
|
f"PARAMETER num_ctx {num_ctx}\n"
|
|
f"PARAMETER stop \"<|im_end|>\"\n"
|
|
f"TEMPLATE \"\"\"\n"
|
|
f"{{{{- if .System}}}}<|im_start|>system\n{{{{.System}}}}<|im_end|>\n{{{{- end}}}}\n"
|
|
f"{{{{- range .Messages}}}}\n"
|
|
f"<|im_start|>{{{{.Role}}}}\n{{{{.Content}}}}<|im_end|>\n"
|
|
f"{{{{- end}}}}\n"
|
|
f"<|im_start|>assistant\n\"\"\"\n"
|
|
f"SYSTEM \"{system_prompt}\"\n"
|
|
)
|
|
return modelfile
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 4 — Register with Ollama
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def register_with_ollama(
|
|
modelfile_content: str,
|
|
model_name: str,
|
|
ollama_url: str,
|
|
timeout_s: int = 600,
|
|
) -> bool:
|
|
"""
|
|
Register a model with Ollama via POST /api/create (streaming response).
|
|
|
|
Streams the response to capture progress lines.
|
|
Returns True when Ollama confirms success, False on any error.
|
|
"""
|
|
url = f"{ollama_url}/api/create"
|
|
payload = {"name": model_name, "modelfile": modelfile_content}
|
|
|
|
logger.info("register_with_ollama: model=%s url=%s", model_name, url)
|
|
|
|
try:
|
|
with requests.post(
|
|
url,
|
|
json=payload,
|
|
stream=True,
|
|
timeout=timeout_s,
|
|
headers={"Content-Type": "application/json"},
|
|
) as resp:
|
|
resp.raise_for_status()
|
|
|
|
last_status = ""
|
|
for line in resp.iter_lines():
|
|
if not line:
|
|
continue
|
|
try:
|
|
data = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
status = data.get("status", "")
|
|
if status != last_status:
|
|
logger.info("Ollama create: %s", status)
|
|
last_status = status
|
|
|
|
if data.get("error"):
|
|
logger.error("Ollama create error: %s", data["error"])
|
|
return False
|
|
|
|
logger.info("register_with_ollama: model=%s registered successfully", model_name)
|
|
return True
|
|
|
|
except requests.exceptions.Timeout:
|
|
logger.error("register_with_ollama: timed out after %ds", timeout_s)
|
|
return False
|
|
except requests.exceptions.RequestException as exc:
|
|
logger.error("register_with_ollama: HTTP error: %s", exc)
|
|
return False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 5 — Evaluate deployed model
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def evaluate_model(
|
|
model_name: str,
|
|
task_type: str,
|
|
gateway_url: str,
|
|
n_samples: int = 20,
|
|
timeout_s: int = 60,
|
|
) -> float:
|
|
"""
|
|
Run evaluation prompts through the gateway using the newly deployed model.
|
|
|
|
Returns average confidence score across all successful responses.
|
|
Returns 0.0 if no successful responses were obtained.
|
|
"""
|
|
from .evaluator import EVAL_PROMPTS, _call_gateway
|
|
|
|
prompts = EVAL_PROMPTS.get(task_type, EVAL_PROMPTS.get("general", []))
|
|
if not prompts:
|
|
logger.warning(
|
|
"evaluate_model: no eval prompts for task_type=%s, using general", task_type
|
|
)
|
|
prompts = EVAL_PROMPTS.get("general", [])
|
|
|
|
# Limit to n_samples
|
|
selected_prompts = prompts[:n_samples]
|
|
scores: list[float] = []
|
|
|
|
for prompt in selected_prompts:
|
|
confidence = _call_gateway(
|
|
gateway_url=gateway_url,
|
|
model=model_name,
|
|
prompt=prompt,
|
|
task_type=task_type,
|
|
timeout_s=timeout_s,
|
|
)
|
|
if confidence is not None:
|
|
scores.append(confidence)
|
|
time.sleep(0.3) # avoid overwhelming the gateway
|
|
|
|
if not scores:
|
|
logger.warning(
|
|
"evaluate_model: model=%s task=%s — no successful responses", model_name, task_type
|
|
)
|
|
return 0.0
|
|
|
|
avg = round(sum(scores) / len(scores), 3)
|
|
logger.info(
|
|
"evaluate_model: model=%s task=%s avg_confidence=%.3f (n=%d)",
|
|
model_name,
|
|
task_type,
|
|
avg,
|
|
len(scores),
|
|
)
|
|
return avg
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Convenience: full pipeline
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def run_conversion_and_registration(
|
|
base_model_path: str,
|
|
adapter_path: str,
|
|
task_type: Optional[str],
|
|
output_base_dir: str,
|
|
ollama_url: str,
|
|
gateway_url: str,
|
|
quantization: str = "Q5_K_M",
|
|
min_confidence_to_deploy: float = 0.0,
|
|
convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
|
|
quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
|
|
) -> dict:
|
|
"""
|
|
End-to-end: merge → GGUF → Ollama registration → evaluation.
|
|
|
|
Returns a result dict with keys: success, model_name, confidence, error.
|
|
"""
|
|
task_label = task_type or "general"
|
|
model_name = f"llm-gateway-{task_label}-ft"
|
|
|
|
base = Path(output_base_dir)
|
|
merged_dir = str(base / "merged")
|
|
gguf_path = str(base / f"{model_name}.gguf")
|
|
|
|
result: dict = {
|
|
"success": False,
|
|
"model_name": model_name,
|
|
"confidence": 0.0,
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
logger.info("Pipeline step 1/4: merging LoRA adapter")
|
|
merge_lora_and_save(base_model_path, adapter_path, merged_dir)
|
|
|
|
logger.info("Pipeline step 2/4: converting to GGUF (%s)", quantization)
|
|
ok = convert_to_gguf(
|
|
merged_dir,
|
|
gguf_path,
|
|
quantization=quantization,
|
|
convert_script=convert_script,
|
|
quantize_binary=quantize_binary,
|
|
)
|
|
if not ok:
|
|
result["error"] = "GGUF conversion failed"
|
|
return result
|
|
|
|
logger.info("Pipeline step 3/4: registering with Ollama")
|
|
modelfile = create_ollama_modelfile(gguf_path, model_name, task_type)
|
|
registered = register_with_ollama(modelfile, model_name, ollama_url)
|
|
if not registered:
|
|
result["error"] = "Ollama registration failed"
|
|
return result
|
|
|
|
logger.info("Pipeline step 4/4: evaluating deployed model")
|
|
confidence = evaluate_model(model_name, task_label, gateway_url)
|
|
|
|
result["success"] = True
|
|
result["confidence"] = confidence
|
|
|
|
logger.info(
|
|
"Conversion pipeline complete: model=%s confidence=%.3f",
|
|
model_name,
|
|
confidence,
|
|
)
|
|
|
|
except Exception as exc:
|
|
logger.error("run_conversion_and_registration: unexpected error: %s", exc, exc_info=True)
|
|
result["error"] = str(exc)
|
|
|
|
return result
|