Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

459 lines
14 KiB
Python

"""
converter.py - Convert fine-tuned LoRA adapter to GGUF and register with Ollama.
Pipeline:
1. Merge LoRA adapter weights into the base model.
2. Save the merged full-precision HuggingFace model.
3. Convert to GGUF via llama.cpp convert_hf_to_gguf.py.
4. Quantize with llama-quantize (Q5_K_M by default).
5. Create an Ollama Modelfile.
6. Register the model with Ollama via POST /api/create.
7. Run a lightweight evaluation to confirm the model is responsive.
All subprocess calls use a fixed argument list — no shell=True, no
string interpolation of user-controlled data into shell commands.
"""
from __future__ import annotations
import json
import logging
import subprocess
import time
from pathlib import Path
from typing import Optional
import requests
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Step 1 — Merge LoRA adapter into base model
# ---------------------------------------------------------------------------
def merge_lora_and_save(
base_model_path: str,
adapter_path: str,
output_path: str,
) -> None:
"""
Merge LoRA adapter weights into the base model and save the result.
The merged model is saved in standard HuggingFace format and can
then be converted to GGUF. The base model is loaded in float32
for maximum compatibility with MPS and llama.cpp.
"""
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
logger.info(
"merge_lora_and_save: base=%s adapter=%s → output=%s",
base_model_path,
adapter_path,
output_path,
)
tokenizer = AutoTokenizer.from_pretrained(
adapter_path,
trust_remote_code=True,
)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
trust_remote_code=True,
)
model = PeftModel.from_pretrained(
base_model,
adapter_path,
torch_dtype=torch.float16,
)
logger.info("Merging LoRA weights into base model...")
merged = model.merge_and_unload()
out = Path(output_path)
out.mkdir(parents=True, exist_ok=True)
merged.save_pretrained(str(out), safe_serialization=True)
tokenizer.save_pretrained(str(out))
logger.info("Merged model saved to %s", out)
# ---------------------------------------------------------------------------
# Step 2 — Convert HuggingFace model to GGUF
# ---------------------------------------------------------------------------
def convert_to_gguf(
model_path: str,
output_gguf_path: str,
quantization: str = "Q5_K_M",
convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
) -> bool:
"""
Convert a HuggingFace model directory to a quantized GGUF file.
Steps:
1. Run convert_hf_to_gguf.py → unquantized fp16 GGUF.
2. Run llama-quantize → Q5_K_M (or requested quantization).
Returns True on success, False on any failure.
All subprocess calls use explicit argument lists (no shell=True).
"""
model_path_obj = Path(model_path)
output_path_obj = Path(output_gguf_path)
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
# Intermediate unquantized GGUF
f16_gguf = str(output_path_obj.with_suffix("")) + "_f16.gguf"
# --- Conversion step ---
convert_script_path = Path(convert_script)
if not convert_script_path.exists():
# Try to find it relative to the llama-cpp-python package
import importlib.util
spec = importlib.util.find_spec("llama_cpp")
if spec and spec.origin:
pkg_dir = Path(spec.origin).parent
alt_script = pkg_dir / "convert_hf_to_gguf.py"
if alt_script.exists():
convert_script_path = alt_script
else:
logger.error(
"convert_to_gguf: convert_hf_to_gguf.py not found at %s or %s",
convert_script,
alt_script,
)
return False
else:
logger.error(
"convert_to_gguf: convert_hf_to_gguf.py not found at %s", convert_script
)
return False
cmd_convert = [
"python3",
str(convert_script_path),
str(model_path_obj),
"--outfile",
f16_gguf,
"--outtype",
"f16",
]
logger.info("convert_to_gguf: running conversion: %s", " ".join(cmd_convert))
try:
result = subprocess.run(
cmd_convert,
capture_output=True,
text=True,
timeout=3600, # 1 hour — large models take time
check=False,
)
if result.returncode != 0:
logger.error(
"convert_to_gguf: conversion failed (rc=%d):\n%s\n%s",
result.returncode,
result.stdout[-2000:],
result.stderr[-2000:],
)
return False
logger.info("convert_to_gguf: f16 GGUF written to %s", f16_gguf)
except subprocess.TimeoutExpired:
logger.error("convert_to_gguf: conversion timed out after 3600s")
return False
except FileNotFoundError as exc:
logger.error("convert_to_gguf: python3 not found: %s", exc)
return False
# --- Quantization step ---
quantize_bin = Path(quantize_binary)
if not quantize_bin.exists():
logger.warning(
"convert_to_gguf: llama-quantize not found at %s, skipping quantization",
quantize_binary,
)
# Use f16 as the output without quantization
Path(f16_gguf).rename(output_gguf_path)
return True
cmd_quantize = [
str(quantize_bin),
f16_gguf,
output_gguf_path,
quantization,
]
logger.info("convert_to_gguf: quantizing: %s", " ".join(cmd_quantize))
try:
result = subprocess.run(
cmd_quantize,
capture_output=True,
text=True,
timeout=3600,
check=False,
)
if result.returncode != 0:
logger.error(
"convert_to_gguf: quantization failed (rc=%d):\n%s\n%s",
result.returncode,
result.stdout[-2000:],
result.stderr[-2000:],
)
return False
logger.info("convert_to_gguf: quantized GGUF written to %s", output_gguf_path)
except subprocess.TimeoutExpired:
logger.error("convert_to_gguf: quantization timed out after 3600s")
return False
# Clean up intermediate f16 file
try:
Path(f16_gguf).unlink(missing_ok=True)
except OSError:
pass
return True
# ---------------------------------------------------------------------------
# Step 3 — Create Ollama Modelfile
# ---------------------------------------------------------------------------
def create_ollama_modelfile(
gguf_path: str,
base_name: str,
task_type: Optional[str],
temperature: float = 0.3,
num_ctx: int = 8192,
) -> str:
"""
Generate Ollama Modelfile content for the fine-tuned model.
The Modelfile uses the absolute GGUF path so Ollama can locate it
regardless of working directory.
"""
task_label = task_type or "general"
model_name = f"llm-gateway-{task_label}-ft"
system_prompt = (
f"You are a fine-tuned assistant specialised in {task_label} tasks. "
"Provide accurate, detailed, professional responses. "
"Your outputs have been optimised through automated training on "
"high-quality examples from the LLM Gateway learning corpus."
)
modelfile = (
f"FROM {gguf_path}\n"
f"PARAMETER temperature {temperature}\n"
f"PARAMETER num_ctx {num_ctx}\n"
f"PARAMETER stop \"<|im_end|>\"\n"
f"TEMPLATE \"\"\"\n"
f"{{{{- if .System}}}}<|im_start|>system\n{{{{.System}}}}<|im_end|>\n{{{{- end}}}}\n"
f"{{{{- range .Messages}}}}\n"
f"<|im_start|>{{{{.Role}}}}\n{{{{.Content}}}}<|im_end|>\n"
f"{{{{- end}}}}\n"
f"<|im_start|>assistant\n\"\"\"\n"
f"SYSTEM \"{system_prompt}\"\n"
)
return modelfile
# ---------------------------------------------------------------------------
# Step 4 — Register with Ollama
# ---------------------------------------------------------------------------
def register_with_ollama(
modelfile_content: str,
model_name: str,
ollama_url: str,
timeout_s: int = 600,
) -> bool:
"""
Register a model with Ollama via POST /api/create (streaming response).
Streams the response to capture progress lines.
Returns True when Ollama confirms success, False on any error.
"""
url = f"{ollama_url}/api/create"
payload = {"name": model_name, "modelfile": modelfile_content}
logger.info("register_with_ollama: model=%s url=%s", model_name, url)
try:
with requests.post(
url,
json=payload,
stream=True,
timeout=timeout_s,
headers={"Content-Type": "application/json"},
) as resp:
resp.raise_for_status()
last_status = ""
for line in resp.iter_lines():
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError:
continue
status = data.get("status", "")
if status != last_status:
logger.info("Ollama create: %s", status)
last_status = status
if data.get("error"):
logger.error("Ollama create error: %s", data["error"])
return False
logger.info("register_with_ollama: model=%s registered successfully", model_name)
return True
except requests.exceptions.Timeout:
logger.error("register_with_ollama: timed out after %ds", timeout_s)
return False
except requests.exceptions.RequestException as exc:
logger.error("register_with_ollama: HTTP error: %s", exc)
return False
# ---------------------------------------------------------------------------
# Step 5 — Evaluate deployed model
# ---------------------------------------------------------------------------
def evaluate_model(
model_name: str,
task_type: str,
gateway_url: str,
n_samples: int = 20,
timeout_s: int = 60,
) -> float:
"""
Run evaluation prompts through the gateway using the newly deployed model.
Returns average confidence score across all successful responses.
Returns 0.0 if no successful responses were obtained.
"""
from .evaluator import EVAL_PROMPTS, _call_gateway
prompts = EVAL_PROMPTS.get(task_type, EVAL_PROMPTS.get("general", []))
if not prompts:
logger.warning(
"evaluate_model: no eval prompts for task_type=%s, using general", task_type
)
prompts = EVAL_PROMPTS.get("general", [])
# Limit to n_samples
selected_prompts = prompts[:n_samples]
scores: list[float] = []
for prompt in selected_prompts:
confidence = _call_gateway(
gateway_url=gateway_url,
model=model_name,
prompt=prompt,
task_type=task_type,
timeout_s=timeout_s,
)
if confidence is not None:
scores.append(confidence)
time.sleep(0.3) # avoid overwhelming the gateway
if not scores:
logger.warning(
"evaluate_model: model=%s task=%s — no successful responses", model_name, task_type
)
return 0.0
avg = round(sum(scores) / len(scores), 3)
logger.info(
"evaluate_model: model=%s task=%s avg_confidence=%.3f (n=%d)",
model_name,
task_type,
avg,
len(scores),
)
return avg
# ---------------------------------------------------------------------------
# Convenience: full pipeline
# ---------------------------------------------------------------------------
def run_conversion_and_registration(
base_model_path: str,
adapter_path: str,
task_type: Optional[str],
output_base_dir: str,
ollama_url: str,
gateway_url: str,
quantization: str = "Q5_K_M",
min_confidence_to_deploy: float = 0.0,
convert_script: str = "/opt/homebrew/lib/python3.12/site-packages/llama_cpp/convert_hf_to_gguf.py",
quantize_binary: str = "/opt/homebrew/bin/llama-quantize",
) -> dict:
"""
End-to-end: merge → GGUF → Ollama registration → evaluation.
Returns a result dict with keys: success, model_name, confidence, error.
"""
task_label = task_type or "general"
model_name = f"llm-gateway-{task_label}-ft"
base = Path(output_base_dir)
merged_dir = str(base / "merged")
gguf_path = str(base / f"{model_name}.gguf")
result: dict = {
"success": False,
"model_name": model_name,
"confidence": 0.0,
"error": None,
}
try:
logger.info("Pipeline step 1/4: merging LoRA adapter")
merge_lora_and_save(base_model_path, adapter_path, merged_dir)
logger.info("Pipeline step 2/4: converting to GGUF (%s)", quantization)
ok = convert_to_gguf(
merged_dir,
gguf_path,
quantization=quantization,
convert_script=convert_script,
quantize_binary=quantize_binary,
)
if not ok:
result["error"] = "GGUF conversion failed"
return result
logger.info("Pipeline step 3/4: registering with Ollama")
modelfile = create_ollama_modelfile(gguf_path, model_name, task_type)
registered = register_with_ollama(modelfile, model_name, ollama_url)
if not registered:
result["error"] = "Ollama registration failed"
return result
logger.info("Pipeline step 4/4: evaluating deployed model")
confidence = evaluate_model(model_name, task_label, gateway_url)
result["success"] = True
result["confidence"] = confidence
logger.info(
"Conversion pipeline complete: model=%s confidence=%.3f",
model_name,
confidence,
)
except Exception as exc:
logger.error("run_conversion_and_registration: unexpected error: %s", exc, exc_info=True)
result["error"] = str(exc)
return result