llm-gateway/packages/fine-tuner/scripts/merge_and_convert.py
Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

149 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
Merge fo-blog-v6 LoRA adapter (checkpoint-102) into base model,
then convert to GGUF Q4_K_M and register in Ollama.
Usage:
python3 scripts/merge_and_convert.py
"""
import subprocess
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parent.parent.parent.parent # llm-gateway root
FINE_TUNER = Path(__file__).parent.parent
ADAPTER_DIR = FINE_TUNER / "adapters" / "fo-blog-v6" / "checkpoint-102"
MERGED_DIR = FINE_TUNER / "models" / "fo-blog-v6" / "merged"
GGUF_DIR = FINE_TUNER / "models" / "fo-blog-v6"
GGUF_F16 = GGUF_DIR / "fo-blog-v6-f16.gguf"
GGUF_Q4 = GGUF_DIR / "fo-blog-v6.gguf"
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
CONVERT_SCRIPT = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
QUANTIZE_BIN = "/opt/homebrew/bin/llama-quantize"
PYTHON_BIN = "/opt/homebrew/bin/python3.13"
MODELFILE_TEMPLATE = """FROM {gguf_path}
SYSTEM \"\"\"You are an expert technical writer specializing in optical networking and transceiver technology.
Write clear, authoritative blog posts for network engineers and IT professionals.
Focus on practical insights, real-world use cases, and technical accuracy.
Use precise terminology. Keep paragraphs concise (3-5 sentences).
Structure posts with: intro hook, technical body, practical takeaways.\"\"\"
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_predict 2048
"""
def merge_adapter():
"""Merge LoRA adapter into base model and save to merged_dir."""
print("\n[1/3] Merging LoRA adapter into base model...")
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
MERGED_DIR.mkdir(parents=True, exist_ok=True)
# Check if already merged
if (MERGED_DIR / "model.safetensors").exists():
size = (MERGED_DIR / "model.safetensors").stat().st_size
if size > 10_000_000_000: # > 10GB = complete
print(f" Already merged ({size/1e9:.1f} GB) — skipping merge step")
return
else:
print(f" Found incomplete merge ({size/1e9:.2f} GB) — redoing")
(MERGED_DIR / "model.safetensors").unlink()
print(f" Loading base: {BASE_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16,
device_map="cpu", # CPU for merge — avoids MPS memory pressure on save
trust_remote_code=True,
)
print(f" Loading adapter: {ADAPTER_DIR}")
model = PeftModel.from_pretrained(model, str(ADAPTER_DIR))
print(" Merging and unloading...")
model = model.merge_and_unload()
print(f" Saving merged model to: {MERGED_DIR}")
model.save_pretrained(str(MERGED_DIR), safe_serialization=True)
tokenizer.save_pretrained(str(MERGED_DIR))
print(f" Done — {(MERGED_DIR / 'model.safetensors').stat().st_size / 1e9:.2f} GB")
def convert_gguf():
"""Convert merged model to quantized GGUF."""
print("\n[2/3] Converting to GGUF...")
# fp16 first
if not GGUF_F16.exists():
print(" Converting to fp16 GGUF...")
subprocess.run([
PYTHON_BIN, CONVERT_SCRIPT,
str(MERGED_DIR),
"--outfile", str(GGUF_F16),
"--outtype", "f16",
], check=True)
print(f" fp16 GGUF: {GGUF_F16.stat().st_size / 1e9:.2f} GB")
else:
print(f" fp16 GGUF exists ({GGUF_F16.stat().st_size / 1e9:.2f} GB) — skipping")
# Quantize to Q4_K_M
if not GGUF_Q4.exists():
print(" Quantizing to Q4_K_M...")
subprocess.run([
QUANTIZE_BIN, str(GGUF_F16), str(GGUF_Q4), "Q4_K_M",
], check=True)
print(f" Q4_K_M GGUF: {GGUF_Q4.stat().st_size / 1e9:.2f} GB")
GGUF_F16.unlink(missing_ok=True)
print(" Removed fp16 intermediate")
else:
print(f" Q4_K_M GGUF exists ({GGUF_Q4.stat().st_size / 1e9:.2f} GB) — skipping")
return GGUF_Q4
def register_ollama(gguf_path: Path):
"""Write Modelfile and register in Ollama."""
print("\n[3/3] Registering in Ollama...")
modelfile_path = gguf_path.parent / "Modelfile"
modelfile_path.write_text(MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve()))
print(f" Modelfile: {modelfile_path}")
print(" Running: ollama create fo-blog-v6 ...")
subprocess.run(["ollama", "create", "fo-blog-v6", "-f", str(modelfile_path)], check=True)
print(" Verifying...")
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
if "fo-blog-v6" in result.stdout:
print(" ✓ fo-blog-v6 registered in Ollama")
else:
print(" ✗ fo-blog-v6 NOT found in ollama list — check manually")
print(result.stdout)
if __name__ == "__main__":
print("=" * 60)
print(" fo-blog-v6 Merge + GGUF Conversion")
print("=" * 60)
merge_adapter()
gguf_path = convert_gguf()
register_ollama(gguf_path)
print("\n" + "=" * 60)
print(" DONE!")
print(f" GGUF: {gguf_path}")
print(" Next: update Erik ecosystem.config.js")
print(" OLLAMA_LLM_MODEL=fo-blog-v6, BLOG_LLM_PROVIDER=ollama")
print("=" * 60)