- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator - ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation) - ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles) - ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral) - Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry - Integration tests: claude-code-integration.test.ts (14 test cases) - PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan - Post-deployment verification procedures for health, client fallback, metrics
149 lines
5.2 KiB
Python
149 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge fo-blog-v6 LoRA adapter (checkpoint-102) into base model,
|
|
then convert to GGUF Q4_K_M and register in Ollama.
|
|
|
|
Usage:
|
|
python3 scripts/merge_and_convert.py
|
|
"""
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).parent.parent.parent.parent # llm-gateway root
|
|
FINE_TUNER = Path(__file__).parent.parent
|
|
|
|
ADAPTER_DIR = FINE_TUNER / "adapters" / "fo-blog-v6" / "checkpoint-102"
|
|
MERGED_DIR = FINE_TUNER / "models" / "fo-blog-v6" / "merged"
|
|
GGUF_DIR = FINE_TUNER / "models" / "fo-blog-v6"
|
|
GGUF_F16 = GGUF_DIR / "fo-blog-v6-f16.gguf"
|
|
GGUF_Q4 = GGUF_DIR / "fo-blog-v6.gguf"
|
|
|
|
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
|
CONVERT_SCRIPT = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
|
|
QUANTIZE_BIN = "/opt/homebrew/bin/llama-quantize"
|
|
PYTHON_BIN = "/opt/homebrew/bin/python3.13"
|
|
|
|
MODELFILE_TEMPLATE = """FROM {gguf_path}
|
|
|
|
SYSTEM \"\"\"You are an expert technical writer specializing in optical networking and transceiver technology.
|
|
Write clear, authoritative blog posts for network engineers and IT professionals.
|
|
Focus on practical insights, real-world use cases, and technical accuracy.
|
|
Use precise terminology. Keep paragraphs concise (3-5 sentences).
|
|
Structure posts with: intro hook, technical body, practical takeaways.\"\"\"
|
|
|
|
PARAMETER temperature 0.7
|
|
PARAMETER top_p 0.9
|
|
PARAMETER top_k 40
|
|
PARAMETER repeat_penalty 1.1
|
|
PARAMETER num_predict 2048
|
|
"""
|
|
|
|
def merge_adapter():
|
|
"""Merge LoRA adapter into base model and save to merged_dir."""
|
|
print("\n[1/3] Merging LoRA adapter into base model...")
|
|
import torch
|
|
from peft import PeftModel
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
MERGED_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check if already merged
|
|
if (MERGED_DIR / "model.safetensors").exists():
|
|
size = (MERGED_DIR / "model.safetensors").stat().st_size
|
|
if size > 10_000_000_000: # > 10GB = complete
|
|
print(f" Already merged ({size/1e9:.1f} GB) — skipping merge step")
|
|
return
|
|
else:
|
|
print(f" Found incomplete merge ({size/1e9:.2f} GB) — redoing")
|
|
(MERGED_DIR / "model.safetensors").unlink()
|
|
|
|
print(f" Loading base: {BASE_MODEL}")
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
BASE_MODEL,
|
|
torch_dtype=torch.float16,
|
|
device_map="cpu", # CPU for merge — avoids MPS memory pressure on save
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
print(f" Loading adapter: {ADAPTER_DIR}")
|
|
model = PeftModel.from_pretrained(model, str(ADAPTER_DIR))
|
|
|
|
print(" Merging and unloading...")
|
|
model = model.merge_and_unload()
|
|
|
|
print(f" Saving merged model to: {MERGED_DIR}")
|
|
model.save_pretrained(str(MERGED_DIR), safe_serialization=True)
|
|
tokenizer.save_pretrained(str(MERGED_DIR))
|
|
print(f" Done — {(MERGED_DIR / 'model.safetensors').stat().st_size / 1e9:.2f} GB")
|
|
|
|
|
|
def convert_gguf():
|
|
"""Convert merged model to quantized GGUF."""
|
|
print("\n[2/3] Converting to GGUF...")
|
|
|
|
# fp16 first
|
|
if not GGUF_F16.exists():
|
|
print(" Converting to fp16 GGUF...")
|
|
subprocess.run([
|
|
PYTHON_BIN, CONVERT_SCRIPT,
|
|
str(MERGED_DIR),
|
|
"--outfile", str(GGUF_F16),
|
|
"--outtype", "f16",
|
|
], check=True)
|
|
print(f" fp16 GGUF: {GGUF_F16.stat().st_size / 1e9:.2f} GB")
|
|
else:
|
|
print(f" fp16 GGUF exists ({GGUF_F16.stat().st_size / 1e9:.2f} GB) — skipping")
|
|
|
|
# Quantize to Q4_K_M
|
|
if not GGUF_Q4.exists():
|
|
print(" Quantizing to Q4_K_M...")
|
|
subprocess.run([
|
|
QUANTIZE_BIN, str(GGUF_F16), str(GGUF_Q4), "Q4_K_M",
|
|
], check=True)
|
|
print(f" Q4_K_M GGUF: {GGUF_Q4.stat().st_size / 1e9:.2f} GB")
|
|
GGUF_F16.unlink(missing_ok=True)
|
|
print(" Removed fp16 intermediate")
|
|
else:
|
|
print(f" Q4_K_M GGUF exists ({GGUF_Q4.stat().st_size / 1e9:.2f} GB) — skipping")
|
|
|
|
return GGUF_Q4
|
|
|
|
|
|
def register_ollama(gguf_path: Path):
|
|
"""Write Modelfile and register in Ollama."""
|
|
print("\n[3/3] Registering in Ollama...")
|
|
|
|
modelfile_path = gguf_path.parent / "Modelfile"
|
|
modelfile_path.write_text(MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve()))
|
|
print(f" Modelfile: {modelfile_path}")
|
|
|
|
print(" Running: ollama create fo-blog-v6 ...")
|
|
subprocess.run(["ollama", "create", "fo-blog-v6", "-f", str(modelfile_path)], check=True)
|
|
|
|
print(" Verifying...")
|
|
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
|
|
if "fo-blog-v6" in result.stdout:
|
|
print(" ✓ fo-blog-v6 registered in Ollama")
|
|
else:
|
|
print(" ✗ fo-blog-v6 NOT found in ollama list — check manually")
|
|
print(result.stdout)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 60)
|
|
print(" fo-blog-v6 Merge + GGUF Conversion")
|
|
print("=" * 60)
|
|
|
|
merge_adapter()
|
|
gguf_path = convert_gguf()
|
|
register_ollama(gguf_path)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(" DONE!")
|
|
print(f" GGUF: {gguf_path}")
|
|
print(" Next: update Erik ecosystem.config.js")
|
|
print(" OLLAMA_LLM_MODEL=fo-blog-v6, BLOG_LLM_PROVIDER=ollama")
|
|
print("=" * 60)
|