#!/usr/bin/env python3 """ Merge fo-blog-v6 LoRA adapter (checkpoint-102) into base model, then convert to GGUF Q4_K_M and register in Ollama. Usage: python3 scripts/merge_and_convert.py """ import subprocess import sys from pathlib import Path REPO_ROOT = Path(__file__).parent.parent.parent.parent # llm-gateway root FINE_TUNER = Path(__file__).parent.parent ADAPTER_DIR = FINE_TUNER / "adapters" / "fo-blog-v6" / "checkpoint-102" MERGED_DIR = FINE_TUNER / "models" / "fo-blog-v6" / "merged" GGUF_DIR = FINE_TUNER / "models" / "fo-blog-v6" GGUF_F16 = GGUF_DIR / "fo-blog-v6-f16.gguf" GGUF_Q4 = GGUF_DIR / "fo-blog-v6.gguf" BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" CONVERT_SCRIPT = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py" QUANTIZE_BIN = "/opt/homebrew/bin/llama-quantize" PYTHON_BIN = "/opt/homebrew/bin/python3.13" MODELFILE_TEMPLATE = """FROM {gguf_path} SYSTEM \"\"\"You are an expert technical writer specializing in optical networking and transceiver technology. Write clear, authoritative blog posts for network engineers and IT professionals. Focus on practical insights, real-world use cases, and technical accuracy. Use precise terminology. Keep paragraphs concise (3-5 sentences). Structure posts with: intro hook, technical body, practical takeaways.\"\"\" PARAMETER temperature 0.7 PARAMETER top_p 0.9 PARAMETER top_k 40 PARAMETER repeat_penalty 1.1 PARAMETER num_predict 2048 """ def merge_adapter(): """Merge LoRA adapter into base model and save to merged_dir.""" print("\n[1/3] Merging LoRA adapter into base model...") import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer MERGED_DIR.mkdir(parents=True, exist_ok=True) # Check if already merged if (MERGED_DIR / "model.safetensors").exists(): size = (MERGED_DIR / "model.safetensors").stat().st_size if size > 10_000_000_000: # > 10GB = complete print(f" Already merged ({size/1e9:.1f} GB) — skipping merge step") return else: print(f" Found incomplete merge ({size/1e9:.2f} GB) — redoing") (MERGED_DIR / "model.safetensors").unlink() print(f" Loading base: {BASE_MODEL}") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="cpu", # CPU for merge — avoids MPS memory pressure on save trust_remote_code=True, ) print(f" Loading adapter: {ADAPTER_DIR}") model = PeftModel.from_pretrained(model, str(ADAPTER_DIR)) print(" Merging and unloading...") model = model.merge_and_unload() print(f" Saving merged model to: {MERGED_DIR}") model.save_pretrained(str(MERGED_DIR), safe_serialization=True) tokenizer.save_pretrained(str(MERGED_DIR)) print(f" Done — {(MERGED_DIR / 'model.safetensors').stat().st_size / 1e9:.2f} GB") def convert_gguf(): """Convert merged model to quantized GGUF.""" print("\n[2/3] Converting to GGUF...") # fp16 first if not GGUF_F16.exists(): print(" Converting to fp16 GGUF...") subprocess.run([ PYTHON_BIN, CONVERT_SCRIPT, str(MERGED_DIR), "--outfile", str(GGUF_F16), "--outtype", "f16", ], check=True) print(f" fp16 GGUF: {GGUF_F16.stat().st_size / 1e9:.2f} GB") else: print(f" fp16 GGUF exists ({GGUF_F16.stat().st_size / 1e9:.2f} GB) — skipping") # Quantize to Q4_K_M if not GGUF_Q4.exists(): print(" Quantizing to Q4_K_M...") subprocess.run([ QUANTIZE_BIN, str(GGUF_F16), str(GGUF_Q4), "Q4_K_M", ], check=True) print(f" Q4_K_M GGUF: {GGUF_Q4.stat().st_size / 1e9:.2f} GB") GGUF_F16.unlink(missing_ok=True) print(" Removed fp16 intermediate") else: print(f" Q4_K_M GGUF exists ({GGUF_Q4.stat().st_size / 1e9:.2f} GB) — skipping") return GGUF_Q4 def register_ollama(gguf_path: Path): """Write Modelfile and register in Ollama.""" print("\n[3/3] Registering in Ollama...") modelfile_path = gguf_path.parent / "Modelfile" modelfile_path.write_text(MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve())) print(f" Modelfile: {modelfile_path}") print(" Running: ollama create fo-blog-v6 ...") subprocess.run(["ollama", "create", "fo-blog-v6", "-f", str(modelfile_path)], check=True) print(" Verifying...") result = subprocess.run(["ollama", "list"], capture_output=True, text=True) if "fo-blog-v6" in result.stdout: print(" ✓ fo-blog-v6 registered in Ollama") else: print(" ✗ fo-blog-v6 NOT found in ollama list — check manually") print(result.stdout) if __name__ == "__main__": print("=" * 60) print(" fo-blog-v6 Merge + GGUF Conversion") print("=" * 60) merge_adapter() gguf_path = convert_gguf() register_ollama(gguf_path) print("\n" + "=" * 60) print(" DONE!") print(f" GGUF: {gguf_path}") print(" Next: update Erik ecosystem.config.js") print(" OLLAMA_LLM_MODEL=fo-blog-v6, BLOG_LLM_PROVIDER=ollama") print("=" * 60)