llm-gateway/packages/fine-tuner/scripts/merge_and_convert.py

#!/usr/bin/env python3
"""
Merge fo-blog-v6 LoRA adapter (checkpoint-102) into base model,
then convert to GGUF Q4_K_M and register in Ollama.

Usage:
  python3 scripts/merge_and_convert.py
"""
import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).parent.parent.parent.parent  # llm-gateway root
FINE_TUNER = Path(__file__).parent.parent

ADAPTER_DIR = FINE_TUNER / "adapters" / "fo-blog-v6" / "checkpoint-102"
MERGED_DIR  = FINE_TUNER / "models" / "fo-blog-v6" / "merged"
GGUF_DIR    = FINE_TUNER / "models" / "fo-blog-v6"
GGUF_F16    = GGUF_DIR / "fo-blog-v6-f16.gguf"
GGUF_Q4     = GGUF_DIR / "fo-blog-v6.gguf"

BASE_MODEL  = "Qwen/Qwen2.5-7B-Instruct"
CONVERT_SCRIPT  = "/opt/homebrew/Cellar/llama.cpp/8680/bin/convert_hf_to_gguf.py"
QUANTIZE_BIN    = "/opt/homebrew/bin/llama-quantize"
PYTHON_BIN      = "/opt/homebrew/bin/python3.13"

MODELFILE_TEMPLATE = """FROM {gguf_path}

SYSTEM \"\"\"You are an expert technical writer specializing in optical networking and transceiver technology.
Write clear, authoritative blog posts for network engineers and IT professionals.
Focus on practical insights, real-world use cases, and technical accuracy.
Use precise terminology. Keep paragraphs concise (3-5 sentences).
Structure posts with: intro hook, technical body, practical takeaways.\"\"\"

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_predict 2048
"""

def merge_adapter():
    """Merge LoRA adapter into base model and save to merged_dir."""
    print("\n[1/3] Merging LoRA adapter into base model...")
    import torch
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, AutoTokenizer

    MERGED_DIR.mkdir(parents=True, exist_ok=True)

    # Check if already merged
    if (MERGED_DIR / "model.safetensors").exists():
        size = (MERGED_DIR / "model.safetensors").stat().st_size
        if size > 10_000_000_000:  # > 10GB = complete
            print(f"  Already merged ({size/1e9:.1f} GB) — skipping merge step")
            return
        else:
            print(f"  Found incomplete merge ({size/1e9:.2f} GB) — redoing")
            (MERGED_DIR / "model.safetensors").unlink()

    print(f"  Loading base: {BASE_MODEL}")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="cpu",   # CPU for merge — avoids MPS memory pressure on save
        trust_remote_code=True,
    )

    print(f"  Loading adapter: {ADAPTER_DIR}")
    model = PeftModel.from_pretrained(model, str(ADAPTER_DIR))

    print("  Merging and unloading...")
    model = model.merge_and_unload()

    print(f"  Saving merged model to: {MERGED_DIR}")
    model.save_pretrained(str(MERGED_DIR), safe_serialization=True)
    tokenizer.save_pretrained(str(MERGED_DIR))
    print(f"  Done — {(MERGED_DIR / 'model.safetensors').stat().st_size / 1e9:.2f} GB")


def convert_gguf():
    """Convert merged model to quantized GGUF."""
    print("\n[2/3] Converting to GGUF...")

    # fp16 first
    if not GGUF_F16.exists():
        print("  Converting to fp16 GGUF...")
        subprocess.run([
            PYTHON_BIN, CONVERT_SCRIPT,
            str(MERGED_DIR),
            "--outfile", str(GGUF_F16),
            "--outtype", "f16",
        ], check=True)
        print(f"  fp16 GGUF: {GGUF_F16.stat().st_size / 1e9:.2f} GB")
    else:
        print(f"  fp16 GGUF exists ({GGUF_F16.stat().st_size / 1e9:.2f} GB) — skipping")

    # Quantize to Q4_K_M
    if not GGUF_Q4.exists():
        print("  Quantizing to Q4_K_M...")
        subprocess.run([
            QUANTIZE_BIN, str(GGUF_F16), str(GGUF_Q4), "Q4_K_M",
        ], check=True)
        print(f"  Q4_K_M GGUF: {GGUF_Q4.stat().st_size / 1e9:.2f} GB")
        GGUF_F16.unlink(missing_ok=True)
        print("  Removed fp16 intermediate")
    else:
        print(f"  Q4_K_M GGUF exists ({GGUF_Q4.stat().st_size / 1e9:.2f} GB) — skipping")

    return GGUF_Q4


def register_ollama(gguf_path: Path):
    """Write Modelfile and register in Ollama."""
    print("\n[3/3] Registering in Ollama...")

    modelfile_path = gguf_path.parent / "Modelfile"
    modelfile_path.write_text(MODELFILE_TEMPLATE.format(gguf_path=gguf_path.resolve()))
    print(f"  Modelfile: {modelfile_path}")

    print("  Running: ollama create fo-blog-v6 ...")
    subprocess.run(["ollama", "create", "fo-blog-v6", "-f", str(modelfile_path)], check=True)

    print("  Verifying...")
    result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
    if "fo-blog-v6" in result.stdout:
        print("  ✓ fo-blog-v6 registered in Ollama")
    else:
        print("  ✗ fo-blog-v6 NOT found in ollama list — check manually")
        print(result.stdout)


if __name__ == "__main__":
    print("=" * 60)
    print("  fo-blog-v6 Merge + GGUF Conversion")
    print("=" * 60)

    merge_adapter()
    gguf_path = convert_gguf()
    register_ollama(gguf_path)

    print("\n" + "=" * 60)
    print("  DONE!")
    print(f"  GGUF: {gguf_path}")
    print("  Next: update Erik ecosystem.config.js")
    print("  OLLAMA_LLM_MODEL=fo-blog-v6, BLOG_LLM_PROVIDER=ollama")
    print("=" * 60)