transceiver-db/scripts/tip-publish-hf-datasets.py

#!/usr/bin/env python3
"""Publish private TIP selflearning datasets to Hugging Face."""

from __future__ import annotations

import json
import os
import subprocess
from pathlib import Path

from huggingface_hub import HfApi

ROOT = Path(__file__).resolve().parents[1]
RUNPOD_DIR = ROOT / "training-data" / "runpod"
LANES = {
    "tip_llm": os.getenv("TIP_HF_DATASET_TIP_LLM", "renefichtmueller/tip-llm-sft"),
    "blog_llm": os.getenv("TIP_HF_DATASET_BLOG_LLM", "renefichtmueller/blog-llm-sft"),
}


def keychain(service: str) -> str | None:
    try:
        return subprocess.check_output(
            ["security", "find-generic-password", "-s", service, "-w"],
            stderr=subprocess.DEVNULL,
            text=True,
        ).strip() or None
    except Exception:
        return None


def hf_token() -> str:
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or keychain("magatama.huggingface.token") or keychain("tip.huggingface.token")
    if not token:
        raise SystemExit("No Hugging Face token found.")
    return token


def main() -> None:
    api = HfApi(token=hf_token())
    manifest = json.loads((RUNPOD_DIR / "manifest.json").read_text())
    published = {}
    for lane, repo_id in LANES.items():
        lane_dir = RUNPOD_DIR / lane
        lane_manifest = manifest["lanes"][lane]
        api.create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)
        for name in (f"{lane}-sft-train.jsonl", f"{lane}-sft-eval.jsonl", f"{lane}-sft-all.jsonl", "manifest.json"):
            api.upload_file(
                repo_id=repo_id,
                repo_type="dataset",
                path_or_fileobj=str(lane_dir / name),
                path_in_repo=name,
                commit_message=f"Update {lane} selflearning dataset",
            )
        card = (
            f"# {repo_id}\n\n"
            "Private TIP selflearning dataset generated from the Gitea/local learning pool.\n\n"
            f"- Lane: `{lane}`\n"
            f"- Version: `{manifest['version']}`\n"
            f"- Generated: `{manifest['generated_at']}`\n"
            f"- Training pairs after dedupe: `{lane_manifest['training_pairs']}`\n"
            f"- Train/Eval split: `{lane_manifest['train_pairs']}` / `{lane_manifest['eval_pairs']}`\n"
            f"- Duplicates removed: `{lane_manifest['duplicates_removed']}`\n"
        )
        api.upload_file(repo_id=repo_id, repo_type="dataset", path_or_fileobj=card.encode(), path_in_repo="README.md", commit_message=f"Document {lane} selflearning dataset")
        published[lane] = {"repo_id": repo_id, "training_pairs": lane_manifest["training_pairs"], "train_pairs": lane_manifest["train_pairs"], "eval_pairs": lane_manifest["eval_pairs"]}
    print(json.dumps({"success": True, "published": published}, indent=2))


if __name__ == "__main__":
    main()