transceiver-db/scripts/tip-publish-hf-datasets.py
2026-04-25 12:21:56 +02:00

72 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""Publish private TIP selflearning datasets to Hugging Face."""
from __future__ import annotations
import json
import os
import subprocess
from pathlib import Path
from huggingface_hub import HfApi
ROOT = Path(__file__).resolve().parents[1]
RUNPOD_DIR = ROOT / "training-data" / "runpod"
LANES = {
"tip_llm": os.getenv("TIP_HF_DATASET_TIP_LLM", "renefichtmueller/tip-llm-sft"),
"blog_llm": os.getenv("TIP_HF_DATASET_BLOG_LLM", "renefichtmueller/blog-llm-sft"),
}
def keychain(service: str) -> str | None:
try:
return subprocess.check_output(
["security", "find-generic-password", "-s", service, "-w"],
stderr=subprocess.DEVNULL,
text=True,
).strip() or None
except Exception:
return None
def hf_token() -> str:
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or keychain("magatama.huggingface.token") or keychain("tip.huggingface.token")
if not token:
raise SystemExit("No Hugging Face token found.")
return token
def main() -> None:
api = HfApi(token=hf_token())
manifest = json.loads((RUNPOD_DIR / "manifest.json").read_text())
published = {}
for lane, repo_id in LANES.items():
lane_dir = RUNPOD_DIR / lane
lane_manifest = manifest["lanes"][lane]
api.create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)
for name in (f"{lane}-sft-train.jsonl", f"{lane}-sft-eval.jsonl", f"{lane}-sft-all.jsonl", "manifest.json"):
api.upload_file(
repo_id=repo_id,
repo_type="dataset",
path_or_fileobj=str(lane_dir / name),
path_in_repo=name,
commit_message=f"Update {lane} selflearning dataset",
)
card = (
f"# {repo_id}\n\n"
"Private TIP selflearning dataset generated from the Gitea/local learning pool.\n\n"
f"- Lane: `{lane}`\n"
f"- Version: `{manifest['version']}`\n"
f"- Generated: `{manifest['generated_at']}`\n"
f"- Training pairs after dedupe: `{lane_manifest['training_pairs']}`\n"
f"- Train/Eval split: `{lane_manifest['train_pairs']}` / `{lane_manifest['eval_pairs']}`\n"
f"- Duplicates removed: `{lane_manifest['duplicates_removed']}`\n"
)
api.upload_file(repo_id=repo_id, repo_type="dataset", path_or_fileobj=card.encode(), path_in_repo="README.md", commit_message=f"Document {lane} selflearning dataset")
published[lane] = {"repo_id": repo_id, "training_pairs": lane_manifest["training_pairs"], "train_pairs": lane_manifest["train_pairs"], "eval_pairs": lane_manifest["eval_pairs"]}
print(json.dumps({"success": True, "published": published}, indent=2))
if __name__ == "__main__":
main()