#!/usr/bin/env python3 """Publish private TIP selflearning datasets to Hugging Face.""" from __future__ import annotations import json import os import subprocess from pathlib import Path from huggingface_hub import HfApi ROOT = Path(__file__).resolve().parents[1] RUNPOD_DIR = ROOT / "training-data" / "runpod" LANES = { "tip_llm": os.getenv("TIP_HF_DATASET_TIP_LLM", "renefichtmueller/tip-llm-sft"), "blog_llm": os.getenv("TIP_HF_DATASET_BLOG_LLM", "renefichtmueller/blog-llm-sft"), } def keychain(service: str) -> str | None: try: return subprocess.check_output( ["security", "find-generic-password", "-s", service, "-w"], stderr=subprocess.DEVNULL, text=True, ).strip() or None except Exception: return None def hf_token() -> str: token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or keychain("magatama.huggingface.token") or keychain("tip.huggingface.token") if not token: raise SystemExit("No Hugging Face token found.") return token def main() -> None: api = HfApi(token=hf_token()) manifest = json.loads((RUNPOD_DIR / "manifest.json").read_text()) published = {} for lane, repo_id in LANES.items(): lane_dir = RUNPOD_DIR / lane lane_manifest = manifest["lanes"][lane] api.create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True) for name in (f"{lane}-sft-train.jsonl", f"{lane}-sft-eval.jsonl", f"{lane}-sft-all.jsonl", "manifest.json"): api.upload_file( repo_id=repo_id, repo_type="dataset", path_or_fileobj=str(lane_dir / name), path_in_repo=name, commit_message=f"Update {lane} selflearning dataset", ) card = ( f"# {repo_id}\n\n" "Private TIP selflearning dataset generated from the Gitea/local learning pool.\n\n" f"- Lane: `{lane}`\n" f"- Version: `{manifest['version']}`\n" f"- Generated: `{manifest['generated_at']}`\n" f"- Training pairs after dedupe: `{lane_manifest['training_pairs']}`\n" f"- Train/Eval split: `{lane_manifest['train_pairs']}` / `{lane_manifest['eval_pairs']}`\n" f"- Duplicates removed: `{lane_manifest['duplicates_removed']}`\n" ) api.upload_file(repo_id=repo_id, repo_type="dataset", path_or_fileobj=card.encode(), path_in_repo="README.md", commit_message=f"Document {lane} selflearning dataset") published[lane] = {"repo_id": repo_id, "training_pairs": lane_manifest["training_pairs"], "train_pairs": lane_manifest["train_pairs"], "eval_pairs": lane_manifest["eval_pairs"]} print(json.dumps({"success": True, "published": published}, indent=2)) if __name__ == "__main__": main()