72 lines
2.8 KiB
Python
72 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Publish private TIP selflearning datasets to Hugging Face."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
from huggingface_hub import HfApi
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
RUNPOD_DIR = ROOT / "training-data" / "runpod"
|
|
LANES = {
|
|
"tip_llm": os.getenv("TIP_HF_DATASET_TIP_LLM", "renefichtmueller/tip-llm-sft"),
|
|
"blog_llm": os.getenv("TIP_HF_DATASET_BLOG_LLM", "renefichtmueller/blog-llm-sft"),
|
|
}
|
|
|
|
|
|
def keychain(service: str) -> str | None:
|
|
try:
|
|
return subprocess.check_output(
|
|
["security", "find-generic-password", "-s", service, "-w"],
|
|
stderr=subprocess.DEVNULL,
|
|
text=True,
|
|
).strip() or None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def hf_token() -> str:
|
|
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or keychain("magatama.huggingface.token") or keychain("tip.huggingface.token")
|
|
if not token:
|
|
raise SystemExit("No Hugging Face token found.")
|
|
return token
|
|
|
|
|
|
def main() -> None:
|
|
api = HfApi(token=hf_token())
|
|
manifest = json.loads((RUNPOD_DIR / "manifest.json").read_text())
|
|
published = {}
|
|
for lane, repo_id in LANES.items():
|
|
lane_dir = RUNPOD_DIR / lane
|
|
lane_manifest = manifest["lanes"][lane]
|
|
api.create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)
|
|
for name in (f"{lane}-sft-train.jsonl", f"{lane}-sft-eval.jsonl", f"{lane}-sft-all.jsonl", "manifest.json"):
|
|
api.upload_file(
|
|
repo_id=repo_id,
|
|
repo_type="dataset",
|
|
path_or_fileobj=str(lane_dir / name),
|
|
path_in_repo=name,
|
|
commit_message=f"Update {lane} selflearning dataset",
|
|
)
|
|
card = (
|
|
f"# {repo_id}\n\n"
|
|
"Private TIP selflearning dataset generated from the Gitea/local learning pool.\n\n"
|
|
f"- Lane: `{lane}`\n"
|
|
f"- Version: `{manifest['version']}`\n"
|
|
f"- Generated: `{manifest['generated_at']}`\n"
|
|
f"- Training pairs after dedupe: `{lane_manifest['training_pairs']}`\n"
|
|
f"- Train/Eval split: `{lane_manifest['train_pairs']}` / `{lane_manifest['eval_pairs']}`\n"
|
|
f"- Duplicates removed: `{lane_manifest['duplicates_removed']}`\n"
|
|
)
|
|
api.upload_file(repo_id=repo_id, repo_type="dataset", path_or_fileobj=card.encode(), path_in_repo="README.md", commit_message=f"Document {lane} selflearning dataset")
|
|
published[lane] = {"repo_id": repo_id, "training_pairs": lane_manifest["training_pairs"], "train_pairs": lane_manifest["train_pairs"], "eval_pairs": lane_manifest["eval_pairs"]}
|
|
print(json.dumps({"success": True, "published": published}, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|