- Add scripts/seed-tip-llm-capabilities.ts: generator for 34 SFT pairs covering all 5 TIP_LLM capabilities (transceiver research, switch research, Blog_LLM data evaluation, crawler/scraper design, Hype Cycle) - Add training-data/tip-llm-capabilities-v1.jsonl: generated output (34 pairs) - Update tip-learning-pool-build.ts: expanded 5-capability system prompt replaces single-line prompt; register capabilities file in files.tip_llm - Regenerate tip_llm runpod outputs: 12141 raw pairs → 11872 training pairs (up from 10654 before capabilities addition) - Published tip_llm (11872 pairs) + blog_llm (11408 pairs) to HuggingFace
29 lines
1.1 KiB
JSON
29 lines
1.1 KiB
JSON
{
|
|
"raw_pairs": 12141,
|
|
"duplicates_removed": 269,
|
|
"training_pairs": 11872,
|
|
"train_pairs": 10684,
|
|
"eval_pairs": 1188,
|
|
"sources": {
|
|
"external:vendor-deep-dives.jsonl": 11200,
|
|
"external:technical-deep-dives.jsonl": 84,
|
|
"external:rir-infrastructure-data.jsonl": 150,
|
|
"external:market-business-analysis-part1.jsonl": 10,
|
|
"external:synthesized-training-samples.jsonl": 219,
|
|
"external:nanog-ripe-labs-content.jsonl": 34,
|
|
"external:academic-research-synthesis.jsonl": 109,
|
|
"training-data/tip-llm-capabilities-v1.jsonl": 34,
|
|
"external:market-business-analysis-part6.jsonl": 5,
|
|
"external:market-business-analysis-part5.jsonl": 7,
|
|
"external:market-business-analysis-part4.jsonl": 5,
|
|
"external:market-business-analysis-part2.jsonl": 8,
|
|
"external:market-business-analysis-part3.jsonl": 7
|
|
},
|
|
"files": {
|
|
"train": "training-data/runpod/tip_llm/tip_llm-sft-train.jsonl",
|
|
"eval": "training-data/runpod/tip_llm/tip_llm-sft-eval.jsonl",
|
|
"all": "training-data/runpod/tip_llm/tip_llm-sft-all.jsonl",
|
|
"manifest": "training-data/runpod/tip_llm/manifest.json"
|
|
}
|
|
}
|