llm-gateway/packages/fine-tuner/scripts/download-datasets.sh
Rene Fichtmueller 2ca77d0aee feat: Phase 2F — Multi-Agent Integration (ADRs + Client Fallback + Tests)
- ADR-0001: Multi-Agent Coworking Architecture with LLM Gateway Orchestrator
- ADR-0002: Tier Assignment Strategy for Model Selection (cost-first escalation)
- ADR-0003: Confidence Gate Thresholds & Learning Cycle Intervals (6h/12h/24h cycles)
- ADR-0004: External Provider Fallback Chain Ordering (Cerebras → Groq → Mistral)
- Enhanced client SDK: Offline Ollama fallback, health checks, exponential backoff retry
- Integration tests: claude-code-integration.test.ts (14 test cases)
- PHASE_2F_DEPLOYMENT.md: Pre-deployment checklist, automated deploy, rollback plan
- Post-deployment verification procedures for health, client fallback, metrics
2026-04-19 21:39:44 +02:00

83 lines
2.5 KiB
Bash
Executable File

#!/bin/bash
# download-datasets.sh — Download top training datasets for MAGATAMA Ops AI
# Total estimated: ~300K+ samples across security, ops, networking
set -euo pipefail
DATA_DIR="$(dirname "$0")/../data/external"
mkdir -p "$DATA_DIR"
cd "$DATA_DIR"
echo "═══ MAGATAMA Training Data Downloader ═══"
echo ""
# Activate venv
source "$(dirname "$0")/../.venv/bin/activate"
# ─── TIER 1: MUST HAVE ───
echo "[1/7] Fenrir v2.0 — 83K Security+DevOps+IR (Apache 2.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('AlicanKiraz0/Cybersecurity-Dataset-Fenrir-v2.0', split='train')
ds.to_json('fenrir-v2.jsonl')
print(f' → {len(ds)} samples saved')
"
echo "[2/7] Trendyol Cybersecurity — 53K Security Domains (Apache 2.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset', split='train')
ds.to_json('trendyol-security.jsonl')
print(f' → {len(ds)} samples saved')
"
echo "[3/7] NeurAlchemy Prompt Injection — 22K, 29 Attack Categories (Apache 2.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('neuralchemy/Prompt-injection-dataset', 'full', split='train')
ds.to_json('neuralchemy-prompt-injection.jsonl')
print(f' → {len(ds)} samples saved')
"
echo "[4/7] Code Vulnerability DPO — 4.7K Vuln→Fix Pairs (Apache 2.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('CyberNative/Code_Vulnerability_Security_DPO', split='train')
ds.to_json('code-vuln-dpo.jsonl')
print(f' → {len(ds)} samples saved')
"
# ─── TIER 2: HIGH VALUE ───
echo "[5/7] MITRE ATT&CK TTP Mapping — 21K Expert-Annotated (CC BY 4.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('tumeteor/Security-TTP-Mapping', split='train')
ds.to_json('mitre-ttp-mapping.jsonl')
print(f' → {len(ds)} samples saved')
"
echo "[6/7] deepset Prompt Injections — 662 Baseline (Apache 2.0)"
python3 -c "
from datasets import load_dataset
ds = load_dataset('deepset/prompt-injections', split='train')
ds.to_json('deepset-injections.jsonl')
print(f' → {len(ds)} samples saved')
"
echo "[7/7] NotInject — 339 False Positive Calibration"
python3 -c "
from datasets import load_dataset
ds = load_dataset('leolee99/NotInject', split='test')
ds.to_json('notinject-calibration.jsonl')
print(f' → {len(ds)} samples saved')
"
echo ""
echo "═══ DOWNLOAD COMPLETE ═══"
echo ""
ls -lhS "$DATA_DIR"/*.jsonl
echo ""
echo "Total files: $(ls "$DATA_DIR"/*.jsonl | wc -l)"
echo "Total size: $(du -sh "$DATA_DIR" | cut -f1)"