#!/bin/bash # download-datasets.sh — Download top training datasets for MAGATAMA Ops AI # Total estimated: ~300K+ samples across security, ops, networking set -euo pipefail DATA_DIR="$(dirname "$0")/../data/external" mkdir -p "$DATA_DIR" cd "$DATA_DIR" echo "═══ MAGATAMA Training Data Downloader ═══" echo "" # Activate venv source "$(dirname "$0")/../.venv/bin/activate" # ─── TIER 1: MUST HAVE ─── echo "[1/7] Fenrir v2.0 — 83K Security+DevOps+IR (Apache 2.0)" python3 -c " from datasets import load_dataset ds = load_dataset('AlicanKiraz0/Cybersecurity-Dataset-Fenrir-v2.0', split='train') ds.to_json('fenrir-v2.jsonl') print(f' → {len(ds)} samples saved') " echo "[2/7] Trendyol Cybersecurity — 53K Security Domains (Apache 2.0)" python3 -c " from datasets import load_dataset ds = load_dataset('Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset', split='train') ds.to_json('trendyol-security.jsonl') print(f' → {len(ds)} samples saved') " echo "[3/7] NeurAlchemy Prompt Injection — 22K, 29 Attack Categories (Apache 2.0)" python3 -c " from datasets import load_dataset ds = load_dataset('neuralchemy/Prompt-injection-dataset', 'full', split='train') ds.to_json('neuralchemy-prompt-injection.jsonl') print(f' → {len(ds)} samples saved') " echo "[4/7] Code Vulnerability DPO — 4.7K Vuln→Fix Pairs (Apache 2.0)" python3 -c " from datasets import load_dataset ds = load_dataset('CyberNative/Code_Vulnerability_Security_DPO', split='train') ds.to_json('code-vuln-dpo.jsonl') print(f' → {len(ds)} samples saved') " # ─── TIER 2: HIGH VALUE ─── echo "[5/7] MITRE ATT&CK TTP Mapping — 21K Expert-Annotated (CC BY 4.0)" python3 -c " from datasets import load_dataset ds = load_dataset('tumeteor/Security-TTP-Mapping', split='train') ds.to_json('mitre-ttp-mapping.jsonl') print(f' → {len(ds)} samples saved') " echo "[6/7] deepset Prompt Injections — 662 Baseline (Apache 2.0)" python3 -c " from datasets import load_dataset ds = load_dataset('deepset/prompt-injections', split='train') ds.to_json('deepset-injections.jsonl') print(f' → {len(ds)} samples saved') " echo "[7/7] NotInject — 339 False Positive Calibration" python3 -c " from datasets import load_dataset ds = load_dataset('leolee99/NotInject', split='test') ds.to_json('notinject-calibration.jsonl') print(f' → {len(ds)} samples saved') " echo "" echo "═══ DOWNLOAD COMPLETE ═══" echo "" ls -lhS "$DATA_DIR"/*.jsonl echo "" echo "Total files: $(ls "$DATA_DIR"/*.jsonl | wc -l)" echo "Total size: $(du -sh "$DATA_DIR" | cut -f1)"