From b8ec33a09b1e75592cacef2179d0ad4d81c87f3b Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 28 Apr 2026 23:36:13 +0200 Subject: [PATCH] init: TIPLLM training data repository structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-generated training data from TIP intelligent crawlers. Crawler → LLM extraction → Validation → SFT pairs → Fine-tuning → Smarter TIPLLM --- README.md | 36 ++++++++++++++++++++++++++++++++++++ crawl-extractions/.gitkeep | 1 + qa-pairs/.gitkeep | 1 + stats/dataset-stats.json | 8 ++++++++ validated-specs/.gitkeep | 1 + 5 files changed, 47 insertions(+) create mode 100644 README.md create mode 100644 crawl-extractions/.gitkeep create mode 100644 qa-pairs/.gitkeep create mode 100644 stats/dataset-stats.json create mode 100644 validated-specs/.gitkeep diff --git a/README.md b/README.md new file mode 100644 index 0000000..3003a14 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# TIP Training Data + +Auto-generated training dataset for TIPLLM fine-tuning. +Generated by TIP Intelligent Crawlers — validated, structured, SFT-ready. + +## Structure + +| Directory | Content | +|-----------|---------| +| `crawl-extractions/` | Raw LLM extractions from vendor product pages (JSONL) | +| `validated-specs/` | Validated transceiver specs with confidence ≥ 0.7 (JSONL) | +| `qa-pairs/` | SFT question-answer training pairs (JSONL) | +| `raw-html/` | Cached HTML snippets for offline re-training (gzipped) | +| `stats/` | Dataset statistics and coverage reports | + +## SFT Format + +Each JSONL line in `qa-pairs/` follows the SFT format: +```json +{ + "id": "uuid", + "source": "crawler:vendor-name:url", + "kind": "sft-jsonl", + "crawled_at": "2026-04-28T...", + "confidence": 0.92, + "messages": [ + {"role": "system", "content": "You are TIP_LLM..."}, + {"role": "user", "content": "Extract transceiver specs from: ..."}, + {"role": "assistant", "content": "{\"part_number\": \"...\", ...}"} + ] +} +``` + +## Stats + +Updated automatically after each crawler run. diff --git a/crawl-extractions/.gitkeep b/crawl-extractions/.gitkeep new file mode 100644 index 0000000..887dacf --- /dev/null +++ b/crawl-extractions/.gitkeep @@ -0,0 +1 @@ +# crawl-extractions — raw LLM extractions from vendor product pages diff --git a/qa-pairs/.gitkeep b/qa-pairs/.gitkeep new file mode 100644 index 0000000..94c6654 --- /dev/null +++ b/qa-pairs/.gitkeep @@ -0,0 +1 @@ +# qa-pairs — SFT training pairs for TIPLLM fine-tuning diff --git a/stats/dataset-stats.json b/stats/dataset-stats.json new file mode 100644 index 0000000..105f9ac --- /dev/null +++ b/stats/dataset-stats.json @@ -0,0 +1,8 @@ +{ + "total_extractions": 0, + "validated_specs": 0, + "qa_pairs": 0, + "vendors_covered": [], + "confidence_distribution": {"high": 0, "medium": 0, "low": 0}, + "last_updated": "2026-04-28T00:00:00Z" +} diff --git a/validated-specs/.gitkeep b/validated-specs/.gitkeep new file mode 100644 index 0000000..3d4dc80 --- /dev/null +++ b/validated-specs/.gitkeep @@ -0,0 +1 @@ +# validated-specs — confidence >= 0.7 validated transceiver specs