From b8ec33a09b1e75592cacef2179d0ad4d81c87f3b Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller <renefichtmueller@MacStudio-von-Rene-8.local>
Date: Tue, 28 Apr 2026 23:36:13 +0200
Subject: [PATCH] init: TIPLLM training data repository structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-generated training data from TIP intelligent crawlers.
Crawler → LLM extraction → Validation → SFT pairs → Fine-tuning → Smarter TIPLLM
---
 README.md                  | 36 ++++++++++++++++++++++++++++++++++++
 crawl-extractions/.gitkeep |  1 +
 qa-pairs/.gitkeep          |  1 +
 stats/dataset-stats.json   |  8 ++++++++
 validated-specs/.gitkeep   |  1 +
 5 files changed, 47 insertions(+)
 create mode 100644 README.md
 create mode 100644 crawl-extractions/.gitkeep
 create mode 100644 qa-pairs/.gitkeep
 create mode 100644 stats/dataset-stats.json
 create mode 100644 validated-specs/.gitkeep

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3003a14
--- /dev/null
+++ b/README.md
@@ -0,0 +1,36 @@
+# TIP Training Data
+
+Auto-generated training dataset for TIPLLM fine-tuning.
+Generated by TIP Intelligent Crawlers — validated, structured, SFT-ready.
+
+## Structure
+
+| Directory | Content |
+|-----------|---------|
+| `crawl-extractions/` | Raw LLM extractions from vendor product pages (JSONL) |
+| `validated-specs/` | Validated transceiver specs with confidence ≥ 0.7 (JSONL) |
+| `qa-pairs/` | SFT question-answer training pairs (JSONL) |
+| `raw-html/` | Cached HTML snippets for offline re-training (gzipped) |
+| `stats/` | Dataset statistics and coverage reports |
+
+## SFT Format
+
+Each JSONL line in `qa-pairs/` follows the SFT format:
+```json
+{
+  "id": "uuid",
+  "source": "crawler:vendor-name:url",
+  "kind": "sft-jsonl",
+  "crawled_at": "2026-04-28T...",
+  "confidence": 0.92,
+  "messages": [
+    {"role": "system", "content": "You are TIP_LLM..."},
+    {"role": "user", "content": "Extract transceiver specs from: ..."},
+    {"role": "assistant", "content": "{\"part_number\": \"...\", ...}"}
+  ]
+}
+```
+
+## Stats
+
+Updated automatically after each crawler run.
diff --git a/crawl-extractions/.gitkeep b/crawl-extractions/.gitkeep
new file mode 100644
index 0000000..887dacf
--- /dev/null
+++ b/crawl-extractions/.gitkeep
@@ -0,0 +1 @@
+# crawl-extractions — raw LLM extractions from vendor product pages
diff --git a/qa-pairs/.gitkeep b/qa-pairs/.gitkeep
new file mode 100644
index 0000000..94c6654
--- /dev/null
+++ b/qa-pairs/.gitkeep
@@ -0,0 +1 @@
+# qa-pairs — SFT training pairs for TIPLLM fine-tuning
diff --git a/stats/dataset-stats.json b/stats/dataset-stats.json
new file mode 100644
index 0000000..105f9ac
--- /dev/null
+++ b/stats/dataset-stats.json
@@ -0,0 +1,8 @@
+{
+  "total_extractions": 0,
+  "validated_specs": 0,
+  "qa_pairs": 0,
+  "vendors_covered": [],
+  "confidence_distribution": {"high": 0, "medium": 0, "low": 0},
+  "last_updated": "2026-04-28T00:00:00Z"
+}
diff --git a/validated-specs/.gitkeep b/validated-specs/.gitkeep
new file mode 100644
index 0000000..3d4dc80
--- /dev/null
+++ b/validated-specs/.gitkeep
@@ -0,0 +1 @@
+# validated-specs — confidence >= 0.7 validated transceiver specs