From 2052d87ba11274507aaf48dc2542cd6f5d66c2e4 Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller
Date: Thu, 26 Mar 2026 06:28:48 +1300
Subject: [PATCH] =?UTF-8?q?feat:=20initial=20release=20=E2=80=94=20AI=20do?=
=?UTF-8?q?cument=20intelligence=20for=20Paperless-ngx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
PaperCortex adds semantic search, auto-classification, receipt extraction,
bank statement matching, and DATEV export to Paperless-ngx — powered
entirely by local AI through Ollama. Exposes everything as an MCP Server
for Claude Code and AI agent integration.
- MCP Server with 5 tools (search, classify, receipt, query, export)
- Local Ollama embeddings for semantic document search
- Receipt data extraction (vendor, amount, date, tax, line items)
- DATEV Buchungsstapel CSV export for German accounting
- Bank CSV transaction matching
- Paperless-ngx REST API client
- Docker deployment
- Zero cloud dependencies — 100% self-hosted
---
.env.example | 20 +
.gitignore | 35 ++
Dockerfile | 34 ++
LICENSE | 21 +
README.md | 737 +++++++++++++++++++++++++++++++
docker-compose.yml | 36 ++
docs/architecture.md | 64 +++
docs/receipts.md | 101 +++++
docs/setup.md | 107 +++++
package.json | 57 +++
src/embeddings/ollama.ts | 148 +++++++
src/embeddings/store.ts | 231 ++++++++++
src/mcp-server/index.ts | 249 +++++++++++
src/mcp-server/tools/classify.ts | 117 +++++
src/mcp-server/tools/export.ts | 116 +++++
src/mcp-server/tools/query.ts | 110 +++++
src/mcp-server/tools/receipt.ts | 76 ++++
src/mcp-server/tools/search.ts | 87 ++++
src/paperless/client.ts | 182 ++++++++
src/paperless/types.ts | 126 ++++++
src/receipt/datev.ts | 171 +++++++
src/receipt/extractor.ts | 170 +++++++
src/receipt/matcher.ts | 231 ++++++++++
src/skill/SKILL.md | 72 +++
tsconfig.json | 24 +
25 files changed, 3322 insertions(+)
create mode 100644 .env.example
create mode 100644 .gitignore
create mode 100644 Dockerfile
create mode 100644 LICENSE
create mode 100644 README.md
create mode 100644 docker-compose.yml
create mode 100644 docs/architecture.md
create mode 100644 docs/receipts.md
create mode 100644 docs/setup.md
create mode 100644 package.json
create mode 100644 src/embeddings/ollama.ts
create mode 100644 src/embeddings/store.ts
create mode 100644 src/mcp-server/index.ts
create mode 100644 src/mcp-server/tools/classify.ts
create mode 100644 src/mcp-server/tools/export.ts
create mode 100644 src/mcp-server/tools/query.ts
create mode 100644 src/mcp-server/tools/receipt.ts
create mode 100644 src/mcp-server/tools/search.ts
create mode 100644 src/paperless/client.ts
create mode 100644 src/paperless/types.ts
create mode 100644 src/receipt/datev.ts
create mode 100644 src/receipt/extractor.ts
create mode 100644 src/receipt/matcher.ts
create mode 100644 src/skill/SKILL.md
create mode 100644 tsconfig.json
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..670512c
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,20 @@
+# PaperCortex Configuration
+# Copy this file to .env and fill in your values
+
+# Paperless-ngx connection
+PAPERLESS_URL=http://localhost:8000
+PAPERLESS_TOKEN=your-paperless-api-token-here
+
+# Ollama connection
+OLLAMA_URL=http://localhost:11434
+OLLAMA_MODEL=qwen2.5:14b
+OLLAMA_EMBEDDING_MODEL=nomic-embed-text
+
+# Vector store
+VECTOR_DB_PATH=./data/vectors.db
+
+# MCP Server
+MCP_SERVER_PORT=3100
+
+# Logging
+LOG_LEVEL=info
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..68acf58
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+# Dependencies
+node_modules/
+
+# Build output
+dist/
+
+# Environment files
+.env
+.env.local
+.env.*.local
+
+# Data directory (vectors, cache)
+data/
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Logs
+logs/
+*.log
+npm-debug.log*
+
+# Test coverage
+coverage/
+
+# Temporary files
+tmp/
+temp/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..149e5f8
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,34 @@
+FROM node:22-alpine AS builder
+
+WORKDIR /app
+
+COPY package.json package-lock.json* ./
+RUN npm ci
+
+COPY tsconfig.json ./
+COPY src/ ./src/
+RUN npm run build
+
+# --- Production image ---
+FROM node:22-alpine
+
+WORKDIR /app
+
+RUN addgroup -g 1001 -S papercortex && \
+ adduser -S papercortex -u 1001
+
+COPY package.json package-lock.json* ./
+RUN npm ci --omit=dev && npm cache clean --force
+
+COPY --from=builder /app/dist ./dist
+
+RUN mkdir -p /app/data && chown papercortex:papercortex /app/data
+
+USER papercortex
+
+ENV NODE_ENV=production
+ENV VECTOR_DB_PATH=/app/data/vectors.db
+
+EXPOSE 3100
+
+CMD ["node", "dist/mcp-server/index.js"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a2b6e2c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 PaperCortex Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f8fa4e7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,737 @@
+
+
+
PaperCortex
+
+ AI-Powered Document Intelligence for Paperless-ngx
+ Semantic search, auto-classification, receipt extraction, and accounting export — 100% local, 100% private.
+
+
+
+
+
+
+
+
+
+
+
+
+ Quick Start · Features · MCP Tools · Receipts · Docs
+
+
+
+---
+
+## What is PaperCortex?
+
+**PaperCortex** turns your [Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx) document archive into an intelligent, queryable knowledge base — powered entirely by local AI running on your own hardware.
+
+If you use Paperless-ngx to store invoices, receipts, contracts, tax documents, letters, or any other scanned paperwork, PaperCortex adds the intelligence layer that Paperless-ngx is missing:
+
+- **Ask questions in plain English** — "Show me all invoices from Amazon over 100 EUR in 2025"
+- **Find documents by meaning**, not just keywords — searching for "office rent" finds "Bueromiete" and "monthly lease payment"
+- **Auto-tag and classify** every new document the moment it arrives
+- **Extract structured data from receipts** — vendor, date, amount, tax rate, line items
+- **Match receipts to bank transactions** automatically
+- **Export to DATEV** for your German tax advisor — or plain CSV for any accounting software
+
+Everything runs locally through [Ollama](https://ollama.com). No document content ever leaves your network. No cloud APIs. No subscriptions. No data harvesting.
+
+PaperCortex exposes all capabilities as an **[MCP (Model Context Protocol)](https://modelcontextprotocol.io) Server**, making it a first-class tool for [Claude Code](https://docs.anthropic.com/en/docs/claude-code), AI coding agents, and automated workflows.
+
+---
+
+## The Problem
+
+Paperless-ngx is an outstanding document management system with 37,000+ GitHub stars. It handles scanning, OCR, storage, and basic tagging beautifully. But once your documents are in Paperless-ngx, finding and working with them has real limitations:
+
+| What you want to do | Paperless-ngx alone | With PaperCortex |
+|---|---|---|
+| Find a document by what it's about | Keyword search only — misses synonyms, translations, related concepts | **Semantic search** understands meaning across languages |
+| Classify incoming documents | Manual rules or basic auto-matching | **LLM-powered classification** understands document content |
+| Extract data from a receipt | Read it yourself and type it in | **Automatic extraction** of vendor, amount, date, tax, line items |
+| Answer "How much did I spend on X?" | Export everything, open spreadsheet, filter manually | **Natural language query** returns the answer instantly |
+| Send receipt data to accounting | Manual data entry or copy-paste | **One-click DATEV/CSV export** ready for your tax advisor |
+| Use documents in AI workflows | No API integration for AI agents | **Full MCP Server** for Claude Code and any MCP-compatible agent |
+| Keep data private | Self-hosted (good!) | Self-hosted AI too — **zero cloud dependency** |
+
+---
+
+## Features
+
+### Semantic Document Search
+
+Traditional keyword search fails when you don't remember the exact words. PaperCortex generates vector embeddings for every document using local Ollama models and stores them in a lightweight SQLite vector database.
+
+**Search by meaning, not by memory:**
+- Search for `"electricity bill"` → finds documents containing "Stromrechnung", "utility payment", "power invoice"
+- Search for `"office supplies"` → finds "Bueroausstattung", "paper and toner", "desk accessories order"
+- Search for `"tax deductible travel"` → finds flight bookings, hotel receipts, train tickets, taxi invoices
+
+**Supported embedding models:**
+- `nomic-embed-text` (recommended — fast, accurate, 768 dimensions)
+- `mxbai-embed-large` (higher accuracy, slower)
+- Any Ollama-compatible embedding model
+
+### Automatic Document Classification
+
+Every new document arriving in Paperless-ngx gets analyzed by a local LLM that reads the OCR content and assigns:
+
+- **Document type** — Invoice, Receipt, Contract, Letter, Statement, Tax Document, Certificate
+- **Tags** — Contextual tags based on content (e.g., "office", "travel", "insurance", "subscription")
+- **Correspondent** — Identifies the sender/vendor from document content
+- **Date extraction** — Finds the document date (not just the scan date)
+- **Language detection** — Identifies the document language
+
+Classification runs asynchronously in the background. New documents are processed within minutes of arriving in Paperless-ngx.
+
+### Receipt Intelligence
+
+PaperCortex includes a dedicated receipt processing pipeline optimized for expense management:
+
+**Data extraction from receipts and invoices:**
+- Vendor / merchant name and address
+- Date of purchase
+- Total amount (gross and net)
+- Tax rate and tax amount (supports multiple VAT rates)
+- Currency
+- Individual line items with quantities and prices
+- Payment method
+- Invoice/receipt number
+
+**Works with:**
+- Scanned paper receipts (via Paperless-ngx OCR)
+- Digital PDF invoices
+- Photographed receipts (mobile upload to Paperless-ngx)
+- Multi-page invoices
+- Receipts in German, English, French, Spanish, and other languages
+
+### Bank Statement Matching
+
+Import your bank statement as CSV and let PaperCortex automatically match transactions to receipts:
+
+- **Fuzzy matching** on amount, date, and vendor name
+- **Confidence scoring** — high/medium/low match indicators
+- **Unmatched detection** — highlights receipts without matching transactions and vice versa
+- **Multi-currency support** — handles EUR, USD, GBP, CHF, and 20+ currencies
+
+### DATEV Export
+
+For German businesses and freelancers, PaperCortex generates DATEV-compatible export files that your Steuerberater can import directly:
+
+- **DATEV CSV format** (Buchungsstapel) — the standard German accounting import format
+- **SKR03 / SKR04** account mapping
+- **Automatic account assignment** based on document classification
+- **Beleglink** — links each DATEV entry back to the original document in Paperless-ngx
+- **Period exports** — monthly, quarterly, or annual
+
+Also supports plain CSV export for use with any accounting software worldwide.
+
+### Natural Language Queries
+
+Ask questions about your document archive in plain language:
+
+```
+"How much did I spend on hotels in Q1 2025?"
+"Show me all contracts expiring this year"
+"What was my highest single expense last month?"
+"Find all invoices from Deutsche Telekom"
+"Which receipts don't have a matching bank transaction?"
+"Summarize my office supply spending trend over the last 12 months"
+```
+
+PaperCortex translates natural language into document queries, retrieves relevant documents via semantic search, and uses the local LLM to synthesize answers with source references.
+
+### MCP Server Integration
+
+PaperCortex implements the [Model Context Protocol (MCP)](https://modelcontextprotocol.io) — the open standard for connecting AI agents to external tools. This means any MCP-compatible AI agent can use your document archive as a knowledge source.
+
+**Compatible with:**
+- [Claude Code](https://docs.anthropic.com/en/docs/claude-code) (Anthropic)
+- [Claude Desktop](https://claude.ai)
+- Any MCP-compatible AI agent or IDE plugin
+- Custom AI workflows via the MCP SDK
+
+---
+
+## Feature Comparison
+
+| Feature | PaperCortex | paperless-ai | Veryfi | Taggun | Rossum |
+|---|:---:|:---:|:---:|:---:|:---:|
+| Fully self-hosted | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: |
+| Local AI (no cloud API) | :white_check_mark: | :x: OpenAI | :x: | :x: | :x: |
+| Semantic search | :white_check_mark: | :x: | :x: | :x: | :x: |
+| Auto-classification | :white_check_mark: | :white_check_mark: | :x: | :x: | :white_check_mark: |
+| Receipt data extraction | :white_check_mark: | :x: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| Bank statement matching | :white_check_mark: | :x: | :x: | :x: | :x: |
+| DATEV export | :white_check_mark: | :x: | :x: | :x: | :x: |
+| CSV accounting export | :white_check_mark: | :x: | :white_check_mark: | :x: | :white_check_mark: |
+| MCP Server | :white_check_mark: | :x: | :x: | :x: | :x: |
+| Natural language queries | :white_check_mark: | :x: | :x: | :x: | :x: |
+| Multi-language documents | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| Free and open source | :white_check_mark: | :white_check_mark: | :x: $$$ | :x: $$$ | :x: $$$$ |
+| Privacy — data stays local | :white_check_mark: | :warning: API calls | :x: | :x: | :x: |
+| Works with Paperless-ngx | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: |
+
+---
+
+## Architecture
+
+```
+┌─────────────────────┐ ┌──────────────────────────┐ ┌────────────────────┐
+│ │ │ │ │ │
+│ Claude Code / │ MCP │ PaperCortex │ REST │ Paperless-ngx │
+│ AI Agents / ├────────►│ ├────────►│ │
+│ Automation │ │ ┌──────────────────┐ │ API │ OCR + Storage + │
+│ │ │ │ MCP Server │ │ │ Tagging │
+└─────────────────────┘ │ │ (stdio / HTTP) │ │ │ │
+ │ └──────────────────┘ │ └────────────────────┘
+ │ │
+ │ ┌──────────────────┐ │ ┌────────────────────┐
+ │ │ Intelligence │ │ │ │
+ │ │ Layer │ │ LLM │ Ollama │
+ │ │ ├────────────►│ │
+ │ │ - Classifier │ │ API │ qwen2.5 / llama3 │
+ │ │ - Extractor │ │ │ nomic-embed-text │
+ │ │ - Query Engine │ │ │ │
+ │ └──────────────────┘ │ └────────────────────┘
+ │ │
+ │ ┌──────────────────┐ │
+ │ │ Vector Store │ │
+ │ │ (SQLite + HNSW) │ │
+ │ └──────────────────┘ │
+ │ │
+ └──────────────────────────┘
+```
+
+### How It Works
+
+1. **Documents arrive** in Paperless-ngx through scanning, email, or manual upload
+2. **PaperCortex polls** the Paperless-ngx API for new and updated documents
+3. **Embedding generation** — Ollama creates vector embeddings from OCR text
+4. **Classification** — the local LLM analyzes content and assigns types, tags, and metadata
+5. **Storage** — embeddings and extracted data are stored in a local SQLite vector database
+6. **Query interface** — the MCP Server exposes search, classify, extract, query, and export tools
+7. **AI agents connect** via MCP and interact with your documents using natural language
+
+All processing happens on your hardware. The only network traffic is between PaperCortex and your local Paperless-ngx and Ollama instances.
+
+---
+
+## Quick Start
+
+### Prerequisites
+
+- **[Docker](https://docs.docker.com/get-docker/)** and Docker Compose
+- **[Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx)** — running instance with API access
+- **[Ollama](https://ollama.com)** — running locally or on your network
+
+**Pull the required Ollama models:**
+
+```bash
+ollama pull qwen2.5:14b # LLM for classification, extraction, queries
+ollama pull nomic-embed-text # Embedding model for semantic search
+```
+
+### Option 1: Docker Compose (Recommended)
+
+```bash
+git clone https://github.com/renefichtmueller/PaperCortex.git
+cd PaperCortex
+cp .env.example .env
+```
+
+Edit `.env` with your configuration:
+
+```env
+PAPERLESS_URL=http://your-paperless-instance:8000
+PAPERLESS_TOKEN=your-paperless-api-token
+OLLAMA_URL=http://your-ollama-host:11434
+OLLAMA_MODEL=qwen2.5:14b
+OLLAMA_EMBEDDING_MODEL=nomic-embed-text
+```
+
+Start PaperCortex:
+
+```bash
+docker compose up -d
+```
+
+PaperCortex will begin indexing your existing documents automatically.
+
+### Option 2: Manual Installation
+
+```bash
+git clone https://github.com/renefichtmueller/PaperCortex.git
+cd PaperCortex
+npm install
+cp .env.example .env
+# Edit .env with your settings
+npm run build
+npm start
+```
+
+### Option 3: npx (MCP Server only)
+
+```bash
+npx papercortex --paperless-url http://localhost:8000 --paperless-token YOUR_TOKEN
+```
+
+---
+
+## MCP Server Tools
+
+PaperCortex exposes five MCP tools that AI agents can call:
+
+### `papercortex_search` — Semantic Document Search
+
+Find documents by meaning, not just keywords.
+
+```json
+{
+ "tool": "papercortex_search",
+ "arguments": {
+ "query": "electricity bills from last winter",
+ "limit": 10,
+ "date_from": "2024-12-01",
+ "date_to": "2025-02-28"
+ }
+}
+```
+
+**Returns:** Ranked list of documents with relevance scores, titles, dates, and Paperless-ngx document IDs.
+
+### `papercortex_classify` — Auto-Classification
+
+Analyze a document and assign type, tags, and metadata.
+
+```json
+{
+ "tool": "papercortex_classify",
+ "arguments": {
+ "document_id": 1234,
+ "apply": true
+ }
+}
+```
+
+**Returns:** Suggested document type, tags, correspondent, and confidence scores. Set `apply: true` to write classifications back to Paperless-ngx.
+
+### `papercortex_receipt` — Receipt Data Extraction
+
+Extract structured financial data from receipts and invoices.
+
+```json
+{
+ "tool": "papercortex_receipt",
+ "arguments": {
+ "document_id": 5678
+ }
+}
+```
+
+**Returns:**
+```json
+{
+ "vendor": "Amazon EU S.a.r.l.",
+ "date": "2025-03-15",
+ "total_gross": 119.99,
+ "total_net": 100.83,
+ "tax_rate": 19,
+ "tax_amount": 19.16,
+ "currency": "EUR",
+ "items": [
+ { "description": "USB-C Hub", "quantity": 1, "price": 49.99 },
+ { "description": "Monitor Arm", "quantity": 1, "price": 70.00 }
+ ],
+ "invoice_number": "INV-DE-2025-1234567"
+}
+```
+
+### `papercortex_query` — Natural Language Questions
+
+Ask questions about your entire document archive.
+
+```json
+{
+ "tool": "papercortex_query",
+ "arguments": {
+ "question": "How much did I spend on business travel in Q1 2025?"
+ }
+}
+```
+
+**Returns:** A natural language answer with source document references and a breakdown of the calculation.
+
+### `papercortex_export` — Accounting Export
+
+Export extracted receipt data in accounting-ready formats.
+
+```json
+{
+ "tool": "papercortex_export",
+ "arguments": {
+ "format": "datev",
+ "date_from": "2025-01-01",
+ "date_to": "2025-03-31",
+ "account_plan": "SKR03"
+ }
+}
+```
+
+**Supported formats:** `datev` (German standard), `csv` (universal), `json` (programmatic).
+
+---
+
+## Claude Code Integration
+
+### Register as MCP Server
+
+Add to your `~/.claude.json` or project `.claude/settings.json`:
+
+```json
+{
+ "mcpServers": {
+ "papercortex": {
+ "command": "node",
+ "args": ["./dist/mcp-server/index.js"],
+ "cwd": "/path/to/PaperCortex",
+ "env": {
+ "PAPERLESS_URL": "http://localhost:8000",
+ "PAPERLESS_TOKEN": "your-token",
+ "OLLAMA_URL": "http://localhost:11434"
+ }
+ }
+ }
+}
+```
+
+### Example Conversations
+
+Once connected, you can ask Claude Code about your documents naturally:
+
+```
+You: Search my documents for anything related to the office lease renewal
+
+Claude: I found 4 relevant documents:
+ 1. "Mietvertrag Verlängerung 2025" (Score: 0.94) — Document #1234
+ 2. "Office Lease Agreement Amendment" (Score: 0.91) — Document #1235
+ 3. "Nebenkostenabrechnung 2024" (Score: 0.78) — Document #1240
+ 4. "Facilities Management Invoice" (Score: 0.72) — Document #1251
+```
+
+```
+You: Extract the receipt data from document #5678 and export it for DATEV
+
+Claude: Extracted receipt data:
+ Vendor: Deutsche Bahn AG
+ Date: 2025-03-20
+ Amount: 89.90 EUR (net: 75.55 EUR, 19% VAT: 14.35 EUR)
+ Description: ICE Frankfurt-Berlin, 1st class
+
+ DATEV export saved to: exports/datev_2025_03.csv
+```
+
+```
+You: How much did I spend on cloud services this year?
+
+Claude: Based on 23 matching documents, your cloud service spending in 2025:
+ - AWS: 2,340.00 EUR (12 invoices)
+ - Hetzner: 456.00 EUR (3 invoices)
+ - Cloudflare: 240.00 EUR (3 invoices)
+ - Vercel: 180.00 EUR (3 invoices)
+ - GitHub: 132.00 EUR (2 invoices)
+ Total: 3,348.00 EUR
+```
+
+---
+
+## Receipt Workflow
+
+### End-to-End Receipt Processing
+
+```
+┌──────────┐ ┌─────────────┐ ┌──────────────┐ ┌──────────┐ ┌──────────┐
+│ Scan / │ │ Paperless- │ │ PaperCortex │ │ Match │ │ Export │
+│ Photo / ├───►│ ngx ├───►│ Receipt ├───►│ Bank ├───►│ DATEV / │
+│ Email │ │ OCR+Store │ │ Extraction │ │ CSV │ │ CSV │
+└──────────┘ └─────────────┘ └──────────────┘ └──────────┘ └──────────┘
+```
+
+### CLI Commands
+
+```bash
+# Process all unprocessed receipts
+npm run receipt:process
+
+# Extract data from a specific document
+npm run receipt:extract -- --document-id 1234
+
+# Import bank statement and match transactions
+npm run receipt:match -- --bank-csv ./bank_export_2025_q1.csv
+
+# Export matched data as DATEV
+npm run receipt:export -- --format datev --period 2025-Q1
+
+# Export as plain CSV
+npm run receipt:export -- --format csv --period 2025-03
+```
+
+### DATEV Integration Details
+
+The DATEV export generates a `Buchungsstapel` CSV file following the official DATEV format specification:
+
+- **Header row** with advisor number, client number, fiscal year start, and export period
+- **Transaction rows** with amount, debit/credit account, tax code, date, and booking text
+- **Beleglink** — each row includes a reference to the source document in Paperless-ngx
+- **Account mapping** — automatic assignment based on vendor and document type (configurable)
+- **SKR03 and SKR04** chart of accounts supported
+
+---
+
+## Privacy and Security
+
+### Why Local AI Matters
+
+Your documents contain some of the most sensitive data in your life:
+
+- **Tax returns** with income, deductions, and financial details
+- **Contracts** with confidential terms and personal information
+- **Medical bills** with health information
+- **Bank statements** with account numbers and transaction history
+- **Personal correspondence** with private content
+
+Cloud-based document AI services require uploading this data to external servers for processing. Even with encryption and privacy policies, you are trusting a third party with your most sensitive information.
+
+**PaperCortex takes a fundamentally different approach:**
+
+- All AI processing runs on **your hardware** via Ollama
+- Document content is sent only to **your local Ollama instance**
+- Embeddings and extracted data are stored in a **local SQLite database**
+- The only network traffic is between PaperCortex, your Paperless-ngx instance, and your Ollama server
+- **No telemetry, no analytics, no external API calls**
+
+**Your documents stay in your network. Period.**
+
+### Security Best Practices
+
+- Store the Paperless-ngx API token in environment variables, never in source code
+- Run PaperCortex on the same network as Paperless-ngx and Ollama
+- Use Docker networks to isolate services
+- Regularly update Ollama and PaperCortex for security patches
+
+---
+
+## Configuration Reference
+
+All configuration is done through environment variables. See `.env.example` for a complete template.
+
+### Core Settings
+
+| Variable | Default | Description |
+|---|---|---|
+| `PAPERLESS_URL` | `http://localhost:8000` | Paperless-ngx instance URL |
+| `PAPERLESS_TOKEN` | *(required)* | Paperless-ngx API authentication token |
+| `OLLAMA_URL` | `http://localhost:11434` | Ollama API endpoint |
+| `OLLAMA_MODEL` | `qwen2.5:14b` | LLM model for classification and extraction |
+| `OLLAMA_EMBEDDING_MODEL` | `nomic-embed-text` | Embedding model for semantic search |
+| `VECTOR_DB_PATH` | `./data/vectors.db` | Path to the SQLite vector database |
+
+### Processing Settings
+
+| Variable | Default | Description |
+|---|---|---|
+| `POLL_INTERVAL` | `300` | Seconds between polling Paperless-ngx for new documents |
+| `BATCH_SIZE` | `10` | Number of documents to process per batch |
+| `EMBEDDING_DIMENSIONS` | `768` | Vector dimensions (must match embedding model) |
+| `CLASSIFICATION_CONFIDENCE` | `0.7` | Minimum confidence to auto-apply classifications |
+
+### Export Settings
+
+| Variable | Default | Description |
+|---|---|---|
+| `DATEV_ADVISOR_NUMBER` | *(optional)* | Steuerberater number for DATEV export header |
+| `DATEV_CLIENT_NUMBER` | *(optional)* | Mandantennummer for DATEV export header |
+| `DATEV_FISCAL_YEAR_START` | `01-01` | Fiscal year start (MM-DD) |
+| `DEFAULT_ACCOUNT_PLAN` | `SKR03` | Default chart of accounts (`SKR03` or `SKR04`) |
+| `EXPORT_DIR` | `./exports` | Directory for generated export files |
+
+### MCP Server Settings
+
+| Variable | Default | Description |
+|---|---|---|
+| `MCP_TRANSPORT` | `stdio` | MCP transport mode (`stdio` or `http`) |
+| `MCP_PORT` | `3100` | Port for HTTP transport mode |
+| `MCP_AUTH_TOKEN` | *(optional)* | Bearer token for HTTP transport authentication |
+
+---
+
+## Supported Models
+
+PaperCortex works with any Ollama-compatible model. Recommended configurations:
+
+### For Classification and Extraction
+
+| Model | VRAM | Speed | Quality | Recommended For |
+|---|---|---|---|---|
+| `qwen2.5:7b` | 5 GB | Fast | Good | Raspberry Pi, low-end servers |
+| `qwen2.5:14b` | 10 GB | Medium | Very Good | Most homelab setups |
+| `qwen2.5:32b` | 20 GB | Slow | Excellent | High-accuracy requirements |
+| `llama3.1:8b` | 5 GB | Fast | Good | Alternative to Qwen |
+| `mistral:7b` | 5 GB | Fast | Good | European language focus |
+
+### For Embeddings
+
+| Model | Dimensions | Speed | Quality |
+|---|---|---|---|
+| `nomic-embed-text` | 768 | Very Fast | Very Good |
+| `mxbai-embed-large` | 1024 | Fast | Excellent |
+| `all-minilm` | 384 | Fastest | Good |
+
+---
+
+## Project Structure
+
+```
+PaperCortex/
+├── src/
+│ ├── mcp-server/ # MCP Server for AI agent integration
+│ │ ├── index.ts # Server entry point and tool registration
+│ │ └── tools/
+│ │ ├── search.ts # Semantic document search tool
+│ │ ├── classify.ts # Auto-classification tool
+│ │ ├── receipt.ts # Receipt data extraction tool
+│ │ ├── query.ts # Natural language query tool
+│ │ └── export.ts # DATEV/CSV export tool
+│ ├── embeddings/
+│ │ ├── ollama.ts # Ollama embedding API client
+│ │ └── store.ts # SQLite vector store with HNSW index
+│ ├── paperless/
+│ │ ├── client.ts # Paperless-ngx REST API client
+│ │ └── types.ts # TypeScript type definitions
+│ └── receipt/
+│ ├── extractor.ts # Receipt OCR content parsing and extraction
+│ ├── matcher.ts # Bank CSV transaction matching engine
+│ └── datev.ts # DATEV Buchungsstapel CSV formatter
+├── docs/
+│ ├── architecture.md # Detailed architecture documentation
+│ ├── setup.md # Step-by-step installation guide
+│ └── receipts.md # Receipt workflow documentation
+├── docker-compose.yml # Production deployment
+├── Dockerfile # Container build
+├── .env.example # Configuration template (no secrets!)
+├── package.json
+├── tsconfig.json
+└── LICENSE # MIT
+```
+
+---
+
+## Roadmap
+
+- [x] Core MCP Server with 5 tools
+- [x] Paperless-ngx API client
+- [x] Ollama embedding generation
+- [x] SQLite vector store
+- [x] Receipt data extraction
+- [x] DATEV export
+- [x] Docker deployment
+- [ ] Bank CSV matching engine
+- [ ] Web dashboard UI
+- [ ] Webhook support (instant processing on document arrival)
+- [ ] Multi-user support with separate vector stores
+- [ ] Additional export formats (SKR04 mapping, FiBu, CSV+)
+- [ ] Ollama vision model support for direct image analysis
+- [ ] Automated document workflow triggers
+- [ ] Plugin system for custom extractors
+- [ ] Prometheus metrics endpoint
+
+---
+
+## Contributing
+
+Contributions are welcome! PaperCortex is early-stage and there are many ways to help:
+
+### Getting Started
+
+```bash
+git clone https://github.com/renefichtmueller/PaperCortex.git
+cd PaperCortex
+npm install
+cp .env.example .env
+# Edit .env with your local Paperless-ngx and Ollama settings
+npm run dev
+```
+
+### How to Contribute
+
+1. **Fork** the repository
+2. **Create** a feature branch (`git checkout -b feat/amazing-feature`)
+3. **Write tests** for your changes
+4. **Commit** using conventional commits (`feat:`, `fix:`, `docs:`, `refactor:`)
+5. **Push** and open a Pull Request
+
+### Areas Where Help is Needed
+
+| Area | Description | Difficulty |
+|---|---|---|
+| **Bank CSV Parsers** | Add parsers for different bank export formats (Sparkasse, ING, N26, Revolut, etc.) | Easy |
+| **Export Formats** | Additional accounting export formats beyond DATEV | Medium |
+| **Web Dashboard** | Build a simple web UI for browsing indexed documents and extracted data | Medium |
+| **Multi-language** | Improve extraction accuracy for non-English/German receipts | Medium |
+| **Vision Models** | Use Ollama vision models to extract data directly from receipt images | Hard |
+| **Webhooks** | React to Paperless-ngx document events in real-time | Medium |
+
+---
+
+## Frequently Asked Questions
+
+**Q: Does PaperCortex modify my documents in Paperless-ngx?**
+A: By default, PaperCortex only reads documents. When you use the `classify` tool with `apply: true`, it can write tags, document types, and correspondents back to Paperless-ngx. Extraction results and embeddings are stored in PaperCortex's own database.
+
+**Q: How much disk space does the vector database need?**
+A: Roughly 1-2 KB per document for embeddings. A collection of 10,000 documents needs about 10-20 MB of vector storage.
+
+**Q: Can I use OpenAI instead of Ollama?**
+A: PaperCortex is designed for local-first operation with Ollama. Support for OpenAI-compatible APIs (including local alternatives like LM Studio, vLLM, or LocalAI) is on the roadmap.
+
+**Q: What Paperless-ngx version is required?**
+A: PaperCortex works with Paperless-ngx 2.0 and later (REST API v3+).
+
+**Q: Can I run PaperCortex on a Raspberry Pi?**
+A: PaperCortex itself is lightweight. The bottleneck is Ollama — you'll need a model that fits in your available RAM. `qwen2.5:7b` works on 8GB devices.
+
+**Q: Is DATEV export only for Germany?**
+A: The DATEV format is the German standard, but PaperCortex also exports plain CSV that works with any accounting software worldwide.
+
+---
+
+## License
+
+MIT License — see [LICENSE](LICENSE) for details.
+
+Free to use, modify, and distribute. Commercial use welcome.
+
+---
+
+## Acknowledgments
+
+Built on the shoulders of giants:
+
+- **[Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx)** — The incredible open-source document management system (37k+ stars)
+- **[Ollama](https://ollama.com)** — Making local AI accessible to everyone
+- **[Model Context Protocol](https://modelcontextprotocol.io)** — The open standard for AI tool integration by Anthropic
+- **[better-sqlite3](https://github.com/WiseLibs/better-sqlite3)** — Fast, reliable SQLite bindings for Node.js
+
+---
+
+## Star History
+
+If PaperCortex is useful to you, please consider giving it a star — it helps others discover the project!
+
+---
+
+
+ Your documents. Your AI. Your hardware.
+ No cloud required.
+
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..81fba7b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,36 @@
+services:
+ papercortex:
+ build: .
+ container_name: papercortex
+ restart: unless-stopped
+ ports:
+ - "3100:3100"
+ volumes:
+ - papercortex-data:/app/data
+ env_file:
+ - .env
+ environment:
+ - NODE_ENV=production
+ depends_on:
+ - ollama
+
+ ollama:
+ image: ollama/ollama:latest
+ container_name: papercortex-ollama
+ restart: unless-stopped
+ ports:
+ - "11434:11434"
+ volumes:
+ - ollama-models:/root/.ollama
+ # Uncomment for NVIDIA GPU support:
+ # deploy:
+ # resources:
+ # reservations:
+ # devices:
+ # - driver: nvidia
+ # count: all
+ # capabilities: [gpu]
+
+volumes:
+ papercortex-data:
+ ollama-models:
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 0000000..420cc56
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,64 @@
+# Architecture
+
+## Overview
+
+PaperCortex is structured as three layers:
+
+1. **MCP Server Layer** -- Exposes tools via the Model Context Protocol for AI agent integration.
+2. **Intelligence Layer** -- Embedding generation, classification, receipt extraction, and query answering.
+3. **Data Layer** -- Paperless-ngx API client and local SQLite vector store.
+
+## Components
+
+### MCP Server (`src/mcp-server/`)
+
+The entry point for all AI agent interactions. Implements the MCP standard using `@modelcontextprotocol/sdk` and communicates via stdio transport.
+
+Each tool is implemented as a separate handler module under `src/mcp-server/tools/`.
+
+### Embeddings (`src/embeddings/`)
+
+- **ollama.ts** -- Client for the Ollama API. Handles embedding generation and LLM completions.
+- **store.ts** -- SQLite-backed vector store using `better-sqlite3`. Stores document embeddings and supports cosine similarity search.
+
+Current implementation uses brute-force search, which is performant up to ~100k documents. For larger archives, consider migrating to `sqlite-vss` or a dedicated vector database.
+
+### Paperless Integration (`src/paperless/`)
+
+- **client.ts** -- REST API client for Paperless-ngx. Supports document CRUD, search, tags, correspondents, and document types.
+- **types.ts** -- TypeScript type definitions matching the Paperless-ngx API v3+ schema.
+
+### Receipt Processing (`src/receipt/`)
+
+- **extractor.ts** -- Uses LLM to extract structured data from receipt OCR text.
+- **matcher.ts** -- Matches extracted receipts against bank CSV transaction exports.
+- **datev.ts** -- Generates DATEV Buchungsstapel format CSV for German accounting software.
+
+## Data Flow
+
+```
+Paperless-ngx --(REST API)--> PaperCortex --(Ollama API)--> Ollama
+ |
+ v
+ SQLite Vector DB
+ |
+ v
+ MCP Server (stdio)
+ |
+ v
+ Claude Code / AI Agents
+```
+
+## Security Model
+
+- All data stays local -- no external API calls except to Paperless-ngx and Ollama (both self-hosted).
+- API tokens are read from environment variables, never hardcoded.
+- The SQLite database is stored on the local filesystem with configurable path.
+- MCP Server communicates via stdio (no network port required for MCP).
+
+## Future Considerations
+
+- **Webhook support** -- Listen for Paperless-ngx webhooks to auto-process new documents.
+- **Plugin system** -- Allow custom extractors and exporters.
+- **Web dashboard** -- Optional UI for monitoring and manual review.
+- **Multi-user** -- Support multiple Paperless-ngx instances and user isolation.
diff --git a/docs/receipts.md b/docs/receipts.md
new file mode 100644
index 0000000..2bf2bcc
--- /dev/null
+++ b/docs/receipts.md
@@ -0,0 +1,101 @@
+# Receipt Workflow
+
+## Overview
+
+PaperCortex provides a complete receipt-to-accounting pipeline:
+
+1. **Scan** -- Upload receipts to Paperless-ngx (scan, email, photo)
+2. **Extract** -- AI extracts structured data (vendor, date, amounts, line items)
+3. **Match** -- Reconcile against bank CSV exports
+4. **Export** -- Generate DATEV-compatible CSV for accounting software
+
+## Receipt Extraction
+
+### Via MCP Server (Claude Code)
+
+```
+Extract receipt data from document #1234
+```
+
+### Via CLI
+
+```bash
+npm run receipt:extract -- --document-id 1234
+```
+
+### Extracted Fields
+
+| Field | Description | Example |
+|---|---|---|
+| vendor | Company name | "IKEA Deutschland GmbH" |
+| vendorAddress | Full address | "Am Wanderweg 1, 65719 Hofheim" |
+| vendorTaxId | Tax ID / VAT number | "DE 129 341 800" |
+| date | Receipt date | "2024-03-15" |
+| currency | ISO 4217 code | "EUR" |
+| subtotal | Before tax | 84.03 |
+| taxRate | Tax percentage | 19 |
+| taxAmount | Tax amount | 15.97 |
+| totalAmount | Total with tax | 100.00 |
+| paymentMethod | How it was paid | "card" |
+| lineItems | Individual items | Array of items |
+| category | Expense category | "office_supplies" |
+
+## Bank Statement Matching
+
+Match receipts against bank CSV exports to verify which receipts correspond to which bank transactions.
+
+### Supported Bank Formats
+
+- Sparkasse (semicolon-separated, German format)
+- ING (semicolon-separated)
+- DKB (semicolon-separated)
+- Volksbank (semicolon-separated)
+- Generic CSV
+
+### Matching Algorithm
+
+1. **Amount match** -- Exact or close amount (within 1.00 tolerance)
+2. **Date proximity** -- Same day, within 3 days, or within 7 days
+3. **Vendor name** -- Partial match in transaction description
+
+Results include a confidence score (0.0 - 1.0) and match reasons.
+
+## DATEV Export
+
+### Format
+
+PaperCortex generates DATEV Buchungsstapel (posting batch) format CSV, compatible with:
+
+- DATEV Unternehmen Online
+- lexoffice
+- sevDesk
+- FastBill
+- Any DATEV-import-capable software
+
+### Account Mapping (SKR03)
+
+| Category | Account | Description |
+|---|---|---|
+| office_supplies | 4930 | Buerokosten |
+| travel | 4660 | Reisekosten |
+| food | 4650 | Bewirtungskosten |
+| telephone | 4920 | Telefon |
+| postage | 4910 | Porto |
+| rent | 4210 | Miete |
+| advertising | 4600 | Werbekosten |
+| software | 4964 | Software |
+| consulting | 4950 | Rechts- und Beratungskosten |
+| default | 4900 | Sonstige Aufwendungen |
+
+### Export via CLI
+
+```bash
+# Export all receipts from March 2024 as DATEV CSV
+npm run receipt:export -- --format datev --year 2024 --month 03
+```
+
+### Export via MCP Server
+
+```
+Export documents #100, #101, #102 as DATEV CSV
+```
diff --git a/docs/setup.md b/docs/setup.md
new file mode 100644
index 0000000..c348cbd
--- /dev/null
+++ b/docs/setup.md
@@ -0,0 +1,107 @@
+# Setup Guide
+
+## Prerequisites
+
+- **Node.js** 20+ (or Docker)
+- **Paperless-ngx** instance with API access
+- **Ollama** with required models
+
+## Step 1: Install Ollama Models
+
+```bash
+# Required: LLM for classification and extraction
+ollama pull qwen2.5:14b
+
+# Required: Embedding model for semantic search
+ollama pull nomic-embed-text
+```
+
+Verify Ollama is running:
+```bash
+curl http://localhost:11434/api/tags
+```
+
+## Step 2: Get Paperless-ngx API Token
+
+1. Open your Paperless-ngx web UI
+2. Go to Settings > API
+3. Generate a new API token
+4. Copy the token for the next step
+
+## Step 3: Configure PaperCortex
+
+```bash
+git clone https://github.com/YOUR_USERNAME/PaperCortex.git
+cd PaperCortex
+cp .env.example .env
+```
+
+Edit `.env` with your values:
+```env
+PAPERLESS_URL=http://localhost:8000
+PAPERLESS_TOKEN=
+OLLAMA_URL=http://localhost:11434
+```
+
+## Step 4: Run
+
+### Option A: Docker (Recommended)
+
+```bash
+docker compose up -d
+```
+
+### Option B: Manual
+
+```bash
+npm install
+npm run build
+npm start
+```
+
+### Option C: Development
+
+```bash
+npm install
+npm run dev
+```
+
+## Step 5: Register MCP Server
+
+Add to your Claude Code configuration (`~/.claude.json`):
+
+```json
+{
+ "mcpServers": {
+ "papercortex": {
+ "command": "node",
+ "args": ["/absolute/path/to/PaperCortex/dist/mcp-server/index.js"],
+ "env": {
+ "PAPERLESS_URL": "http://localhost:8000",
+ "PAPERLESS_TOKEN": "your-token",
+ "OLLAMA_URL": "http://localhost:11434"
+ }
+ }
+ }
+}
+```
+
+## Step 6: Populate Vector Store
+
+On first run, you need to embed your existing documents. This will be automated in a future release. For now, the vector store is populated as documents are queried or classified.
+
+## Troubleshooting
+
+### "Connection refused" to Paperless-ngx
+- Verify the URL in `.env` is reachable
+- Check that the API token is valid
+- Ensure Paperless-ngx is running
+
+### "Connection refused" to Ollama
+- Run `ollama serve` if not already running
+- Check the port (default: 11434)
+- Verify models are pulled: `ollama list`
+
+### Slow first query
+- The first embedding generation may take longer as Ollama loads the model into memory
+- Subsequent queries will be faster once the model is loaded
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..7a38ac0
--- /dev/null
+++ b/package.json
@@ -0,0 +1,57 @@
+{
+ "name": "papercortex",
+ "version": "0.1.0",
+ "description": "Self-hosted AI intelligence layer for Paperless-ngx with semantic search, receipt extraction, and MCP Server integration",
+ "main": "dist/mcp-server/index.js",
+ "type": "module",
+ "scripts": {
+ "build": "tsc",
+ "start": "node dist/mcp-server/index.js",
+ "dev": "tsx watch src/mcp-server/index.ts",
+ "lint": "eslint src/",
+ "test": "vitest",
+ "test:coverage": "vitest --coverage",
+ "receipt:extract": "tsx src/receipt/extractor.ts",
+ "receipt:match": "tsx src/receipt/matcher.ts",
+ "receipt:export": "tsx src/receipt/datev.ts"
+ },
+ "keywords": [
+ "paperless-ngx",
+ "ollama",
+ "mcp",
+ "mcp-server",
+ "semantic-search",
+ "document-ai",
+ "receipt-extraction",
+ "datev",
+ "self-hosted",
+ "local-ai",
+ "embeddings",
+ "vector-search"
+ ],
+ "author": "",
+ "license": "MIT",
+ "repository": {
+ "type": "git",
+ "url": ""
+ },
+ "engines": {
+ "node": ">=20.0.0"
+ },
+ "dependencies": {
+ "@modelcontextprotocol/sdk": "^1.12.0",
+ "better-sqlite3": "^11.8.0",
+ "csv-parse": "^5.6.0",
+ "csv-stringify": "^6.5.0",
+ "dotenv": "^16.4.0",
+ "zod": "^3.24.0"
+ },
+ "devDependencies": {
+ "@types/better-sqlite3": "^7.6.12",
+ "@types/node": "^22.10.0",
+ "eslint": "^9.17.0",
+ "tsx": "^4.19.0",
+ "typescript": "^5.7.0",
+ "vitest": "^3.0.0"
+ }
+}
diff --git a/src/embeddings/ollama.ts b/src/embeddings/ollama.ts
new file mode 100644
index 0000000..b2ad772
--- /dev/null
+++ b/src/embeddings/ollama.ts
@@ -0,0 +1,148 @@
+/**
+ * Ollama embedding and LLM integration.
+ *
+ * Generates vector embeddings and LLM completions using a local Ollama instance.
+ * All functions are pure and return new objects -- no mutation.
+ *
+ * @example
+ * ```ts
+ * const ollama = createOllamaClient({ baseUrl: "http://localhost:11434" });
+ * const embedding = await ollama.embed("Office rent invoice March 2024");
+ * const answer = await ollama.complete("Classify this document: ...");
+ * ```
+ */
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface OllamaConfig {
+ readonly baseUrl: string;
+ readonly model: string;
+ readonly embeddingModel: string;
+ readonly timeout?: number;
+}
+
+export interface EmbeddingResult {
+ readonly vector: readonly number[];
+ readonly model: string;
+ readonly dimensions: number;
+}
+
+export interface CompletionResult {
+ readonly text: string;
+ readonly model: string;
+ readonly totalDuration: number;
+}
+
+export interface OllamaClient {
+ /** Generate an embedding vector for the given text. */
+ embed(text: string): Promise;
+
+ /** Generate a chat/instruct completion. */
+ complete(prompt: string, systemPrompt?: string): Promise;
+
+ /** Check if the Ollama server is reachable and models are available. */
+ healthCheck(): Promise<{ ok: boolean; models: readonly string[] }>;
+}
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create an Ollama client for embeddings and completions.
+ */
+export function createOllamaClient(config: OllamaConfig): OllamaClient {
+ const { baseUrl, model, embeddingModel, timeout = 120_000 } = config;
+
+ async function post(path: string, body: unknown): Promise {
+ const url = `${baseUrl.replace(/\/+$/, "")}${path}`;
+ const controller = new AbortController();
+ const timer = setTimeout(() => controller.abort(), timeout);
+
+ try {
+ const response = await fetch(url, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(body),
+ signal: controller.signal,
+ });
+
+ if (!response.ok) {
+ const text = await response.text().catch(() => "");
+ throw new Error(`Ollama API error: ${response.status} -- ${text}`);
+ }
+
+ return (await response.json()) as T;
+ } finally {
+ clearTimeout(timer);
+ }
+ }
+
+ return {
+ async embed(text) {
+ // TODO: implement chunking for texts exceeding model context window
+ // TODO: add retry logic with exponential backoff
+
+ interface OllamaEmbedResponse {
+ embedding: number[];
+ }
+
+ const result = await post("/api/embeddings", {
+ model: embeddingModel,
+ prompt: text,
+ });
+
+ return {
+ vector: result.embedding,
+ model: embeddingModel,
+ dimensions: result.embedding.length,
+ };
+ },
+
+ async complete(prompt, systemPrompt) {
+ // TODO: implement streaming support for long completions
+ // TODO: add structured output parsing (JSON mode)
+
+ interface OllamaGenerateResponse {
+ response: string;
+ model: string;
+ total_duration: number;
+ }
+
+ const result = await post("/api/generate", {
+ model,
+ prompt,
+ system: systemPrompt ?? "",
+ stream: false,
+ });
+
+ return {
+ text: result.response,
+ model: result.model,
+ totalDuration: result.total_duration,
+ };
+ },
+
+ async healthCheck() {
+ try {
+ const url = `${baseUrl.replace(/\/+$/, "")}/api/tags`;
+ const response = await fetch(url);
+ if (!response.ok) return { ok: false, models: [] };
+
+ interface OllamaTagsResponse {
+ models: Array<{ name: string }>;
+ }
+
+ const data = (await response.json()) as OllamaTagsResponse;
+ return {
+ ok: true,
+ models: data.models.map((m) => m.name),
+ };
+ } catch {
+ return { ok: false, models: [] };
+ }
+ },
+ };
+}
diff --git a/src/embeddings/store.ts b/src/embeddings/store.ts
new file mode 100644
index 0000000..dc3a012
--- /dev/null
+++ b/src/embeddings/store.ts
@@ -0,0 +1,231 @@
+/**
+ * Local SQLite-backed vector store for document embeddings.
+ *
+ * Stores embedding vectors alongside document metadata in a SQLite database
+ * using better-sqlite3. Supports cosine similarity search for semantic
+ * document retrieval.
+ *
+ * @example
+ * ```ts
+ * const store = createVectorStore({ dbPath: "./data/vectors.db" });
+ * await store.upsert({ documentId: 42, vector: [...], content: "..." });
+ * const results = await store.search(queryVector, { limit: 10 });
+ * ```
+ */
+
+import Database from "better-sqlite3";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface VectorStoreConfig {
+ readonly dbPath: string;
+}
+
+export interface DocumentEmbedding {
+ readonly documentId: number;
+ readonly vector: readonly number[];
+ readonly content: string;
+ readonly title: string;
+ readonly tags: readonly string[];
+ readonly createdAt: string;
+}
+
+export interface SearchResult {
+ readonly documentId: number;
+ readonly title: string;
+ readonly content: string;
+ readonly score: number;
+ readonly tags: readonly string[];
+}
+
+export interface SearchOptions {
+ readonly limit?: number;
+ readonly minScore?: number;
+ readonly tagFilter?: readonly string[];
+}
+
+export interface VectorStore {
+ /** Insert or update a document embedding. */
+ upsert(embedding: DocumentEmbedding): void;
+
+ /** Search for similar documents using cosine similarity. */
+ search(queryVector: readonly number[], options?: SearchOptions): readonly SearchResult[];
+
+ /** Remove an embedding by document ID. */
+ remove(documentId: number): void;
+
+ /** Get the total count of stored embeddings. */
+ count(): number;
+
+ /** Check if a document has been embedded. */
+ has(documentId: number): boolean;
+
+ /** Close the database connection. */
+ close(): void;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Compute cosine similarity between two vectors.
+ * Returns a value between -1 and 1 (1 = identical direction).
+ */
+function cosineSimilarity(a: readonly number[], b: readonly number[]): number {
+ if (a.length !== b.length) {
+ throw new Error(
+ `Vector dimension mismatch: ${a.length} vs ${b.length}`,
+ );
+ }
+
+ let dotProduct = 0;
+ let normA = 0;
+ let normB = 0;
+
+ for (let i = 0; i < a.length; i++) {
+ dotProduct += a[i] * b[i];
+ normA += a[i] * a[i];
+ normB += b[i] * b[i];
+ }
+
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
+ if (denominator === 0) return 0;
+
+ return dotProduct / denominator;
+}
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a local vector store backed by SQLite.
+ *
+ * TODO: Consider migrating to sqlite-vss or DuckDB for ANN search at scale.
+ * The current brute-force approach works well for <100k documents.
+ */
+export function createVectorStore(config: VectorStoreConfig): VectorStore {
+ const db = new Database(config.dbPath);
+
+ // Enable WAL mode for better concurrent read performance
+ db.pragma("journal_mode = WAL");
+
+ // Create tables if they don't exist
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS embeddings (
+ document_id INTEGER PRIMARY KEY,
+ vector BLOB NOT NULL,
+ content TEXT NOT NULL,
+ title TEXT NOT NULL,
+ tags TEXT NOT NULL DEFAULT '[]',
+ created_at TEXT NOT NULL,
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
+ );
+
+ CREATE INDEX IF NOT EXISTS idx_embeddings_created
+ ON embeddings (created_at);
+ `);
+
+ // Prepared statements for performance
+ const upsertStmt = db.prepare(`
+ INSERT INTO embeddings (document_id, vector, content, title, tags, created_at, updated_at)
+ VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
+ ON CONFLICT(document_id) DO UPDATE SET
+ vector = excluded.vector,
+ content = excluded.content,
+ title = excluded.title,
+ tags = excluded.tags,
+ updated_at = datetime('now')
+ `);
+
+ const getAllStmt = db.prepare(`
+ SELECT document_id, vector, content, title, tags FROM embeddings
+ `);
+
+ const removeStmt = db.prepare(`
+ DELETE FROM embeddings WHERE document_id = ?
+ `);
+
+ const countStmt = db.prepare(`
+ SELECT COUNT(*) as count FROM embeddings
+ `);
+
+ const hasStmt = db.prepare(`
+ SELECT 1 FROM embeddings WHERE document_id = ? LIMIT 1
+ `);
+
+ return {
+ upsert(embedding) {
+ const vectorBlob = Buffer.from(new Float32Array(embedding.vector).buffer);
+ upsertStmt.run(
+ embedding.documentId,
+ vectorBlob,
+ embedding.content,
+ embedding.title,
+ JSON.stringify(embedding.tags),
+ embedding.createdAt,
+ );
+ },
+
+ search(queryVector, options = {}) {
+ const { limit = 10, minScore = 0.5, tagFilter } = options;
+
+ // TODO: Implement ANN (approximate nearest neighbor) for large datasets
+ // Current approach: brute-force scan -- fine for <100k documents
+
+ interface EmbeddingRow {
+ document_id: number;
+ vector: Buffer;
+ content: string;
+ title: string;
+ tags: string;
+ }
+
+ const rows = getAllStmt.all() as EmbeddingRow[];
+
+ const scored = rows
+ .map((row) => {
+ const storedVector = Array.from(new Float32Array(row.vector.buffer));
+ const tags: string[] = JSON.parse(row.tags);
+ const score = cosineSimilarity(queryVector, storedVector);
+
+ return {
+ documentId: row.document_id,
+ title: row.title,
+ content: row.content,
+ score,
+ tags,
+ };
+ })
+ .filter((result) => result.score >= minScore)
+ .filter((result) => {
+ if (!tagFilter || tagFilter.length === 0) return true;
+ return tagFilter.some((tag) => result.tags.includes(tag));
+ })
+ .sort((a, b) => b.score - a.score)
+ .slice(0, limit);
+
+ return scored;
+ },
+
+ remove(documentId) {
+ removeStmt.run(documentId);
+ },
+
+ count() {
+ const row = countStmt.get() as { count: number };
+ return row.count;
+ },
+
+ has(documentId) {
+ return hasStmt.get(documentId) !== undefined;
+ },
+
+ close() {
+ db.close();
+ },
+ };
+}
diff --git a/src/mcp-server/index.ts b/src/mcp-server/index.ts
new file mode 100644
index 0000000..fcc9b6d
--- /dev/null
+++ b/src/mcp-server/index.ts
@@ -0,0 +1,249 @@
+/**
+ * PaperCortex MCP Server entry point.
+ *
+ * Exposes document intelligence tools via the Model Context Protocol (MCP)
+ * for integration with Claude Code and other AI agents.
+ *
+ * @see https://modelcontextprotocol.io
+ */
+
+import { Server } from "@modelcontextprotocol/sdk/server/index.js";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import {
+ CallToolRequestSchema,
+ ListToolsRequestSchema,
+} from "@modelcontextprotocol/sdk/types.js";
+import { config } from "dotenv";
+
+import { createOllamaClient } from "../embeddings/ollama.js";
+import { createVectorStore } from "../embeddings/store.js";
+import { createPaperlessClient } from "../paperless/client.js";
+import { handleClassify } from "./tools/classify.js";
+import { handleExport } from "./tools/export.js";
+import { handleQuery } from "./tools/query.js";
+import { handleReceipt } from "./tools/receipt.js";
+import { handleSearch } from "./tools/search.js";
+
+// ---------------------------------------------------------------------------
+// Configuration
+// ---------------------------------------------------------------------------
+
+config(); // Load .env
+
+function requireEnv(key: string): string {
+ const value = process.env[key];
+ if (!value) {
+ throw new Error(`Missing required environment variable: ${key}`);
+ }
+ return value;
+}
+
+// ---------------------------------------------------------------------------
+// Service initialization
+// ---------------------------------------------------------------------------
+
+const paperless = createPaperlessClient({
+ baseUrl: requireEnv("PAPERLESS_URL"),
+ token: requireEnv("PAPERLESS_TOKEN"),
+});
+
+const ollama = createOllamaClient({
+ baseUrl: process.env["OLLAMA_URL"] ?? "http://localhost:11434",
+ model: process.env["OLLAMA_MODEL"] ?? "qwen2.5:14b",
+ embeddingModel: process.env["OLLAMA_EMBEDDING_MODEL"] ?? "nomic-embed-text",
+});
+
+const vectorStore = createVectorStore({
+ dbPath: process.env["VECTOR_DB_PATH"] ?? "./data/vectors.db",
+});
+
+// ---------------------------------------------------------------------------
+// Shared context for tool handlers
+// ---------------------------------------------------------------------------
+
+export interface ToolContext {
+ readonly paperless: typeof paperless;
+ readonly ollama: typeof ollama;
+ readonly vectorStore: typeof vectorStore;
+}
+
+const ctx: ToolContext = { paperless, ollama, vectorStore };
+
+// ---------------------------------------------------------------------------
+// MCP Server setup
+// ---------------------------------------------------------------------------
+
+const server = new Server(
+ {
+ name: "papercortex",
+ version: "0.1.0",
+ },
+ {
+ capabilities: {
+ tools: {},
+ },
+ },
+);
+
+/**
+ * List all available PaperCortex tools.
+ */
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+ tools: [
+ {
+ name: "papercortex_search",
+ description:
+ "Semantic search across all documents in Paperless-ngx. " +
+ "Finds documents by meaning, not just keywords.",
+ inputSchema: {
+ type: "object" as const,
+ properties: {
+ query: {
+ type: "string",
+ description: "Natural language search query",
+ },
+ limit: {
+ type: "number",
+ description: "Maximum number of results (default: 10)",
+ },
+ tags: {
+ type: "array",
+ items: { type: "string" },
+ description: "Filter by tag names",
+ },
+ },
+ required: ["query"],
+ },
+ },
+ {
+ name: "papercortex_classify",
+ description:
+ "Auto-classify a document using local AI. " +
+ "Suggests tags, document type, and correspondent.",
+ inputSchema: {
+ type: "object" as const,
+ properties: {
+ documentId: {
+ type: "number",
+ description: "Paperless-ngx document ID",
+ },
+ applyTags: {
+ type: "boolean",
+ description: "Automatically apply suggested tags (default: false)",
+ },
+ },
+ required: ["documentId"],
+ },
+ },
+ {
+ name: "papercortex_receipt",
+ description:
+ "Extract structured data from a receipt document: " +
+ "vendor, date, amounts, tax, line items.",
+ inputSchema: {
+ type: "object" as const,
+ properties: {
+ documentId: {
+ type: "number",
+ description: "Paperless-ngx document ID of the receipt",
+ },
+ },
+ required: ["documentId"],
+ },
+ },
+ {
+ name: "papercortex_query",
+ description:
+ "Ask natural language questions about your documents. " +
+ 'Example: "How much did I spend on office supplies in Q1 2024?"',
+ inputSchema: {
+ type: "object" as const,
+ properties: {
+ question: {
+ type: "string",
+ description: "Natural language question about your documents",
+ },
+ maxDocuments: {
+ type: "number",
+ description:
+ "Maximum documents to include in context (default: 5)",
+ },
+ },
+ required: ["question"],
+ },
+ },
+ {
+ name: "papercortex_export",
+ description:
+ "Export receipt data as DATEV-compatible CSV for German accounting, " +
+ "or as generic CSV.",
+ inputSchema: {
+ type: "object" as const,
+ properties: {
+ documentIds: {
+ type: "array",
+ items: { type: "number" },
+ description: "Document IDs to export",
+ },
+ format: {
+ type: "string",
+ enum: ["datev", "csv"],
+ description: "Export format (default: datev)",
+ },
+ },
+ required: ["documentIds"],
+ },
+ },
+ ],
+}));
+
+/**
+ * Route tool calls to their respective handlers.
+ */
+server.setRequestHandler(CallToolRequestSchema, async (request) => {
+ const { name, arguments: args } = request.params;
+
+ try {
+ switch (name) {
+ case "papercortex_search":
+ return await handleSearch(ctx, args as Record);
+ case "papercortex_classify":
+ return await handleClassify(ctx, args as Record);
+ case "papercortex_receipt":
+ return await handleReceipt(ctx, args as Record);
+ case "papercortex_query":
+ return await handleQuery(ctx, args as Record);
+ case "papercortex_export":
+ return await handleExport(ctx, args as Record);
+ default:
+ return {
+ content: [
+ { type: "text" as const, text: `Unknown tool: ${name}` },
+ ],
+ isError: true,
+ };
+ }
+ } catch (error) {
+ const message =
+ error instanceof Error ? error.message : "Unknown error occurred";
+ return {
+ content: [{ type: "text" as const, text: `Error: ${message}` }],
+ isError: true,
+ };
+ }
+});
+
+// ---------------------------------------------------------------------------
+// Start server
+// ---------------------------------------------------------------------------
+
+async function main(): Promise {
+ const transport = new StdioServerTransport();
+ await server.connect(transport);
+ console.error("PaperCortex MCP Server running on stdio");
+}
+
+main().catch((error) => {
+ console.error("Fatal error starting PaperCortex:", error);
+ process.exit(1);
+});
diff --git a/src/mcp-server/tools/classify.ts b/src/mcp-server/tools/classify.ts
new file mode 100644
index 0000000..cf45bf3
--- /dev/null
+++ b/src/mcp-server/tools/classify.ts
@@ -0,0 +1,117 @@
+/**
+ * Auto-classification tool for the PaperCortex MCP Server.
+ *
+ * Uses local LLM to analyze document content and suggest appropriate
+ * tags, document types, and correspondents.
+ */
+
+import type { ToolContext } from "../index.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface ClassifyArgs {
+ readonly documentId: number;
+ readonly applyTags?: boolean;
+}
+
+interface ClassificationResult {
+ readonly suggestedTags: readonly string[];
+ readonly suggestedType: string | null;
+ readonly suggestedCorrespondent: string | null;
+ readonly summary: string;
+ readonly language: string;
+ readonly confidence: number;
+}
+
+// ---------------------------------------------------------------------------
+// Prompts
+// ---------------------------------------------------------------------------
+
+const CLASSIFY_SYSTEM_PROMPT = `You are a document classification assistant. Analyze the document content and provide structured classification.
+
+Respond with valid JSON only:
+{
+ "suggestedTags": ["tag1", "tag2"],
+ "suggestedType": "invoice|contract|receipt|letter|report|tax_document|bank_statement|insurance|warranty|manual|other",
+ "suggestedCorrespondent": "Company or person name",
+ "summary": "One sentence summary",
+ "language": "ISO 639-1 code",
+ "confidence": 0.0 to 1.0
+}`;
+
+// ---------------------------------------------------------------------------
+// Handler
+// ---------------------------------------------------------------------------
+
+/**
+ * Handle a `papercortex_classify` tool call.
+ *
+ * 1. Fetch document content from Paperless-ngx.
+ * 2. Send content to Ollama for classification.
+ * 3. Optionally apply suggested tags back to Paperless-ngx.
+ *
+ * TODO: Match suggested tags against existing Paperless-ngx tags
+ * TODO: Create new tags automatically when confidence is high
+ * TODO: Learn from user corrections to improve classification
+ */
+export async function handleClassify(
+ ctx: ToolContext,
+ args: Record,
+): Promise<{ content: Array<{ type: "text"; text: string }> }> {
+ const { documentId, applyTags = false } = args as unknown as ClassifyArgs;
+
+ // Fetch document from Paperless-ngx
+ const document = await ctx.paperless.getDocument(documentId);
+
+ if (!document.content || document.content.trim().length === 0) {
+ return {
+ content: [
+ {
+ type: "text",
+ text: `Document #${documentId} has no text content. OCR may not have completed.`,
+ },
+ ],
+ };
+ }
+
+ // Classify using Ollama
+ const prompt = `Classify this document:\n\nTitle: ${document.title}\n\nContent:\n${document.content.slice(0, 4000)}`;
+ const completion = await ctx.ollama.complete(prompt, CLASSIFY_SYSTEM_PROMPT);
+
+ let classification: ClassificationResult;
+ try {
+ classification = JSON.parse(completion.text) as ClassificationResult;
+ } catch {
+ return {
+ content: [
+ {
+ type: "text",
+ text: `Classification failed: LLM did not return valid JSON.\nRaw response: ${completion.text.slice(0, 500)}`,
+ },
+ ],
+ };
+ }
+
+ // Optionally apply tags
+ let appliedNote = "";
+ if (applyTags && classification.suggestedTags.length > 0) {
+ // TODO: Look up tag IDs from Paperless-ngx, create missing tags
+ appliedNote =
+ "\n\nNote: Tag application is not yet implemented. " +
+ "Tags need to be matched against existing Paperless-ngx tags.";
+ }
+
+ const output =
+ `Classification for Document #${documentId} "${document.title}":\n\n` +
+ `Type: ${classification.suggestedType ?? "unknown"}\n` +
+ `Correspondent: ${classification.suggestedCorrespondent ?? "unknown"}\n` +
+ `Tags: ${classification.suggestedTags.join(", ") || "none"}\n` +
+ `Language: ${classification.language}\n` +
+ `Summary: ${classification.summary}\n` +
+ `Confidence: ${(classification.confidence * 100).toFixed(0)}%` +
+ appliedNote;
+
+ return { content: [{ type: "text", text: output }] };
+}
diff --git a/src/mcp-server/tools/export.ts b/src/mcp-server/tools/export.ts
new file mode 100644
index 0000000..69019fa
--- /dev/null
+++ b/src/mcp-server/tools/export.ts
@@ -0,0 +1,116 @@
+/**
+ * DATEV/CSV export tool for the PaperCortex MCP Server.
+ *
+ * Exports receipt data in accounting-compatible formats.
+ */
+
+import { createReceiptExtractor } from "../../receipt/extractor.js";
+import { createDatevExporter } from "../../receipt/datev.js";
+import type { ToolContext } from "../index.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface ExportArgs {
+ readonly documentIds: readonly number[];
+ readonly format?: "datev" | "csv";
+}
+
+// ---------------------------------------------------------------------------
+// Handler
+// ---------------------------------------------------------------------------
+
+/**
+ * Handle a `papercortex_export` tool call.
+ *
+ * 1. Extract receipt data from all specified documents.
+ * 2. Format as DATEV or generic CSV.
+ * 3. Return the CSV content.
+ *
+ * TODO: Add file output option (save to disk)
+ * TODO: Add date range filtering
+ * TODO: Add DATEV header metadata (consultant/client numbers from config)
+ */
+export async function handleExport(
+ ctx: ToolContext,
+ args: Record,
+): Promise<{ content: Array<{ type: "text"; text: string }> }> {
+ const { documentIds, format = "datev" } = args as unknown as ExportArgs;
+
+ if (!documentIds || documentIds.length === 0) {
+ return {
+ content: [
+ {
+ type: "text",
+ text: "Error: at least one document ID is required for export.",
+ },
+ ],
+ };
+ }
+
+ // Extract receipt data from all documents
+ const extractor = createReceiptExtractor({
+ ollama: ctx.ollama,
+ paperless: ctx.paperless,
+ });
+
+ const receipts = await extractor.extractBatch(documentIds);
+
+ if (format === "datev") {
+ // TODO: Read consultant/client numbers from configuration
+ const exporter = createDatevExporter({
+ consultantNumber: 0,
+ clientNumber: 0,
+ });
+
+ const receiptsForExport = receipts.map((r) => ({
+ documentId: r.documentId,
+ vendor: r.vendor,
+ date: r.date,
+ totalAmount: r.totalAmount,
+ taxRate: r.taxRate,
+ category: r.category,
+ }));
+
+ const csv = exporter.generateCsv(receiptsForExport);
+
+ return {
+ content: [
+ {
+ type: "text",
+ text:
+ `DATEV export for ${receipts.length} receipt(s):\n\n` +
+ "```csv\n" +
+ csv +
+ "\n```\n\n" +
+ "Copy this CSV content into a file and import into your " +
+ "DATEV-compatible accounting software.",
+ },
+ ],
+ };
+ }
+
+ // Generic CSV format
+ const header = "Document ID;Vendor;Date;Amount;Tax Rate;Tax Amount;Currency;Category";
+ const rows = receipts.map(
+ (r) =>
+ `${r.documentId};${r.vendor};${r.date};${r.totalAmount.toFixed(2)};` +
+ `${r.taxRate ?? ""};${r.taxAmount?.toFixed(2) ?? ""};${r.currency};${r.category ?? ""}`,
+ );
+
+ const csv = [header, ...rows].join("\n");
+
+ return {
+ content: [
+ {
+ type: "text",
+ text:
+ `CSV export for ${receipts.length} receipt(s):\n\n` +
+ "```csv\n" +
+ csv +
+ "\n```",
+ },
+ ],
+ };
+}
diff --git a/src/mcp-server/tools/query.ts b/src/mcp-server/tools/query.ts
new file mode 100644
index 0000000..eeb2420
--- /dev/null
+++ b/src/mcp-server/tools/query.ts
@@ -0,0 +1,110 @@
+/**
+ * Natural language query tool for the PaperCortex MCP Server.
+ *
+ * Answers questions about documents using RAG (Retrieval-Augmented Generation):
+ * retrieves relevant documents via semantic search, then generates an answer
+ * using the local LLM with document context.
+ */
+
+import type { ToolContext } from "../index.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface QueryArgs {
+ readonly question: string;
+ readonly maxDocuments?: number;
+}
+
+// ---------------------------------------------------------------------------
+// Prompts
+// ---------------------------------------------------------------------------
+
+const QUERY_SYSTEM_PROMPT = `You are a document analysis assistant. Answer the user's question based ONLY on the provided document excerpts. If the documents don't contain enough information to answer, say so clearly.
+
+Be precise with numbers, dates, and amounts. Cite document IDs when referencing specific documents.`;
+
+// ---------------------------------------------------------------------------
+// Handler
+// ---------------------------------------------------------------------------
+
+/**
+ * Handle a `papercortex_query` tool call.
+ *
+ * Uses RAG (Retrieval-Augmented Generation):
+ * 1. Embed the question and retrieve relevant documents.
+ * 2. Build a context from retrieved documents.
+ * 3. Generate an answer using the local LLM.
+ *
+ * TODO: Add conversation history for follow-up questions
+ * TODO: Add source citation with page numbers
+ * TODO: Implement query decomposition for complex questions
+ */
+export async function handleQuery(
+ ctx: ToolContext,
+ args: Record,
+): Promise<{ content: Array<{ type: "text"; text: string }> }> {
+ const { question, maxDocuments = 5 } = args as unknown as QueryArgs;
+
+ if (!question || question.trim().length === 0) {
+ return {
+ content: [{ type: "text", text: "Error: question cannot be empty." }],
+ };
+ }
+
+ // Step 1: Retrieve relevant documents
+ const queryEmbedding = await ctx.ollama.embed(question);
+ const relevantDocs = ctx.vectorStore.search(queryEmbedding.vector, {
+ limit: maxDocuments,
+ minScore: 0.3,
+ });
+
+ if (relevantDocs.length === 0) {
+ return {
+ content: [
+ {
+ type: "text",
+ text:
+ `I couldn't find any relevant documents to answer: "${question}"\n\n` +
+ "The vector store may need to be populated first, or your documents " +
+ "may not contain information related to this question.",
+ },
+ ],
+ };
+ }
+
+ // Step 2: Build context from retrieved documents
+ const context = relevantDocs
+ .map(
+ (doc) =>
+ `--- Document #${doc.documentId}: ${doc.title} (relevance: ${doc.score.toFixed(2)}) ---\n` +
+ doc.content.slice(0, 2000),
+ )
+ .join("\n\n");
+
+ // Step 3: Generate answer with context
+ const prompt =
+ `Based on the following documents, answer this question: "${question}"\n\n` +
+ `Documents:\n${context}`;
+
+ const completion = await ctx.ollama.complete(prompt, QUERY_SYSTEM_PROMPT);
+
+ const sourcesNote = relevantDocs
+ .map(
+ (doc) =>
+ ` - Document #${doc.documentId}: ${doc.title} (score: ${doc.score.toFixed(2)})`,
+ )
+ .join("\n");
+
+ return {
+ content: [
+ {
+ type: "text",
+ text:
+ `${completion.text}\n\n` +
+ `---\nSources (${relevantDocs.length} documents):\n${sourcesNote}`,
+ },
+ ],
+ };
+}
diff --git a/src/mcp-server/tools/receipt.ts b/src/mcp-server/tools/receipt.ts
new file mode 100644
index 0000000..2509fc9
--- /dev/null
+++ b/src/mcp-server/tools/receipt.ts
@@ -0,0 +1,76 @@
+/**
+ * Receipt extraction tool for the PaperCortex MCP Server.
+ *
+ * Extracts structured receipt data from Paperless-ngx documents
+ * using local LLM analysis.
+ */
+
+import { createReceiptExtractor } from "../../receipt/extractor.js";
+import type { ToolContext } from "../index.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface ReceiptArgs {
+ readonly documentId: number;
+}
+
+// ---------------------------------------------------------------------------
+// Handler
+// ---------------------------------------------------------------------------
+
+/**
+ * Handle a `papercortex_receipt` tool call.
+ *
+ * 1. Fetch document from Paperless-ngx.
+ * 2. Extract receipt data using LLM.
+ * 3. Return structured receipt information.
+ *
+ * TODO: Cache extraction results to avoid re-processing
+ * TODO: Add confidence thresholds and human review flags
+ * TODO: Store extracted data back as Paperless-ngx custom fields
+ */
+export async function handleReceipt(
+ ctx: ToolContext,
+ args: Record,
+): Promise<{ content: Array<{ type: "text"; text: string }> }> {
+ const { documentId } = args as unknown as ReceiptArgs;
+
+ const extractor = createReceiptExtractor({
+ ollama: ctx.ollama,
+ paperless: ctx.paperless,
+ });
+
+ const receipt = await extractor.extract(documentId);
+
+ // Format line items table
+ const lineItemsTable =
+ receipt.lineItems.length > 0
+ ? receipt.lineItems
+ .map(
+ (item, i) =>
+ ` ${i + 1}. ${item.description} | ` +
+ `${item.quantity}x ${item.unitPrice.toFixed(2)} = ${item.totalPrice.toFixed(2)}`,
+ )
+ .join("\n")
+ : " No line items extracted";
+
+ const output =
+ `Receipt Data for Document #${documentId}:\n\n` +
+ `Vendor: ${receipt.vendor}\n` +
+ `Address: ${receipt.vendorAddress ?? "N/A"}\n` +
+ `Tax ID: ${receipt.vendorTaxId ?? "N/A"}\n` +
+ `Date: ${receipt.date}\n` +
+ `Currency: ${receipt.currency}\n` +
+ `\nAmounts:\n` +
+ ` Subtotal: ${receipt.subtotal?.toFixed(2) ?? "N/A"}\n` +
+ ` Tax (${receipt.taxRate ?? "?"}%): ${receipt.taxAmount?.toFixed(2) ?? "N/A"}\n` +
+ ` Total: ${receipt.totalAmount.toFixed(2)}\n` +
+ `\nPayment: ${receipt.paymentMethod ?? "N/A"}\n` +
+ `Category: ${receipt.category ?? "uncategorized"}\n` +
+ `Confidence: ${(receipt.confidence * 100).toFixed(0)}%\n` +
+ `\nLine Items:\n${lineItemsTable}`;
+
+ return { content: [{ type: "text", text: output }] };
+}
diff --git a/src/mcp-server/tools/search.ts b/src/mcp-server/tools/search.ts
new file mode 100644
index 0000000..5a4e869
--- /dev/null
+++ b/src/mcp-server/tools/search.ts
@@ -0,0 +1,87 @@
+/**
+ * Semantic search tool for the PaperCortex MCP Server.
+ *
+ * Performs vector similarity search across all embedded documents,
+ * returning the most semantically relevant results.
+ */
+
+import type { ToolContext } from "../index.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface SearchArgs {
+ readonly query: string;
+ readonly limit?: number;
+ readonly tags?: readonly string[];
+}
+
+// ---------------------------------------------------------------------------
+// Handler
+// ---------------------------------------------------------------------------
+
+/**
+ * Handle a `papercortex_search` tool call.
+ *
+ * 1. Generate an embedding for the search query via Ollama.
+ * 2. Search the local vector store for similar documents.
+ * 3. Return ranked results with scores and metadata.
+ *
+ * TODO: Add hybrid search (combine vector + keyword for better recall)
+ * TODO: Add date range filtering
+ * TODO: Add result caching for repeated queries
+ */
+export async function handleSearch(
+ ctx: ToolContext,
+ args: Record,
+): Promise<{ content: Array<{ type: "text"; text: string }> }> {
+ const { query, limit = 10, tags } = args as unknown as SearchArgs;
+
+ if (!query || query.trim().length === 0) {
+ return {
+ content: [{ type: "text", text: "Error: search query cannot be empty." }],
+ };
+ }
+
+ // Generate embedding for the query
+ const queryEmbedding = await ctx.ollama.embed(query);
+
+ // Search vector store
+ const results = ctx.vectorStore.search(queryEmbedding.vector, {
+ limit,
+ minScore: 0.4,
+ tagFilter: tags ? [...tags] : undefined,
+ });
+
+ if (results.length === 0) {
+ return {
+ content: [
+ {
+ type: "text",
+ text: `No documents found matching "${query}". The vector store may need to be populated first.`,
+ },
+ ],
+ };
+ }
+
+ // Format results
+ const formatted = results
+ .map(
+ (r, i) =>
+ `${i + 1}. [Document #${r.documentId}] (score: ${r.score.toFixed(3)})\n` +
+ ` Title: ${r.title}\n` +
+ ` Tags: ${r.tags.length > 0 ? r.tags.join(", ") : "none"}\n` +
+ ` Preview: ${r.content.slice(0, 200).replace(/\n/g, " ")}...`,
+ )
+ .join("\n\n");
+
+ return {
+ content: [
+ {
+ type: "text",
+ text: `Found ${results.length} documents matching "${query}":\n\n${formatted}`,
+ },
+ ],
+ };
+}
diff --git a/src/paperless/client.ts b/src/paperless/client.ts
new file mode 100644
index 0000000..36db72a
--- /dev/null
+++ b/src/paperless/client.ts
@@ -0,0 +1,182 @@
+/**
+ * Paperless-ngx REST API client.
+ *
+ * Provides typed access to documents, correspondents, tags, and document types.
+ * All methods return immutable result objects.
+ *
+ * @example
+ * ```ts
+ * const client = createPaperlessClient({
+ * baseUrl: "http://localhost:8000",
+ * token: "your-api-token",
+ * });
+ * const docs = await client.getDocuments({ query: "invoice" });
+ * ```
+ */
+
+import type {
+ Correspondent,
+ DocumentSearchParams,
+ DocumentType,
+ PaginatedResponse,
+ PaperlessConfig,
+ PaperlessDocument,
+ Tag,
+} from "./types.js";
+
+// ---------------------------------------------------------------------------
+// Client interface
+// ---------------------------------------------------------------------------
+
+export interface PaperlessClient {
+ /** Fetch a single document by ID. */
+ getDocument(id: number): Promise;
+
+ /** Search / list documents with optional filters. */
+ getDocuments(
+ params?: DocumentSearchParams,
+ ): Promise>;
+
+ /** Fetch all correspondents. */
+ getCorrespondents(): Promise>;
+
+ /** Fetch all tags. */
+ getTags(): Promise>;
+
+ /** Fetch all document types. */
+ getDocumentTypes(): Promise>;
+
+ /** Download the original file content of a document. */
+ downloadDocument(id: number): Promise;
+
+ /** Update tags on a document (immutable -- returns the updated doc). */
+ updateDocumentTags(
+ id: number,
+ tagIds: readonly number[],
+ ): Promise;
+}
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a new Paperless-ngx API client.
+ *
+ * @param config - Connection configuration (URL + token).
+ * @returns A {@link PaperlessClient} instance.
+ */
+export function createPaperlessClient(config: PaperlessConfig): PaperlessClient {
+ const { baseUrl, token, timeout = 30_000 } = config;
+
+ const headers: Record = {
+ Authorization: `Token ${token}`,
+ "Content-Type": "application/json",
+ Accept: "application/json; version=3",
+ };
+
+ /**
+ * Internal fetch wrapper with timeout and error handling.
+ */
+ async function request(
+ path: string,
+ options: RequestInit = {},
+ ): Promise {
+ const url = `${baseUrl.replace(/\/+$/, "")}/api${path}`;
+ const controller = new AbortController();
+ const timer = setTimeout(() => controller.abort(), timeout);
+
+ try {
+ const response = await fetch(url, {
+ ...options,
+ headers: { ...headers, ...((options.headers as Record) ?? {}) },
+ signal: controller.signal,
+ });
+
+ if (!response.ok) {
+ const body = await response.text().catch(() => "");
+ throw new Error(
+ `Paperless API error: ${response.status} ${response.statusText} -- ${body}`,
+ );
+ }
+
+ return (await response.json()) as T;
+ } finally {
+ clearTimeout(timer);
+ }
+ }
+
+ /**
+ * Build query string from search params.
+ */
+ function buildQuery(params?: DocumentSearchParams): string {
+ if (!params) return "";
+ const entries = Object.entries(params).filter(
+ ([, v]) => v !== undefined && v !== null,
+ );
+ if (entries.length === 0) return "";
+ const searchParams = new URLSearchParams();
+ for (const [key, value] of entries) {
+ if (Array.isArray(value)) {
+ searchParams.set(key, value.join(","));
+ } else {
+ searchParams.set(key, String(value));
+ }
+ }
+ return `?${searchParams.toString()}`;
+ }
+
+ return {
+ async getDocument(id) {
+ return request(`/documents/${id}/`);
+ },
+
+ async getDocuments(params) {
+ return request>(
+ `/documents/${buildQuery(params)}`,
+ );
+ },
+
+ async getCorrespondents() {
+ return request>("/correspondents/");
+ },
+
+ async getTags() {
+ return request>("/tags/");
+ },
+
+ async getDocumentTypes() {
+ return request>("/document_types/");
+ },
+
+ async downloadDocument(id) {
+ const url = `${baseUrl.replace(/\/+$/, "")}/api/documents/${id}/download/`;
+ const controller = new AbortController();
+ const timer = setTimeout(() => controller.abort(), timeout);
+
+ try {
+ const response = await fetch(url, {
+ headers: { Authorization: `Token ${token}` },
+ signal: controller.signal,
+ });
+
+ if (!response.ok) {
+ throw new Error(
+ `Paperless download error: ${response.status} ${response.statusText}`,
+ );
+ }
+
+ return await response.arrayBuffer();
+ } finally {
+ clearTimeout(timer);
+ }
+ },
+
+ async updateDocumentTags(id, tagIds) {
+ return request(`/documents/${id}/`, {
+ method: "PATCH",
+ body: JSON.stringify({ tags: [...tagIds] }),
+ });
+ },
+ };
+}
diff --git a/src/paperless/types.ts b/src/paperless/types.ts
new file mode 100644
index 0000000..1088dda
--- /dev/null
+++ b/src/paperless/types.ts
@@ -0,0 +1,126 @@
+/**
+ * TypeScript type definitions for the Paperless-ngx REST API.
+ *
+ * Based on Paperless-ngx API v3+.
+ * @see https://docs.paperless-ngx.com/api/
+ */
+
+// ---------------------------------------------------------------------------
+// Pagination
+// ---------------------------------------------------------------------------
+
+/** Generic paginated response envelope from Paperless-ngx. */
+export interface PaginatedResponse {
+ readonly count: number;
+ readonly next: string | null;
+ readonly previous: string | null;
+ readonly results: readonly T[];
+}
+
+// ---------------------------------------------------------------------------
+// Core entities
+// ---------------------------------------------------------------------------
+
+export interface PaperlessDocument {
+ readonly id: number;
+ readonly correspondent: number | null;
+ readonly document_type: number | null;
+ readonly storage_path: number | null;
+ readonly title: string;
+ readonly content: string;
+ readonly tags: readonly number[];
+ readonly created: string;
+ readonly created_date: string;
+ readonly modified: string;
+ readonly added: string;
+ readonly archive_serial_number: number | null;
+ readonly original_file_name: string;
+ readonly archived_file_name: string | null;
+ readonly owner: number | null;
+ readonly notes: readonly DocumentNote[];
+ readonly custom_fields: readonly CustomFieldValue[];
+}
+
+export interface DocumentNote {
+ readonly id: number;
+ readonly note: string;
+ readonly created: string;
+ readonly user: number;
+}
+
+export interface CustomFieldValue {
+ readonly field: number;
+ readonly value: string | number | boolean | null;
+}
+
+export interface Correspondent {
+ readonly id: number;
+ readonly slug: string;
+ readonly name: string;
+ readonly match: string;
+ readonly matching_algorithm: number;
+ readonly is_insensitive: boolean;
+ readonly document_count: number;
+ readonly last_correspondence: string | null;
+}
+
+export interface DocumentType {
+ readonly id: number;
+ readonly slug: string;
+ readonly name: string;
+ readonly match: string;
+ readonly matching_algorithm: number;
+ readonly is_insensitive: boolean;
+ readonly document_count: number;
+}
+
+export interface Tag {
+ readonly id: number;
+ readonly slug: string;
+ readonly name: string;
+ readonly color: string;
+ readonly text_color: string;
+ readonly match: string;
+ readonly matching_algorithm: number;
+ readonly is_insensitive: boolean;
+ readonly is_inbox_tag: boolean;
+ readonly document_count: number;
+}
+
+export interface StoragePath {
+ readonly id: number;
+ readonly slug: string;
+ readonly name: string;
+ readonly path: string;
+ readonly match: string;
+ readonly matching_algorithm: number;
+ readonly is_insensitive: boolean;
+ readonly document_count: number;
+}
+
+// ---------------------------------------------------------------------------
+// Search & filter
+// ---------------------------------------------------------------------------
+
+export interface DocumentSearchParams {
+ readonly query?: string;
+ readonly correspondent__id?: number;
+ readonly document_type__id?: number;
+ readonly tags__id__all?: readonly number[];
+ readonly tags__id__none?: readonly number[];
+ readonly created__date__gt?: string;
+ readonly created__date__lt?: string;
+ readonly ordering?: string;
+ readonly page?: number;
+ readonly page_size?: number;
+}
+
+// ---------------------------------------------------------------------------
+// API client configuration
+// ---------------------------------------------------------------------------
+
+export interface PaperlessConfig {
+ readonly baseUrl: string;
+ readonly token: string;
+ readonly timeout?: number;
+}
diff --git a/src/receipt/datev.ts b/src/receipt/datev.ts
new file mode 100644
index 0000000..6b31516
--- /dev/null
+++ b/src/receipt/datev.ts
@@ -0,0 +1,171 @@
+/**
+ * DATEV export formatter.
+ *
+ * Generates DATEV-compatible CSV files for import into German accounting
+ * software (DATEV Unternehmen Online, lexoffice, sevDesk, etc.).
+ *
+ * Implements the DATEV "Buchungsstapel" (posting batch) format v7.0+.
+ *
+ * @see https://developer.datev.de/datev/platform/en/dtvf/formate
+ *
+ * @example
+ * ```ts
+ * const exporter = createDatevExporter({ consultantNumber: 12345, clientNumber: 67890 });
+ * const csv = exporter.generateCsv(receiptData);
+ * writeFileSync("./export.csv", csv);
+ * ```
+ */
+
+import { stringify } from "csv-stringify/sync";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface DatevConfig {
+ /** DATEV consultant number (Beraternummer). */
+ readonly consultantNumber: number;
+ /** DATEV client number (Mandantennummer). */
+ readonly clientNumber: number;
+ /** Fiscal year start (1-12, default: 1 for January). */
+ readonly fiscalYearStart?: number;
+ /** Default debit account length (SKR03/SKR04). */
+ readonly accountLength?: 4 | 5;
+}
+
+export interface DatevBookingEntry {
+ readonly amount: number;
+ readonly debitAccount: string;
+ readonly creditAccount: string;
+ readonly taxCode: string;
+ readonly date: string;
+ readonly description: string;
+ readonly documentNumber: string;
+ readonly costCenter?: string;
+}
+
+export interface ReceiptForExport {
+ readonly documentId: number;
+ readonly vendor: string;
+ readonly date: string;
+ readonly totalAmount: number;
+ readonly taxRate: number | null;
+ readonly category: string | null;
+}
+
+export interface DatevExporter {
+ /** Generate DATEV CSV from receipt data. */
+ generateCsv(receipts: readonly ReceiptForExport[]): string;
+
+ /** Map a receipt to a DATEV booking entry. */
+ mapToBooking(receipt: ReceiptForExport): DatevBookingEntry;
+}
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/**
+ * Map expense categories to SKR03 accounts.
+ * TODO: Add SKR04 mapping support
+ * TODO: Make configurable via user settings
+ */
+const SKR03_ACCOUNT_MAP: Record = {
+ office_supplies: "4930",
+ travel: "4660",
+ food: "4650",
+ telephone: "4920",
+ postage: "4910",
+ insurance: "4360",
+ rent: "4210",
+ advertising: "4600",
+ software: "4964",
+ hardware: "4980",
+ consulting: "4950",
+ training: "4945",
+ vehicle: "4500",
+ default: "4900",
+};
+
+/**
+ * Map tax rates to DATEV tax codes (Steuerschluessel).
+ */
+const TAX_CODE_MAP: Record = {
+ 19: "9", // 19% USt (standard)
+ 7: "8", // 7% USt (reduced)
+ 0: "0", // Tax-free
+};
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a DATEV-format exporter for receipt data.
+ *
+ * TODO: Implement DATEV header line with metadata (consultant, client, date range)
+ * TODO: Add validation for account numbers against SKR03/SKR04
+ * TODO: Support DATEV XML format (Buchungsdaten v5.0)
+ */
+export function createDatevExporter(config: DatevConfig): DatevExporter {
+ const {
+ consultantNumber: _consultantNumber,
+ clientNumber: _clientNumber,
+ fiscalYearStart: _fiscalYearStart = 1,
+ accountLength: _accountLength = 4,
+ } = config;
+
+ function mapToBooking(receipt: ReceiptForExport): DatevBookingEntry {
+ const category = receipt.category ?? "default";
+ const debitAccount =
+ SKR03_ACCOUNT_MAP[category] ?? SKR03_ACCOUNT_MAP["default"];
+
+ const taxRate = receipt.taxRate ?? 19;
+ const taxCode = TAX_CODE_MAP[taxRate] ?? TAX_CODE_MAP[19];
+
+ // Parse date to DD.MM format for DATEV
+ const dateParts = receipt.date.split("-");
+ const datevDate =
+ dateParts.length === 3
+ ? `${dateParts[2]}${dateParts[1]}`
+ : receipt.date;
+
+ return {
+ amount: receipt.totalAmount,
+ debitAccount,
+ creditAccount: "1200", // Bank account (SKR03 default)
+ taxCode,
+ date: datevDate,
+ description: receipt.vendor.slice(0, 60), // DATEV max 60 chars
+ documentNumber: `PC-${receipt.documentId}`,
+ costCenter: undefined,
+ };
+ }
+
+ function generateCsv(receipts: readonly ReceiptForExport[]): string {
+ const bookings = receipts.map(mapToBooking);
+
+ // DATEV Buchungsstapel columns
+ const rows = bookings.map((b) => [
+ b.amount.toFixed(2).replace(".", ","), // Umsatz (amount with comma)
+ "S", // Soll/Haben (S = Soll/Debit)
+ b.taxCode, // BU-Schluessel (tax code)
+ b.debitAccount, // Gegenkonto (offset account)
+ b.date, // Belegdatum (document date)
+ b.documentNumber, // Belegfeld 1 (document number)
+ "", // Belegfeld 2
+ b.description, // Buchungstext (description)
+ "", // Umsatzsteuer-ID
+ b.creditAccount, // Konto (account)
+ b.costCenter ?? "", // Kostenstelle (cost center)
+ ]);
+
+ return stringify(rows, {
+ delimiter: ";",
+ quoted: true,
+ record_delimiter: "\r\n",
+ });
+ }
+
+ return { generateCsv, mapToBooking };
+}
diff --git a/src/receipt/extractor.ts b/src/receipt/extractor.ts
new file mode 100644
index 0000000..36877fb
--- /dev/null
+++ b/src/receipt/extractor.ts
@@ -0,0 +1,170 @@
+/**
+ * Receipt data extraction using local LLM via Ollama.
+ *
+ * Extracts structured data from receipt documents: vendor, date, amounts,
+ * tax breakdown, line items, and payment method. Uses the Paperless-ngx
+ * OCR content and enriches it with LLM analysis.
+ *
+ * @example
+ * ```ts
+ * const extractor = createReceiptExtractor({ ollama, paperless });
+ * const receipt = await extractor.extract(documentId);
+ * console.log(receipt.vendor, receipt.totalAmount, receipt.taxAmount);
+ * ```
+ */
+
+import type { OllamaClient } from "../embeddings/ollama.js";
+import type { PaperlessClient } from "../paperless/client.js";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface ReceiptData {
+ readonly documentId: number;
+ readonly vendor: string;
+ readonly vendorAddress: string | null;
+ readonly vendorTaxId: string | null;
+ readonly date: string;
+ readonly currency: string;
+ readonly subtotal: number | null;
+ readonly taxRate: number | null;
+ readonly taxAmount: number | null;
+ readonly totalAmount: number;
+ readonly paymentMethod: string | null;
+ readonly lineItems: readonly LineItem[];
+ readonly category: string | null;
+ readonly confidence: number;
+ readonly rawText: string;
+}
+
+export interface LineItem {
+ readonly description: string;
+ readonly quantity: number;
+ readonly unitPrice: number;
+ readonly totalPrice: number;
+ readonly taxRate: number | null;
+}
+
+export interface ReceiptExtractorConfig {
+ readonly ollama: OllamaClient;
+ readonly paperless: PaperlessClient;
+}
+
+export interface ReceiptExtractor {
+ /** Extract structured receipt data from a Paperless-ngx document. */
+ extract(documentId: number): Promise;
+
+ /** Batch-extract receipts from multiple documents. */
+ extractBatch(documentIds: readonly number[]): Promise;
+}
+
+// ---------------------------------------------------------------------------
+// Prompts
+// ---------------------------------------------------------------------------
+
+const EXTRACTION_SYSTEM_PROMPT = `You are a receipt data extraction assistant. Given the OCR text of a receipt, extract structured data in JSON format.
+
+Extract the following fields:
+- vendor: Company/store name
+- vendorAddress: Full address if visible
+- vendorTaxId: Tax ID / VAT number if visible (e.g., USt-IdNr, Steuernummer)
+- date: Date in ISO 8601 format (YYYY-MM-DD)
+- currency: ISO 4217 currency code (e.g., EUR, USD)
+- subtotal: Amount before tax (null if not distinguishable)
+- taxRate: Tax percentage as decimal (e.g., 19 for 19%)
+- taxAmount: Tax amount
+- totalAmount: Total amount including tax
+- paymentMethod: Payment method if visible (cash, card, etc.)
+- lineItems: Array of { description, quantity, unitPrice, totalPrice, taxRate }
+- category: Suggested expense category (office_supplies, travel, food, etc.)
+- confidence: Your confidence in the extraction (0.0 to 1.0)
+
+Respond ONLY with valid JSON. No explanation, no markdown.`;
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a receipt data extractor.
+ *
+ * TODO: Add support for image-based receipts (pass images to multimodal LLM)
+ * TODO: Add receipt template matching for common vendors
+ * TODO: Add currency conversion support
+ */
+export function createReceiptExtractor(
+ config: ReceiptExtractorConfig,
+): ReceiptExtractor {
+ const { ollama, paperless } = config;
+
+ async function extractSingle(documentId: number): Promise {
+ // Fetch the document content from Paperless-ngx
+ const document = await paperless.getDocument(documentId);
+ const ocrText = document.content;
+
+ if (!ocrText || ocrText.trim().length === 0) {
+ throw new Error(
+ `Document ${documentId} has no OCR content. Ensure Paperless-ngx has processed the document.`,
+ );
+ }
+
+ // Send to Ollama for structured extraction
+ const prompt = `Extract receipt data from the following OCR text:\n\n---\n${ocrText}\n---`;
+ const completion = await ollama.complete(prompt, EXTRACTION_SYSTEM_PROMPT);
+
+ // Parse LLM response
+ // TODO: Add robust JSON extraction (handle markdown code blocks, partial JSON)
+ // TODO: Validate against Zod schema for type safety
+ let parsed: Record;
+ try {
+ parsed = JSON.parse(completion.text);
+ } catch {
+ throw new Error(
+ `Failed to parse receipt extraction result for document ${documentId}. ` +
+ `LLM response was not valid JSON.`,
+ );
+ }
+
+ return {
+ documentId,
+ vendor: String(parsed.vendor ?? "Unknown"),
+ vendorAddress: parsed.vendorAddress ? String(parsed.vendorAddress) : null,
+ vendorTaxId: parsed.vendorTaxId ? String(parsed.vendorTaxId) : null,
+ date: String(parsed.date ?? new Date().toISOString().split("T")[0]),
+ currency: String(parsed.currency ?? "EUR"),
+ subtotal: typeof parsed.subtotal === "number" ? parsed.subtotal : null,
+ taxRate: typeof parsed.taxRate === "number" ? parsed.taxRate : null,
+ taxAmount: typeof parsed.taxAmount === "number" ? parsed.taxAmount : null,
+ totalAmount: typeof parsed.totalAmount === "number" ? parsed.totalAmount : 0,
+ paymentMethod: parsed.paymentMethod ? String(parsed.paymentMethod) : null,
+ lineItems: Array.isArray(parsed.lineItems)
+ ? parsed.lineItems.map((item: Record) => ({
+ description: String(item.description ?? ""),
+ quantity: Number(item.quantity ?? 1),
+ unitPrice: Number(item.unitPrice ?? 0),
+ totalPrice: Number(item.totalPrice ?? 0),
+ taxRate: typeof item.taxRate === "number" ? item.taxRate : null,
+ }))
+ : [],
+ category: parsed.category ? String(parsed.category) : null,
+ confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5,
+ rawText: ocrText,
+ };
+ }
+
+ return {
+ extract: extractSingle,
+
+ async extractBatch(documentIds) {
+ // TODO: Add concurrency control (process N at a time)
+ // TODO: Add progress reporting callback
+ const results: ReceiptData[] = [];
+ for (const id of documentIds) {
+ const result = await extractSingle(id);
+ results.push(result);
+ }
+ return results;
+ },
+ };
+}
diff --git a/src/receipt/matcher.ts b/src/receipt/matcher.ts
new file mode 100644
index 0000000..61fd4ee
--- /dev/null
+++ b/src/receipt/matcher.ts
@@ -0,0 +1,231 @@
+/**
+ * Bank CSV transaction matching for receipts.
+ *
+ * Matches extracted receipt data against bank CSV exports to reconcile
+ * transactions. Supports common German bank export formats (Sparkasse,
+ * Volksbank, ING, DKB).
+ *
+ * @example
+ * ```ts
+ * const matcher = createTransactionMatcher();
+ * const bankTxns = await matcher.parseBankCsv("./bank_export.csv");
+ * const matches = matcher.matchReceipts(receipts, bankTxns);
+ * ```
+ */
+
+import { parse } from "csv-parse/sync";
+import { readFileSync } from "node:fs";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface BankTransaction {
+ readonly date: string;
+ readonly description: string;
+ readonly amount: number;
+ readonly currency: string;
+ readonly iban: string | null;
+ readonly bic: string | null;
+ readonly reference: string | null;
+ readonly rawLine: string;
+}
+
+export interface ReceiptMatchCandidate {
+ readonly documentId: number;
+ readonly vendor: string;
+ readonly date: string;
+ readonly totalAmount: number;
+ readonly currency: string;
+}
+
+export interface MatchResult {
+ readonly receipt: ReceiptMatchCandidate;
+ readonly transaction: BankTransaction;
+ readonly confidence: number;
+ readonly matchReasons: readonly string[];
+}
+
+export interface UnmatchedItem {
+ readonly type: "receipt" | "transaction";
+ readonly item: ReceiptMatchCandidate | BankTransaction;
+}
+
+export interface MatchSummary {
+ readonly matched: readonly MatchResult[];
+ readonly unmatchedReceipts: readonly ReceiptMatchCandidate[];
+ readonly unmatchedTransactions: readonly BankTransaction[];
+ readonly matchRate: number;
+}
+
+export interface TransactionMatcher {
+ /** Parse a bank CSV export file into structured transactions. */
+ parseBankCsv(filePath: string, format?: BankCsvFormat): readonly BankTransaction[];
+
+ /** Match receipts against bank transactions. */
+ matchReceipts(
+ receipts: readonly ReceiptMatchCandidate[],
+ transactions: readonly BankTransaction[],
+ ): MatchSummary;
+}
+
+export type BankCsvFormat = "auto" | "sparkasse" | "ing" | "dkb" | "volksbank" | "generic";
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a transaction matcher for bank CSV reconciliation.
+ *
+ * TODO: Add ML-based fuzzy matching for vendor names
+ * TODO: Add support for MT940/CAMT.053 bank statement formats
+ * TODO: Add date tolerance configuration (match within N days)
+ */
+export function createTransactionMatcher(): TransactionMatcher {
+ /**
+ * Parse bank CSV with auto-detected or specified format.
+ */
+ function parseBankCsv(
+ filePath: string,
+ format: BankCsvFormat = "auto",
+ ): readonly BankTransaction[] {
+ const raw = readFileSync(filePath, "utf-8");
+
+ // TODO: Implement format auto-detection based on header patterns
+ // TODO: Add support for different CSV delimiters (semicolon for German exports)
+ // TODO: Handle different date formats (DD.MM.YYYY, YYYY-MM-DD, MM/DD/YYYY)
+
+ const _format = format; // Acknowledge format parameter for future use
+
+ const records = parse(raw, {
+ columns: true,
+ skip_empty_lines: true,
+ delimiter: ";",
+ relaxColumnCount: true,
+ }) as Record[];
+
+ return records.map((record): BankTransaction => {
+ // Generic column mapping -- override per format
+ // TODO: Implement format-specific column mappings
+ return {
+ date: record["Buchungstag"] ?? record["Date"] ?? record["Datum"] ?? "",
+ description:
+ record["Verwendungszweck"] ??
+ record["Description"] ??
+ record["Buchungstext"] ??
+ "",
+ amount: parseFloat(
+ (record["Betrag"] ?? record["Amount"] ?? "0")
+ .replace(/\./g, "")
+ .replace(",", "."),
+ ),
+ currency: record["Waehrung"] ?? record["Currency"] ?? "EUR",
+ iban: record["IBAN"] ?? null,
+ bic: record["BIC"] ?? null,
+ reference: record["Kundenreferenz"] ?? record["Reference"] ?? null,
+ rawLine: JSON.stringify(record),
+ };
+ });
+ }
+
+ /**
+ * Match receipts against bank transactions by amount and date proximity.
+ */
+ function matchReceipts(
+ receipts: readonly ReceiptMatchCandidate[],
+ transactions: readonly BankTransaction[],
+ ): MatchSummary {
+ const matched: MatchResult[] = [];
+ const matchedReceiptIds = new Set();
+ const matchedTxnIndices = new Set();
+
+ // TODO: Implement smarter matching with vendor name fuzzy matching
+ // TODO: Add configurable date tolerance window
+ // TODO: Handle split transactions (one receipt, multiple bank entries)
+
+ for (const receipt of receipts) {
+ let bestMatch: { index: number; confidence: number; reasons: string[] } | null =
+ null;
+
+ for (let i = 0; i < transactions.length; i++) {
+ if (matchedTxnIndices.has(i)) continue;
+
+ const txn = transactions[i];
+ const reasons: string[] = [];
+ let confidence = 0;
+
+ // Amount matching (exact or close)
+ const amountDiff = Math.abs(Math.abs(txn.amount) - receipt.totalAmount);
+ if (amountDiff < 0.01) {
+ confidence += 0.5;
+ reasons.push("exact_amount_match");
+ } else if (amountDiff < 1.0) {
+ confidence += 0.3;
+ reasons.push("close_amount_match");
+ }
+
+ // Date matching
+ const receiptDate = new Date(receipt.date).getTime();
+ const txnDate = new Date(txn.date).getTime();
+ const daysDiff = Math.abs(receiptDate - txnDate) / (1000 * 60 * 60 * 24);
+
+ if (daysDiff < 1) {
+ confidence += 0.3;
+ reasons.push("same_day");
+ } else if (daysDiff < 3) {
+ confidence += 0.15;
+ reasons.push("within_3_days");
+ } else if (daysDiff < 7) {
+ confidence += 0.05;
+ reasons.push("within_7_days");
+ }
+
+ // Vendor name in description
+ if (
+ txn.description
+ .toLowerCase()
+ .includes(receipt.vendor.toLowerCase().slice(0, 8))
+ ) {
+ confidence += 0.2;
+ reasons.push("vendor_in_description");
+ }
+
+ if (
+ confidence > 0.5 &&
+ (!bestMatch || confidence > bestMatch.confidence)
+ ) {
+ bestMatch = { index: i, confidence, reasons };
+ }
+ }
+
+ if (bestMatch) {
+ matched.push({
+ receipt,
+ transaction: transactions[bestMatch.index],
+ confidence: bestMatch.confidence,
+ matchReasons: bestMatch.reasons,
+ });
+ matchedReceiptIds.add(receipt.documentId);
+ matchedTxnIndices.add(bestMatch.index);
+ }
+ }
+
+ const unmatchedReceipts = receipts.filter(
+ (r) => !matchedReceiptIds.has(r.documentId),
+ );
+ const unmatchedTransactions = transactions.filter(
+ (_, i) => !matchedTxnIndices.has(i),
+ );
+
+ return {
+ matched,
+ unmatchedReceipts,
+ unmatchedTransactions,
+ matchRate:
+ receipts.length > 0 ? matched.length / receipts.length : 0,
+ };
+ }
+
+ return { parseBankCsv, matchReceipts };
+}
diff --git a/src/skill/SKILL.md b/src/skill/SKILL.md
new file mode 100644
index 0000000..d5cc5eb
--- /dev/null
+++ b/src/skill/SKILL.md
@@ -0,0 +1,72 @@
+# PaperCortex -- Document Intelligence Skill
+
+> A Claude Code skill for interacting with your Paperless-ngx document archive through AI-powered semantic search, classification, receipt extraction, and accounting export.
+
+## Prerequisites
+
+- PaperCortex MCP Server running (see project README)
+- Paperless-ngx instance with API access
+- Ollama with `qwen2.5:14b` and `nomic-embed-text` models
+
+## Available Tools
+
+### papercortex_search
+Search documents by meaning, not just keywords.
+
+```
+Search for: "office lease agreements from last year"
+Search for: "tax-relevant receipts over 500 EUR"
+Search for: "correspondence with insurance companies"
+```
+
+### papercortex_classify
+Auto-classify a document with AI-suggested tags, type, and correspondent.
+
+```
+Classify document #1234
+Classify document #1234 and apply suggested tags
+```
+
+### papercortex_receipt
+Extract structured data from receipt documents.
+
+```
+Extract receipt from document #5678
+```
+
+Returns: vendor, date, amounts, tax breakdown, line items, category.
+
+### papercortex_query
+Ask natural language questions about your document archive.
+
+```
+"How much did I spend on office supplies in Q1 2024?"
+"Which invoices are still unpaid?"
+"Summarize all contracts expiring this year"
+```
+
+### papercortex_export
+Export receipt data for accounting software.
+
+```
+Export documents #100, #101, #102 as DATEV CSV
+Export documents #200, #201 as generic CSV
+```
+
+## Workflow Examples
+
+### Monthly Bookkeeping
+1. Search for all receipts from the current month
+2. Extract data from each receipt
+3. Export as DATEV CSV
+4. Import into accounting software
+
+### Document Organization
+1. Find unclassified documents (no tags)
+2. Auto-classify each document
+3. Review and approve suggested tags
+
+### Expense Analysis
+1. Query: "What were my top 5 expense categories last quarter?"
+2. Drill into specific categories with follow-up queries
+3. Export relevant receipts for documentation
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..a80386d
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,24 @@
+{
+ "compilerOptions": {
+ "target": "ES2022",
+ "module": "ESNext",
+ "moduleResolution": "bundler",
+ "lib": ["ES2022"],
+ "outDir": "./dist",
+ "rootDir": "./src",
+ "strict": true,
+ "esModuleInterop": true,
+ "skipLibCheck": true,
+ "forceConsistentCasingInFileNames": true,
+ "resolveJsonModule": true,
+ "declaration": true,
+ "declarationMap": true,
+ "sourceMap": true,
+ "noUnusedLocals": true,
+ "noUnusedParameters": true,
+ "noImplicitReturns": true,
+ "noFallthroughCasesInSwitch": true
+ },
+ "include": ["src/**/*"],
+ "exclude": ["node_modules", "dist", "**/*.test.ts"]
+}