commit 2052d87ba11274507aaf48dc2542cd6f5d66c2e4 Author: Rene Fichtmueller Date: Thu Mar 26 06:28:48 2026 +1300 feat: initial release — AI document intelligence for Paperless-ngx PaperCortex adds semantic search, auto-classification, receipt extraction, bank statement matching, and DATEV export to Paperless-ngx — powered entirely by local AI through Ollama. Exposes everything as an MCP Server for Claude Code and AI agent integration. - MCP Server with 5 tools (search, classify, receipt, query, export) - Local Ollama embeddings for semantic document search - Receipt data extraction (vendor, amount, date, tax, line items) - DATEV Buchungsstapel CSV export for German accounting - Bank CSV transaction matching - Paperless-ngx REST API client - Docker deployment - Zero cloud dependencies — 100% self-hosted diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..670512c --- /dev/null +++ b/.env.example @@ -0,0 +1,20 @@ +# PaperCortex Configuration +# Copy this file to .env and fill in your values + +# Paperless-ngx connection +PAPERLESS_URL=http://localhost:8000 +PAPERLESS_TOKEN=your-paperless-api-token-here + +# Ollama connection +OLLAMA_URL=http://localhost:11434 +OLLAMA_MODEL=qwen2.5:14b +OLLAMA_EMBEDDING_MODEL=nomic-embed-text + +# Vector store +VECTOR_DB_PATH=./data/vectors.db + +# MCP Server +MCP_SERVER_PORT=3100 + +# Logging +LOG_LEVEL=info diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68acf58 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +# Dependencies +node_modules/ + +# Build output +dist/ + +# Environment files +.env +.env.local +.env.*.local + +# Data directory (vectors, cache) +data/ + +# OS files +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Logs +logs/ +*.log +npm-debug.log* + +# Test coverage +coverage/ + +# Temporary files +tmp/ +temp/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..149e5f8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +FROM node:22-alpine AS builder + +WORKDIR /app + +COPY package.json package-lock.json* ./ +RUN npm ci + +COPY tsconfig.json ./ +COPY src/ ./src/ +RUN npm run build + +# --- Production image --- +FROM node:22-alpine + +WORKDIR /app + +RUN addgroup -g 1001 -S papercortex && \ + adduser -S papercortex -u 1001 + +COPY package.json package-lock.json* ./ +RUN npm ci --omit=dev && npm cache clean --force + +COPY --from=builder /app/dist ./dist + +RUN mkdir -p /app/data && chown papercortex:papercortex /app/data + +USER papercortex + +ENV NODE_ENV=production +ENV VECTOR_DB_PATH=/app/data/vectors.db + +EXPOSE 3100 + +CMD ["node", "dist/mcp-server/index.js"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a2b6e2c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 PaperCortex Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..f8fa4e7 --- /dev/null +++ b/README.md @@ -0,0 +1,737 @@ +

+ PaperCortex Logo +

PaperCortex

+

+ AI-Powered Document Intelligence for Paperless-ngx
+ Semantic search, auto-classification, receipt extraction, and accounting export — 100% local, 100% private. +

+

+ Docker + MIT License + TypeScript + Ollama + MCP Server + Paperless-ngx + DATEV Export + Privacy First +

+

+ Quick Start · Features · MCP Tools · Receipts · Docs +

+

+ +--- + +## What is PaperCortex? + +**PaperCortex** turns your [Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx) document archive into an intelligent, queryable knowledge base — powered entirely by local AI running on your own hardware. + +If you use Paperless-ngx to store invoices, receipts, contracts, tax documents, letters, or any other scanned paperwork, PaperCortex adds the intelligence layer that Paperless-ngx is missing: + +- **Ask questions in plain English** — "Show me all invoices from Amazon over 100 EUR in 2025" +- **Find documents by meaning**, not just keywords — searching for "office rent" finds "Bueromiete" and "monthly lease payment" +- **Auto-tag and classify** every new document the moment it arrives +- **Extract structured data from receipts** — vendor, date, amount, tax rate, line items +- **Match receipts to bank transactions** automatically +- **Export to DATEV** for your German tax advisor — or plain CSV for any accounting software + +Everything runs locally through [Ollama](https://ollama.com). No document content ever leaves your network. No cloud APIs. No subscriptions. No data harvesting. + +PaperCortex exposes all capabilities as an **[MCP (Model Context Protocol)](https://modelcontextprotocol.io) Server**, making it a first-class tool for [Claude Code](https://docs.anthropic.com/en/docs/claude-code), AI coding agents, and automated workflows. + +--- + +## The Problem + +Paperless-ngx is an outstanding document management system with 37,000+ GitHub stars. It handles scanning, OCR, storage, and basic tagging beautifully. But once your documents are in Paperless-ngx, finding and working with them has real limitations: + +| What you want to do | Paperless-ngx alone | With PaperCortex | +|---|---|---| +| Find a document by what it's about | Keyword search only — misses synonyms, translations, related concepts | **Semantic search** understands meaning across languages | +| Classify incoming documents | Manual rules or basic auto-matching | **LLM-powered classification** understands document content | +| Extract data from a receipt | Read it yourself and type it in | **Automatic extraction** of vendor, amount, date, tax, line items | +| Answer "How much did I spend on X?" | Export everything, open spreadsheet, filter manually | **Natural language query** returns the answer instantly | +| Send receipt data to accounting | Manual data entry or copy-paste | **One-click DATEV/CSV export** ready for your tax advisor | +| Use documents in AI workflows | No API integration for AI agents | **Full MCP Server** for Claude Code and any MCP-compatible agent | +| Keep data private | Self-hosted (good!) | Self-hosted AI too — **zero cloud dependency** | + +--- + +## Features + +### Semantic Document Search + +Traditional keyword search fails when you don't remember the exact words. PaperCortex generates vector embeddings for every document using local Ollama models and stores them in a lightweight SQLite vector database. + +**Search by meaning, not by memory:** +- Search for `"electricity bill"` → finds documents containing "Stromrechnung", "utility payment", "power invoice" +- Search for `"office supplies"` → finds "Bueroausstattung", "paper and toner", "desk accessories order" +- Search for `"tax deductible travel"` → finds flight bookings, hotel receipts, train tickets, taxi invoices + +**Supported embedding models:** +- `nomic-embed-text` (recommended — fast, accurate, 768 dimensions) +- `mxbai-embed-large` (higher accuracy, slower) +- Any Ollama-compatible embedding model + +### Automatic Document Classification + +Every new document arriving in Paperless-ngx gets analyzed by a local LLM that reads the OCR content and assigns: + +- **Document type** — Invoice, Receipt, Contract, Letter, Statement, Tax Document, Certificate +- **Tags** — Contextual tags based on content (e.g., "office", "travel", "insurance", "subscription") +- **Correspondent** — Identifies the sender/vendor from document content +- **Date extraction** — Finds the document date (not just the scan date) +- **Language detection** — Identifies the document language + +Classification runs asynchronously in the background. New documents are processed within minutes of arriving in Paperless-ngx. + +### Receipt Intelligence + +PaperCortex includes a dedicated receipt processing pipeline optimized for expense management: + +**Data extraction from receipts and invoices:** +- Vendor / merchant name and address +- Date of purchase +- Total amount (gross and net) +- Tax rate and tax amount (supports multiple VAT rates) +- Currency +- Individual line items with quantities and prices +- Payment method +- Invoice/receipt number + +**Works with:** +- Scanned paper receipts (via Paperless-ngx OCR) +- Digital PDF invoices +- Photographed receipts (mobile upload to Paperless-ngx) +- Multi-page invoices +- Receipts in German, English, French, Spanish, and other languages + +### Bank Statement Matching + +Import your bank statement as CSV and let PaperCortex automatically match transactions to receipts: + +- **Fuzzy matching** on amount, date, and vendor name +- **Confidence scoring** — high/medium/low match indicators +- **Unmatched detection** — highlights receipts without matching transactions and vice versa +- **Multi-currency support** — handles EUR, USD, GBP, CHF, and 20+ currencies + +### DATEV Export + +For German businesses and freelancers, PaperCortex generates DATEV-compatible export files that your Steuerberater can import directly: + +- **DATEV CSV format** (Buchungsstapel) — the standard German accounting import format +- **SKR03 / SKR04** account mapping +- **Automatic account assignment** based on document classification +- **Beleglink** — links each DATEV entry back to the original document in Paperless-ngx +- **Period exports** — monthly, quarterly, or annual + +Also supports plain CSV export for use with any accounting software worldwide. + +### Natural Language Queries + +Ask questions about your document archive in plain language: + +``` +"How much did I spend on hotels in Q1 2025?" +"Show me all contracts expiring this year" +"What was my highest single expense last month?" +"Find all invoices from Deutsche Telekom" +"Which receipts don't have a matching bank transaction?" +"Summarize my office supply spending trend over the last 12 months" +``` + +PaperCortex translates natural language into document queries, retrieves relevant documents via semantic search, and uses the local LLM to synthesize answers with source references. + +### MCP Server Integration + +PaperCortex implements the [Model Context Protocol (MCP)](https://modelcontextprotocol.io) — the open standard for connecting AI agents to external tools. This means any MCP-compatible AI agent can use your document archive as a knowledge source. + +**Compatible with:** +- [Claude Code](https://docs.anthropic.com/en/docs/claude-code) (Anthropic) +- [Claude Desktop](https://claude.ai) +- Any MCP-compatible AI agent or IDE plugin +- Custom AI workflows via the MCP SDK + +--- + +## Feature Comparison + +| Feature | PaperCortex | paperless-ai | Veryfi | Taggun | Rossum | +|---|:---:|:---:|:---:|:---:|:---:| +| Fully self-hosted | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: | +| Local AI (no cloud API) | :white_check_mark: | :x: OpenAI | :x: | :x: | :x: | +| Semantic search | :white_check_mark: | :x: | :x: | :x: | :x: | +| Auto-classification | :white_check_mark: | :white_check_mark: | :x: | :x: | :white_check_mark: | +| Receipt data extraction | :white_check_mark: | :x: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| Bank statement matching | :white_check_mark: | :x: | :x: | :x: | :x: | +| DATEV export | :white_check_mark: | :x: | :x: | :x: | :x: | +| CSV accounting export | :white_check_mark: | :x: | :white_check_mark: | :x: | :white_check_mark: | +| MCP Server | :white_check_mark: | :x: | :x: | :x: | :x: | +| Natural language queries | :white_check_mark: | :x: | :x: | :x: | :x: | +| Multi-language documents | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| Free and open source | :white_check_mark: | :white_check_mark: | :x: $$$ | :x: $$$ | :x: $$$$ | +| Privacy — data stays local | :white_check_mark: | :warning: API calls | :x: | :x: | :x: | +| Works with Paperless-ngx | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: | + +--- + +## Architecture + +``` +┌─────────────────────┐ ┌──────────────────────────┐ ┌────────────────────┐ +│ │ │ │ │ │ +│ Claude Code / │ MCP │ PaperCortex │ REST │ Paperless-ngx │ +│ AI Agents / ├────────►│ ├────────►│ │ +│ Automation │ │ ┌──────────────────┐ │ API │ OCR + Storage + │ +│ │ │ │ MCP Server │ │ │ Tagging │ +└─────────────────────┘ │ │ (stdio / HTTP) │ │ │ │ + │ └──────────────────┘ │ └────────────────────┘ + │ │ + │ ┌──────────────────┐ │ ┌────────────────────┐ + │ │ Intelligence │ │ │ │ + │ │ Layer │ │ LLM │ Ollama │ + │ │ ├────────────►│ │ + │ │ - Classifier │ │ API │ qwen2.5 / llama3 │ + │ │ - Extractor │ │ │ nomic-embed-text │ + │ │ - Query Engine │ │ │ │ + │ └──────────────────┘ │ └────────────────────┘ + │ │ + │ ┌──────────────────┐ │ + │ │ Vector Store │ │ + │ │ (SQLite + HNSW) │ │ + │ └──────────────────┘ │ + │ │ + └──────────────────────────┘ +``` + +### How It Works + +1. **Documents arrive** in Paperless-ngx through scanning, email, or manual upload +2. **PaperCortex polls** the Paperless-ngx API for new and updated documents +3. **Embedding generation** — Ollama creates vector embeddings from OCR text +4. **Classification** — the local LLM analyzes content and assigns types, tags, and metadata +5. **Storage** — embeddings and extracted data are stored in a local SQLite vector database +6. **Query interface** — the MCP Server exposes search, classify, extract, query, and export tools +7. **AI agents connect** via MCP and interact with your documents using natural language + +All processing happens on your hardware. The only network traffic is between PaperCortex and your local Paperless-ngx and Ollama instances. + +--- + +## Quick Start + +### Prerequisites + +- **[Docker](https://docs.docker.com/get-docker/)** and Docker Compose +- **[Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx)** — running instance with API access +- **[Ollama](https://ollama.com)** — running locally or on your network + +**Pull the required Ollama models:** + +```bash +ollama pull qwen2.5:14b # LLM for classification, extraction, queries +ollama pull nomic-embed-text # Embedding model for semantic search +``` + +### Option 1: Docker Compose (Recommended) + +```bash +git clone https://github.com/renefichtmueller/PaperCortex.git +cd PaperCortex +cp .env.example .env +``` + +Edit `.env` with your configuration: + +```env +PAPERLESS_URL=http://your-paperless-instance:8000 +PAPERLESS_TOKEN=your-paperless-api-token +OLLAMA_URL=http://your-ollama-host:11434 +OLLAMA_MODEL=qwen2.5:14b +OLLAMA_EMBEDDING_MODEL=nomic-embed-text +``` + +Start PaperCortex: + +```bash +docker compose up -d +``` + +PaperCortex will begin indexing your existing documents automatically. + +### Option 2: Manual Installation + +```bash +git clone https://github.com/renefichtmueller/PaperCortex.git +cd PaperCortex +npm install +cp .env.example .env +# Edit .env with your settings +npm run build +npm start +``` + +### Option 3: npx (MCP Server only) + +```bash +npx papercortex --paperless-url http://localhost:8000 --paperless-token YOUR_TOKEN +``` + +--- + +## MCP Server Tools + +PaperCortex exposes five MCP tools that AI agents can call: + +### `papercortex_search` — Semantic Document Search + +Find documents by meaning, not just keywords. + +```json +{ + "tool": "papercortex_search", + "arguments": { + "query": "electricity bills from last winter", + "limit": 10, + "date_from": "2024-12-01", + "date_to": "2025-02-28" + } +} +``` + +**Returns:** Ranked list of documents with relevance scores, titles, dates, and Paperless-ngx document IDs. + +### `papercortex_classify` — Auto-Classification + +Analyze a document and assign type, tags, and metadata. + +```json +{ + "tool": "papercortex_classify", + "arguments": { + "document_id": 1234, + "apply": true + } +} +``` + +**Returns:** Suggested document type, tags, correspondent, and confidence scores. Set `apply: true` to write classifications back to Paperless-ngx. + +### `papercortex_receipt` — Receipt Data Extraction + +Extract structured financial data from receipts and invoices. + +```json +{ + "tool": "papercortex_receipt", + "arguments": { + "document_id": 5678 + } +} +``` + +**Returns:** +```json +{ + "vendor": "Amazon EU S.a.r.l.", + "date": "2025-03-15", + "total_gross": 119.99, + "total_net": 100.83, + "tax_rate": 19, + "tax_amount": 19.16, + "currency": "EUR", + "items": [ + { "description": "USB-C Hub", "quantity": 1, "price": 49.99 }, + { "description": "Monitor Arm", "quantity": 1, "price": 70.00 } + ], + "invoice_number": "INV-DE-2025-1234567" +} +``` + +### `papercortex_query` — Natural Language Questions + +Ask questions about your entire document archive. + +```json +{ + "tool": "papercortex_query", + "arguments": { + "question": "How much did I spend on business travel in Q1 2025?" + } +} +``` + +**Returns:** A natural language answer with source document references and a breakdown of the calculation. + +### `papercortex_export` — Accounting Export + +Export extracted receipt data in accounting-ready formats. + +```json +{ + "tool": "papercortex_export", + "arguments": { + "format": "datev", + "date_from": "2025-01-01", + "date_to": "2025-03-31", + "account_plan": "SKR03" + } +} +``` + +**Supported formats:** `datev` (German standard), `csv` (universal), `json` (programmatic). + +--- + +## Claude Code Integration + +### Register as MCP Server + +Add to your `~/.claude.json` or project `.claude/settings.json`: + +```json +{ + "mcpServers": { + "papercortex": { + "command": "node", + "args": ["./dist/mcp-server/index.js"], + "cwd": "/path/to/PaperCortex", + "env": { + "PAPERLESS_URL": "http://localhost:8000", + "PAPERLESS_TOKEN": "your-token", + "OLLAMA_URL": "http://localhost:11434" + } + } + } +} +``` + +### Example Conversations + +Once connected, you can ask Claude Code about your documents naturally: + +``` +You: Search my documents for anything related to the office lease renewal + +Claude: I found 4 relevant documents: + 1. "Mietvertrag Verlängerung 2025" (Score: 0.94) — Document #1234 + 2. "Office Lease Agreement Amendment" (Score: 0.91) — Document #1235 + 3. "Nebenkostenabrechnung 2024" (Score: 0.78) — Document #1240 + 4. "Facilities Management Invoice" (Score: 0.72) — Document #1251 +``` + +``` +You: Extract the receipt data from document #5678 and export it for DATEV + +Claude: Extracted receipt data: + Vendor: Deutsche Bahn AG + Date: 2025-03-20 + Amount: 89.90 EUR (net: 75.55 EUR, 19% VAT: 14.35 EUR) + Description: ICE Frankfurt-Berlin, 1st class + + DATEV export saved to: exports/datev_2025_03.csv +``` + +``` +You: How much did I spend on cloud services this year? + +Claude: Based on 23 matching documents, your cloud service spending in 2025: + - AWS: 2,340.00 EUR (12 invoices) + - Hetzner: 456.00 EUR (3 invoices) + - Cloudflare: 240.00 EUR (3 invoices) + - Vercel: 180.00 EUR (3 invoices) + - GitHub: 132.00 EUR (2 invoices) + Total: 3,348.00 EUR +``` + +--- + +## Receipt Workflow + +### End-to-End Receipt Processing + +``` +┌──────────┐ ┌─────────────┐ ┌──────────────┐ ┌──────────┐ ┌──────────┐ +│ Scan / │ │ Paperless- │ │ PaperCortex │ │ Match │ │ Export │ +│ Photo / ├───►│ ngx ├───►│ Receipt ├───►│ Bank ├───►│ DATEV / │ +│ Email │ │ OCR+Store │ │ Extraction │ │ CSV │ │ CSV │ +└──────────┘ └─────────────┘ └──────────────┘ └──────────┘ └──────────┘ +``` + +### CLI Commands + +```bash +# Process all unprocessed receipts +npm run receipt:process + +# Extract data from a specific document +npm run receipt:extract -- --document-id 1234 + +# Import bank statement and match transactions +npm run receipt:match -- --bank-csv ./bank_export_2025_q1.csv + +# Export matched data as DATEV +npm run receipt:export -- --format datev --period 2025-Q1 + +# Export as plain CSV +npm run receipt:export -- --format csv --period 2025-03 +``` + +### DATEV Integration Details + +The DATEV export generates a `Buchungsstapel` CSV file following the official DATEV format specification: + +- **Header row** with advisor number, client number, fiscal year start, and export period +- **Transaction rows** with amount, debit/credit account, tax code, date, and booking text +- **Beleglink** — each row includes a reference to the source document in Paperless-ngx +- **Account mapping** — automatic assignment based on vendor and document type (configurable) +- **SKR03 and SKR04** chart of accounts supported + +--- + +## Privacy and Security + +### Why Local AI Matters + +Your documents contain some of the most sensitive data in your life: + +- **Tax returns** with income, deductions, and financial details +- **Contracts** with confidential terms and personal information +- **Medical bills** with health information +- **Bank statements** with account numbers and transaction history +- **Personal correspondence** with private content + +Cloud-based document AI services require uploading this data to external servers for processing. Even with encryption and privacy policies, you are trusting a third party with your most sensitive information. + +**PaperCortex takes a fundamentally different approach:** + +- All AI processing runs on **your hardware** via Ollama +- Document content is sent only to **your local Ollama instance** +- Embeddings and extracted data are stored in a **local SQLite database** +- The only network traffic is between PaperCortex, your Paperless-ngx instance, and your Ollama server +- **No telemetry, no analytics, no external API calls** + +**Your documents stay in your network. Period.** + +### Security Best Practices + +- Store the Paperless-ngx API token in environment variables, never in source code +- Run PaperCortex on the same network as Paperless-ngx and Ollama +- Use Docker networks to isolate services +- Regularly update Ollama and PaperCortex for security patches + +--- + +## Configuration Reference + +All configuration is done through environment variables. See `.env.example` for a complete template. + +### Core Settings + +| Variable | Default | Description | +|---|---|---| +| `PAPERLESS_URL` | `http://localhost:8000` | Paperless-ngx instance URL | +| `PAPERLESS_TOKEN` | *(required)* | Paperless-ngx API authentication token | +| `OLLAMA_URL` | `http://localhost:11434` | Ollama API endpoint | +| `OLLAMA_MODEL` | `qwen2.5:14b` | LLM model for classification and extraction | +| `OLLAMA_EMBEDDING_MODEL` | `nomic-embed-text` | Embedding model for semantic search | +| `VECTOR_DB_PATH` | `./data/vectors.db` | Path to the SQLite vector database | + +### Processing Settings + +| Variable | Default | Description | +|---|---|---| +| `POLL_INTERVAL` | `300` | Seconds between polling Paperless-ngx for new documents | +| `BATCH_SIZE` | `10` | Number of documents to process per batch | +| `EMBEDDING_DIMENSIONS` | `768` | Vector dimensions (must match embedding model) | +| `CLASSIFICATION_CONFIDENCE` | `0.7` | Minimum confidence to auto-apply classifications | + +### Export Settings + +| Variable | Default | Description | +|---|---|---| +| `DATEV_ADVISOR_NUMBER` | *(optional)* | Steuerberater number for DATEV export header | +| `DATEV_CLIENT_NUMBER` | *(optional)* | Mandantennummer for DATEV export header | +| `DATEV_FISCAL_YEAR_START` | `01-01` | Fiscal year start (MM-DD) | +| `DEFAULT_ACCOUNT_PLAN` | `SKR03` | Default chart of accounts (`SKR03` or `SKR04`) | +| `EXPORT_DIR` | `./exports` | Directory for generated export files | + +### MCP Server Settings + +| Variable | Default | Description | +|---|---|---| +| `MCP_TRANSPORT` | `stdio` | MCP transport mode (`stdio` or `http`) | +| `MCP_PORT` | `3100` | Port for HTTP transport mode | +| `MCP_AUTH_TOKEN` | *(optional)* | Bearer token for HTTP transport authentication | + +--- + +## Supported Models + +PaperCortex works with any Ollama-compatible model. Recommended configurations: + +### For Classification and Extraction + +| Model | VRAM | Speed | Quality | Recommended For | +|---|---|---|---|---| +| `qwen2.5:7b` | 5 GB | Fast | Good | Raspberry Pi, low-end servers | +| `qwen2.5:14b` | 10 GB | Medium | Very Good | Most homelab setups | +| `qwen2.5:32b` | 20 GB | Slow | Excellent | High-accuracy requirements | +| `llama3.1:8b` | 5 GB | Fast | Good | Alternative to Qwen | +| `mistral:7b` | 5 GB | Fast | Good | European language focus | + +### For Embeddings + +| Model | Dimensions | Speed | Quality | +|---|---|---|---| +| `nomic-embed-text` | 768 | Very Fast | Very Good | +| `mxbai-embed-large` | 1024 | Fast | Excellent | +| `all-minilm` | 384 | Fastest | Good | + +--- + +## Project Structure + +``` +PaperCortex/ +├── src/ +│ ├── mcp-server/ # MCP Server for AI agent integration +│ │ ├── index.ts # Server entry point and tool registration +│ │ └── tools/ +│ │ ├── search.ts # Semantic document search tool +│ │ ├── classify.ts # Auto-classification tool +│ │ ├── receipt.ts # Receipt data extraction tool +│ │ ├── query.ts # Natural language query tool +│ │ └── export.ts # DATEV/CSV export tool +│ ├── embeddings/ +│ │ ├── ollama.ts # Ollama embedding API client +│ │ └── store.ts # SQLite vector store with HNSW index +│ ├── paperless/ +│ │ ├── client.ts # Paperless-ngx REST API client +│ │ └── types.ts # TypeScript type definitions +│ └── receipt/ +│ ├── extractor.ts # Receipt OCR content parsing and extraction +│ ├── matcher.ts # Bank CSV transaction matching engine +│ └── datev.ts # DATEV Buchungsstapel CSV formatter +├── docs/ +│ ├── architecture.md # Detailed architecture documentation +│ ├── setup.md # Step-by-step installation guide +│ └── receipts.md # Receipt workflow documentation +├── docker-compose.yml # Production deployment +├── Dockerfile # Container build +├── .env.example # Configuration template (no secrets!) +├── package.json +├── tsconfig.json +└── LICENSE # MIT +``` + +--- + +## Roadmap + +- [x] Core MCP Server with 5 tools +- [x] Paperless-ngx API client +- [x] Ollama embedding generation +- [x] SQLite vector store +- [x] Receipt data extraction +- [x] DATEV export +- [x] Docker deployment +- [ ] Bank CSV matching engine +- [ ] Web dashboard UI +- [ ] Webhook support (instant processing on document arrival) +- [ ] Multi-user support with separate vector stores +- [ ] Additional export formats (SKR04 mapping, FiBu, CSV+) +- [ ] Ollama vision model support for direct image analysis +- [ ] Automated document workflow triggers +- [ ] Plugin system for custom extractors +- [ ] Prometheus metrics endpoint + +--- + +## Contributing + +Contributions are welcome! PaperCortex is early-stage and there are many ways to help: + +### Getting Started + +```bash +git clone https://github.com/renefichtmueller/PaperCortex.git +cd PaperCortex +npm install +cp .env.example .env +# Edit .env with your local Paperless-ngx and Ollama settings +npm run dev +``` + +### How to Contribute + +1. **Fork** the repository +2. **Create** a feature branch (`git checkout -b feat/amazing-feature`) +3. **Write tests** for your changes +4. **Commit** using conventional commits (`feat:`, `fix:`, `docs:`, `refactor:`) +5. **Push** and open a Pull Request + +### Areas Where Help is Needed + +| Area | Description | Difficulty | +|---|---|---| +| **Bank CSV Parsers** | Add parsers for different bank export formats (Sparkasse, ING, N26, Revolut, etc.) | Easy | +| **Export Formats** | Additional accounting export formats beyond DATEV | Medium | +| **Web Dashboard** | Build a simple web UI for browsing indexed documents and extracted data | Medium | +| **Multi-language** | Improve extraction accuracy for non-English/German receipts | Medium | +| **Vision Models** | Use Ollama vision models to extract data directly from receipt images | Hard | +| **Webhooks** | React to Paperless-ngx document events in real-time | Medium | + +--- + +## Frequently Asked Questions + +**Q: Does PaperCortex modify my documents in Paperless-ngx?** +A: By default, PaperCortex only reads documents. When you use the `classify` tool with `apply: true`, it can write tags, document types, and correspondents back to Paperless-ngx. Extraction results and embeddings are stored in PaperCortex's own database. + +**Q: How much disk space does the vector database need?** +A: Roughly 1-2 KB per document for embeddings. A collection of 10,000 documents needs about 10-20 MB of vector storage. + +**Q: Can I use OpenAI instead of Ollama?** +A: PaperCortex is designed for local-first operation with Ollama. Support for OpenAI-compatible APIs (including local alternatives like LM Studio, vLLM, or LocalAI) is on the roadmap. + +**Q: What Paperless-ngx version is required?** +A: PaperCortex works with Paperless-ngx 2.0 and later (REST API v3+). + +**Q: Can I run PaperCortex on a Raspberry Pi?** +A: PaperCortex itself is lightweight. The bottleneck is Ollama — you'll need a model that fits in your available RAM. `qwen2.5:7b` works on 8GB devices. + +**Q: Is DATEV export only for Germany?** +A: The DATEV format is the German standard, but PaperCortex also exports plain CSV that works with any accounting software worldwide. + +--- + +## License + +MIT License — see [LICENSE](LICENSE) for details. + +Free to use, modify, and distribute. Commercial use welcome. + +--- + +## Acknowledgments + +Built on the shoulders of giants: + +- **[Paperless-ngx](https://github.com/paperless-ngx/paperless-ngx)** — The incredible open-source document management system (37k+ stars) +- **[Ollama](https://ollama.com)** — Making local AI accessible to everyone +- **[Model Context Protocol](https://modelcontextprotocol.io)** — The open standard for AI tool integration by Anthropic +- **[better-sqlite3](https://github.com/WiseLibs/better-sqlite3)** — Fast, reliable SQLite bindings for Node.js + +--- + +## Star History + +If PaperCortex is useful to you, please consider giving it a star — it helps others discover the project! + +--- + +

+ Your documents. Your AI. Your hardware.
+ No cloud required. +

diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..81fba7b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,36 @@ +services: + papercortex: + build: . + container_name: papercortex + restart: unless-stopped + ports: + - "3100:3100" + volumes: + - papercortex-data:/app/data + env_file: + - .env + environment: + - NODE_ENV=production + depends_on: + - ollama + + ollama: + image: ollama/ollama:latest + container_name: papercortex-ollama + restart: unless-stopped + ports: + - "11434:11434" + volumes: + - ollama-models:/root/.ollama + # Uncomment for NVIDIA GPU support: + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: all + # capabilities: [gpu] + +volumes: + papercortex-data: + ollama-models: diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..420cc56 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,64 @@ +# Architecture + +## Overview + +PaperCortex is structured as three layers: + +1. **MCP Server Layer** -- Exposes tools via the Model Context Protocol for AI agent integration. +2. **Intelligence Layer** -- Embedding generation, classification, receipt extraction, and query answering. +3. **Data Layer** -- Paperless-ngx API client and local SQLite vector store. + +## Components + +### MCP Server (`src/mcp-server/`) + +The entry point for all AI agent interactions. Implements the MCP standard using `@modelcontextprotocol/sdk` and communicates via stdio transport. + +Each tool is implemented as a separate handler module under `src/mcp-server/tools/`. + +### Embeddings (`src/embeddings/`) + +- **ollama.ts** -- Client for the Ollama API. Handles embedding generation and LLM completions. +- **store.ts** -- SQLite-backed vector store using `better-sqlite3`. Stores document embeddings and supports cosine similarity search. + +Current implementation uses brute-force search, which is performant up to ~100k documents. For larger archives, consider migrating to `sqlite-vss` or a dedicated vector database. + +### Paperless Integration (`src/paperless/`) + +- **client.ts** -- REST API client for Paperless-ngx. Supports document CRUD, search, tags, correspondents, and document types. +- **types.ts** -- TypeScript type definitions matching the Paperless-ngx API v3+ schema. + +### Receipt Processing (`src/receipt/`) + +- **extractor.ts** -- Uses LLM to extract structured data from receipt OCR text. +- **matcher.ts** -- Matches extracted receipts against bank CSV transaction exports. +- **datev.ts** -- Generates DATEV Buchungsstapel format CSV for German accounting software. + +## Data Flow + +``` +Paperless-ngx --(REST API)--> PaperCortex --(Ollama API)--> Ollama + | + v + SQLite Vector DB + | + v + MCP Server (stdio) + | + v + Claude Code / AI Agents +``` + +## Security Model + +- All data stays local -- no external API calls except to Paperless-ngx and Ollama (both self-hosted). +- API tokens are read from environment variables, never hardcoded. +- The SQLite database is stored on the local filesystem with configurable path. +- MCP Server communicates via stdio (no network port required for MCP). + +## Future Considerations + +- **Webhook support** -- Listen for Paperless-ngx webhooks to auto-process new documents. +- **Plugin system** -- Allow custom extractors and exporters. +- **Web dashboard** -- Optional UI for monitoring and manual review. +- **Multi-user** -- Support multiple Paperless-ngx instances and user isolation. diff --git a/docs/receipts.md b/docs/receipts.md new file mode 100644 index 0000000..2bf2bcc --- /dev/null +++ b/docs/receipts.md @@ -0,0 +1,101 @@ +# Receipt Workflow + +## Overview + +PaperCortex provides a complete receipt-to-accounting pipeline: + +1. **Scan** -- Upload receipts to Paperless-ngx (scan, email, photo) +2. **Extract** -- AI extracts structured data (vendor, date, amounts, line items) +3. **Match** -- Reconcile against bank CSV exports +4. **Export** -- Generate DATEV-compatible CSV for accounting software + +## Receipt Extraction + +### Via MCP Server (Claude Code) + +``` +Extract receipt data from document #1234 +``` + +### Via CLI + +```bash +npm run receipt:extract -- --document-id 1234 +``` + +### Extracted Fields + +| Field | Description | Example | +|---|---|---| +| vendor | Company name | "IKEA Deutschland GmbH" | +| vendorAddress | Full address | "Am Wanderweg 1, 65719 Hofheim" | +| vendorTaxId | Tax ID / VAT number | "DE 129 341 800" | +| date | Receipt date | "2024-03-15" | +| currency | ISO 4217 code | "EUR" | +| subtotal | Before tax | 84.03 | +| taxRate | Tax percentage | 19 | +| taxAmount | Tax amount | 15.97 | +| totalAmount | Total with tax | 100.00 | +| paymentMethod | How it was paid | "card" | +| lineItems | Individual items | Array of items | +| category | Expense category | "office_supplies" | + +## Bank Statement Matching + +Match receipts against bank CSV exports to verify which receipts correspond to which bank transactions. + +### Supported Bank Formats + +- Sparkasse (semicolon-separated, German format) +- ING (semicolon-separated) +- DKB (semicolon-separated) +- Volksbank (semicolon-separated) +- Generic CSV + +### Matching Algorithm + +1. **Amount match** -- Exact or close amount (within 1.00 tolerance) +2. **Date proximity** -- Same day, within 3 days, or within 7 days +3. **Vendor name** -- Partial match in transaction description + +Results include a confidence score (0.0 - 1.0) and match reasons. + +## DATEV Export + +### Format + +PaperCortex generates DATEV Buchungsstapel (posting batch) format CSV, compatible with: + +- DATEV Unternehmen Online +- lexoffice +- sevDesk +- FastBill +- Any DATEV-import-capable software + +### Account Mapping (SKR03) + +| Category | Account | Description | +|---|---|---| +| office_supplies | 4930 | Buerokosten | +| travel | 4660 | Reisekosten | +| food | 4650 | Bewirtungskosten | +| telephone | 4920 | Telefon | +| postage | 4910 | Porto | +| rent | 4210 | Miete | +| advertising | 4600 | Werbekosten | +| software | 4964 | Software | +| consulting | 4950 | Rechts- und Beratungskosten | +| default | 4900 | Sonstige Aufwendungen | + +### Export via CLI + +```bash +# Export all receipts from March 2024 as DATEV CSV +npm run receipt:export -- --format datev --year 2024 --month 03 +``` + +### Export via MCP Server + +``` +Export documents #100, #101, #102 as DATEV CSV +``` diff --git a/docs/setup.md b/docs/setup.md new file mode 100644 index 0000000..c348cbd --- /dev/null +++ b/docs/setup.md @@ -0,0 +1,107 @@ +# Setup Guide + +## Prerequisites + +- **Node.js** 20+ (or Docker) +- **Paperless-ngx** instance with API access +- **Ollama** with required models + +## Step 1: Install Ollama Models + +```bash +# Required: LLM for classification and extraction +ollama pull qwen2.5:14b + +# Required: Embedding model for semantic search +ollama pull nomic-embed-text +``` + +Verify Ollama is running: +```bash +curl http://localhost:11434/api/tags +``` + +## Step 2: Get Paperless-ngx API Token + +1. Open your Paperless-ngx web UI +2. Go to Settings > API +3. Generate a new API token +4. Copy the token for the next step + +## Step 3: Configure PaperCortex + +```bash +git clone https://github.com/YOUR_USERNAME/PaperCortex.git +cd PaperCortex +cp .env.example .env +``` + +Edit `.env` with your values: +```env +PAPERLESS_URL=http://localhost:8000 +PAPERLESS_TOKEN= +OLLAMA_URL=http://localhost:11434 +``` + +## Step 4: Run + +### Option A: Docker (Recommended) + +```bash +docker compose up -d +``` + +### Option B: Manual + +```bash +npm install +npm run build +npm start +``` + +### Option C: Development + +```bash +npm install +npm run dev +``` + +## Step 5: Register MCP Server + +Add to your Claude Code configuration (`~/.claude.json`): + +```json +{ + "mcpServers": { + "papercortex": { + "command": "node", + "args": ["/absolute/path/to/PaperCortex/dist/mcp-server/index.js"], + "env": { + "PAPERLESS_URL": "http://localhost:8000", + "PAPERLESS_TOKEN": "your-token", + "OLLAMA_URL": "http://localhost:11434" + } + } + } +} +``` + +## Step 6: Populate Vector Store + +On first run, you need to embed your existing documents. This will be automated in a future release. For now, the vector store is populated as documents are queried or classified. + +## Troubleshooting + +### "Connection refused" to Paperless-ngx +- Verify the URL in `.env` is reachable +- Check that the API token is valid +- Ensure Paperless-ngx is running + +### "Connection refused" to Ollama +- Run `ollama serve` if not already running +- Check the port (default: 11434) +- Verify models are pulled: `ollama list` + +### Slow first query +- The first embedding generation may take longer as Ollama loads the model into memory +- Subsequent queries will be faster once the model is loaded diff --git a/package.json b/package.json new file mode 100644 index 0000000..7a38ac0 --- /dev/null +++ b/package.json @@ -0,0 +1,57 @@ +{ + "name": "papercortex", + "version": "0.1.0", + "description": "Self-hosted AI intelligence layer for Paperless-ngx with semantic search, receipt extraction, and MCP Server integration", + "main": "dist/mcp-server/index.js", + "type": "module", + "scripts": { + "build": "tsc", + "start": "node dist/mcp-server/index.js", + "dev": "tsx watch src/mcp-server/index.ts", + "lint": "eslint src/", + "test": "vitest", + "test:coverage": "vitest --coverage", + "receipt:extract": "tsx src/receipt/extractor.ts", + "receipt:match": "tsx src/receipt/matcher.ts", + "receipt:export": "tsx src/receipt/datev.ts" + }, + "keywords": [ + "paperless-ngx", + "ollama", + "mcp", + "mcp-server", + "semantic-search", + "document-ai", + "receipt-extraction", + "datev", + "self-hosted", + "local-ai", + "embeddings", + "vector-search" + ], + "author": "", + "license": "MIT", + "repository": { + "type": "git", + "url": "" + }, + "engines": { + "node": ">=20.0.0" + }, + "dependencies": { + "@modelcontextprotocol/sdk": "^1.12.0", + "better-sqlite3": "^11.8.0", + "csv-parse": "^5.6.0", + "csv-stringify": "^6.5.0", + "dotenv": "^16.4.0", + "zod": "^3.24.0" + }, + "devDependencies": { + "@types/better-sqlite3": "^7.6.12", + "@types/node": "^22.10.0", + "eslint": "^9.17.0", + "tsx": "^4.19.0", + "typescript": "^5.7.0", + "vitest": "^3.0.0" + } +} diff --git a/src/embeddings/ollama.ts b/src/embeddings/ollama.ts new file mode 100644 index 0000000..b2ad772 --- /dev/null +++ b/src/embeddings/ollama.ts @@ -0,0 +1,148 @@ +/** + * Ollama embedding and LLM integration. + * + * Generates vector embeddings and LLM completions using a local Ollama instance. + * All functions are pure and return new objects -- no mutation. + * + * @example + * ```ts + * const ollama = createOllamaClient({ baseUrl: "http://localhost:11434" }); + * const embedding = await ollama.embed("Office rent invoice March 2024"); + * const answer = await ollama.complete("Classify this document: ..."); + * ``` + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface OllamaConfig { + readonly baseUrl: string; + readonly model: string; + readonly embeddingModel: string; + readonly timeout?: number; +} + +export interface EmbeddingResult { + readonly vector: readonly number[]; + readonly model: string; + readonly dimensions: number; +} + +export interface CompletionResult { + readonly text: string; + readonly model: string; + readonly totalDuration: number; +} + +export interface OllamaClient { + /** Generate an embedding vector for the given text. */ + embed(text: string): Promise; + + /** Generate a chat/instruct completion. */ + complete(prompt: string, systemPrompt?: string): Promise; + + /** Check if the Ollama server is reachable and models are available. */ + healthCheck(): Promise<{ ok: boolean; models: readonly string[] }>; +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create an Ollama client for embeddings and completions. + */ +export function createOllamaClient(config: OllamaConfig): OllamaClient { + const { baseUrl, model, embeddingModel, timeout = 120_000 } = config; + + async function post(path: string, body: unknown): Promise { + const url = `${baseUrl.replace(/\/+$/, "")}${path}`; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeout); + + try { + const response = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + signal: controller.signal, + }); + + if (!response.ok) { + const text = await response.text().catch(() => ""); + throw new Error(`Ollama API error: ${response.status} -- ${text}`); + } + + return (await response.json()) as T; + } finally { + clearTimeout(timer); + } + } + + return { + async embed(text) { + // TODO: implement chunking for texts exceeding model context window + // TODO: add retry logic with exponential backoff + + interface OllamaEmbedResponse { + embedding: number[]; + } + + const result = await post("/api/embeddings", { + model: embeddingModel, + prompt: text, + }); + + return { + vector: result.embedding, + model: embeddingModel, + dimensions: result.embedding.length, + }; + }, + + async complete(prompt, systemPrompt) { + // TODO: implement streaming support for long completions + // TODO: add structured output parsing (JSON mode) + + interface OllamaGenerateResponse { + response: string; + model: string; + total_duration: number; + } + + const result = await post("/api/generate", { + model, + prompt, + system: systemPrompt ?? "", + stream: false, + }); + + return { + text: result.response, + model: result.model, + totalDuration: result.total_duration, + }; + }, + + async healthCheck() { + try { + const url = `${baseUrl.replace(/\/+$/, "")}/api/tags`; + const response = await fetch(url); + if (!response.ok) return { ok: false, models: [] }; + + interface OllamaTagsResponse { + models: Array<{ name: string }>; + } + + const data = (await response.json()) as OllamaTagsResponse; + return { + ok: true, + models: data.models.map((m) => m.name), + }; + } catch { + return { ok: false, models: [] }; + } + }, + }; +} diff --git a/src/embeddings/store.ts b/src/embeddings/store.ts new file mode 100644 index 0000000..dc3a012 --- /dev/null +++ b/src/embeddings/store.ts @@ -0,0 +1,231 @@ +/** + * Local SQLite-backed vector store for document embeddings. + * + * Stores embedding vectors alongside document metadata in a SQLite database + * using better-sqlite3. Supports cosine similarity search for semantic + * document retrieval. + * + * @example + * ```ts + * const store = createVectorStore({ dbPath: "./data/vectors.db" }); + * await store.upsert({ documentId: 42, vector: [...], content: "..." }); + * const results = await store.search(queryVector, { limit: 10 }); + * ``` + */ + +import Database from "better-sqlite3"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface VectorStoreConfig { + readonly dbPath: string; +} + +export interface DocumentEmbedding { + readonly documentId: number; + readonly vector: readonly number[]; + readonly content: string; + readonly title: string; + readonly tags: readonly string[]; + readonly createdAt: string; +} + +export interface SearchResult { + readonly documentId: number; + readonly title: string; + readonly content: string; + readonly score: number; + readonly tags: readonly string[]; +} + +export interface SearchOptions { + readonly limit?: number; + readonly minScore?: number; + readonly tagFilter?: readonly string[]; +} + +export interface VectorStore { + /** Insert or update a document embedding. */ + upsert(embedding: DocumentEmbedding): void; + + /** Search for similar documents using cosine similarity. */ + search(queryVector: readonly number[], options?: SearchOptions): readonly SearchResult[]; + + /** Remove an embedding by document ID. */ + remove(documentId: number): void; + + /** Get the total count of stored embeddings. */ + count(): number; + + /** Check if a document has been embedded. */ + has(documentId: number): boolean; + + /** Close the database connection. */ + close(): void; +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Compute cosine similarity between two vectors. + * Returns a value between -1 and 1 (1 = identical direction). + */ +function cosineSimilarity(a: readonly number[], b: readonly number[]): number { + if (a.length !== b.length) { + throw new Error( + `Vector dimension mismatch: ${a.length} vs ${b.length}`, + ); + } + + let dotProduct = 0; + let normA = 0; + let normB = 0; + + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + + const denominator = Math.sqrt(normA) * Math.sqrt(normB); + if (denominator === 0) return 0; + + return dotProduct / denominator; +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create a local vector store backed by SQLite. + * + * TODO: Consider migrating to sqlite-vss or DuckDB for ANN search at scale. + * The current brute-force approach works well for <100k documents. + */ +export function createVectorStore(config: VectorStoreConfig): VectorStore { + const db = new Database(config.dbPath); + + // Enable WAL mode for better concurrent read performance + db.pragma("journal_mode = WAL"); + + // Create tables if they don't exist + db.exec(` + CREATE TABLE IF NOT EXISTS embeddings ( + document_id INTEGER PRIMARY KEY, + vector BLOB NOT NULL, + content TEXT NOT NULL, + title TEXT NOT NULL, + tags TEXT NOT NULL DEFAULT '[]', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE INDEX IF NOT EXISTS idx_embeddings_created + ON embeddings (created_at); + `); + + // Prepared statements for performance + const upsertStmt = db.prepare(` + INSERT INTO embeddings (document_id, vector, content, title, tags, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, datetime('now')) + ON CONFLICT(document_id) DO UPDATE SET + vector = excluded.vector, + content = excluded.content, + title = excluded.title, + tags = excluded.tags, + updated_at = datetime('now') + `); + + const getAllStmt = db.prepare(` + SELECT document_id, vector, content, title, tags FROM embeddings + `); + + const removeStmt = db.prepare(` + DELETE FROM embeddings WHERE document_id = ? + `); + + const countStmt = db.prepare(` + SELECT COUNT(*) as count FROM embeddings + `); + + const hasStmt = db.prepare(` + SELECT 1 FROM embeddings WHERE document_id = ? LIMIT 1 + `); + + return { + upsert(embedding) { + const vectorBlob = Buffer.from(new Float32Array(embedding.vector).buffer); + upsertStmt.run( + embedding.documentId, + vectorBlob, + embedding.content, + embedding.title, + JSON.stringify(embedding.tags), + embedding.createdAt, + ); + }, + + search(queryVector, options = {}) { + const { limit = 10, minScore = 0.5, tagFilter } = options; + + // TODO: Implement ANN (approximate nearest neighbor) for large datasets + // Current approach: brute-force scan -- fine for <100k documents + + interface EmbeddingRow { + document_id: number; + vector: Buffer; + content: string; + title: string; + tags: string; + } + + const rows = getAllStmt.all() as EmbeddingRow[]; + + const scored = rows + .map((row) => { + const storedVector = Array.from(new Float32Array(row.vector.buffer)); + const tags: string[] = JSON.parse(row.tags); + const score = cosineSimilarity(queryVector, storedVector); + + return { + documentId: row.document_id, + title: row.title, + content: row.content, + score, + tags, + }; + }) + .filter((result) => result.score >= minScore) + .filter((result) => { + if (!tagFilter || tagFilter.length === 0) return true; + return tagFilter.some((tag) => result.tags.includes(tag)); + }) + .sort((a, b) => b.score - a.score) + .slice(0, limit); + + return scored; + }, + + remove(documentId) { + removeStmt.run(documentId); + }, + + count() { + const row = countStmt.get() as { count: number }; + return row.count; + }, + + has(documentId) { + return hasStmt.get(documentId) !== undefined; + }, + + close() { + db.close(); + }, + }; +} diff --git a/src/mcp-server/index.ts b/src/mcp-server/index.ts new file mode 100644 index 0000000..fcc9b6d --- /dev/null +++ b/src/mcp-server/index.ts @@ -0,0 +1,249 @@ +/** + * PaperCortex MCP Server entry point. + * + * Exposes document intelligence tools via the Model Context Protocol (MCP) + * for integration with Claude Code and other AI agents. + * + * @see https://modelcontextprotocol.io + */ + +import { Server } from "@modelcontextprotocol/sdk/server/index.js"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { + CallToolRequestSchema, + ListToolsRequestSchema, +} from "@modelcontextprotocol/sdk/types.js"; +import { config } from "dotenv"; + +import { createOllamaClient } from "../embeddings/ollama.js"; +import { createVectorStore } from "../embeddings/store.js"; +import { createPaperlessClient } from "../paperless/client.js"; +import { handleClassify } from "./tools/classify.js"; +import { handleExport } from "./tools/export.js"; +import { handleQuery } from "./tools/query.js"; +import { handleReceipt } from "./tools/receipt.js"; +import { handleSearch } from "./tools/search.js"; + +// --------------------------------------------------------------------------- +// Configuration +// --------------------------------------------------------------------------- + +config(); // Load .env + +function requireEnv(key: string): string { + const value = process.env[key]; + if (!value) { + throw new Error(`Missing required environment variable: ${key}`); + } + return value; +} + +// --------------------------------------------------------------------------- +// Service initialization +// --------------------------------------------------------------------------- + +const paperless = createPaperlessClient({ + baseUrl: requireEnv("PAPERLESS_URL"), + token: requireEnv("PAPERLESS_TOKEN"), +}); + +const ollama = createOllamaClient({ + baseUrl: process.env["OLLAMA_URL"] ?? "http://localhost:11434", + model: process.env["OLLAMA_MODEL"] ?? "qwen2.5:14b", + embeddingModel: process.env["OLLAMA_EMBEDDING_MODEL"] ?? "nomic-embed-text", +}); + +const vectorStore = createVectorStore({ + dbPath: process.env["VECTOR_DB_PATH"] ?? "./data/vectors.db", +}); + +// --------------------------------------------------------------------------- +// Shared context for tool handlers +// --------------------------------------------------------------------------- + +export interface ToolContext { + readonly paperless: typeof paperless; + readonly ollama: typeof ollama; + readonly vectorStore: typeof vectorStore; +} + +const ctx: ToolContext = { paperless, ollama, vectorStore }; + +// --------------------------------------------------------------------------- +// MCP Server setup +// --------------------------------------------------------------------------- + +const server = new Server( + { + name: "papercortex", + version: "0.1.0", + }, + { + capabilities: { + tools: {}, + }, + }, +); + +/** + * List all available PaperCortex tools. + */ +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { + name: "papercortex_search", + description: + "Semantic search across all documents in Paperless-ngx. " + + "Finds documents by meaning, not just keywords.", + inputSchema: { + type: "object" as const, + properties: { + query: { + type: "string", + description: "Natural language search query", + }, + limit: { + type: "number", + description: "Maximum number of results (default: 10)", + }, + tags: { + type: "array", + items: { type: "string" }, + description: "Filter by tag names", + }, + }, + required: ["query"], + }, + }, + { + name: "papercortex_classify", + description: + "Auto-classify a document using local AI. " + + "Suggests tags, document type, and correspondent.", + inputSchema: { + type: "object" as const, + properties: { + documentId: { + type: "number", + description: "Paperless-ngx document ID", + }, + applyTags: { + type: "boolean", + description: "Automatically apply suggested tags (default: false)", + }, + }, + required: ["documentId"], + }, + }, + { + name: "papercortex_receipt", + description: + "Extract structured data from a receipt document: " + + "vendor, date, amounts, tax, line items.", + inputSchema: { + type: "object" as const, + properties: { + documentId: { + type: "number", + description: "Paperless-ngx document ID of the receipt", + }, + }, + required: ["documentId"], + }, + }, + { + name: "papercortex_query", + description: + "Ask natural language questions about your documents. " + + 'Example: "How much did I spend on office supplies in Q1 2024?"', + inputSchema: { + type: "object" as const, + properties: { + question: { + type: "string", + description: "Natural language question about your documents", + }, + maxDocuments: { + type: "number", + description: + "Maximum documents to include in context (default: 5)", + }, + }, + required: ["question"], + }, + }, + { + name: "papercortex_export", + description: + "Export receipt data as DATEV-compatible CSV for German accounting, " + + "or as generic CSV.", + inputSchema: { + type: "object" as const, + properties: { + documentIds: { + type: "array", + items: { type: "number" }, + description: "Document IDs to export", + }, + format: { + type: "string", + enum: ["datev", "csv"], + description: "Export format (default: datev)", + }, + }, + required: ["documentIds"], + }, + }, + ], +})); + +/** + * Route tool calls to their respective handlers. + */ +server.setRequestHandler(CallToolRequestSchema, async (request) => { + const { name, arguments: args } = request.params; + + try { + switch (name) { + case "papercortex_search": + return await handleSearch(ctx, args as Record); + case "papercortex_classify": + return await handleClassify(ctx, args as Record); + case "papercortex_receipt": + return await handleReceipt(ctx, args as Record); + case "papercortex_query": + return await handleQuery(ctx, args as Record); + case "papercortex_export": + return await handleExport(ctx, args as Record); + default: + return { + content: [ + { type: "text" as const, text: `Unknown tool: ${name}` }, + ], + isError: true, + }; + } + } catch (error) { + const message = + error instanceof Error ? error.message : "Unknown error occurred"; + return { + content: [{ type: "text" as const, text: `Error: ${message}` }], + isError: true, + }; + } +}); + +// --------------------------------------------------------------------------- +// Start server +// --------------------------------------------------------------------------- + +async function main(): Promise { + const transport = new StdioServerTransport(); + await server.connect(transport); + console.error("PaperCortex MCP Server running on stdio"); +} + +main().catch((error) => { + console.error("Fatal error starting PaperCortex:", error); + process.exit(1); +}); diff --git a/src/mcp-server/tools/classify.ts b/src/mcp-server/tools/classify.ts new file mode 100644 index 0000000..cf45bf3 --- /dev/null +++ b/src/mcp-server/tools/classify.ts @@ -0,0 +1,117 @@ +/** + * Auto-classification tool for the PaperCortex MCP Server. + * + * Uses local LLM to analyze document content and suggest appropriate + * tags, document types, and correspondents. + */ + +import type { ToolContext } from "../index.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface ClassifyArgs { + readonly documentId: number; + readonly applyTags?: boolean; +} + +interface ClassificationResult { + readonly suggestedTags: readonly string[]; + readonly suggestedType: string | null; + readonly suggestedCorrespondent: string | null; + readonly summary: string; + readonly language: string; + readonly confidence: number; +} + +// --------------------------------------------------------------------------- +// Prompts +// --------------------------------------------------------------------------- + +const CLASSIFY_SYSTEM_PROMPT = `You are a document classification assistant. Analyze the document content and provide structured classification. + +Respond with valid JSON only: +{ + "suggestedTags": ["tag1", "tag2"], + "suggestedType": "invoice|contract|receipt|letter|report|tax_document|bank_statement|insurance|warranty|manual|other", + "suggestedCorrespondent": "Company or person name", + "summary": "One sentence summary", + "language": "ISO 639-1 code", + "confidence": 0.0 to 1.0 +}`; + +// --------------------------------------------------------------------------- +// Handler +// --------------------------------------------------------------------------- + +/** + * Handle a `papercortex_classify` tool call. + * + * 1. Fetch document content from Paperless-ngx. + * 2. Send content to Ollama for classification. + * 3. Optionally apply suggested tags back to Paperless-ngx. + * + * TODO: Match suggested tags against existing Paperless-ngx tags + * TODO: Create new tags automatically when confidence is high + * TODO: Learn from user corrections to improve classification + */ +export async function handleClassify( + ctx: ToolContext, + args: Record, +): Promise<{ content: Array<{ type: "text"; text: string }> }> { + const { documentId, applyTags = false } = args as unknown as ClassifyArgs; + + // Fetch document from Paperless-ngx + const document = await ctx.paperless.getDocument(documentId); + + if (!document.content || document.content.trim().length === 0) { + return { + content: [ + { + type: "text", + text: `Document #${documentId} has no text content. OCR may not have completed.`, + }, + ], + }; + } + + // Classify using Ollama + const prompt = `Classify this document:\n\nTitle: ${document.title}\n\nContent:\n${document.content.slice(0, 4000)}`; + const completion = await ctx.ollama.complete(prompt, CLASSIFY_SYSTEM_PROMPT); + + let classification: ClassificationResult; + try { + classification = JSON.parse(completion.text) as ClassificationResult; + } catch { + return { + content: [ + { + type: "text", + text: `Classification failed: LLM did not return valid JSON.\nRaw response: ${completion.text.slice(0, 500)}`, + }, + ], + }; + } + + // Optionally apply tags + let appliedNote = ""; + if (applyTags && classification.suggestedTags.length > 0) { + // TODO: Look up tag IDs from Paperless-ngx, create missing tags + appliedNote = + "\n\nNote: Tag application is not yet implemented. " + + "Tags need to be matched against existing Paperless-ngx tags."; + } + + const output = + `Classification for Document #${documentId} "${document.title}":\n\n` + + `Type: ${classification.suggestedType ?? "unknown"}\n` + + `Correspondent: ${classification.suggestedCorrespondent ?? "unknown"}\n` + + `Tags: ${classification.suggestedTags.join(", ") || "none"}\n` + + `Language: ${classification.language}\n` + + `Summary: ${classification.summary}\n` + + `Confidence: ${(classification.confidence * 100).toFixed(0)}%` + + appliedNote; + + return { content: [{ type: "text", text: output }] }; +} diff --git a/src/mcp-server/tools/export.ts b/src/mcp-server/tools/export.ts new file mode 100644 index 0000000..69019fa --- /dev/null +++ b/src/mcp-server/tools/export.ts @@ -0,0 +1,116 @@ +/** + * DATEV/CSV export tool for the PaperCortex MCP Server. + * + * Exports receipt data in accounting-compatible formats. + */ + +import { createReceiptExtractor } from "../../receipt/extractor.js"; +import { createDatevExporter } from "../../receipt/datev.js"; +import type { ToolContext } from "../index.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface ExportArgs { + readonly documentIds: readonly number[]; + readonly format?: "datev" | "csv"; +} + +// --------------------------------------------------------------------------- +// Handler +// --------------------------------------------------------------------------- + +/** + * Handle a `papercortex_export` tool call. + * + * 1. Extract receipt data from all specified documents. + * 2. Format as DATEV or generic CSV. + * 3. Return the CSV content. + * + * TODO: Add file output option (save to disk) + * TODO: Add date range filtering + * TODO: Add DATEV header metadata (consultant/client numbers from config) + */ +export async function handleExport( + ctx: ToolContext, + args: Record, +): Promise<{ content: Array<{ type: "text"; text: string }> }> { + const { documentIds, format = "datev" } = args as unknown as ExportArgs; + + if (!documentIds || documentIds.length === 0) { + return { + content: [ + { + type: "text", + text: "Error: at least one document ID is required for export.", + }, + ], + }; + } + + // Extract receipt data from all documents + const extractor = createReceiptExtractor({ + ollama: ctx.ollama, + paperless: ctx.paperless, + }); + + const receipts = await extractor.extractBatch(documentIds); + + if (format === "datev") { + // TODO: Read consultant/client numbers from configuration + const exporter = createDatevExporter({ + consultantNumber: 0, + clientNumber: 0, + }); + + const receiptsForExport = receipts.map((r) => ({ + documentId: r.documentId, + vendor: r.vendor, + date: r.date, + totalAmount: r.totalAmount, + taxRate: r.taxRate, + category: r.category, + })); + + const csv = exporter.generateCsv(receiptsForExport); + + return { + content: [ + { + type: "text", + text: + `DATEV export for ${receipts.length} receipt(s):\n\n` + + "```csv\n" + + csv + + "\n```\n\n" + + "Copy this CSV content into a file and import into your " + + "DATEV-compatible accounting software.", + }, + ], + }; + } + + // Generic CSV format + const header = "Document ID;Vendor;Date;Amount;Tax Rate;Tax Amount;Currency;Category"; + const rows = receipts.map( + (r) => + `${r.documentId};${r.vendor};${r.date};${r.totalAmount.toFixed(2)};` + + `${r.taxRate ?? ""};${r.taxAmount?.toFixed(2) ?? ""};${r.currency};${r.category ?? ""}`, + ); + + const csv = [header, ...rows].join("\n"); + + return { + content: [ + { + type: "text", + text: + `CSV export for ${receipts.length} receipt(s):\n\n` + + "```csv\n" + + csv + + "\n```", + }, + ], + }; +} diff --git a/src/mcp-server/tools/query.ts b/src/mcp-server/tools/query.ts new file mode 100644 index 0000000..eeb2420 --- /dev/null +++ b/src/mcp-server/tools/query.ts @@ -0,0 +1,110 @@ +/** + * Natural language query tool for the PaperCortex MCP Server. + * + * Answers questions about documents using RAG (Retrieval-Augmented Generation): + * retrieves relevant documents via semantic search, then generates an answer + * using the local LLM with document context. + */ + +import type { ToolContext } from "../index.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface QueryArgs { + readonly question: string; + readonly maxDocuments?: number; +} + +// --------------------------------------------------------------------------- +// Prompts +// --------------------------------------------------------------------------- + +const QUERY_SYSTEM_PROMPT = `You are a document analysis assistant. Answer the user's question based ONLY on the provided document excerpts. If the documents don't contain enough information to answer, say so clearly. + +Be precise with numbers, dates, and amounts. Cite document IDs when referencing specific documents.`; + +// --------------------------------------------------------------------------- +// Handler +// --------------------------------------------------------------------------- + +/** + * Handle a `papercortex_query` tool call. + * + * Uses RAG (Retrieval-Augmented Generation): + * 1. Embed the question and retrieve relevant documents. + * 2. Build a context from retrieved documents. + * 3. Generate an answer using the local LLM. + * + * TODO: Add conversation history for follow-up questions + * TODO: Add source citation with page numbers + * TODO: Implement query decomposition for complex questions + */ +export async function handleQuery( + ctx: ToolContext, + args: Record, +): Promise<{ content: Array<{ type: "text"; text: string }> }> { + const { question, maxDocuments = 5 } = args as unknown as QueryArgs; + + if (!question || question.trim().length === 0) { + return { + content: [{ type: "text", text: "Error: question cannot be empty." }], + }; + } + + // Step 1: Retrieve relevant documents + const queryEmbedding = await ctx.ollama.embed(question); + const relevantDocs = ctx.vectorStore.search(queryEmbedding.vector, { + limit: maxDocuments, + minScore: 0.3, + }); + + if (relevantDocs.length === 0) { + return { + content: [ + { + type: "text", + text: + `I couldn't find any relevant documents to answer: "${question}"\n\n` + + "The vector store may need to be populated first, or your documents " + + "may not contain information related to this question.", + }, + ], + }; + } + + // Step 2: Build context from retrieved documents + const context = relevantDocs + .map( + (doc) => + `--- Document #${doc.documentId}: ${doc.title} (relevance: ${doc.score.toFixed(2)}) ---\n` + + doc.content.slice(0, 2000), + ) + .join("\n\n"); + + // Step 3: Generate answer with context + const prompt = + `Based on the following documents, answer this question: "${question}"\n\n` + + `Documents:\n${context}`; + + const completion = await ctx.ollama.complete(prompt, QUERY_SYSTEM_PROMPT); + + const sourcesNote = relevantDocs + .map( + (doc) => + ` - Document #${doc.documentId}: ${doc.title} (score: ${doc.score.toFixed(2)})`, + ) + .join("\n"); + + return { + content: [ + { + type: "text", + text: + `${completion.text}\n\n` + + `---\nSources (${relevantDocs.length} documents):\n${sourcesNote}`, + }, + ], + }; +} diff --git a/src/mcp-server/tools/receipt.ts b/src/mcp-server/tools/receipt.ts new file mode 100644 index 0000000..2509fc9 --- /dev/null +++ b/src/mcp-server/tools/receipt.ts @@ -0,0 +1,76 @@ +/** + * Receipt extraction tool for the PaperCortex MCP Server. + * + * Extracts structured receipt data from Paperless-ngx documents + * using local LLM analysis. + */ + +import { createReceiptExtractor } from "../../receipt/extractor.js"; +import type { ToolContext } from "../index.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface ReceiptArgs { + readonly documentId: number; +} + +// --------------------------------------------------------------------------- +// Handler +// --------------------------------------------------------------------------- + +/** + * Handle a `papercortex_receipt` tool call. + * + * 1. Fetch document from Paperless-ngx. + * 2. Extract receipt data using LLM. + * 3. Return structured receipt information. + * + * TODO: Cache extraction results to avoid re-processing + * TODO: Add confidence thresholds and human review flags + * TODO: Store extracted data back as Paperless-ngx custom fields + */ +export async function handleReceipt( + ctx: ToolContext, + args: Record, +): Promise<{ content: Array<{ type: "text"; text: string }> }> { + const { documentId } = args as unknown as ReceiptArgs; + + const extractor = createReceiptExtractor({ + ollama: ctx.ollama, + paperless: ctx.paperless, + }); + + const receipt = await extractor.extract(documentId); + + // Format line items table + const lineItemsTable = + receipt.lineItems.length > 0 + ? receipt.lineItems + .map( + (item, i) => + ` ${i + 1}. ${item.description} | ` + + `${item.quantity}x ${item.unitPrice.toFixed(2)} = ${item.totalPrice.toFixed(2)}`, + ) + .join("\n") + : " No line items extracted"; + + const output = + `Receipt Data for Document #${documentId}:\n\n` + + `Vendor: ${receipt.vendor}\n` + + `Address: ${receipt.vendorAddress ?? "N/A"}\n` + + `Tax ID: ${receipt.vendorTaxId ?? "N/A"}\n` + + `Date: ${receipt.date}\n` + + `Currency: ${receipt.currency}\n` + + `\nAmounts:\n` + + ` Subtotal: ${receipt.subtotal?.toFixed(2) ?? "N/A"}\n` + + ` Tax (${receipt.taxRate ?? "?"}%): ${receipt.taxAmount?.toFixed(2) ?? "N/A"}\n` + + ` Total: ${receipt.totalAmount.toFixed(2)}\n` + + `\nPayment: ${receipt.paymentMethod ?? "N/A"}\n` + + `Category: ${receipt.category ?? "uncategorized"}\n` + + `Confidence: ${(receipt.confidence * 100).toFixed(0)}%\n` + + `\nLine Items:\n${lineItemsTable}`; + + return { content: [{ type: "text", text: output }] }; +} diff --git a/src/mcp-server/tools/search.ts b/src/mcp-server/tools/search.ts new file mode 100644 index 0000000..5a4e869 --- /dev/null +++ b/src/mcp-server/tools/search.ts @@ -0,0 +1,87 @@ +/** + * Semantic search tool for the PaperCortex MCP Server. + * + * Performs vector similarity search across all embedded documents, + * returning the most semantically relevant results. + */ + +import type { ToolContext } from "../index.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface SearchArgs { + readonly query: string; + readonly limit?: number; + readonly tags?: readonly string[]; +} + +// --------------------------------------------------------------------------- +// Handler +// --------------------------------------------------------------------------- + +/** + * Handle a `papercortex_search` tool call. + * + * 1. Generate an embedding for the search query via Ollama. + * 2. Search the local vector store for similar documents. + * 3. Return ranked results with scores and metadata. + * + * TODO: Add hybrid search (combine vector + keyword for better recall) + * TODO: Add date range filtering + * TODO: Add result caching for repeated queries + */ +export async function handleSearch( + ctx: ToolContext, + args: Record, +): Promise<{ content: Array<{ type: "text"; text: string }> }> { + const { query, limit = 10, tags } = args as unknown as SearchArgs; + + if (!query || query.trim().length === 0) { + return { + content: [{ type: "text", text: "Error: search query cannot be empty." }], + }; + } + + // Generate embedding for the query + const queryEmbedding = await ctx.ollama.embed(query); + + // Search vector store + const results = ctx.vectorStore.search(queryEmbedding.vector, { + limit, + minScore: 0.4, + tagFilter: tags ? [...tags] : undefined, + }); + + if (results.length === 0) { + return { + content: [ + { + type: "text", + text: `No documents found matching "${query}". The vector store may need to be populated first.`, + }, + ], + }; + } + + // Format results + const formatted = results + .map( + (r, i) => + `${i + 1}. [Document #${r.documentId}] (score: ${r.score.toFixed(3)})\n` + + ` Title: ${r.title}\n` + + ` Tags: ${r.tags.length > 0 ? r.tags.join(", ") : "none"}\n` + + ` Preview: ${r.content.slice(0, 200).replace(/\n/g, " ")}...`, + ) + .join("\n\n"); + + return { + content: [ + { + type: "text", + text: `Found ${results.length} documents matching "${query}":\n\n${formatted}`, + }, + ], + }; +} diff --git a/src/paperless/client.ts b/src/paperless/client.ts new file mode 100644 index 0000000..36db72a --- /dev/null +++ b/src/paperless/client.ts @@ -0,0 +1,182 @@ +/** + * Paperless-ngx REST API client. + * + * Provides typed access to documents, correspondents, tags, and document types. + * All methods return immutable result objects. + * + * @example + * ```ts + * const client = createPaperlessClient({ + * baseUrl: "http://localhost:8000", + * token: "your-api-token", + * }); + * const docs = await client.getDocuments({ query: "invoice" }); + * ``` + */ + +import type { + Correspondent, + DocumentSearchParams, + DocumentType, + PaginatedResponse, + PaperlessConfig, + PaperlessDocument, + Tag, +} from "./types.js"; + +// --------------------------------------------------------------------------- +// Client interface +// --------------------------------------------------------------------------- + +export interface PaperlessClient { + /** Fetch a single document by ID. */ + getDocument(id: number): Promise; + + /** Search / list documents with optional filters. */ + getDocuments( + params?: DocumentSearchParams, + ): Promise>; + + /** Fetch all correspondents. */ + getCorrespondents(): Promise>; + + /** Fetch all tags. */ + getTags(): Promise>; + + /** Fetch all document types. */ + getDocumentTypes(): Promise>; + + /** Download the original file content of a document. */ + downloadDocument(id: number): Promise; + + /** Update tags on a document (immutable -- returns the updated doc). */ + updateDocumentTags( + id: number, + tagIds: readonly number[], + ): Promise; +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create a new Paperless-ngx API client. + * + * @param config - Connection configuration (URL + token). + * @returns A {@link PaperlessClient} instance. + */ +export function createPaperlessClient(config: PaperlessConfig): PaperlessClient { + const { baseUrl, token, timeout = 30_000 } = config; + + const headers: Record = { + Authorization: `Token ${token}`, + "Content-Type": "application/json", + Accept: "application/json; version=3", + }; + + /** + * Internal fetch wrapper with timeout and error handling. + */ + async function request( + path: string, + options: RequestInit = {}, + ): Promise { + const url = `${baseUrl.replace(/\/+$/, "")}/api${path}`; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeout); + + try { + const response = await fetch(url, { + ...options, + headers: { ...headers, ...((options.headers as Record) ?? {}) }, + signal: controller.signal, + }); + + if (!response.ok) { + const body = await response.text().catch(() => ""); + throw new Error( + `Paperless API error: ${response.status} ${response.statusText} -- ${body}`, + ); + } + + return (await response.json()) as T; + } finally { + clearTimeout(timer); + } + } + + /** + * Build query string from search params. + */ + function buildQuery(params?: DocumentSearchParams): string { + if (!params) return ""; + const entries = Object.entries(params).filter( + ([, v]) => v !== undefined && v !== null, + ); + if (entries.length === 0) return ""; + const searchParams = new URLSearchParams(); + for (const [key, value] of entries) { + if (Array.isArray(value)) { + searchParams.set(key, value.join(",")); + } else { + searchParams.set(key, String(value)); + } + } + return `?${searchParams.toString()}`; + } + + return { + async getDocument(id) { + return request(`/documents/${id}/`); + }, + + async getDocuments(params) { + return request>( + `/documents/${buildQuery(params)}`, + ); + }, + + async getCorrespondents() { + return request>("/correspondents/"); + }, + + async getTags() { + return request>("/tags/"); + }, + + async getDocumentTypes() { + return request>("/document_types/"); + }, + + async downloadDocument(id) { + const url = `${baseUrl.replace(/\/+$/, "")}/api/documents/${id}/download/`; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeout); + + try { + const response = await fetch(url, { + headers: { Authorization: `Token ${token}` }, + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error( + `Paperless download error: ${response.status} ${response.statusText}`, + ); + } + + return await response.arrayBuffer(); + } finally { + clearTimeout(timer); + } + }, + + async updateDocumentTags(id, tagIds) { + return request(`/documents/${id}/`, { + method: "PATCH", + body: JSON.stringify({ tags: [...tagIds] }), + }); + }, + }; +} diff --git a/src/paperless/types.ts b/src/paperless/types.ts new file mode 100644 index 0000000..1088dda --- /dev/null +++ b/src/paperless/types.ts @@ -0,0 +1,126 @@ +/** + * TypeScript type definitions for the Paperless-ngx REST API. + * + * Based on Paperless-ngx API v3+. + * @see https://docs.paperless-ngx.com/api/ + */ + +// --------------------------------------------------------------------------- +// Pagination +// --------------------------------------------------------------------------- + +/** Generic paginated response envelope from Paperless-ngx. */ +export interface PaginatedResponse { + readonly count: number; + readonly next: string | null; + readonly previous: string | null; + readonly results: readonly T[]; +} + +// --------------------------------------------------------------------------- +// Core entities +// --------------------------------------------------------------------------- + +export interface PaperlessDocument { + readonly id: number; + readonly correspondent: number | null; + readonly document_type: number | null; + readonly storage_path: number | null; + readonly title: string; + readonly content: string; + readonly tags: readonly number[]; + readonly created: string; + readonly created_date: string; + readonly modified: string; + readonly added: string; + readonly archive_serial_number: number | null; + readonly original_file_name: string; + readonly archived_file_name: string | null; + readonly owner: number | null; + readonly notes: readonly DocumentNote[]; + readonly custom_fields: readonly CustomFieldValue[]; +} + +export interface DocumentNote { + readonly id: number; + readonly note: string; + readonly created: string; + readonly user: number; +} + +export interface CustomFieldValue { + readonly field: number; + readonly value: string | number | boolean | null; +} + +export interface Correspondent { + readonly id: number; + readonly slug: string; + readonly name: string; + readonly match: string; + readonly matching_algorithm: number; + readonly is_insensitive: boolean; + readonly document_count: number; + readonly last_correspondence: string | null; +} + +export interface DocumentType { + readonly id: number; + readonly slug: string; + readonly name: string; + readonly match: string; + readonly matching_algorithm: number; + readonly is_insensitive: boolean; + readonly document_count: number; +} + +export interface Tag { + readonly id: number; + readonly slug: string; + readonly name: string; + readonly color: string; + readonly text_color: string; + readonly match: string; + readonly matching_algorithm: number; + readonly is_insensitive: boolean; + readonly is_inbox_tag: boolean; + readonly document_count: number; +} + +export interface StoragePath { + readonly id: number; + readonly slug: string; + readonly name: string; + readonly path: string; + readonly match: string; + readonly matching_algorithm: number; + readonly is_insensitive: boolean; + readonly document_count: number; +} + +// --------------------------------------------------------------------------- +// Search & filter +// --------------------------------------------------------------------------- + +export interface DocumentSearchParams { + readonly query?: string; + readonly correspondent__id?: number; + readonly document_type__id?: number; + readonly tags__id__all?: readonly number[]; + readonly tags__id__none?: readonly number[]; + readonly created__date__gt?: string; + readonly created__date__lt?: string; + readonly ordering?: string; + readonly page?: number; + readonly page_size?: number; +} + +// --------------------------------------------------------------------------- +// API client configuration +// --------------------------------------------------------------------------- + +export interface PaperlessConfig { + readonly baseUrl: string; + readonly token: string; + readonly timeout?: number; +} diff --git a/src/receipt/datev.ts b/src/receipt/datev.ts new file mode 100644 index 0000000..6b31516 --- /dev/null +++ b/src/receipt/datev.ts @@ -0,0 +1,171 @@ +/** + * DATEV export formatter. + * + * Generates DATEV-compatible CSV files for import into German accounting + * software (DATEV Unternehmen Online, lexoffice, sevDesk, etc.). + * + * Implements the DATEV "Buchungsstapel" (posting batch) format v7.0+. + * + * @see https://developer.datev.de/datev/platform/en/dtvf/formate + * + * @example + * ```ts + * const exporter = createDatevExporter({ consultantNumber: 12345, clientNumber: 67890 }); + * const csv = exporter.generateCsv(receiptData); + * writeFileSync("./export.csv", csv); + * ``` + */ + +import { stringify } from "csv-stringify/sync"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface DatevConfig { + /** DATEV consultant number (Beraternummer). */ + readonly consultantNumber: number; + /** DATEV client number (Mandantennummer). */ + readonly clientNumber: number; + /** Fiscal year start (1-12, default: 1 for January). */ + readonly fiscalYearStart?: number; + /** Default debit account length (SKR03/SKR04). */ + readonly accountLength?: 4 | 5; +} + +export interface DatevBookingEntry { + readonly amount: number; + readonly debitAccount: string; + readonly creditAccount: string; + readonly taxCode: string; + readonly date: string; + readonly description: string; + readonly documentNumber: string; + readonly costCenter?: string; +} + +export interface ReceiptForExport { + readonly documentId: number; + readonly vendor: string; + readonly date: string; + readonly totalAmount: number; + readonly taxRate: number | null; + readonly category: string | null; +} + +export interface DatevExporter { + /** Generate DATEV CSV from receipt data. */ + generateCsv(receipts: readonly ReceiptForExport[]): string; + + /** Map a receipt to a DATEV booking entry. */ + mapToBooking(receipt: ReceiptForExport): DatevBookingEntry; +} + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** + * Map expense categories to SKR03 accounts. + * TODO: Add SKR04 mapping support + * TODO: Make configurable via user settings + */ +const SKR03_ACCOUNT_MAP: Record = { + office_supplies: "4930", + travel: "4660", + food: "4650", + telephone: "4920", + postage: "4910", + insurance: "4360", + rent: "4210", + advertising: "4600", + software: "4964", + hardware: "4980", + consulting: "4950", + training: "4945", + vehicle: "4500", + default: "4900", +}; + +/** + * Map tax rates to DATEV tax codes (Steuerschluessel). + */ +const TAX_CODE_MAP: Record = { + 19: "9", // 19% USt (standard) + 7: "8", // 7% USt (reduced) + 0: "0", // Tax-free +}; + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create a DATEV-format exporter for receipt data. + * + * TODO: Implement DATEV header line with metadata (consultant, client, date range) + * TODO: Add validation for account numbers against SKR03/SKR04 + * TODO: Support DATEV XML format (Buchungsdaten v5.0) + */ +export function createDatevExporter(config: DatevConfig): DatevExporter { + const { + consultantNumber: _consultantNumber, + clientNumber: _clientNumber, + fiscalYearStart: _fiscalYearStart = 1, + accountLength: _accountLength = 4, + } = config; + + function mapToBooking(receipt: ReceiptForExport): DatevBookingEntry { + const category = receipt.category ?? "default"; + const debitAccount = + SKR03_ACCOUNT_MAP[category] ?? SKR03_ACCOUNT_MAP["default"]; + + const taxRate = receipt.taxRate ?? 19; + const taxCode = TAX_CODE_MAP[taxRate] ?? TAX_CODE_MAP[19]; + + // Parse date to DD.MM format for DATEV + const dateParts = receipt.date.split("-"); + const datevDate = + dateParts.length === 3 + ? `${dateParts[2]}${dateParts[1]}` + : receipt.date; + + return { + amount: receipt.totalAmount, + debitAccount, + creditAccount: "1200", // Bank account (SKR03 default) + taxCode, + date: datevDate, + description: receipt.vendor.slice(0, 60), // DATEV max 60 chars + documentNumber: `PC-${receipt.documentId}`, + costCenter: undefined, + }; + } + + function generateCsv(receipts: readonly ReceiptForExport[]): string { + const bookings = receipts.map(mapToBooking); + + // DATEV Buchungsstapel columns + const rows = bookings.map((b) => [ + b.amount.toFixed(2).replace(".", ","), // Umsatz (amount with comma) + "S", // Soll/Haben (S = Soll/Debit) + b.taxCode, // BU-Schluessel (tax code) + b.debitAccount, // Gegenkonto (offset account) + b.date, // Belegdatum (document date) + b.documentNumber, // Belegfeld 1 (document number) + "", // Belegfeld 2 + b.description, // Buchungstext (description) + "", // Umsatzsteuer-ID + b.creditAccount, // Konto (account) + b.costCenter ?? "", // Kostenstelle (cost center) + ]); + + return stringify(rows, { + delimiter: ";", + quoted: true, + record_delimiter: "\r\n", + }); + } + + return { generateCsv, mapToBooking }; +} diff --git a/src/receipt/extractor.ts b/src/receipt/extractor.ts new file mode 100644 index 0000000..36877fb --- /dev/null +++ b/src/receipt/extractor.ts @@ -0,0 +1,170 @@ +/** + * Receipt data extraction using local LLM via Ollama. + * + * Extracts structured data from receipt documents: vendor, date, amounts, + * tax breakdown, line items, and payment method. Uses the Paperless-ngx + * OCR content and enriches it with LLM analysis. + * + * @example + * ```ts + * const extractor = createReceiptExtractor({ ollama, paperless }); + * const receipt = await extractor.extract(documentId); + * console.log(receipt.vendor, receipt.totalAmount, receipt.taxAmount); + * ``` + */ + +import type { OllamaClient } from "../embeddings/ollama.js"; +import type { PaperlessClient } from "../paperless/client.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface ReceiptData { + readonly documentId: number; + readonly vendor: string; + readonly vendorAddress: string | null; + readonly vendorTaxId: string | null; + readonly date: string; + readonly currency: string; + readonly subtotal: number | null; + readonly taxRate: number | null; + readonly taxAmount: number | null; + readonly totalAmount: number; + readonly paymentMethod: string | null; + readonly lineItems: readonly LineItem[]; + readonly category: string | null; + readonly confidence: number; + readonly rawText: string; +} + +export interface LineItem { + readonly description: string; + readonly quantity: number; + readonly unitPrice: number; + readonly totalPrice: number; + readonly taxRate: number | null; +} + +export interface ReceiptExtractorConfig { + readonly ollama: OllamaClient; + readonly paperless: PaperlessClient; +} + +export interface ReceiptExtractor { + /** Extract structured receipt data from a Paperless-ngx document. */ + extract(documentId: number): Promise; + + /** Batch-extract receipts from multiple documents. */ + extractBatch(documentIds: readonly number[]): Promise; +} + +// --------------------------------------------------------------------------- +// Prompts +// --------------------------------------------------------------------------- + +const EXTRACTION_SYSTEM_PROMPT = `You are a receipt data extraction assistant. Given the OCR text of a receipt, extract structured data in JSON format. + +Extract the following fields: +- vendor: Company/store name +- vendorAddress: Full address if visible +- vendorTaxId: Tax ID / VAT number if visible (e.g., USt-IdNr, Steuernummer) +- date: Date in ISO 8601 format (YYYY-MM-DD) +- currency: ISO 4217 currency code (e.g., EUR, USD) +- subtotal: Amount before tax (null if not distinguishable) +- taxRate: Tax percentage as decimal (e.g., 19 for 19%) +- taxAmount: Tax amount +- totalAmount: Total amount including tax +- paymentMethod: Payment method if visible (cash, card, etc.) +- lineItems: Array of { description, quantity, unitPrice, totalPrice, taxRate } +- category: Suggested expense category (office_supplies, travel, food, etc.) +- confidence: Your confidence in the extraction (0.0 to 1.0) + +Respond ONLY with valid JSON. No explanation, no markdown.`; + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create a receipt data extractor. + * + * TODO: Add support for image-based receipts (pass images to multimodal LLM) + * TODO: Add receipt template matching for common vendors + * TODO: Add currency conversion support + */ +export function createReceiptExtractor( + config: ReceiptExtractorConfig, +): ReceiptExtractor { + const { ollama, paperless } = config; + + async function extractSingle(documentId: number): Promise { + // Fetch the document content from Paperless-ngx + const document = await paperless.getDocument(documentId); + const ocrText = document.content; + + if (!ocrText || ocrText.trim().length === 0) { + throw new Error( + `Document ${documentId} has no OCR content. Ensure Paperless-ngx has processed the document.`, + ); + } + + // Send to Ollama for structured extraction + const prompt = `Extract receipt data from the following OCR text:\n\n---\n${ocrText}\n---`; + const completion = await ollama.complete(prompt, EXTRACTION_SYSTEM_PROMPT); + + // Parse LLM response + // TODO: Add robust JSON extraction (handle markdown code blocks, partial JSON) + // TODO: Validate against Zod schema for type safety + let parsed: Record; + try { + parsed = JSON.parse(completion.text); + } catch { + throw new Error( + `Failed to parse receipt extraction result for document ${documentId}. ` + + `LLM response was not valid JSON.`, + ); + } + + return { + documentId, + vendor: String(parsed.vendor ?? "Unknown"), + vendorAddress: parsed.vendorAddress ? String(parsed.vendorAddress) : null, + vendorTaxId: parsed.vendorTaxId ? String(parsed.vendorTaxId) : null, + date: String(parsed.date ?? new Date().toISOString().split("T")[0]), + currency: String(parsed.currency ?? "EUR"), + subtotal: typeof parsed.subtotal === "number" ? parsed.subtotal : null, + taxRate: typeof parsed.taxRate === "number" ? parsed.taxRate : null, + taxAmount: typeof parsed.taxAmount === "number" ? parsed.taxAmount : null, + totalAmount: typeof parsed.totalAmount === "number" ? parsed.totalAmount : 0, + paymentMethod: parsed.paymentMethod ? String(parsed.paymentMethod) : null, + lineItems: Array.isArray(parsed.lineItems) + ? parsed.lineItems.map((item: Record) => ({ + description: String(item.description ?? ""), + quantity: Number(item.quantity ?? 1), + unitPrice: Number(item.unitPrice ?? 0), + totalPrice: Number(item.totalPrice ?? 0), + taxRate: typeof item.taxRate === "number" ? item.taxRate : null, + })) + : [], + category: parsed.category ? String(parsed.category) : null, + confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5, + rawText: ocrText, + }; + } + + return { + extract: extractSingle, + + async extractBatch(documentIds) { + // TODO: Add concurrency control (process N at a time) + // TODO: Add progress reporting callback + const results: ReceiptData[] = []; + for (const id of documentIds) { + const result = await extractSingle(id); + results.push(result); + } + return results; + }, + }; +} diff --git a/src/receipt/matcher.ts b/src/receipt/matcher.ts new file mode 100644 index 0000000..61fd4ee --- /dev/null +++ b/src/receipt/matcher.ts @@ -0,0 +1,231 @@ +/** + * Bank CSV transaction matching for receipts. + * + * Matches extracted receipt data against bank CSV exports to reconcile + * transactions. Supports common German bank export formats (Sparkasse, + * Volksbank, ING, DKB). + * + * @example + * ```ts + * const matcher = createTransactionMatcher(); + * const bankTxns = await matcher.parseBankCsv("./bank_export.csv"); + * const matches = matcher.matchReceipts(receipts, bankTxns); + * ``` + */ + +import { parse } from "csv-parse/sync"; +import { readFileSync } from "node:fs"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface BankTransaction { + readonly date: string; + readonly description: string; + readonly amount: number; + readonly currency: string; + readonly iban: string | null; + readonly bic: string | null; + readonly reference: string | null; + readonly rawLine: string; +} + +export interface ReceiptMatchCandidate { + readonly documentId: number; + readonly vendor: string; + readonly date: string; + readonly totalAmount: number; + readonly currency: string; +} + +export interface MatchResult { + readonly receipt: ReceiptMatchCandidate; + readonly transaction: BankTransaction; + readonly confidence: number; + readonly matchReasons: readonly string[]; +} + +export interface UnmatchedItem { + readonly type: "receipt" | "transaction"; + readonly item: ReceiptMatchCandidate | BankTransaction; +} + +export interface MatchSummary { + readonly matched: readonly MatchResult[]; + readonly unmatchedReceipts: readonly ReceiptMatchCandidate[]; + readonly unmatchedTransactions: readonly BankTransaction[]; + readonly matchRate: number; +} + +export interface TransactionMatcher { + /** Parse a bank CSV export file into structured transactions. */ + parseBankCsv(filePath: string, format?: BankCsvFormat): readonly BankTransaction[]; + + /** Match receipts against bank transactions. */ + matchReceipts( + receipts: readonly ReceiptMatchCandidate[], + transactions: readonly BankTransaction[], + ): MatchSummary; +} + +export type BankCsvFormat = "auto" | "sparkasse" | "ing" | "dkb" | "volksbank" | "generic"; + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +/** + * Create a transaction matcher for bank CSV reconciliation. + * + * TODO: Add ML-based fuzzy matching for vendor names + * TODO: Add support for MT940/CAMT.053 bank statement formats + * TODO: Add date tolerance configuration (match within N days) + */ +export function createTransactionMatcher(): TransactionMatcher { + /** + * Parse bank CSV with auto-detected or specified format. + */ + function parseBankCsv( + filePath: string, + format: BankCsvFormat = "auto", + ): readonly BankTransaction[] { + const raw = readFileSync(filePath, "utf-8"); + + // TODO: Implement format auto-detection based on header patterns + // TODO: Add support for different CSV delimiters (semicolon for German exports) + // TODO: Handle different date formats (DD.MM.YYYY, YYYY-MM-DD, MM/DD/YYYY) + + const _format = format; // Acknowledge format parameter for future use + + const records = parse(raw, { + columns: true, + skip_empty_lines: true, + delimiter: ";", + relaxColumnCount: true, + }) as Record[]; + + return records.map((record): BankTransaction => { + // Generic column mapping -- override per format + // TODO: Implement format-specific column mappings + return { + date: record["Buchungstag"] ?? record["Date"] ?? record["Datum"] ?? "", + description: + record["Verwendungszweck"] ?? + record["Description"] ?? + record["Buchungstext"] ?? + "", + amount: parseFloat( + (record["Betrag"] ?? record["Amount"] ?? "0") + .replace(/\./g, "") + .replace(",", "."), + ), + currency: record["Waehrung"] ?? record["Currency"] ?? "EUR", + iban: record["IBAN"] ?? null, + bic: record["BIC"] ?? null, + reference: record["Kundenreferenz"] ?? record["Reference"] ?? null, + rawLine: JSON.stringify(record), + }; + }); + } + + /** + * Match receipts against bank transactions by amount and date proximity. + */ + function matchReceipts( + receipts: readonly ReceiptMatchCandidate[], + transactions: readonly BankTransaction[], + ): MatchSummary { + const matched: MatchResult[] = []; + const matchedReceiptIds = new Set(); + const matchedTxnIndices = new Set(); + + // TODO: Implement smarter matching with vendor name fuzzy matching + // TODO: Add configurable date tolerance window + // TODO: Handle split transactions (one receipt, multiple bank entries) + + for (const receipt of receipts) { + let bestMatch: { index: number; confidence: number; reasons: string[] } | null = + null; + + for (let i = 0; i < transactions.length; i++) { + if (matchedTxnIndices.has(i)) continue; + + const txn = transactions[i]; + const reasons: string[] = []; + let confidence = 0; + + // Amount matching (exact or close) + const amountDiff = Math.abs(Math.abs(txn.amount) - receipt.totalAmount); + if (amountDiff < 0.01) { + confidence += 0.5; + reasons.push("exact_amount_match"); + } else if (amountDiff < 1.0) { + confidence += 0.3; + reasons.push("close_amount_match"); + } + + // Date matching + const receiptDate = new Date(receipt.date).getTime(); + const txnDate = new Date(txn.date).getTime(); + const daysDiff = Math.abs(receiptDate - txnDate) / (1000 * 60 * 60 * 24); + + if (daysDiff < 1) { + confidence += 0.3; + reasons.push("same_day"); + } else if (daysDiff < 3) { + confidence += 0.15; + reasons.push("within_3_days"); + } else if (daysDiff < 7) { + confidence += 0.05; + reasons.push("within_7_days"); + } + + // Vendor name in description + if ( + txn.description + .toLowerCase() + .includes(receipt.vendor.toLowerCase().slice(0, 8)) + ) { + confidence += 0.2; + reasons.push("vendor_in_description"); + } + + if ( + confidence > 0.5 && + (!bestMatch || confidence > bestMatch.confidence) + ) { + bestMatch = { index: i, confidence, reasons }; + } + } + + if (bestMatch) { + matched.push({ + receipt, + transaction: transactions[bestMatch.index], + confidence: bestMatch.confidence, + matchReasons: bestMatch.reasons, + }); + matchedReceiptIds.add(receipt.documentId); + matchedTxnIndices.add(bestMatch.index); + } + } + + const unmatchedReceipts = receipts.filter( + (r) => !matchedReceiptIds.has(r.documentId), + ); + const unmatchedTransactions = transactions.filter( + (_, i) => !matchedTxnIndices.has(i), + ); + + return { + matched, + unmatchedReceipts, + unmatchedTransactions, + matchRate: + receipts.length > 0 ? matched.length / receipts.length : 0, + }; + } + + return { parseBankCsv, matchReceipts }; +} diff --git a/src/skill/SKILL.md b/src/skill/SKILL.md new file mode 100644 index 0000000..d5cc5eb --- /dev/null +++ b/src/skill/SKILL.md @@ -0,0 +1,72 @@ +# PaperCortex -- Document Intelligence Skill + +> A Claude Code skill for interacting with your Paperless-ngx document archive through AI-powered semantic search, classification, receipt extraction, and accounting export. + +## Prerequisites + +- PaperCortex MCP Server running (see project README) +- Paperless-ngx instance with API access +- Ollama with `qwen2.5:14b` and `nomic-embed-text` models + +## Available Tools + +### papercortex_search +Search documents by meaning, not just keywords. + +``` +Search for: "office lease agreements from last year" +Search for: "tax-relevant receipts over 500 EUR" +Search for: "correspondence with insurance companies" +``` + +### papercortex_classify +Auto-classify a document with AI-suggested tags, type, and correspondent. + +``` +Classify document #1234 +Classify document #1234 and apply suggested tags +``` + +### papercortex_receipt +Extract structured data from receipt documents. + +``` +Extract receipt from document #5678 +``` + +Returns: vendor, date, amounts, tax breakdown, line items, category. + +### papercortex_query +Ask natural language questions about your document archive. + +``` +"How much did I spend on office supplies in Q1 2024?" +"Which invoices are still unpaid?" +"Summarize all contracts expiring this year" +``` + +### papercortex_export +Export receipt data for accounting software. + +``` +Export documents #100, #101, #102 as DATEV CSV +Export documents #200, #201 as generic CSV +``` + +## Workflow Examples + +### Monthly Bookkeeping +1. Search for all receipts from the current month +2. Extract data from each receipt +3. Export as DATEV CSV +4. Import into accounting software + +### Document Organization +1. Find unclassified documents (no tags) +2. Auto-classify each document +3. Review and approve suggested tags + +### Expense Analysis +1. Query: "What were my top 5 expense categories last quarter?" +2. Drill into specific categories with follow-up queries +3. Export relevant receipts for documentation diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..a80386d --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "lib": ["ES2022"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/*.test.ts"] +}