feat: add crawlee python worker integration

2026-05-09 14:06:34 +02:00 · 2026-05-09 14:06:34 +02:00 · 60531b6250
commit 60531b6250
parent 6ee10bf301
15 changed files with 325 additions and 0 deletions
--- a/docs/TIP_CRAWLEE_RUNTIME.md
+++ b/docs/TIP_CRAWLEE_RUNTIME.md
@ -0,0 +1,58 @@
+# TIP Crawlee Runtime
+
+## Decision
+
+TIP standardizes on Crawlee as the crawler runtime.
+
+- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright.
+- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`.
+
+## TypeScript Core
+
+The TypeScript scraper remains the canonical production path because TIP already
+uses it for DB writes, price observations, stock observations, image verification
+and detail verification.
+
+Useful FS.com commands:
+
+```bash
+pnpm -C packages/scraper run scrape:fs:db-detail
+pnpm -C packages/scraper run scrape:fs:url-discovery
+```
+
+Erik safety defaults:
+
+- keep FS.com at browser concurrency `1`
+- use bounded run caps
+- treat no-text and max-retry URLs as retry/classification classes
+- keep Crawlee storage isolated with `makeCrawleeConfig(...)`
+
+## Python Worker
+
+The Python worker is optional and should run first on Pi/Proxmox/residential
+nodes. It writes JSONL evidence and does not write directly into TIP DB.
+
+Install:
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+Smoke:
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## Training Pool
+
+Every crawler result, failure class, parser lesson and runtime safety lesson
+should be written to the TIPLLM training pool and synced through `sync/`.
--- a/packages/crawlee-python/README.md
+++ b/packages/crawlee-python/README.md
@ -0,0 +1,42 @@
+# TIP Crawlee Python Worker
+
+Optional Python crawler worker for Pi/Proxmox/residential nodes.
+
+The TypeScript scraper package remains the production crawler core. This package
+exists for isolated worker experiments where Python extraction libraries are a
+better fit. It writes JSONL artifacts; it does not write directly to TIP
+PostgreSQL.
+
+## Install
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+For browser-based Python workers:
+
+```bash
+python -m pip install -e ".[playwright]"
+python -m playwright install chromium
+```
+
+## Smoke Run
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## TIP Policy
+
+- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
+- Keep output as JSONL evidence until a deterministic importer validates it.
+- Record useful crawler outcomes in the TIPLLM training pool.
+- Use TIPLLM only for planning/extraction feedback; no external AI.
--- a/packages/crawlee-python/pyproject.toml
+++ b/packages/crawlee-python/pyproject.toml
@ -0,0 +1,27 @@
+[project]
+name = "tip-crawlee-python-worker"
+version = "0.1.0"
+description = "Optional Crawlee Python worker for TIP crawler nodes"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "crawlee>=1.0.0",
+]
+
+[project.optional-dependencies]
+beautifulsoup = [
+  "crawlee[beautifulsoup]>=1.0.0",
+]
+playwright = [
+  "crawlee[playwright]>=1.0.0",
+  "playwright>=1.50.0",
+]
+all = [
+  "crawlee[all]>=1.0.0",
+]
+
+[project.scripts]
+tip-crawlee-worker = "tip_crawlee_worker.__main__:main"
+
+[tool.ruff]
+line-length = 100
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO
@ -0,0 +1,57 @@
+Metadata-Version: 2.4
+Name: tip-crawlee-python-worker
+Version: 0.1.0
+Summary: Optional Crawlee Python worker for TIP crawler nodes
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: crawlee>=1.0.0
+Provides-Extra: beautifulsoup
+Requires-Dist: crawlee[beautifulsoup]>=1.0.0; extra == "beautifulsoup"
+Provides-Extra: playwright
+Requires-Dist: crawlee[playwright]>=1.0.0; extra == "playwright"
+Requires-Dist: playwright>=1.50.0; extra == "playwright"
+Provides-Extra: all
+Requires-Dist: crawlee[all]>=1.0.0; extra == "all"
+
+# TIP Crawlee Python Worker
+
+Optional Python crawler worker for Pi/Proxmox/residential nodes.
+
+The TypeScript scraper package remains the production crawler core. This package
+exists for isolated worker experiments where Python extraction libraries are a
+better fit. It writes JSONL artifacts; it does not write directly to TIP
+PostgreSQL.
+
+## Install
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+For browser-based Python workers:
+
+```bash
+python -m pip install -e ".[playwright]"
+python -m playwright install chromium
+```
+
+## Smoke Run
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## TIP Policy
+
+- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
+- Keep output as JSONL evidence until a deterministic importer validates it.
+- Record useful crawler outcomes in the TIPLLM training pool.
+- Use TIPLLM only for planning/extraction feedback; no external AI.
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt
@ -0,0 +1,10 @@
+README.md
+pyproject.toml
+tip_crawlee_python_worker.egg-info/PKG-INFO
+tip_crawlee_python_worker.egg-info/SOURCES.txt
+tip_crawlee_python_worker.egg-info/dependency_links.txt
+tip_crawlee_python_worker.egg-info/entry_points.txt
+tip_crawlee_python_worker.egg-info/requires.txt
+tip_crawlee_python_worker.egg-info/top_level.txt
+tip_crawlee_worker/__init__.py
+tip_crawlee_worker/__main__.py
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt
@ -0,0 +1 @@
+
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt
@ -0,0 +1,2 @@
+[console_scripts]
+tip-crawlee-worker = tip_crawlee_worker.__main__:main
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt
@ -0,0 +1,11 @@
+crawlee>=1.0.0
+
+[all]
+crawlee[all]>=1.0.0
+
+[beautifulsoup]
+crawlee[beautifulsoup]>=1.0.0
+
+[playwright]
+crawlee[playwright]>=1.0.0
+playwright>=1.50.0
--- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt
@ -0,0 +1 @@
+tip_crawlee_worker
--- a/packages/crawlee-python/tip_crawlee_worker/init.py
+++ b/packages/crawlee-python/tip_crawlee_worker/init.py
@ -0,0 +1,5 @@
+"""TIP optional Crawlee Python worker."""
+
+__all__ = ["__version__"]
+
+__version__ = "0.1.0"
--- a/packages/crawlee-python/tip_crawlee_worker/main.py
+++ b/packages/crawlee-python/tip_crawlee_worker/main.py
@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+from datetime import timedelta
+from pathlib import Path
+from typing import Any
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker")
+    parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup")
+    parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.")
+    parser.add_argument("--out", required=True, help="JSONL output path.")
+    parser.add_argument("--max-requests", type=int, default=10)
+    parser.add_argument("--timeout-seconds", type=int, default=30)
+    parser.add_argument("--label", default="tip-crawlee-python")
+    return parser.parse_args()
+
+
+def write_jsonl(path: Path, record: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+async def run_beautifulsoup(args: argparse.Namespace) -> None:
+    out = Path(args.out)
+    if out.exists():
+        out.unlink()
+
+    crawler = BeautifulSoupCrawler(
+        max_request_retries=1,
+        request_handler_timeout=timedelta(seconds=args.timeout_seconds),
+        max_requests_per_crawl=args.max_requests,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None
+        headings = [
+            heading.get_text(" ", strip=True)
+            for heading in context.soup.find_all(["h1", "h2", "h3"])
+        ][:25]
+        images = [
+            img.get("src")
+            for img in context.soup.find_all("img")
+            if img.get("src")
+        ][:25]
+        links = [
+            link.get("href")
+            for link in context.soup.find_all("a")
+            if link.get("href")
+        ][:100]
+        write_jsonl(
+            out,
+            {
+                "kind": "tip_crawlee_python_page",
+                "label": args.label,
+                "url": context.request.url,
+                "title": title,
+                "headings": headings,
+                "image_candidates": images,
+                "link_candidates": links,
+            },
+        )
+
+    await crawler.run(args.url)
+
+
+async def async_main() -> None:
+    args = parse_args()
+    if args.mode == "beautifulsoup":
+        await run_beautifulsoup(args)
+
+
+def main() -> None:
+    asyncio.run(async_main())
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/crawlee-python/tip_crawlee_worker/pycache/init.cpython-314.pyc
+++ b/packages/crawlee-python/tip_crawlee_worker/pycache/init.cpython-314.pyc
--- a/packages/crawlee-python/tip_crawlee_worker/pycache/main.cpython-314.pyc
+++ b/packages/crawlee-python/tip_crawlee_worker/pycache/main.cpython-314.pyc
--- a/packages/scraper/package.json
+++ b/packages/scraper/package.json
@ -8,6 +8,8 @@
    "build": "tsc",
    "dev": "tsx src/index.ts",
    "scrape:fs": "tsx src/scrapers/fs-com.ts",
+    "scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
+    "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
    "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
    "scrape:optcore": "tsx src/scrapers/optcore.ts",
    "scrape:news": "tsx src/scrapers/news.ts",
--- a/scripts/setup-crawlee-python-worker.sh
+++ b/scripts/setup-crawlee-python-worker.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV_DIR="${TIP_CRAWLEE_PY_VENV:-/opt/tip-crawlee-python-venv}"
+EXTRA="${TIP_CRAWLEE_PY_EXTRA:-beautifulsoup}"
+
+python3 -m venv "$VENV_DIR"
+"$VENV_DIR/bin/python" -m pip install -U pip
+"$VENV_DIR/bin/python" -m pip install -e "$ROOT_DIR/packages/crawlee-python[$EXTRA]"
+
+cat <<EOF
+TIP Crawlee Python worker installed.
+
+Venv:
+  $VENV_DIR
+
+Smoke test:
+  $VENV_DIR/bin/python -m tip_crawlee_worker \\
+    --mode beautifulsoup \\
+    --url https://crawlee.dev \\
+    --out /tmp/tip-crawlee-python-smoke.jsonl \\
+    --max-requests 1
+EOF