8 changed files with 0 additions and 247 deletions
--- a/docs/TIP_CRAWLEE_RUNTIME.md
+++ b/docs/TIP_CRAWLEE_RUNTIME.md
@ -1,58 +0,0 @@
-# TIP Crawlee Runtime
-
-## Decision
-
-TIP standardizes on Crawlee as the crawler runtime.
-
- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright.
- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`.
-
-## TypeScript Core
-
-The TypeScript scraper remains the canonical production path because TIP already
-uses it for DB writes, price observations, stock observations, image verification
-and detail verification.
-
-Useful FS.com commands:
-
-```bash
-pnpm -C packages/scraper run scrape:fs:db-detail
-pnpm -C packages/scraper run scrape:fs:url-discovery
-```
-
-Erik safety defaults:
-
- keep FS.com at browser concurrency `1`
- use bounded run caps
- treat no-text and max-retry URLs as retry/classification classes
- keep Crawlee storage isolated with `makeCrawleeConfig(...)`
-
-## Python Worker
-
-The Python worker is optional and should run first on Pi/Proxmox/residential
-nodes. It writes JSONL evidence and does not write directly into TIP DB.
-
-Install:
-
-```bash
-cd packages/crawlee-python
-python3 -m venv .venv
-. .venv/bin/activate
-python -m pip install -U pip
-python -m pip install -e ".[beautifulsoup]"
-```
-
-Smoke:
-
-```bash
-python -m tip_crawlee_worker \
-  --mode beautifulsoup \
-  --url https://crawlee.dev \
-  --out /tmp/tip-crawlee-python-smoke.jsonl \
-  --max-requests 1
-```
-
-## Training Pool
-
-Every crawler result, failure class, parser lesson and runtime safety lesson
-should be written to the TIPLLM training pool and synced through `sync/`.
--- a/packages/crawlee-python/.gitignore
+++ b/packages/crawlee-python/.gitignore
@ -1,4 +0,0 @@
-*.egg-info/
-__pycache__/
-*.py[cod]
-.venv/
--- a/packages/crawlee-python/README.md
+++ b/packages/crawlee-python/README.md
@ -1,42 +0,0 @@
-# TIP Crawlee Python Worker
-
-Optional Python crawler worker for Pi/Proxmox/residential nodes.
-
-The TypeScript scraper package remains the production crawler core. This package
-exists for isolated worker experiments where Python extraction libraries are a
-better fit. It writes JSONL artifacts; it does not write directly to TIP
-PostgreSQL.
-
-## Install
-
-```bash
-cd packages/crawlee-python
-python3 -m venv .venv
-. .venv/bin/activate
-python -m pip install -U pip
-python -m pip install -e ".[beautifulsoup]"
-```
-
-For browser-based Python workers:
-
-```bash
-python -m pip install -e ".[playwright]"
-python -m playwright install chromium
-```
-
-## Smoke Run
-
-```bash
-python -m tip_crawlee_worker \
-  --mode beautifulsoup \
-  --url https://crawlee.dev \
-  --out /tmp/tip-crawlee-python-smoke.jsonl \
-  --max-requests 1
-```
-
-## TIP Policy
-
- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
- Keep output as JSONL evidence until a deterministic importer validates it.
- Record useful crawler outcomes in the TIPLLM training pool.
- Use TIPLLM only for planning/extraction feedback; no external AI.
--- a/packages/crawlee-python/pyproject.toml
+++ b/packages/crawlee-python/pyproject.toml
@ -1,27 +0,0 @@
-[project]
-name = "tip-crawlee-python-worker"
-version = "0.1.0"
-description = "Optional Crawlee Python worker for TIP crawler nodes"
-readme = "README.md"
-requires-python = ">=3.11"
-dependencies = [
-  "crawlee>=1.0.0",
-]
-
-[project.optional-dependencies]
-beautifulsoup = [
-  "crawlee[beautifulsoup]>=1.0.0",
-]
-playwright = [
-  "crawlee[playwright]>=1.0.0",
-  "playwright>=1.50.0",
-]
-all = [
-  "crawlee[all]>=1.0.0",
-]
-
-[project.scripts]
-tip-crawlee-worker = "tip_crawlee_worker.__main__:main"
-
-[tool.ruff]
-line-length = 100
--- a/packages/crawlee-python/tip_crawlee_worker/init.py
+++ b/packages/crawlee-python/tip_crawlee_worker/init.py
@ -1,5 +0,0 @@
-"""TIP optional Crawlee Python worker."""
-
-__all__ = ["__version__"]
-
-__version__ = "0.1.0"
--- a/packages/crawlee-python/tip_crawlee_worker/main.py
+++ b/packages/crawlee-python/tip_crawlee_worker/main.py
@ -1,85 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import asyncio
-import json
-from datetime import timedelta
-from pathlib import Path
-from typing import Any
-
-from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker")
-    parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup")
-    parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.")
-    parser.add_argument("--out", required=True, help="JSONL output path.")
-    parser.add_argument("--max-requests", type=int, default=10)
-    parser.add_argument("--timeout-seconds", type=int, default=30)
-    parser.add_argument("--label", default="tip-crawlee-python")
-    return parser.parse_args()
-
-
-def write_jsonl(path: Path, record: dict[str, Any]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("a", encoding="utf-8") as handle:
-        handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
-
-
-async def run_beautifulsoup(args: argparse.Namespace) -> None:
-    out = Path(args.out)
-    if out.exists():
-        out.unlink()
-
-    crawler = BeautifulSoupCrawler(
-        max_request_retries=1,
-        request_handler_timeout=timedelta(seconds=args.timeout_seconds),
-        max_requests_per_crawl=args.max_requests,
-    )
-
-    @crawler.router.default_handler
-    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
-        title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None
-        headings = [
-            heading.get_text(" ", strip=True)
-            for heading in context.soup.find_all(["h1", "h2", "h3"])
-        ][:25]
-        images = [
-            img.get("src")
-            for img in context.soup.find_all("img")
-            if img.get("src")
-        ][:25]
-        links = [
-            link.get("href")
-            for link in context.soup.find_all("a")
-            if link.get("href")
-        ][:100]
-        write_jsonl(
-            out,
-            {
-                "kind": "tip_crawlee_python_page",
-                "label": args.label,
-                "url": context.request.url,
-                "title": title,
-                "headings": headings,
-                "image_candidates": images,
-                "link_candidates": links,
-            },
-        )
-
-    await crawler.run(args.url)
-
-
-async def async_main() -> None:
-    args = parse_args()
-    if args.mode == "beautifulsoup":
-        await run_beautifulsoup(args)
-
-
-def main() -> None:
-    asyncio.run(async_main())
-
-
-if __name__ == "__main__":
-    main()
--- a/packages/scraper/package.json
+++ b/packages/scraper/package.json
@ -8,8 +8,6 @@
    "build": "tsc",
    "dev": "tsx src/index.ts",
    "scrape:fs": "tsx src/scrapers/fs-com.ts",
-    "scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
-    "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
    "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
    "scrape:optcore": "tsx src/scrapers/optcore.ts",
    "scrape:news": "tsx src/scrapers/news.ts",
--- a/scripts/setup-crawlee-python-worker.sh
+++ b/scripts/setup-crawlee-python-worker.sh
@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-VENV_DIR="${TIP_CRAWLEE_PY_VENV:-/opt/tip-crawlee-python-venv}"
-EXTRA="${TIP_CRAWLEE_PY_EXTRA:-beautifulsoup}"
-
-python3 -m venv "$VENV_DIR"
-"$VENV_DIR/bin/python" -m pip install -U pip
-"$VENV_DIR/bin/python" -m pip install -e "$ROOT_DIR/packages/crawlee-python[$EXTRA]"
-
-cat <<EOF
-TIP Crawlee Python worker installed.
-
-Venv:
-  $VENV_DIR
-
-Smoke test:
-  $VENV_DIR/bin/python -m tip_crawlee_worker \\
-    --mode beautifulsoup \\
-    --url https://crawlee.dev \\
-    --out /tmp/tip-crawlee-python-smoke.jsonl \\
-    --max-requests 1
-EOF