diff --git a/docs/TIP_CRAWLEE_RUNTIME.md b/docs/TIP_CRAWLEE_RUNTIME.md new file mode 100644 index 0000000..72dc9bb --- /dev/null +++ b/docs/TIP_CRAWLEE_RUNTIME.md @@ -0,0 +1,58 @@ +# TIP Crawlee Runtime + +## Decision + +TIP standardizes on Crawlee as the crawler runtime. + +- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright. +- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`. + +## TypeScript Core + +The TypeScript scraper remains the canonical production path because TIP already +uses it for DB writes, price observations, stock observations, image verification +and detail verification. + +Useful FS.com commands: + +```bash +pnpm -C packages/scraper run scrape:fs:db-detail +pnpm -C packages/scraper run scrape:fs:url-discovery +``` + +Erik safety defaults: + +- keep FS.com at browser concurrency `1` +- use bounded run caps +- treat no-text and max-retry URLs as retry/classification classes +- keep Crawlee storage isolated with `makeCrawleeConfig(...)` + +## Python Worker + +The Python worker is optional and should run first on Pi/Proxmox/residential +nodes. It writes JSONL evidence and does not write directly into TIP DB. + +Install: + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +Smoke: + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## Training Pool + +Every crawler result, failure class, parser lesson and runtime safety lesson +should be written to the TIPLLM training pool and synced through `sync/`. diff --git a/packages/crawlee-python/README.md b/packages/crawlee-python/README.md new file mode 100644 index 0000000..f114a7f --- /dev/null +++ b/packages/crawlee-python/README.md @@ -0,0 +1,42 @@ +# TIP Crawlee Python Worker + +Optional Python crawler worker for Pi/Proxmox/residential nodes. + +The TypeScript scraper package remains the production crawler core. This package +exists for isolated worker experiments where Python extraction libraries are a +better fit. It writes JSONL artifacts; it does not write directly to TIP +PostgreSQL. + +## Install + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +For browser-based Python workers: + +```bash +python -m pip install -e ".[playwright]" +python -m playwright install chromium +``` + +## Smoke Run + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## TIP Policy + +- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler. +- Keep output as JSONL evidence until a deterministic importer validates it. +- Record useful crawler outcomes in the TIPLLM training pool. +- Use TIPLLM only for planning/extraction feedback; no external AI. diff --git a/packages/crawlee-python/pyproject.toml b/packages/crawlee-python/pyproject.toml new file mode 100644 index 0000000..4ee9c69 --- /dev/null +++ b/packages/crawlee-python/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "tip-crawlee-python-worker" +version = "0.1.0" +description = "Optional Crawlee Python worker for TIP crawler nodes" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "crawlee>=1.0.0", +] + +[project.optional-dependencies] +beautifulsoup = [ + "crawlee[beautifulsoup]>=1.0.0", +] +playwright = [ + "crawlee[playwright]>=1.0.0", + "playwright>=1.50.0", +] +all = [ + "crawlee[all]>=1.0.0", +] + +[project.scripts] +tip-crawlee-worker = "tip_crawlee_worker.__main__:main" + +[tool.ruff] +line-length = 100 diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO new file mode 100644 index 0000000..43c3ec9 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 2.4 +Name: tip-crawlee-python-worker +Version: 0.1.0 +Summary: Optional Crawlee Python worker for TIP crawler nodes +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +Requires-Dist: crawlee>=1.0.0 +Provides-Extra: beautifulsoup +Requires-Dist: crawlee[beautifulsoup]>=1.0.0; extra == "beautifulsoup" +Provides-Extra: playwright +Requires-Dist: crawlee[playwright]>=1.0.0; extra == "playwright" +Requires-Dist: playwright>=1.50.0; extra == "playwright" +Provides-Extra: all +Requires-Dist: crawlee[all]>=1.0.0; extra == "all" + +# TIP Crawlee Python Worker + +Optional Python crawler worker for Pi/Proxmox/residential nodes. + +The TypeScript scraper package remains the production crawler core. This package +exists for isolated worker experiments where Python extraction libraries are a +better fit. It writes JSONL artifacts; it does not write directly to TIP +PostgreSQL. + +## Install + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +For browser-based Python workers: + +```bash +python -m pip install -e ".[playwright]" +python -m playwright install chromium +``` + +## Smoke Run + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## TIP Policy + +- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler. +- Keep output as JSONL evidence until a deterministic importer validates it. +- Record useful crawler outcomes in the TIPLLM training pool. +- Use TIPLLM only for planning/extraction feedback; no external AI. diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt new file mode 100644 index 0000000..f3312c5 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt @@ -0,0 +1,10 @@ +README.md +pyproject.toml +tip_crawlee_python_worker.egg-info/PKG-INFO +tip_crawlee_python_worker.egg-info/SOURCES.txt +tip_crawlee_python_worker.egg-info/dependency_links.txt +tip_crawlee_python_worker.egg-info/entry_points.txt +tip_crawlee_python_worker.egg-info/requires.txt +tip_crawlee_python_worker.egg-info/top_level.txt +tip_crawlee_worker/__init__.py +tip_crawlee_worker/__main__.py \ No newline at end of file diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt new file mode 100644 index 0000000..3cabb91 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +tip-crawlee-worker = tip_crawlee_worker.__main__:main diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt new file mode 100644 index 0000000..f13c8b8 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt @@ -0,0 +1,11 @@ +crawlee>=1.0.0 + +[all] +crawlee[all]>=1.0.0 + +[beautifulsoup] +crawlee[beautifulsoup]>=1.0.0 + +[playwright] +crawlee[playwright]>=1.0.0 +playwright>=1.50.0 diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt new file mode 100644 index 0000000..1dabd13 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt @@ -0,0 +1 @@ +tip_crawlee_worker diff --git a/packages/crawlee-python/tip_crawlee_worker/__init__.py b/packages/crawlee-python/tip_crawlee_worker/__init__.py new file mode 100644 index 0000000..6886fde --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_worker/__init__.py @@ -0,0 +1,5 @@ +"""TIP optional Crawlee Python worker.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/packages/crawlee-python/tip_crawlee_worker/__main__.py b/packages/crawlee-python/tip_crawlee_worker/__main__.py new file mode 100644 index 0000000..954fad2 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_worker/__main__.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import argparse +import asyncio +import json +from datetime import timedelta +from pathlib import Path +from typing import Any + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker") + parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup") + parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.") + parser.add_argument("--out", required=True, help="JSONL output path.") + parser.add_argument("--max-requests", type=int, default=10) + parser.add_argument("--timeout-seconds", type=int, default=30) + parser.add_argument("--label", default="tip-crawlee-python") + return parser.parse_args() + + +def write_jsonl(path: Path, record: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n") + + +async def run_beautifulsoup(args: argparse.Namespace) -> None: + out = Path(args.out) + if out.exists(): + out.unlink() + + crawler = BeautifulSoupCrawler( + max_request_retries=1, + request_handler_timeout=timedelta(seconds=args.timeout_seconds), + max_requests_per_crawl=args.max_requests, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None + headings = [ + heading.get_text(" ", strip=True) + for heading in context.soup.find_all(["h1", "h2", "h3"]) + ][:25] + images = [ + img.get("src") + for img in context.soup.find_all("img") + if img.get("src") + ][:25] + links = [ + link.get("href") + for link in context.soup.find_all("a") + if link.get("href") + ][:100] + write_jsonl( + out, + { + "kind": "tip_crawlee_python_page", + "label": args.label, + "url": context.request.url, + "title": title, + "headings": headings, + "image_candidates": images, + "link_candidates": links, + }, + ) + + await crawler.run(args.url) + + +async def async_main() -> None: + args = parse_args() + if args.mode == "beautifulsoup": + await run_beautifulsoup(args) + + +def main() -> None: + asyncio.run(async_main()) + + +if __name__ == "__main__": + main() diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..6e2015f Binary files /dev/null and b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc differ diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc new file mode 100644 index 0000000..41be146 Binary files /dev/null and b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc differ diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 812525c..3c127e0 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -8,6 +8,8 @@ "build": "tsc", "dev": "tsx src/index.ts", "scrape:fs": "tsx src/scrapers/fs-com.ts", + "scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts", + "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:news": "tsx src/scrapers/news.ts", diff --git a/scripts/setup-crawlee-python-worker.sh b/scripts/setup-crawlee-python-worker.sh new file mode 100755 index 0000000..b75576b --- /dev/null +++ b/scripts/setup-crawlee-python-worker.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +VENV_DIR="${TIP_CRAWLEE_PY_VENV:-/opt/tip-crawlee-python-venv}" +EXTRA="${TIP_CRAWLEE_PY_EXTRA:-beautifulsoup}" + +python3 -m venv "$VENV_DIR" +"$VENV_DIR/bin/python" -m pip install -U pip +"$VENV_DIR/bin/python" -m pip install -e "$ROOT_DIR/packages/crawlee-python[$EXTRA]" + +cat <