From 60531b62509204a5c12f2afb37bfe6741bdc8a52 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 14:06:34 +0200 Subject: [PATCH] feat: add crawlee python worker integration --- docs/TIP_CRAWLEE_RUNTIME.md | 58 ++++++++++++ packages/crawlee-python/README.md | 42 +++++++++ packages/crawlee-python/pyproject.toml | 27 ++++++ .../PKG-INFO | 57 ++++++++++++ .../SOURCES.txt | 10 +++ .../dependency_links.txt | 1 + .../entry_points.txt | 2 + .../requires.txt | 11 +++ .../top_level.txt | 1 + .../tip_crawlee_worker/__init__.py | 5 ++ .../tip_crawlee_worker/__main__.py | 85 ++++++++++++++++++ .../__pycache__/__init__.cpython-314.pyc | Bin 0 -> 331 bytes .../__pycache__/__main__.cpython-314.pyc | Bin 0 -> 6236 bytes packages/scraper/package.json | 2 + scripts/setup-crawlee-python-worker.sh | 24 +++++ 15 files changed, 325 insertions(+) create mode 100644 docs/TIP_CRAWLEE_RUNTIME.md create mode 100644 packages/crawlee-python/README.md create mode 100644 packages/crawlee-python/pyproject.toml create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt create mode 100644 packages/crawlee-python/tip_crawlee_worker/__init__.py create mode 100644 packages/crawlee-python/tip_crawlee_worker/__main__.py create mode 100644 packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc create mode 100644 packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc create mode 100755 scripts/setup-crawlee-python-worker.sh diff --git a/docs/TIP_CRAWLEE_RUNTIME.md b/docs/TIP_CRAWLEE_RUNTIME.md new file mode 100644 index 0000000..72dc9bb --- /dev/null +++ b/docs/TIP_CRAWLEE_RUNTIME.md @@ -0,0 +1,58 @@ +# TIP Crawlee Runtime + +## Decision + +TIP standardizes on Crawlee as the crawler runtime. + +- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright. +- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`. + +## TypeScript Core + +The TypeScript scraper remains the canonical production path because TIP already +uses it for DB writes, price observations, stock observations, image verification +and detail verification. + +Useful FS.com commands: + +```bash +pnpm -C packages/scraper run scrape:fs:db-detail +pnpm -C packages/scraper run scrape:fs:url-discovery +``` + +Erik safety defaults: + +- keep FS.com at browser concurrency `1` +- use bounded run caps +- treat no-text and max-retry URLs as retry/classification classes +- keep Crawlee storage isolated with `makeCrawleeConfig(...)` + +## Python Worker + +The Python worker is optional and should run first on Pi/Proxmox/residential +nodes. It writes JSONL evidence and does not write directly into TIP DB. + +Install: + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +Smoke: + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## Training Pool + +Every crawler result, failure class, parser lesson and runtime safety lesson +should be written to the TIPLLM training pool and synced through `sync/`. diff --git a/packages/crawlee-python/README.md b/packages/crawlee-python/README.md new file mode 100644 index 0000000..f114a7f --- /dev/null +++ b/packages/crawlee-python/README.md @@ -0,0 +1,42 @@ +# TIP Crawlee Python Worker + +Optional Python crawler worker for Pi/Proxmox/residential nodes. + +The TypeScript scraper package remains the production crawler core. This package +exists for isolated worker experiments where Python extraction libraries are a +better fit. It writes JSONL artifacts; it does not write directly to TIP +PostgreSQL. + +## Install + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +For browser-based Python workers: + +```bash +python -m pip install -e ".[playwright]" +python -m playwright install chromium +``` + +## Smoke Run + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## TIP Policy + +- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler. +- Keep output as JSONL evidence until a deterministic importer validates it. +- Record useful crawler outcomes in the TIPLLM training pool. +- Use TIPLLM only for planning/extraction feedback; no external AI. diff --git a/packages/crawlee-python/pyproject.toml b/packages/crawlee-python/pyproject.toml new file mode 100644 index 0000000..4ee9c69 --- /dev/null +++ b/packages/crawlee-python/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "tip-crawlee-python-worker" +version = "0.1.0" +description = "Optional Crawlee Python worker for TIP crawler nodes" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "crawlee>=1.0.0", +] + +[project.optional-dependencies] +beautifulsoup = [ + "crawlee[beautifulsoup]>=1.0.0", +] +playwright = [ + "crawlee[playwright]>=1.0.0", + "playwright>=1.50.0", +] +all = [ + "crawlee[all]>=1.0.0", +] + +[project.scripts] +tip-crawlee-worker = "tip_crawlee_worker.__main__:main" + +[tool.ruff] +line-length = 100 diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO new file mode 100644 index 0000000..43c3ec9 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 2.4 +Name: tip-crawlee-python-worker +Version: 0.1.0 +Summary: Optional Crawlee Python worker for TIP crawler nodes +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +Requires-Dist: crawlee>=1.0.0 +Provides-Extra: beautifulsoup +Requires-Dist: crawlee[beautifulsoup]>=1.0.0; extra == "beautifulsoup" +Provides-Extra: playwright +Requires-Dist: crawlee[playwright]>=1.0.0; extra == "playwright" +Requires-Dist: playwright>=1.50.0; extra == "playwright" +Provides-Extra: all +Requires-Dist: crawlee[all]>=1.0.0; extra == "all" + +# TIP Crawlee Python Worker + +Optional Python crawler worker for Pi/Proxmox/residential nodes. + +The TypeScript scraper package remains the production crawler core. This package +exists for isolated worker experiments where Python extraction libraries are a +better fit. It writes JSONL artifacts; it does not write directly to TIP +PostgreSQL. + +## Install + +```bash +cd packages/crawlee-python +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -U pip +python -m pip install -e ".[beautifulsoup]" +``` + +For browser-based Python workers: + +```bash +python -m pip install -e ".[playwright]" +python -m playwright install chromium +``` + +## Smoke Run + +```bash +python -m tip_crawlee_worker \ + --mode beautifulsoup \ + --url https://crawlee.dev \ + --out /tmp/tip-crawlee-python-smoke.jsonl \ + --max-requests 1 +``` + +## TIP Policy + +- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler. +- Keep output as JSONL evidence until a deterministic importer validates it. +- Record useful crawler outcomes in the TIPLLM training pool. +- Use TIPLLM only for planning/extraction feedback; no external AI. diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt new file mode 100644 index 0000000..f3312c5 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt @@ -0,0 +1,10 @@ +README.md +pyproject.toml +tip_crawlee_python_worker.egg-info/PKG-INFO +tip_crawlee_python_worker.egg-info/SOURCES.txt +tip_crawlee_python_worker.egg-info/dependency_links.txt +tip_crawlee_python_worker.egg-info/entry_points.txt +tip_crawlee_python_worker.egg-info/requires.txt +tip_crawlee_python_worker.egg-info/top_level.txt +tip_crawlee_worker/__init__.py +tip_crawlee_worker/__main__.py \ No newline at end of file diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt new file mode 100644 index 0000000..3cabb91 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +tip-crawlee-worker = tip_crawlee_worker.__main__:main diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt new file mode 100644 index 0000000..f13c8b8 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt @@ -0,0 +1,11 @@ +crawlee>=1.0.0 + +[all] +crawlee[all]>=1.0.0 + +[beautifulsoup] +crawlee[beautifulsoup]>=1.0.0 + +[playwright] +crawlee[playwright]>=1.0.0 +playwright>=1.50.0 diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt new file mode 100644 index 0000000..1dabd13 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt @@ -0,0 +1 @@ +tip_crawlee_worker diff --git a/packages/crawlee-python/tip_crawlee_worker/__init__.py b/packages/crawlee-python/tip_crawlee_worker/__init__.py new file mode 100644 index 0000000..6886fde --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_worker/__init__.py @@ -0,0 +1,5 @@ +"""TIP optional Crawlee Python worker.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/packages/crawlee-python/tip_crawlee_worker/__main__.py b/packages/crawlee-python/tip_crawlee_worker/__main__.py new file mode 100644 index 0000000..954fad2 --- /dev/null +++ b/packages/crawlee-python/tip_crawlee_worker/__main__.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import argparse +import asyncio +import json +from datetime import timedelta +from pathlib import Path +from typing import Any + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker") + parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup") + parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.") + parser.add_argument("--out", required=True, help="JSONL output path.") + parser.add_argument("--max-requests", type=int, default=10) + parser.add_argument("--timeout-seconds", type=int, default=30) + parser.add_argument("--label", default="tip-crawlee-python") + return parser.parse_args() + + +def write_jsonl(path: Path, record: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n") + + +async def run_beautifulsoup(args: argparse.Namespace) -> None: + out = Path(args.out) + if out.exists(): + out.unlink() + + crawler = BeautifulSoupCrawler( + max_request_retries=1, + request_handler_timeout=timedelta(seconds=args.timeout_seconds), + max_requests_per_crawl=args.max_requests, + ) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None + headings = [ + heading.get_text(" ", strip=True) + for heading in context.soup.find_all(["h1", "h2", "h3"]) + ][:25] + images = [ + img.get("src") + for img in context.soup.find_all("img") + if img.get("src") + ][:25] + links = [ + link.get("href") + for link in context.soup.find_all("a") + if link.get("href") + ][:100] + write_jsonl( + out, + { + "kind": "tip_crawlee_python_page", + "label": args.label, + "url": context.request.url, + "title": title, + "headings": headings, + "image_candidates": images, + "link_candidates": links, + }, + ) + + await crawler.run(args.url) + + +async def async_main() -> None: + args = parse_args() + if args.mode == "beautifulsoup": + await run_beautifulsoup(args) + + +def main() -> None: + asyncio.run(async_main()) + + +if __name__ == "__main__": + main() diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e2015f96e3d78b32e7c7846251c48a67c1adaaa GIT binary patch literal 331 zcmXv~%SyvQ6rD-c7K>dexD0j`q?x)GH*w=eO3_e~;3t_? z!51>%8^Zkn^xn2L9HxvhL<)& z;WY>Om}#CBe1eu`J9h*9=Z-m{VLyVMQwAVP;Q(U2s5{p)l~)pPOvl@4van4K-z`e% UCy6%Q^Eb9?Z+d6zt~v6HU$QD=e*gdg literal 0 HcmV?d00001 diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41be146b197b96d07cc8b6f83176e473af971e4b GIT binary patch literal 6236 zcmbtYZ)_CD6`#G`+uQqpHa2JD10S}59&k3;5Kw}WU=#ldj&avuMTz6na<{fOKHptt z_X=1jF-a6`rK(L63FlMz(#nxiDU~Xf_ESPqDgD$ladXE8q(rGy@y&!#O1`x3&HeFV zOibFbZ|3dWH?wbMe(%rBE|1$mp!~MsKiY4A`W7FG#gZqu+kXaPiVPE(>ml8o!PO&e z>9H6V*763=TEP&Y<$J8%q9HOFp~u!O84{zdJ@#&g;b64b?PGN1OB>{hgU=HZ1Kk7UrOK^jUqO2q*V4#- zp~{=c3vs4XNhFenVra>P-UrWiLz_@zYTQuZLFiYEF=+Y2i8CRKS=Fv8X+s-H#|M(> zR0mZ~#Z_ts9^`6?(T-%oP){3C4nBiFEVy_)Jl+lfF+~Q+u;t*JpeXPH-Dy<;9UColp!NKhR*l+Hyiysop}> zggT-{$Bc=z8V3zqj;Q*$kxaF8#FcbR4R$1BYRjl*jHOS8shUdaEe2H*dQ{bZq0(^d zWDATxu8gXBOO#Df4X4f+W64B|p`{`PR^(KYj>FKA$b_OLB9Z3Q8PgSs6c$sBM5qsR za>Gxrf@+G~B@$PE3#&}kcGlhoK8a#gEH)t6`p7V$DKb>#ll}tiN(>;j5*fgEps=24QA5gVnvf+47e=wOsv@7vo9>Hp`pKWh2zoh^x zXH3MLoDsxea-Q{yQdB$+lQE1w-W*4G;Z zGL<%hDX_!lkcHO5yyzyVCOx2*avJdaN?O$o9rO{5GbxpB#FrJ}Fq$!R4(n<(nTY9= zws1JEoK)kJm4M>#{lE@!tXs%xN<~96J%{ONdO}SY{b+a8bSbe|1l}^9$!iJ2bTE@d zrjEAAo+z`$AoZf6!D7YiU;&<{$aQyh*4=c~-IUpTV#z%?)tM67!#`!2LI8SJQq2ejEJh7nW1 zy`yNF=+2@^8k_XQw5ai-ZqVI9aI7y;CzI<-CRB50)ChyhqD~Zm8d6XJ_u^lS( zC_HzZyT$6c*fMJ`_2t`-j66R-k8M@2OF#O#Rb@yjGWt=&Rb?v~uWT}w^$u6>^AUz721&C&;C!4_&;vY&p!EYuS7qTBmvj!Xk>~&L8PiwjnNsgPG zGHFd4BjJMpb4g7^lQ9TzognK{6MCAe5k-$`nrYXQ)QF6$XLOTu^o2xb5-^l$ofwa4 z)D)877EJ3Ysu`*&ysU%qTVv^olpf+~Bj`sFTBbOrBw}$@$0=;zp0^~^$vQ=_Pn_4g)uV1(%estpF*i!Y;&treDmY1Y~yF7911X1|EY7hOOBQF26=Wv_#lQw6& zV872E1b7E3a<_v_iHF-m3#1$&Yrz)KS5O!9V3?=dkXjG&TR=#!V_&%owh_FARiCHVn{eVpUm*>F^%?z^Wo!iZ+nq2DyIp4690it)Lme0H%z~F|Tia{I1 zL(EJmdYngtp}B*m6OsGkfAgVuGOEP&C!5QYzMU{+395Bg$1MU?W=F>|wx0{^@j<9! zta%5gVZeeOvgCpK5XW5Z%Ap&U5)@A$6d%1`Kbr8WQW~q}{~zneOyW6CCIH{HgHOp= zR+MWda^*3u2tFSN!rp05M?2yWK=7C@Gv3s#Zc5|ISM}0PPRa=6d1v7%%Tb< z#Xq_xvb9VLp`b+=v=9b`espE4iYyl8i)%XyyBr`#;S98156~I*p`f#{N@wd&(Aieh z0ai{d(^(u8*VI`m%CDjGIz8^wS(NSft+Qn>?NftcGU9 zRZ|*M6-*s;)2B^z+qa(_QJ0fer+MADd4e#K8JdUsAXc5 zcHo0F0%}B*c-)jmRU?9~&E$b%X#zcpBDR8)rM>VJa?q!s$&X|!Xhi#v^$ZG&Of}sU zP$8HF)H*$YZ+JLQ=$M^t59Z6%6w6Lc?a3-p+64& zeki-Y3;r|vyFTywTkl_cv%SNMy~EjF75+25>YVMO>w+umYgqI(WPPDUUnuKqS@g9m z`SxbUlhf8^kpvpwKlI+AY~7y4x;G;OqHpl0@ zm9wX2PR*rfU;W-n+;!6(vt2V?bI<&$@0!2u7K$xPIF?#fJZHcua!?VjnL8~oB0xK`5&qgAlcyf9jY zZ`%63gV?KbF7MaAjhULgIiG*N>itdcZOZwo=T2RG?ZRtW|IS7KPUKhTe4FOGbH0X4 zojKpuOZ%6-lH2nytJChd_uXcg@4fFN(DdGS4*0@7{Ylu2Hx64m4|1P~hrQ5TsSb38 zge!Y@0R3rAZD+Ia>0{MUf96EaXRWoJZNguC)mU#kya8B!As0LA&|yT3$WmzC6w?XB z5w!u6V%Ragg*YE6L+!d&}4;Yza& z==;(g^lA`0*1tacfM@CUqa`=&SotgrtwSmWOM%BqxS)b+zKJ#YgazJ}3Mi4!ttk+z z^XgJwiK)mM>55F)8(yck+6w