diff --git a/packages/crawlee-python/.gitignore b/packages/crawlee-python/.gitignore new file mode 100644 index 0000000..5cf763a --- /dev/null +++ b/packages/crawlee-python/.gitignore @@ -0,0 +1,4 @@ +*.egg-info/ +__pycache__/ +*.py[cod] +.venv/ diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO deleted file mode 100644 index 43c3ec9..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO +++ /dev/null @@ -1,57 +0,0 @@ -Metadata-Version: 2.4 -Name: tip-crawlee-python-worker -Version: 0.1.0 -Summary: Optional Crawlee Python worker for TIP crawler nodes -Requires-Python: >=3.11 -Description-Content-Type: text/markdown -Requires-Dist: crawlee>=1.0.0 -Provides-Extra: beautifulsoup -Requires-Dist: crawlee[beautifulsoup]>=1.0.0; extra == "beautifulsoup" -Provides-Extra: playwright -Requires-Dist: crawlee[playwright]>=1.0.0; extra == "playwright" -Requires-Dist: playwright>=1.50.0; extra == "playwright" -Provides-Extra: all -Requires-Dist: crawlee[all]>=1.0.0; extra == "all" - -# TIP Crawlee Python Worker - -Optional Python crawler worker for Pi/Proxmox/residential nodes. - -The TypeScript scraper package remains the production crawler core. This package -exists for isolated worker experiments where Python extraction libraries are a -better fit. It writes JSONL artifacts; it does not write directly to TIP -PostgreSQL. - -## Install - -```bash -cd packages/crawlee-python -python3 -m venv .venv -. .venv/bin/activate -python -m pip install -U pip -python -m pip install -e ".[beautifulsoup]" -``` - -For browser-based Python workers: - -```bash -python -m pip install -e ".[playwright]" -python -m playwright install chromium -``` - -## Smoke Run - -```bash -python -m tip_crawlee_worker \ - --mode beautifulsoup \ - --url https://crawlee.dev \ - --out /tmp/tip-crawlee-python-smoke.jsonl \ - --max-requests 1 -``` - -## TIP Policy - -- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler. -- Keep output as JSONL evidence until a deterministic importer validates it. -- Record useful crawler outcomes in the TIPLLM training pool. -- Use TIPLLM only for planning/extraction feedback; no external AI. diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt deleted file mode 100644 index f3312c5..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt +++ /dev/null @@ -1,10 +0,0 @@ -README.md -pyproject.toml -tip_crawlee_python_worker.egg-info/PKG-INFO -tip_crawlee_python_worker.egg-info/SOURCES.txt -tip_crawlee_python_worker.egg-info/dependency_links.txt -tip_crawlee_python_worker.egg-info/entry_points.txt -tip_crawlee_python_worker.egg-info/requires.txt -tip_crawlee_python_worker.egg-info/top_level.txt -tip_crawlee_worker/__init__.py -tip_crawlee_worker/__main__.py \ No newline at end of file diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt deleted file mode 100644 index 3cabb91..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -tip-crawlee-worker = tip_crawlee_worker.__main__:main diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt deleted file mode 100644 index f13c8b8..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt +++ /dev/null @@ -1,11 +0,0 @@ -crawlee>=1.0.0 - -[all] -crawlee[all]>=1.0.0 - -[beautifulsoup] -crawlee[beautifulsoup]>=1.0.0 - -[playwright] -crawlee[playwright]>=1.0.0 -playwright>=1.50.0 diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt deleted file mode 100644 index 1dabd13..0000000 --- a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -tip_crawlee_worker diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc deleted file mode 100644 index 6e2015f..0000000 Binary files a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc and /dev/null differ diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc deleted file mode 100644 index 41be146..0000000 Binary files a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc and /dev/null differ