From 60531b62509204a5c12f2afb37bfe6741bdc8a52 Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller <renefichtmueller@MacStudio-von-Rene-8.local>
Date: Sat, 9 May 2026 14:06:34 +0200
Subject: [PATCH] feat: add crawlee python worker integration

---
 docs/TIP_CRAWLEE_RUNTIME.md                   |  58 ++++++++++++
 packages/crawlee-python/README.md             |  42 +++++++++
 packages/crawlee-python/pyproject.toml        |  27 ++++++
 .../PKG-INFO                                  |  57 ++++++++++++
 .../SOURCES.txt                               |  10 +++
 .../dependency_links.txt                      |   1 +
 .../entry_points.txt                          |   2 +
 .../requires.txt                              |  11 +++
 .../top_level.txt                             |   1 +
 .../tip_crawlee_worker/__init__.py            |   5 ++
 .../tip_crawlee_worker/__main__.py            |  85 ++++++++++++++++++
 .../__pycache__/__init__.cpython-314.pyc      | Bin 0 -> 331 bytes
 .../__pycache__/__main__.cpython-314.pyc      | Bin 0 -> 6236 bytes
 packages/scraper/package.json                 |   2 +
 scripts/setup-crawlee-python-worker.sh        |  24 +++++
 15 files changed, 325 insertions(+)
 create mode 100644 docs/TIP_CRAWLEE_RUNTIME.md
 create mode 100644 packages/crawlee-python/README.md
 create mode 100644 packages/crawlee-python/pyproject.toml
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt
 create mode 100644 packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt
 create mode 100644 packages/crawlee-python/tip_crawlee_worker/__init__.py
 create mode 100644 packages/crawlee-python/tip_crawlee_worker/__main__.py
 create mode 100644 packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc
 create mode 100644 packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc
 create mode 100755 scripts/setup-crawlee-python-worker.sh

diff --git a/docs/TIP_CRAWLEE_RUNTIME.md b/docs/TIP_CRAWLEE_RUNTIME.md
new file mode 100644
index 0000000..72dc9bb
--- /dev/null
+++ b/docs/TIP_CRAWLEE_RUNTIME.md
@@ -0,0 +1,58 @@
+# TIP Crawlee Runtime
+
+## Decision
+
+TIP standardizes on Crawlee as the crawler runtime.
+
+- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright.
+- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`.
+
+## TypeScript Core
+
+The TypeScript scraper remains the canonical production path because TIP already
+uses it for DB writes, price observations, stock observations, image verification
+and detail verification.
+
+Useful FS.com commands:
+
+```bash
+pnpm -C packages/scraper run scrape:fs:db-detail
+pnpm -C packages/scraper run scrape:fs:url-discovery
+```
+
+Erik safety defaults:
+
+- keep FS.com at browser concurrency `1`
+- use bounded run caps
+- treat no-text and max-retry URLs as retry/classification classes
+- keep Crawlee storage isolated with `makeCrawleeConfig(...)`
+
+## Python Worker
+
+The Python worker is optional and should run first on Pi/Proxmox/residential
+nodes. It writes JSONL evidence and does not write directly into TIP DB.
+
+Install:
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+Smoke:
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## Training Pool
+
+Every crawler result, failure class, parser lesson and runtime safety lesson
+should be written to the TIPLLM training pool and synced through `sync/`.
diff --git a/packages/crawlee-python/README.md b/packages/crawlee-python/README.md
new file mode 100644
index 0000000..f114a7f
--- /dev/null
+++ b/packages/crawlee-python/README.md
@@ -0,0 +1,42 @@
+# TIP Crawlee Python Worker
+
+Optional Python crawler worker for Pi/Proxmox/residential nodes.
+
+The TypeScript scraper package remains the production crawler core. This package
+exists for isolated worker experiments where Python extraction libraries are a
+better fit. It writes JSONL artifacts; it does not write directly to TIP
+PostgreSQL.
+
+## Install
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+For browser-based Python workers:
+
+```bash
+python -m pip install -e ".[playwright]"
+python -m playwright install chromium
+```
+
+## Smoke Run
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## TIP Policy
+
+- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
+- Keep output as JSONL evidence until a deterministic importer validates it.
+- Record useful crawler outcomes in the TIPLLM training pool.
+- Use TIPLLM only for planning/extraction feedback; no external AI.
diff --git a/packages/crawlee-python/pyproject.toml b/packages/crawlee-python/pyproject.toml
new file mode 100644
index 0000000..4ee9c69
--- /dev/null
+++ b/packages/crawlee-python/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "tip-crawlee-python-worker"
+version = "0.1.0"
+description = "Optional Crawlee Python worker for TIP crawler nodes"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "crawlee>=1.0.0",
+]
+
+[project.optional-dependencies]
+beautifulsoup = [
+  "crawlee[beautifulsoup]>=1.0.0",
+]
+playwright = [
+  "crawlee[playwright]>=1.0.0",
+  "playwright>=1.50.0",
+]
+all = [
+  "crawlee[all]>=1.0.0",
+]
+
+[project.scripts]
+tip-crawlee-worker = "tip_crawlee_worker.__main__:main"
+
+[tool.ruff]
+line-length = 100
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO
new file mode 100644
index 0000000..43c3ec9
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/PKG-INFO
@@ -0,0 +1,57 @@
+Metadata-Version: 2.4
+Name: tip-crawlee-python-worker
+Version: 0.1.0
+Summary: Optional Crawlee Python worker for TIP crawler nodes
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: crawlee>=1.0.0
+Provides-Extra: beautifulsoup
+Requires-Dist: crawlee[beautifulsoup]>=1.0.0; extra == "beautifulsoup"
+Provides-Extra: playwright
+Requires-Dist: crawlee[playwright]>=1.0.0; extra == "playwright"
+Requires-Dist: playwright>=1.50.0; extra == "playwright"
+Provides-Extra: all
+Requires-Dist: crawlee[all]>=1.0.0; extra == "all"
+
+# TIP Crawlee Python Worker
+
+Optional Python crawler worker for Pi/Proxmox/residential nodes.
+
+The TypeScript scraper package remains the production crawler core. This package
+exists for isolated worker experiments where Python extraction libraries are a
+better fit. It writes JSONL artifacts; it does not write directly to TIP
+PostgreSQL.
+
+## Install
+
+```bash
+cd packages/crawlee-python
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -e ".[beautifulsoup]"
+```
+
+For browser-based Python workers:
+
+```bash
+python -m pip install -e ".[playwright]"
+python -m playwright install chromium
+```
+
+## Smoke Run
+
+```bash
+python -m tip_crawlee_worker \
+  --mode beautifulsoup \
+  --url https://crawlee.dev \
+  --out /tmp/tip-crawlee-python-smoke.jsonl \
+  --max-requests 1
+```
+
+## TIP Policy
+
+- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
+- Keep output as JSONL evidence until a deterministic importer validates it.
+- Record useful crawler outcomes in the TIPLLM training pool.
+- Use TIPLLM only for planning/extraction feedback; no external AI.
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt
new file mode 100644
index 0000000..f3312c5
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/SOURCES.txt
@@ -0,0 +1,10 @@
+README.md
+pyproject.toml
+tip_crawlee_python_worker.egg-info/PKG-INFO
+tip_crawlee_python_worker.egg-info/SOURCES.txt
+tip_crawlee_python_worker.egg-info/dependency_links.txt
+tip_crawlee_python_worker.egg-info/entry_points.txt
+tip_crawlee_python_worker.egg-info/requires.txt
+tip_crawlee_python_worker.egg-info/top_level.txt
+tip_crawlee_worker/__init__.py
+tip_crawlee_worker/__main__.py
\ No newline at end of file
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt
new file mode 100644
index 0000000..3cabb91
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+tip-crawlee-worker = tip_crawlee_worker.__main__:main
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt
new file mode 100644
index 0000000..f13c8b8
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/requires.txt
@@ -0,0 +1,11 @@
+crawlee>=1.0.0
+
+[all]
+crawlee[all]>=1.0.0
+
+[beautifulsoup]
+crawlee[beautifulsoup]>=1.0.0
+
+[playwright]
+crawlee[playwright]>=1.0.0
+playwright>=1.50.0
diff --git a/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt
new file mode 100644
index 0000000..1dabd13
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_python_worker.egg-info/top_level.txt
@@ -0,0 +1 @@
+tip_crawlee_worker
diff --git a/packages/crawlee-python/tip_crawlee_worker/__init__.py b/packages/crawlee-python/tip_crawlee_worker/__init__.py
new file mode 100644
index 0000000..6886fde
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_worker/__init__.py
@@ -0,0 +1,5 @@
+"""TIP optional Crawlee Python worker."""
+
+__all__ = ["__version__"]
+
+__version__ = "0.1.0"
diff --git a/packages/crawlee-python/tip_crawlee_worker/__main__.py b/packages/crawlee-python/tip_crawlee_worker/__main__.py
new file mode 100644
index 0000000..954fad2
--- /dev/null
+++ b/packages/crawlee-python/tip_crawlee_worker/__main__.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+from datetime import timedelta
+from pathlib import Path
+from typing import Any
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker")
+    parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup")
+    parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.")
+    parser.add_argument("--out", required=True, help="JSONL output path.")
+    parser.add_argument("--max-requests", type=int, default=10)
+    parser.add_argument("--timeout-seconds", type=int, default=30)
+    parser.add_argument("--label", default="tip-crawlee-python")
+    return parser.parse_args()
+
+
+def write_jsonl(path: Path, record: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+async def run_beautifulsoup(args: argparse.Namespace) -> None:
+    out = Path(args.out)
+    if out.exists():
+        out.unlink()
+
+    crawler = BeautifulSoupCrawler(
+        max_request_retries=1,
+        request_handler_timeout=timedelta(seconds=args.timeout_seconds),
+        max_requests_per_crawl=args.max_requests,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None
+        headings = [
+            heading.get_text(" ", strip=True)
+            for heading in context.soup.find_all(["h1", "h2", "h3"])
+        ][:25]
+        images = [
+            img.get("src")
+            for img in context.soup.find_all("img")
+            if img.get("src")
+        ][:25]
+        links = [
+            link.get("href")
+            for link in context.soup.find_all("a")
+            if link.get("href")
+        ][:100]
+        write_jsonl(
+            out,
+            {
+                "kind": "tip_crawlee_python_page",
+                "label": args.label,
+                "url": context.request.url,
+                "title": title,
+                "headings": headings,
+                "image_candidates": images,
+                "link_candidates": links,
+            },
+        )
+
+    await crawler.run(args.url)
+
+
+async def async_main() -> None:
+    args = parse_args()
+    if args.mode == "beautifulsoup":
+        await run_beautifulsoup(args)
+
+
+def main() -> None:
+    asyncio.run(async_main())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e2015f96e3d78b32e7c7846251c48a67c1adaaa
GIT binary patch
literal 331
zcmXv~%SyvQ6rD-c7K>dexD0j`q?x)GH*w=eO3<BfnI_kE$TTy|Of2m}N>_e~;3t_?
z!5<L%0XnhuZ0@-)&f#44PL7Gk`^ArV^7X4}{z3O-d+N!Gq{NYw#?;X?a6#JYljvaC
zf4CckN;{!SF2ls|xkL;{3pZ6|I9H}X6W3jU7c|!204xuO@l`y$k6LvbAXgbcL%5Ux
zCh$TZWZUT0uk6wKCuXq3XOT_atU@W#uv@f+Q<^1`S2>1>%8^Zkn^xn2L9HxvhL<)&
z;WY>Om}#CBe1eu`J9h*9=Z-m{VLyVMQwAVP;Q(U2s5{p)l~)pPOvl@4van4K-z`e%
UCy6%Q^Eb9?Z+d6zt~v6HU$QD=e*gdg

literal 0
HcmV?d00001

diff --git a/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc b/packages/crawlee-python/tip_crawlee_worker/__pycache__/__main__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41be146b197b96d07cc8b6f83176e473af971e4b
GIT binary patch
literal 6236
zcmbtYZ)_CD6`#G`+uQqpHa2JD10S}59&k3;5Kw}WU=#ldj&avuMTz6na<{fOKHptt
z_X=1jF-a6`rK(L63FlMz(#nxiDU~Xf_ESPqDgD$ladXE8q(rGy@y&!#O1`x3&HeFV
zOibFbZ|3dWH?wbMe(%rBE|1$mp!~MsKiY4A`W7FG#gZqu+kXaPiVPE(>ml8o!PO&e
z>9H6V*763=TEP&Y<$J8%q9HOFp~u!O84{zdJ@#&g;b64b<Lq`Bu5P#Ct|zk1Ph`nY
z;y%M8+ktj4+ABMOb}`x~yMgvFx<d8>?PGN1OB>{hgU=HZ1Kk7UrOK^jUqO2q*V4#-
zp~{=c3vs4XNhFenVra>P-UrWiLz_@zYTQuZLFiYEF=+Y2i8CRKS=Fv8X+s-H#|M(>
zR0mZ~#Z_ts9^`6?(T-%oP){3C4nBiFEVy_)Jl+lfF+~Q+u;t*JpeXPH<mcQNCv&o;
zg=-`s{wB(rB2|qvP5e-)(9x7ab+x%qnNanV5>-Dy<;9UColp!NKhR*l+Hyiysop}>
zggT-{$Bc=z8V3zqj;Q*$kxaF8#FcbR4R$1BYRjl*jHOS8shUdaEe2H*dQ{bZq0(^d
zWDATxu8gXBOO#Df4X4f+W64B|p`{`PR^(KYj>FKA$b_OLB9Z3Q8PgSs6c$sBM5qsR
za>Gxrf@+G~B@$PE3#&}kcGlhoK8a#gEH)t6`p7V$DKb>#ll}tiN(>;j5*fgEps=<n
zSj~QG`TIOATV%eOlX+P<03&0mf0(3I7G;|($@T-nI+6~MbIPuDM|2N~`H=`2cF3M}
z<-KL{TOL8)Cs)Xo2MTM$=4CH_Z>24QA5gVnvf+47e=wOsv@7vo9>Hp`pKWh2zoh^x
zXH3<jRAXY3VmLejfHB=Ci{RG*{9p~X=vY#Vs=8^5sUu1{ZcJLk;WUk#qLNCfiP-VU
z+7ohb&`1UuW}1Vtno<=*IT=@*Lp(@DQ4Ld~>MLoDsxea-Q{yQdB$+lQE1w-W*4G;Z
zGL<%hDX_!lkcHO5yyzyVCOx2*avJdaN?O$o9rO{5GbxpB#FrJ}Fq$!R4(n<(nTY9=
zws1JEoK)kJm4M>#{lE@!tXs%xN<~96J%{ONdO}SY{b+a8bSbe|1l}^9$!iJ2bTE@d
zrjEAAo+z`$AoZf6!D7YiU;&<{$aQyh*4=c~-IUpTV#z%?)tM6<S;4<3_~)gCr!qUA
z_&Aag{7b@#6~1jjy1XybsWAS@a(+{0e-Gn6TjFMFe)ge$xt(#JD&y|@L*r%2xa}qG
zygjq)DaLy`=d8#&1B=eU7tSpiVGFGQW$*J{h-7A)O>7!#`!2LI8SJQq2ejEJh7nW1
zy`yNF=+2@^8k_XQw5ai-ZqVI9aI7y;CzI<-CRB<m1>50)ChyhqD~Zm8d6XJ_u^lS(
zC_HzZyT$6c*fMJ`_2t`-j66R-k8<yUI<a|pn2!Ok0gTn9zu+s%`QUBP-;`V1L;MZ+
znu635T!c$o(tM;8f>M@2OF#O#Rb@yjGWt=&Rb?v~<X~NiEEau;Y};KXv#;O<A;3xa
zx3A>uWT}w^$u6>^AUz721&C&;C!4_&;vY&p!EYuS7qTBmvj!Xk>~&L8PiwjnNsgPG
zGHFd4BjJMpb4g7^lQ9TzognK{6MCAe5k-$`nrYXQ)QF6$XLOTu^o2xb5-^l$ofwa4
z)D)877EJ3Ysu`*&ysU%qTVv^olpf+~Bj`sFTBbOrBw}$@$0=;zp0^~^$vQ=_Pn_<D
zHw*_a0x?B$f^`1qxuaR3dQqsJk1ZU}2-QnMYtHVTt(~czYrEv1uFcv*OZL#hU{3PP
zx@KJO9>4g)uV1(%estpF*i!Y;&treDmY1Y~yF7911X1|EY7hOOBQF26=Wv_#lQw6&
zV872E1b7E3a<_v_iHF-m3#1$&Yrz)KS5O!9V3?=dkXjG&TR=#!V_&%owh_FARi<Iz
zGPaki*yD=5f*dP|u{Th^6ktOk3f};G4KcNV6ha2pjH*Q>CHVn<t$13DCPh5Oz)=JU
z$gY9uqHH}GD#fK?ZU|SE70>{eVpUm*>F^%?z^Wo!iZ+nq2DyIp4690i<thd|%Df-q
z{vh9Gkp)N-rMR!C9atpSU;SVY_&UatRi(H-T|~CN4EukYo8o(|4zwd{Be9WQaX$~1
zB;Lez26M3$;tvG84G^y)g?I(6K{be-{vsofdKxLjEpua;v>t)Lme0H%z~F|Tia{I1
zL(EJmdYngtp}B*m6OsGkfAgVuGOEP&C!5QYzMU{+395Bg$1MU?W=F>|wx0{^@j<9!
zta%5gVZeeOvgCpK5XW5Z%Ap&U5)@A$6d%1`Kbr8WQW~q}{~zneOyW6CCIH{HgHOp=
zR+MWda^*3u2tFS<VM$w>N!rp05M?2yWK=7C@Gv3s#Zc5|ISM}0PPRa=6d1v7%%Tb<
z#Xq_xvb9VLp`b+=v=9b`espE4iYyl8i)%XyyBr`#;S98156~I*p`f#{N@wd&(Aieh
z0ai{d(^(u8*VI`m%CDjGIz8^wS(NSft+Q<i^Fmez?6Lz75d+d%E=+dA?ACB&{Z+;O
zE3NE-<iI<~%f3OYTrp^CvF$4*DKcq<{jO(wt`#YvM2Q+<?rKAy$pw#tyRt%3rllV5
zyNQW*AEdbxqb9G@C=J0gB<NHfftNA%QW)0~G1FguD9oP|BPlpMnpT!>n>?NftcGU9
zRZ|*M6-*s;)2B^<Oe6}SUW+LPI6`k+ON^H>z+qa(_QJ0fer+MADd4e#K8JdUsAXc5
zcHo0F0%}B*c-)jmRU?9~&E$b%X#zcpBDR8)rM>VJa?q!s$&X|!Xhi#v^$ZG&Of}sU
zP$8HF)H*$YZ+JLQ=$M^t59Z6%6w6Lc?<yy5?)(vdxk?rnR(}Jc#lLStc>a3-p+64&
zeki-Y3;r|vyFTywTkl_cv%SNMy~EjF75+25>YVMO>w+umYgqI(WPPDUUnuKqS@g9m
z`SxbUlhf8^kpvpwKlI+AY~7y4x;<C-b}ZE$xozdE9N*L(nX@hP#9ftj1r}X_`GEx?
z;|hG~3SX-`atkG(uhVnK##ec#rDYeXslRyk!r82U_o9FI!ign++ij~*>G;OqHpl0@
zm9wX2PR*rfU;W-n+;!6(vt2V?bI<&$@0!2u7K$xPIF?<g^iGS*9#R{8-}Rm=TN7Ta
z2`@yKYFc5e3dc7!t#d-oyJ7b9%;|YC>#fJZHcua!?VjnL8~oB0xK`5&qgAlcyf9jY
zZ`%63gV?KbF7MaAjhULgIiG*N>itdcZOZwo=T2RG?ZRtW|IS7KPUKhTe4FOGbH0X4
zojKpuOZ%6-lH2nytJChd_uXcg@4fFN(DdGS4*0@7{Ylu2Hx64m4|1P~hrQ5TsSb38
zge!Y@0R3rAZD+Ia>0{MUf96EaXRWoJZNguC)mU#kya8B!As0LA&|yT3$WmzC6w?XB
z5w!u6V%Ragg*YE6<Oim!Jl?}e39tsIPH*989Vw&-CJ)HU!&u0J(w0BUQX0Xr@vAz1
zU?doPIuFi@2e3#r5MWv5TR_}H++BBgrVZSJx0|OrayG~L$#awE&%S;(Cpl*wGmc-n
z{^4oJ1vXy~hCXV_wstPIb}j|Ga`lbb`aO&Fd%m{`UU8}m@FKd;51$*(S{oOwjdv{&
z^jYM;%@eU{nLR#EoRzb?W_HcRE{<Io`@&J573!CS`g=M$yEofI?T>L+!d&}4;Yza&
z==;(g^lA`0*1tacfM@CUqa`=&SotgrtwSmWOM%BqxS)b+zKJ#YgazJ}3Mi4!ttk+z
z^XgJwiK)mM>55F)<s<~|02wNsAO}e?<UnT0t;#GpxU_^HWS0_i*A|Llpbw5sL1;|@
z?`PPi!pRvuRoNYc=}`1Di730JqX~G&X4(t{+jeL5+lQu(<ZRCKXU?6OJGNwNyzZ&L
zBz)o7mX)?$mA2jDiPUsW+ICOJO#Stp2e=Qp{lf3;2mZ&Mq2n;jx+fkyl9zUa<F?!)
zP))~J^WbhE+!-v|fmr|6aAnPlJ5^kH$l)PehQNk-)5exfOVUwP7#EId&Ay(uD*-P)
zDI$QWi{}uqBf)w8=<7${b-d&HQrOI-O{sWC8mclKk;q6Iu5|H&6j4O+CYZj0RV!4c
zguxJ9L*7fkS=<<lYbOiWCvd@qVGpljOkd#&tGQtRdfvIPq&|q_;VGGcBCkCvimD{w
z0vSGznOfOoq(gwAbX<LsPQnX3yXY7zmwArkZjgp=iT5kw`HDDikS#Yz%?+~gpXAsb
t5iWD@Y{W~LJFXBHyz@K<Msvpndj+0fa7|Ax@wL|k>8(yck+6w<?Y~-h=G*`P

literal 0
HcmV?d00001

diff --git a/packages/scraper/package.json b/packages/scraper/package.json
index 812525c..3c127e0 100644
--- a/packages/scraper/package.json
+++ b/packages/scraper/package.json
@@ -8,6 +8,8 @@
     "build": "tsc",
     "dev": "tsx src/index.ts",
     "scrape:fs": "tsx src/scrapers/fs-com.ts",
+    "scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
+    "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
     "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
     "scrape:optcore": "tsx src/scrapers/optcore.ts",
     "scrape:news": "tsx src/scrapers/news.ts",
diff --git a/scripts/setup-crawlee-python-worker.sh b/scripts/setup-crawlee-python-worker.sh
new file mode 100755
index 0000000..b75576b
--- /dev/null
+++ b/scripts/setup-crawlee-python-worker.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+VENV_DIR="${TIP_CRAWLEE_PY_VENV:-/opt/tip-crawlee-python-venv}"
+EXTRA="${TIP_CRAWLEE_PY_EXTRA:-beautifulsoup}"
+
+python3 -m venv "$VENV_DIR"
+"$VENV_DIR/bin/python" -m pip install -U pip
+"$VENV_DIR/bin/python" -m pip install -e "$ROOT_DIR/packages/crawlee-python[$EXTRA]"
+
+cat <<EOF
+TIP Crawlee Python worker installed.
+
+Venv:
+  $VENV_DIR
+
+Smoke test:
+  $VENV_DIR/bin/python -m tip_crawlee_worker \\
+    --mode beautifulsoup \\
+    --url https://crawlee.dev \\
+    --out /tmp/tip-crawlee-python-smoke.jsonl \\
+    --max-requests 1
+EOF