Compare commits

..

No commits in common. "49f0871720dbecdb7f42964892c21e9cdf0ba115" and "6ee10bf3011285171bbae5cea07807894a010901" have entirely different histories.

8 changed files with 0 additions and 247 deletions

View File

@ -1,58 +0,0 @@
# TIP Crawlee Runtime
## Decision
TIP standardizes on Crawlee as the crawler runtime.
- Production TypeScript path: `packages/scraper` with `apify/crawlee` and Playwright.
- Optional Python worker path: `packages/crawlee-python` with `apify/crawlee-python`.
## TypeScript Core
The TypeScript scraper remains the canonical production path because TIP already
uses it for DB writes, price observations, stock observations, image verification
and detail verification.
Useful FS.com commands:
```bash
pnpm -C packages/scraper run scrape:fs:db-detail
pnpm -C packages/scraper run scrape:fs:url-discovery
```
Erik safety defaults:
- keep FS.com at browser concurrency `1`
- use bounded run caps
- treat no-text and max-retry URLs as retry/classification classes
- keep Crawlee storage isolated with `makeCrawleeConfig(...)`
## Python Worker
The Python worker is optional and should run first on Pi/Proxmox/residential
nodes. It writes JSONL evidence and does not write directly into TIP DB.
Install:
```bash
cd packages/crawlee-python
python3 -m venv .venv
. .venv/bin/activate
python -m pip install -U pip
python -m pip install -e ".[beautifulsoup]"
```
Smoke:
```bash
python -m tip_crawlee_worker \
--mode beautifulsoup \
--url https://crawlee.dev \
--out /tmp/tip-crawlee-python-smoke.jsonl \
--max-requests 1
```
## Training Pool
Every crawler result, failure class, parser lesson and runtime safety lesson
should be written to the TIPLLM training pool and synced through `sync/`.

View File

@ -1,4 +0,0 @@
*.egg-info/
__pycache__/
*.py[cod]
.venv/

View File

@ -1,42 +0,0 @@
# TIP Crawlee Python Worker
Optional Python crawler worker for Pi/Proxmox/residential nodes.
The TypeScript scraper package remains the production crawler core. This package
exists for isolated worker experiments where Python extraction libraries are a
better fit. It writes JSONL artifacts; it does not write directly to TIP
PostgreSQL.
## Install
```bash
cd packages/crawlee-python
python3 -m venv .venv
. .venv/bin/activate
python -m pip install -U pip
python -m pip install -e ".[beautifulsoup]"
```
For browser-based Python workers:
```bash
python -m pip install -e ".[playwright]"
python -m playwright install chromium
```
## Smoke Run
```bash
python -m tip_crawlee_worker \
--mode beautifulsoup \
--url https://crawlee.dev \
--out /tmp/tip-crawlee-python-smoke.jsonl \
--max-requests 1
```
## TIP Policy
- Use this on Pi/Proxmox/residential nodes first, not as an Erik-heavy crawler.
- Keep output as JSONL evidence until a deterministic importer validates it.
- Record useful crawler outcomes in the TIPLLM training pool.
- Use TIPLLM only for planning/extraction feedback; no external AI.

View File

@ -1,27 +0,0 @@
[project]
name = "tip-crawlee-python-worker"
version = "0.1.0"
description = "Optional Crawlee Python worker for TIP crawler nodes"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"crawlee>=1.0.0",
]
[project.optional-dependencies]
beautifulsoup = [
"crawlee[beautifulsoup]>=1.0.0",
]
playwright = [
"crawlee[playwright]>=1.0.0",
"playwright>=1.50.0",
]
all = [
"crawlee[all]>=1.0.0",
]
[project.scripts]
tip-crawlee-worker = "tip_crawlee_worker.__main__:main"
[tool.ruff]
line-length = 100

View File

@ -1,5 +0,0 @@
"""TIP optional Crawlee Python worker."""
__all__ = ["__version__"]
__version__ = "0.1.0"

View File

@ -1,85 +0,0 @@
from __future__ import annotations
import argparse
import asyncio
import json
from datetime import timedelta
from pathlib import Path
from typing import Any
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="TIP optional Crawlee Python worker")
parser.add_argument("--mode", choices=["beautifulsoup"], default="beautifulsoup")
parser.add_argument("--url", action="append", required=True, help="URL to crawl. Repeatable.")
parser.add_argument("--out", required=True, help="JSONL output path.")
parser.add_argument("--max-requests", type=int, default=10)
parser.add_argument("--timeout-seconds", type=int, default=30)
parser.add_argument("--label", default="tip-crawlee-python")
return parser.parse_args()
def write_jsonl(path: Path, record: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
async def run_beautifulsoup(args: argparse.Namespace) -> None:
out = Path(args.out)
if out.exists():
out.unlink()
crawler = BeautifulSoupCrawler(
max_request_retries=1,
request_handler_timeout=timedelta(seconds=args.timeout_seconds),
max_requests_per_crawl=args.max_requests,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
title = context.soup.title.string.strip() if context.soup.title and context.soup.title.string else None
headings = [
heading.get_text(" ", strip=True)
for heading in context.soup.find_all(["h1", "h2", "h3"])
][:25]
images = [
img.get("src")
for img in context.soup.find_all("img")
if img.get("src")
][:25]
links = [
link.get("href")
for link in context.soup.find_all("a")
if link.get("href")
][:100]
write_jsonl(
out,
{
"kind": "tip_crawlee_python_page",
"label": args.label,
"url": context.request.url,
"title": title,
"headings": headings,
"image_candidates": images,
"link_candidates": links,
},
)
await crawler.run(args.url)
async def async_main() -> None:
args = parse_args()
if args.mode == "beautifulsoup":
await run_beautifulsoup(args)
def main() -> None:
asyncio.run(async_main())
if __name__ == "__main__":
main()

View File

@ -8,8 +8,6 @@
"build": "tsc", "build": "tsc",
"dev": "tsx src/index.ts", "dev": "tsx src/index.ts",
"scrape:fs": "tsx src/scrapers/fs-com.ts", "scrape:fs": "tsx src/scrapers/fs-com.ts",
"scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
"scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts",
"scrape:news": "tsx src/scrapers/news.ts", "scrape:news": "tsx src/scrapers/news.ts",

View File

@ -1,24 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV_DIR="${TIP_CRAWLEE_PY_VENV:-/opt/tip-crawlee-python-venv}"
EXTRA="${TIP_CRAWLEE_PY_EXTRA:-beautifulsoup}"
python3 -m venv "$VENV_DIR"
"$VENV_DIR/bin/python" -m pip install -U pip
"$VENV_DIR/bin/python" -m pip install -e "$ROOT_DIR/packages/crawlee-python[$EXTRA]"
cat <<EOF
TIP Crawlee Python worker installed.
Venv:
$VENV_DIR
Smoke test:
$VENV_DIR/bin/python -m tip_crawlee_worker \\
--mode beautifulsoup \\
--url https://crawlee.dev \\
--out /tmp/tip-crawlee-python-smoke.jsonl \\
--max-requests 1
EOF