diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 883f9d2..9c941f2 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -20,7 +20,8 @@ "pg-boss": "^10.1.5", "dotenv": "^16.4.7", "cheerio": "^1.0.0", - "xml2js": "^0.6.2" + "xml2js": "^0.6.2", + "socks-proxy-agent": "^8.0.5" }, "devDependencies": { "@types/pg": "^8.11.11", diff --git a/packages/scraper/src/scrapers/atgbics.ts b/packages/scraper/src/scrapers/atgbics.ts index 282e1a8..c0d357d 100644 --- a/packages/scraper/src/scrapers/atgbics.ts +++ b/packages/scraper/src/scrapers/atgbics.ts @@ -15,12 +15,23 @@ * * Respects: robots.txt, rate limiting (2s between requests, max 50 pages) */ -import { PlaywrightCrawler } from "crawlee"; +import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; const BASE_URL = "https://www.atgbics.com"; +// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks +const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") + .split(",") + .map((u) => u.trim()) + .filter(Boolean); + +function buildProxyConfiguration(): ProxyConfiguration | undefined { + if (PROXY_URLS.length === 0) return undefined; + return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); +} + const CATEGORY_URLS = [ "/collections/sfp-transceivers/", "/collections/sfp-plus-transceivers/", @@ -150,12 +161,14 @@ export async function scrapeAtgbics(): Promise { const products: AtgbicsProduct[] = []; let pagesScraped = 0; + const proxyConfiguration = buildProxyConfiguration(); const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1 maxRequestsPerCrawl: MAX_PAGES, requestHandlerTimeoutSecs: 60, headless: true, + ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"], diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 9f48088..e31fd6c 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -6,7 +6,7 @@ * * Respects: robots.txt, rate limiting (2s between requests) */ -import { PlaywrightCrawler } from "crawlee"; +import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; @@ -15,6 +15,17 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; // EUR is the primary price; USD is derived (EUR → USD), never the reverse. const BASE_URL = "https://de.fs.com"; +// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks +const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") + .split(",") + .map((u) => u.trim()) + .filter(Boolean); + +function buildProxyConfiguration(): ProxyConfiguration | undefined { + if (PROXY_URLS.length === 0) return undefined; + return new ProxyConfiguration({ proxyUrls: PROXY_URLS }); +} + const CATEGORY_URLS = [ "/c/1g-sfp-81", "/c/10g-sfp-63", @@ -99,11 +110,13 @@ export async function scrapeFs(): Promise { const products: FsProduct[] = []; let pagesScraped = 0; + const proxyConfiguration = buildProxyConfiguration(); const crawler = new PlaywrightCrawler({ maxConcurrency: 1, maxRequestsPerMinute: 15, requestHandlerTimeoutSecs: 60, headless: true, + ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], @@ -310,6 +323,7 @@ export async function scrapeFs(): Promise { maxRequestsPerMinute: 10, requestHandlerTimeoutSecs: 45, headless: true, + ...(proxyConfiguration ? { proxyConfiguration } : {}), launchContext: { launchOptions: { args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], diff --git a/packages/scraper/src/scrapers/gbics.ts b/packages/scraper/src/scrapers/gbics.ts index 86163b1..93edb69 100644 --- a/packages/scraper/src/scrapers/gbics.ts +++ b/packages/scraper/src/scrapers/gbics.ts @@ -7,6 +7,7 @@ */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; +import { SocksProxyAgent } from "socks-proxy-agent"; const BASE = "https://www.gbics.com"; const HEADERS = { @@ -15,6 +16,20 @@ const HEADERS = { "Accept-Language": "en-US,en;q=0.9", }; +// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks +const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") + .split(",") + .map((u) => u.trim()) + .filter(Boolean); + +let proxyIndex = 0; +function getNextProxyAgent(): SocksProxyAgent | undefined { + if (PROXY_URLS.length === 0) return undefined; + const url = PROXY_URLS[proxyIndex % PROXY_URLS.length]; + proxyIndex++; + return new SocksProxyAgent(url); +} + const CATEGORIES = [ { path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, { path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 }, @@ -194,7 +209,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product } async function fetchPage(url: string): Promise { - const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + const agent = getNextProxyAgent(); + const opts: RequestInit = { headers: HEADERS, signal: AbortSignal.timeout(30000) }; + if (agent) (opts as Record)["dispatcher"] = agent; + const resp = await fetch(url, opts); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); return resp.text(); }