feat(scraper): add SOCKS5 proxy rotation for fs-com, atgbics, gbics scrapers

Routes requests through CT130/131/132 proxy pool (192.168.178.77/76/74:1080)
when PROXY_URLS env var is set. Uses ProxyConfiguration from crawlee for
PlaywrightCrawler scrapers and socks-proxy-agent for fetch-based scrapers.
This commit is contained in:
Rene Fichtmueller 2026-04-08 08:17:49 +02:00
parent 772ce2074d
commit 240e7f46f2
4 changed files with 50 additions and 4 deletions

View File

@ -20,7 +20,8 @@
"pg-boss": "^10.1.5", "pg-boss": "^10.1.5",
"dotenv": "^16.4.7", "dotenv": "^16.4.7",
"cheerio": "^1.0.0", "cheerio": "^1.0.0",
"xml2js": "^0.6.2" "xml2js": "^0.6.2",
"socks-proxy-agent": "^8.0.5"
}, },
"devDependencies": { "devDependencies": {
"@types/pg": "^8.11.11", "@types/pg": "^8.11.11",

View File

@ -15,12 +15,23 @@
* *
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages) * Respects: robots.txt, rate limiting (2s between requests, max 50 pages)
*/ */
import { PlaywrightCrawler } from "crawlee"; import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
const BASE_URL = "https://www.atgbics.com"; const BASE_URL = "https://www.atgbics.com";
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
.map((u) => u.trim())
.filter(Boolean);
function buildProxyConfiguration(): ProxyConfiguration | undefined {
if (PROXY_URLS.length === 0) return undefined;
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
}
const CATEGORY_URLS = [ const CATEGORY_URLS = [
"/collections/sfp-transceivers/", "/collections/sfp-transceivers/",
"/collections/sfp-plus-transceivers/", "/collections/sfp-plus-transceivers/",
@ -150,12 +161,14 @@ export async function scrapeAtgbics(): Promise<void> {
const products: AtgbicsProduct[] = []; const products: AtgbicsProduct[] = [];
let pagesScraped = 0; let pagesScraped = 0;
const proxyConfiguration = buildProxyConfiguration();
const crawler = new PlaywrightCrawler({ const crawler = new PlaywrightCrawler({
maxConcurrency: 1, maxConcurrency: 1,
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1 maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
maxRequestsPerCrawl: MAX_PAGES, maxRequestsPerCrawl: MAX_PAGES,
requestHandlerTimeoutSecs: 60, requestHandlerTimeoutSecs: 60,
headless: true, headless: true,
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: { launchContext: {
launchOptions: { launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"], args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],

View File

@ -6,7 +6,7 @@
* *
* Respects: robots.txt, rate limiting (2s between requests) * Respects: robots.txt, rate limiting (2s between requests)
*/ */
import { PlaywrightCrawler } from "crawlee"; import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db"; import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash"; import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater"; import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
@ -15,6 +15,17 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
// EUR is the primary price; USD is derived (EUR → USD), never the reverse. // EUR is the primary price; USD is derived (EUR → USD), never the reverse.
const BASE_URL = "https://de.fs.com"; const BASE_URL = "https://de.fs.com";
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
.map((u) => u.trim())
.filter(Boolean);
function buildProxyConfiguration(): ProxyConfiguration | undefined {
if (PROXY_URLS.length === 0) return undefined;
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
}
const CATEGORY_URLS = [ const CATEGORY_URLS = [
"/c/1g-sfp-81", "/c/1g-sfp-81",
"/c/10g-sfp-63", "/c/10g-sfp-63",
@ -99,11 +110,13 @@ export async function scrapeFs(): Promise<void> {
const products: FsProduct[] = []; const products: FsProduct[] = [];
let pagesScraped = 0; let pagesScraped = 0;
const proxyConfiguration = buildProxyConfiguration();
const crawler = new PlaywrightCrawler({ const crawler = new PlaywrightCrawler({
maxConcurrency: 1, maxConcurrency: 1,
maxRequestsPerMinute: 15, maxRequestsPerMinute: 15,
requestHandlerTimeoutSecs: 60, requestHandlerTimeoutSecs: 60,
headless: true, headless: true,
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: { launchContext: {
launchOptions: { launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"], args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
@ -310,6 +323,7 @@ export async function scrapeFs(): Promise<void> {
maxRequestsPerMinute: 10, maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 45, requestHandlerTimeoutSecs: 45,
headless: true, headless: true,
...(proxyConfiguration ? { proxyConfiguration } : {}),
launchContext: { launchContext: {
launchOptions: { launchOptions: {
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"], args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],

View File

@ -7,6 +7,7 @@
*/ */
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
import { SocksProxyAgent } from "socks-proxy-agent";
const BASE = "https://www.gbics.com"; const BASE = "https://www.gbics.com";
const HEADERS = { const HEADERS = {
@ -15,6 +16,20 @@ const HEADERS = {
"Accept-Language": "en-US,en;q=0.9", "Accept-Language": "en-US,en;q=0.9",
}; };
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
.split(",")
.map((u) => u.trim())
.filter(Boolean);
let proxyIndex = 0;
function getNextProxyAgent(): SocksProxyAgent | undefined {
if (PROXY_URLS.length === 0) return undefined;
const url = PROXY_URLS[proxyIndex % PROXY_URLS.length];
proxyIndex++;
return new SocksProxyAgent(url);
}
const CATEGORIES = [ const CATEGORIES = [
{ path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, { path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
{ path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 }, { path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 },
@ -194,7 +209,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
} }
async function fetchPage(url: string): Promise<string> { async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); const agent = getNextProxyAgent();
const opts: RequestInit = { headers: HEADERS, signal: AbortSignal.timeout(30000) };
if (agent) (opts as Record<string, unknown>)["dispatcher"] = agent;
const resp = await fetch(url, opts);
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text(); return resp.text();
} }