feat(scraper): add SOCKS5 proxy rotation for fs-com, atgbics, gbics scrapers
Routes requests through CT130/131/132 proxy pool (192.168.178.77/76/74:1080) when PROXY_URLS env var is set. Uses ProxyConfiguration from crawlee for PlaywrightCrawler scrapers and socks-proxy-agent for fetch-based scrapers.
This commit is contained in:
parent
772ce2074d
commit
240e7f46f2
@ -20,7 +20,8 @@
|
||||
"pg-boss": "^10.1.5",
|
||||
"dotenv": "^16.4.7",
|
||||
"cheerio": "^1.0.0",
|
||||
"xml2js": "^0.6.2"
|
||||
"xml2js": "^0.6.2",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/pg": "^8.11.11",
|
||||
|
||||
@ -15,12 +15,23 @@
|
||||
*
|
||||
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages)
|
||||
*/
|
||||
import { PlaywrightCrawler } from "crawlee";
|
||||
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||
|
||||
const BASE_URL = "https://www.atgbics.com";
|
||||
|
||||
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||
.split(",")
|
||||
.map((u) => u.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
||||
if (PROXY_URLS.length === 0) return undefined;
|
||||
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
||||
}
|
||||
|
||||
const CATEGORY_URLS = [
|
||||
"/collections/sfp-transceivers/",
|
||||
"/collections/sfp-plus-transceivers/",
|
||||
@ -150,12 +161,14 @@ export async function scrapeAtgbics(): Promise<void> {
|
||||
const products: AtgbicsProduct[] = [];
|
||||
let pagesScraped = 0;
|
||||
|
||||
const proxyConfiguration = buildProxyConfiguration();
|
||||
const crawler = new PlaywrightCrawler({
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
|
||||
maxRequestsPerCrawl: MAX_PAGES,
|
||||
requestHandlerTimeoutSecs: 60,
|
||||
headless: true,
|
||||
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
*
|
||||
* Respects: robots.txt, rate limiting (2s between requests)
|
||||
*/
|
||||
import { PlaywrightCrawler } from "crawlee";
|
||||
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||
@ -15,6 +15,17 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||
// EUR is the primary price; USD is derived (EUR → USD), never the reverse.
|
||||
const BASE_URL = "https://de.fs.com";
|
||||
|
||||
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||
.split(",")
|
||||
.map((u) => u.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
||||
if (PROXY_URLS.length === 0) return undefined;
|
||||
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
||||
}
|
||||
|
||||
const CATEGORY_URLS = [
|
||||
"/c/1g-sfp-81",
|
||||
"/c/10g-sfp-63",
|
||||
@ -99,11 +110,13 @@ export async function scrapeFs(): Promise<void> {
|
||||
const products: FsProduct[] = [];
|
||||
let pagesScraped = 0;
|
||||
|
||||
const proxyConfiguration = buildProxyConfiguration();
|
||||
const crawler = new PlaywrightCrawler({
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 15,
|
||||
requestHandlerTimeoutSecs: 60,
|
||||
headless: true,
|
||||
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
|
||||
@ -310,6 +323,7 @@ export async function scrapeFs(): Promise<void> {
|
||||
maxRequestsPerMinute: 10,
|
||||
requestHandlerTimeoutSecs: 45,
|
||||
headless: true,
|
||||
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||
launchContext: {
|
||||
launchOptions: {
|
||||
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
import { SocksProxyAgent } from "socks-proxy-agent";
|
||||
|
||||
const BASE = "https://www.gbics.com";
|
||||
const HEADERS = {
|
||||
@ -15,6 +16,20 @@ const HEADERS = {
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||
.split(",")
|
||||
.map((u) => u.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
let proxyIndex = 0;
|
||||
function getNextProxyAgent(): SocksProxyAgent | undefined {
|
||||
if (PROXY_URLS.length === 0) return undefined;
|
||||
const url = PROXY_URLS[proxyIndex % PROXY_URLS.length];
|
||||
proxyIndex++;
|
||||
return new SocksProxyAgent(url);
|
||||
}
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 },
|
||||
@ -194,7 +209,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
const agent = getNextProxyAgent();
|
||||
const opts: RequestInit = { headers: HEADERS, signal: AbortSignal.timeout(30000) };
|
||||
if (agent) (opts as Record<string, unknown>)["dispatcher"] = agent;
|
||||
const resp = await fetch(url, opts);
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user