feat(scraper): add SOCKS5 proxy rotation for fs-com, atgbics, gbics scrapers
Routes requests through CT130/131/132 proxy pool (192.168.178.77/76/74:1080) when PROXY_URLS env var is set. Uses ProxyConfiguration from crawlee for PlaywrightCrawler scrapers and socks-proxy-agent for fetch-based scrapers.
This commit is contained in:
parent
3f44322a2b
commit
7050ff0802
@ -20,7 +20,8 @@
|
|||||||
"pg-boss": "^10.1.5",
|
"pg-boss": "^10.1.5",
|
||||||
"dotenv": "^16.4.7",
|
"dotenv": "^16.4.7",
|
||||||
"cheerio": "^1.0.0",
|
"cheerio": "^1.0.0",
|
||||||
"xml2js": "^0.6.2"
|
"xml2js": "^0.6.2",
|
||||||
|
"socks-proxy-agent": "^8.0.5"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/pg": "^8.11.11",
|
"@types/pg": "^8.11.11",
|
||||||
|
|||||||
@ -15,12 +15,23 @@
|
|||||||
*
|
*
|
||||||
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages)
|
* Respects: robots.txt, rate limiting (2s between requests, max 50 pages)
|
||||||
*/
|
*/
|
||||||
import { PlaywrightCrawler } from "crawlee";
|
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||||
|
|
||||||
const BASE_URL = "https://www.atgbics.com";
|
const BASE_URL = "https://www.atgbics.com";
|
||||||
|
|
||||||
|
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||||
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||||
|
.split(",")
|
||||||
|
.map((u) => u.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
||||||
|
if (PROXY_URLS.length === 0) return undefined;
|
||||||
|
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
||||||
|
}
|
||||||
|
|
||||||
const CATEGORY_URLS = [
|
const CATEGORY_URLS = [
|
||||||
"/collections/sfp-transceivers/",
|
"/collections/sfp-transceivers/",
|
||||||
"/collections/sfp-plus-transceivers/",
|
"/collections/sfp-plus-transceivers/",
|
||||||
@ -150,12 +161,14 @@ export async function scrapeAtgbics(): Promise<void> {
|
|||||||
const products: AtgbicsProduct[] = [];
|
const products: AtgbicsProduct[] = [];
|
||||||
let pagesScraped = 0;
|
let pagesScraped = 0;
|
||||||
|
|
||||||
|
const proxyConfiguration = buildProxyConfiguration();
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler({
|
||||||
maxConcurrency: 1,
|
maxConcurrency: 1,
|
||||||
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
|
maxRequestsPerMinute: 20, // ~2s between requests at concurrency 1
|
||||||
maxRequestsPerCrawl: MAX_PAGES,
|
maxRequestsPerCrawl: MAX_PAGES,
|
||||||
requestHandlerTimeoutSecs: 60,
|
requestHandlerTimeoutSecs: 60,
|
||||||
headless: true,
|
headless: true,
|
||||||
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||||
launchContext: {
|
launchContext: {
|
||||||
launchOptions: {
|
launchOptions: {
|
||||||
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
|
args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
*
|
*
|
||||||
* Respects: robots.txt, rate limiting (2s between requests)
|
* Respects: robots.txt, rate limiting (2s between requests)
|
||||||
*/
|
*/
|
||||||
import { PlaywrightCrawler } from "crawlee";
|
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||||
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
import { ensureVendor, upsertPriceObservation, findOrCreateScrapedTransceiver, pool } from "../utils/db";
|
||||||
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
import { contentHash, parsePrice, parseStockLevel, parseQuantity } from "../utils/hash";
|
||||||
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
||||||
@ -15,6 +15,17 @@ import { updateVerifiedSpecs, parseSpecTable } from "../utils/spec-updater";
|
|||||||
// EUR is the primary price; USD is derived (EUR → USD), never the reverse.
|
// EUR is the primary price; USD is derived (EUR → USD), never the reverse.
|
||||||
const BASE_URL = "https://de.fs.com";
|
const BASE_URL = "https://de.fs.com";
|
||||||
|
|
||||||
|
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||||
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||||
|
.split(",")
|
||||||
|
.map((u) => u.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
function buildProxyConfiguration(): ProxyConfiguration | undefined {
|
||||||
|
if (PROXY_URLS.length === 0) return undefined;
|
||||||
|
return new ProxyConfiguration({ proxyUrls: PROXY_URLS });
|
||||||
|
}
|
||||||
|
|
||||||
const CATEGORY_URLS = [
|
const CATEGORY_URLS = [
|
||||||
"/c/1g-sfp-81",
|
"/c/1g-sfp-81",
|
||||||
"/c/10g-sfp-63",
|
"/c/10g-sfp-63",
|
||||||
@ -99,11 +110,13 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
const products: FsProduct[] = [];
|
const products: FsProduct[] = [];
|
||||||
let pagesScraped = 0;
|
let pagesScraped = 0;
|
||||||
|
|
||||||
|
const proxyConfiguration = buildProxyConfiguration();
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler({
|
||||||
maxConcurrency: 1,
|
maxConcurrency: 1,
|
||||||
maxRequestsPerMinute: 15,
|
maxRequestsPerMinute: 15,
|
||||||
requestHandlerTimeoutSecs: 60,
|
requestHandlerTimeoutSecs: 60,
|
||||||
headless: true,
|
headless: true,
|
||||||
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||||
launchContext: {
|
launchContext: {
|
||||||
launchOptions: {
|
launchOptions: {
|
||||||
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
|
args: ["--disable-blink-features=AutomationControlled", "--lang=de-DE"],
|
||||||
@ -310,6 +323,7 @@ export async function scrapeFs(): Promise<void> {
|
|||||||
maxRequestsPerMinute: 10,
|
maxRequestsPerMinute: 10,
|
||||||
requestHandlerTimeoutSecs: 45,
|
requestHandlerTimeoutSecs: 45,
|
||||||
headless: true,
|
headless: true,
|
||||||
|
...(proxyConfiguration ? { proxyConfiguration } : {}),
|
||||||
launchContext: {
|
launchContext: {
|
||||||
launchOptions: {
|
launchOptions: {
|
||||||
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
args: ["--disable-blink-features=AutomationControlled", "--lang=en-US"],
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
*/
|
*/
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
|
import { SocksProxyAgent } from "socks-proxy-agent";
|
||||||
|
|
||||||
const BASE = "https://www.gbics.com";
|
const BASE = "https://www.gbics.com";
|
||||||
const HEADERS = {
|
const HEADERS = {
|
||||||
@ -15,6 +16,20 @@ const HEADERS = {
|
|||||||
"Accept-Language": "en-US,en;q=0.9",
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// SOCKS5 proxy pool — rotate across CT130/131/132 to avoid IP blocks
|
||||||
|
const PROXY_URLS = (process.env["PROXY_URLS"] ?? "")
|
||||||
|
.split(",")
|
||||||
|
.map((u) => u.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
let proxyIndex = 0;
|
||||||
|
function getNextProxyAgent(): SocksProxyAgent | undefined {
|
||||||
|
if (PROXY_URLS.length === 0) return undefined;
|
||||||
|
const url = PROXY_URLS[proxyIndex % PROXY_URLS.length];
|
||||||
|
proxyIndex++;
|
||||||
|
return new SocksProxyAgent(url);
|
||||||
|
}
|
||||||
|
|
||||||
const CATEGORIES = [
|
const CATEGORIES = [
|
||||||
{ path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
{ path: "/800g-osfp/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||||
{ path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 },
|
{ path: "/400g-qsfp112/", formFactor: "QSFP112", speed: "400G", speedGbps: 400 },
|
||||||
@ -194,7 +209,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function fetchPage(url: string): Promise<string> {
|
async function fetchPage(url: string): Promise<string> {
|
||||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
const agent = getNextProxyAgent();
|
||||||
|
const opts: RequestInit = { headers: HEADERS, signal: AbortSignal.timeout(30000) };
|
||||||
|
if (agent) (opts as Record<string, unknown>)["dispatcher"] = agent;
|
||||||
|
const resp = await fetch(url, opts);
|
||||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||||
return resp.text();
|
return resp.text();
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user