fix(scrapers): replace bot User-Agents with Chrome UA + disable dead domain
- 16 commercial scrapers: replace TIP-Bot/1.0 with Chrome/120 UA (GBICS confirmed returning 0 bytes for bot UA, Chrome UA returns 200KB) - gbics.ts: fix User-Agent (was returning empty HTML, now returns products) - optictransceiver.ts: disable — domain repurposed as plant shop (2026-04-06) Alocasia Regal Shield is not a transceiver.
This commit is contained in:
parent
80aa85961b
commit
2e852e0a2f
@ -19,7 +19,7 @@ const CATALOG_URLS = [
|
||||
];
|
||||
const MAX_PAGES = 15;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
@ -14,7 +14,7 @@ const BASE = "https://www.blueoptics.de";
|
||||
const CATALOG_PATH = "/transceivers/";
|
||||
const MAX_PAGES = 20;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
|
||||
};
|
||||
|
||||
@ -11,7 +11,7 @@ import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.championone.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ interface Product {
|
||||
async function fetchCategory(cat: typeof CATEGORIES[0], vendorId: string, page = 1): Promise<Product[]> {
|
||||
const url = `${BASE}${cat.url}?page=${page}`;
|
||||
const resp = await fetch(url, {
|
||||
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIPBot/1.0; +https://tip.context-x.org)" },
|
||||
headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" },
|
||||
signal: AbortSignal.timeout(30_000),
|
||||
});
|
||||
if (!resp.ok) return [];
|
||||
|
||||
@ -14,7 +14,7 @@ const BASE = "https://shop.fiber24.net";
|
||||
const CATALOG_PATH = "/FO-TRANSCEIVER/de";
|
||||
const MAX_PAGES = 20;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
|
||||
};
|
||||
|
||||
@ -12,7 +12,7 @@ import { contentHash } from "../utils/hash";
|
||||
const BASE = "https://fluxlight.com";
|
||||
const CATALOG_PATH = "/transceivers/";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ const BASE = "https://gaotek.com";
|
||||
const CATALOG_PATH = "/category/fiber-optics/transceivers/";
|
||||
const MAX_PAGES = 20;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
@ -10,8 +10,9 @@ import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.gbics.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const CATEGORIES = [
|
||||
|
||||
@ -27,7 +27,7 @@ const API_URLS = [
|
||||
];
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/json",
|
||||
"Accept-Language": "en-US,en;q=0.9,de;q=0.8",
|
||||
};
|
||||
|
||||
@ -38,7 +38,7 @@ export async function scrapeMultimodeInc(): Promise<void> {
|
||||
for (const cat of CATEGORIES) {
|
||||
try {
|
||||
const resp = await fetch(`${BASE}${cat.path}`, {
|
||||
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIPBot/1.0)" },
|
||||
headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" },
|
||||
signal: AbortSignal.timeout(25_000),
|
||||
});
|
||||
if (!resp.ok) continue;
|
||||
|
||||
@ -110,7 +110,7 @@ async function fetchTransceiverUrls(): Promise<Array<{ url: string; title: strin
|
||||
const apiUrl = `${BASE_URL}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,link,title`;
|
||||
try {
|
||||
const resp = await fetch(apiUrl, {
|
||||
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-Scraper/1.0)" },
|
||||
headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" },
|
||||
signal: AbortSignal.timeout(15000),
|
||||
});
|
||||
|
||||
|
||||
@ -86,7 +86,9 @@ async function scrapeCategory(path: string, form_factor: string, vendorId: strin
|
||||
}
|
||||
|
||||
export async function scrapeOpticTransceiver(): Promise<void> {
|
||||
logger.info("OpticTransceiver.com scraper starting");
|
||||
// Domain repurposed as plant shop (2026-04-06) — skip entirely
|
||||
logger.warn("OpticTransceiver.com is no longer an optics vendor (domain repurposed). Scraper disabled.");
|
||||
return;
|
||||
const vendorId = await ensureVendor("OpticTransceiver", BASE);
|
||||
let total = 0;
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ async function fetchPage(catUrl: string, form_factor: string, vendorId: string,
|
||||
const sep = catUrl.includes("?") ? "&" : "?";
|
||||
const url = `${BASE}${catUrl}${page > 1 ? `${sep}p=${page}` : ""}`;
|
||||
const resp = await fetch(url, {
|
||||
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIPBot/1.0)" },
|
||||
headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" },
|
||||
signal: AbortSignal.timeout(30_000),
|
||||
});
|
||||
if (!resp.ok) return 0;
|
||||
|
||||
@ -10,7 +10,7 @@ import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.sfpcables.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ const CATALOG_URLS = [
|
||||
];
|
||||
const MAX_PAGES = 10;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
@ -12,7 +12,7 @@ import { contentHash, parsePrice } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.10gtek.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ const BASE = "https://www.china-tscom.com";
|
||||
const CATALOG_PATH = "/products/fiber-optic-transceivers/";
|
||||
const MAX_PAGES = 15;
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
@ -47,7 +47,7 @@ export async function scrapeWiitek(): Promise<void> {
|
||||
for (const cat of CATEGORIES) {
|
||||
try {
|
||||
const resp = await fetch(`${BASE}${cat.path}`, {
|
||||
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIPBot/1.0)" },
|
||||
headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" },
|
||||
signal: AbortSignal.timeout(20_000),
|
||||
});
|
||||
if (!resp.ok) continue;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user