transceiver-db/packages/scraper/src/utils/backfill-images.ts
Rene Fichtmueller edc9311d7b feat: add proxy network, image backfill, and scraper improvements
- Add TIP Proxy Network (packages/proxy-agent): SOCKS5 proxy agent
  for residential IP bypass of CloudFront WAF blocks
- Add /api/proxy/* routes: node registration, heartbeat, load balancing
- Add image extraction to Flexoptix catalog scraper (GraphQL small_image)
- Add image extraction to Optcore scraper (Playwright gallery img)
- Fix Fluxlight price scraping (BigCommerce HTML structure: data-product-price-without-tax)
- Add SmartOptics scraper (8 DWDM/coherent products, og:image extraction)
- Fix findOrCreateScrapedTransceiver to update image_url for existing records
- Add image backfill script (backfill-images.ts): 178 Flexoptix images added
- Fix DB connection pool: max 5, idleTimeoutMillis 10s (was unlimited, caused >100 connections)
- Add proxy.ts utility for scraper proxy rotation
2026-04-03 21:13:03 +02:00

541 lines
16 KiB
TypeScript

/**
* Image Backfill Script
*
* Fills `image_url` for transceivers that currently have none.
* Priority order:
* 1. Optcore (2,580 products) — WP REST API featured_media per product
* 2. Flexoptix (344 products) — Magento GraphQL image fields
* 3. GAO Tek (413 products) — product page og:image scrape
* 4. Other vendors — og:image from stored price_observations URLs
*
* Run on Erik:
* node packages/scraper/dist/utils/backfill-images.js
*
* Or locally with SSH port-forward:
* ssh -L 5433:127.0.0.1:5433 erik -N &
* node dist/utils/backfill-images.js
*/
import { pool } from "./db";
import { logger } from "./logger";
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
async function updateImageUrl(id: string, imageUrl: string): Promise<void> {
await pool.query(
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`,
[imageUrl, id]
);
}
async function fetchJson(url: string, init?: RequestInit): Promise<unknown> {
const resp = await fetch(url, {
...init,
headers: {
"User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)",
Accept: "application/json",
...(init?.headers ?? {}),
},
signal: AbortSignal.timeout(20000),
});
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.json();
}
async function fetchHtml(url: string): Promise<string> {
const resp = await fetch(url, {
headers: {
"User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)",
Accept: "text/html",
},
signal: AbortSignal.timeout(20000),
});
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
// =============================================================================
// Optcore
// Strategy:
// Step 1 — enumerate all products per category, collect slug -> featured_media id
// Step 2 — resolve unique media IDs individually via /wp-json/wp/v2/media/<id>
// Step 3 — update DB for matched slugs
//
// Note: _embed does NOT work for the "product" custom post type on Optcore.
// We must resolve media IDs separately.
// =============================================================================
const OPTCORE_BASE = "https://www.optcore.net";
const OPTCORE_CATEGORY_IDS = [
309, 173, 76, 79, 73, 311, 313, 312, 333, 1088,
59, 1102, 4097, 77, 4101, 4092, 6441,
];
interface OptcoreWpProductRaw {
slug: string;
featured_media: number;
}
interface OptcoreMediaRaw {
id: number;
source_url: string;
media_details?: {
sizes?: {
medium?: { source_url: string };
large?: { source_url: string };
};
};
}
async function fetchOptcoreMediaUrl(mediaId: number): Promise<string | null> {
if (mediaId === 0) return null;
try {
const data = await fetchJson(
`${OPTCORE_BASE}/wp-json/wp/v2/media/${mediaId}?_fields=id,source_url,media_details`
) as OptcoreMediaRaw;
return (
data.media_details?.sizes?.medium?.source_url ||
data.media_details?.sizes?.large?.source_url ||
data.source_url ||
null
);
} catch {
return null;
}
}
async function backfillOptcore(): Promise<{ updated: number; skipped: number; notFound: number }> {
logger.info("=== Optcore image backfill starting ===");
const rows = await pool.query(`
SELECT t.id, t.part_number
FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
WHERE v.name = 'Optcore' AND (t.image_url IS NULL OR t.image_url = '')
`);
// Map: part_number (lowercase) -> DB id
const byPartNumber = new Map<string, string>();
for (const row of rows.rows) {
byPartNumber.set((row.part_number as string).toLowerCase(), row.id as string);
}
logger.info(`Optcore: ${byPartNumber.size} products need images`);
if (byPartNumber.size === 0) {
return { updated: 0, skipped: 0, notFound: 0 };
}
// Step 1: enumerate all products across all transceiver categories
const slugToMediaId = new Map<string, number>();
const seenSlugs = new Set<string>();
for (const catId of OPTCORE_CATEGORY_IDS) {
let page = 1;
while (true) {
const apiUrl = `${OPTCORE_BASE}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,featured_media`;
try {
const resp = await fetch(apiUrl, {
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)" },
signal: AbortSignal.timeout(20000),
});
if (!resp.ok) break;
const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1", 10);
const products = (await resp.json()) as OptcoreWpProductRaw[];
for (const p of products) {
if (seenSlugs.has(p.slug)) continue;
seenSlugs.add(p.slug);
if (p.featured_media) {
slugToMediaId.set(p.slug.toLowerCase(), p.featured_media);
}
}
if (page >= totalPages) break;
page++;
} catch (err) {
logger.warn(`Optcore cat=${catId} page=${page} fetch failed`, { error: (err as Error).message });
break;
}
await sleep(100);
}
}
logger.info(`Optcore: enumerated ${seenSlugs.size} WP products, ${slugToMediaId.size} have featured media`);
// Step 2: resolve unique media IDs
const uniqueMediaIds = new Set(slugToMediaId.values());
logger.info(`Optcore: resolving ${uniqueMediaIds.size} unique media IDs`);
const mediaIdToUrl = new Map<number, string>();
let resolved = 0;
for (const mediaId of uniqueMediaIds) {
const url = await fetchOptcoreMediaUrl(mediaId);
if (url) mediaIdToUrl.set(mediaId, url);
resolved++;
if (resolved % 20 === 0) {
logger.info(`Optcore media resolution: ${resolved}/${uniqueMediaIds.size}`);
}
await sleep(100);
}
// Step 3: match slugs to DB records and update
let updated = 0;
let skipped = 0;
let notFound = 0;
for (const [slug, mediaId] of slugToMediaId) {
const dbId = byPartNumber.get(slug);
if (!dbId) {
notFound++;
continue;
}
const imgUrl = mediaIdToUrl.get(mediaId);
if (!imgUrl) {
skipped++;
continue;
}
await updateImageUrl(dbId, imgUrl);
byPartNumber.delete(slug);
updated++;
if (updated % 50 === 0) {
logger.info(`Optcore DB updates: ${updated} so far`);
}
}
logger.info(`Optcore done: ${updated} updated, ${skipped} no-media-url, ${notFound} slug-not-in-db`);
return { updated, skipped, notFound };
}
// =============================================================================
// Flexoptix
// Magento GraphQL: /graphql — query by SKU, returns small_image.url
// =============================================================================
const FLEXOPTIX_BASE = "https://www.flexoptix.net";
const FLEXOPTIX_GRAPHQL = `${FLEXOPTIX_BASE}/graphql`;
interface FlexoptixGqlProduct {
name: string;
sku: string;
url_key: string;
small_image: { url: string } | null;
image: { url: string } | null;
thumbnail: { url: string } | null;
}
async function fetchFlexoptixImage(sku: string): Promise<string | null> {
// Strip ":Sx" coding suffix to get canonical SKU for search
const canonicalSku = sku.replace(/:.*$/, "").trim();
const query = `{
products(search: ${JSON.stringify(canonicalSku)}, pageSize: 5) {
items {
name
sku
url_key
small_image { url }
image { url }
thumbnail { url }
}
}
}`;
const data = await fetchJson(FLEXOPTIX_GRAPHQL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ query }),
}) as {
data?: { products?: { items: FlexoptixGqlProduct[] } };
};
const items = data.data?.products?.items ?? [];
// Prefer exact SKU match (canonical, case-insensitive)
for (const item of items) {
const itemSku = item.sku.replace(/:.*$/, "").trim().toLowerCase();
if (itemSku === canonicalSku.toLowerCase()) {
return (
item.small_image?.url ||
item.image?.url ||
item.thumbnail?.url ||
null
);
}
}
// Fallback: single result
if (items.length === 1) {
return (
items[0].small_image?.url ||
items[0].image?.url ||
items[0].thumbnail?.url ||
null
);
}
return null;
}
async function backfillFlexoptix(): Promise<{ updated: number; skipped: number; errors: number }> {
logger.info("=== Flexoptix image backfill starting ===");
const rows = await pool.query(`
SELECT t.id, t.part_number
FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
ORDER BY t.part_number
`);
logger.info(`Flexoptix: ${rows.rows.length} products need images`);
let updated = 0;
let skipped = 0;
let errors = 0;
for (const row of rows.rows) {
const sku = row.part_number as string;
try {
const imgUrl = await fetchFlexoptixImage(sku);
if (imgUrl) {
await updateImageUrl(row.id as string, imgUrl);
updated++;
if (updated % 25 === 0) {
logger.info(`Flexoptix progress: ${updated} updated so far`);
}
} else {
skipped++;
}
} catch (err) {
logger.error(`Flexoptix error for ${sku}`, { error: (err as Error).message });
errors++;
}
await sleep(100);
}
logger.info(`Flexoptix done: ${updated} updated, ${skipped} no-image, ${errors} errors`);
return { updated, skipped, errors };
}
// =============================================================================
// GAO Tek
// Fetch product page and extract og:image meta tag
// =============================================================================
const GAOTEK_BASE = "https://gaotek.com";
function extractOgImage(html: string): string | null {
const match =
html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);
if (!match) return null;
const url = match[1];
// Skip placeholder and logo images
if (url.includes("placeholder") || url.includes("logo") || url.includes("mobilelogo")) return null;
return url;
}
async function fetchGaoTekImage(productUrl: string): Promise<string | null> {
const html = await fetchHtml(productUrl);
return extractOgImage(html);
}
async function backfillGaoTek(): Promise<{ updated: number; skipped: number; errors: number }> {
logger.info("=== GAO Tek image backfill starting ===");
// Prefer records that have stored product URLs in price_observations
const withUrlRows = await pool.query(`
SELECT DISTINCT ON (t.id)
t.id, t.part_number, t.slug, po.url as product_url
FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
LEFT JOIN price_observations po ON po.transceiver_id = t.id
WHERE v.name = 'GAO Tek'
AND (t.image_url IS NULL OR t.image_url = '')
AND po.url IS NOT NULL
AND po.url LIKE '%gaotek%'
ORDER BY t.id, po.time DESC
`);
// All records without images
const allRows = await pool.query(`
SELECT t.id, t.part_number, t.slug
FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
WHERE v.name = 'GAO Tek'
AND (t.image_url IS NULL OR t.image_url = '')
`);
const withUrlIds = new Set(withUrlRows.rows.map((r) => r.id as string));
// Combine: known URLs first, then slug-derived URLs
const toProcess: Array<{ id: string; partNumber: string; productUrl: string }> = [];
for (const row of withUrlRows.rows) {
toProcess.push({
id: row.id as string,
partNumber: row.part_number as string,
productUrl: row.product_url as string,
});
}
for (const row of allRows.rows) {
if (withUrlIds.has(row.id as string)) continue;
const rawSlug = (row.slug as string).replace(/^scraped-/, "");
toProcess.push({
id: row.id as string,
partNumber: row.part_number as string,
productUrl: `${GAOTEK_BASE}/product/${rawSlug}/`,
});
}
logger.info(`GAO Tek: ${toProcess.length} products to attempt`);
let updated = 0;
let skipped = 0;
let errors = 0;
for (const item of toProcess) {
try {
const imgUrl = await fetchGaoTekImage(item.productUrl);
if (imgUrl) {
await updateImageUrl(item.id, imgUrl);
updated++;
if (updated % 25 === 0) {
logger.info(`GAO Tek progress: ${updated} updated so far`);
}
} else {
skipped++;
}
} catch (err) {
const msg = (err as Error).message;
if (!msg.includes("HTTP 404") && !msg.includes("HTTP 403")) {
logger.warn(`GAO Tek error for ${item.productUrl}`, { error: msg.slice(0, 80) });
}
errors++;
}
await sleep(100);
}
logger.info(`GAO Tek done: ${updated} updated, ${skipped} no-image, ${errors} errors/404s`);
return { updated, skipped, errors };
}
// =============================================================================
// Other vendors — og:image from stored price_observations URLs
// =============================================================================
const OTHER_VENDOR_NAMES = [
"T&S Communication",
"Ascent Optics",
"ATGBICS",
"Skylane Optics",
"SmartOptics",
"ProLabs",
"FS.COM",
"GBICS",
"Fluxlight",
];
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {
logger.info("=== Other vendors og:image backfill starting ===");
const rows = await pool.query(`
SELECT DISTINCT ON (t.id)
t.id, t.part_number, v.name as vendor_name, po.url as product_url
FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id
JOIN price_observations po ON po.transceiver_id = t.id
WHERE v.name = ANY($1)
AND (t.image_url IS NULL OR t.image_url = '')
AND po.url IS NOT NULL
AND po.url ~ '^https?://'
ORDER BY t.id, po.time DESC
`, [OTHER_VENDOR_NAMES]);
logger.info(`Other vendors: ${rows.rows.length} products with URLs to process`);
let updated = 0;
for (const row of rows.rows) {
const productUrl = row.product_url as string;
try {
const html = await fetchHtml(productUrl);
const imgUrl = extractOgImage(html);
if (imgUrl) {
await updateImageUrl(row.id as string, imgUrl);
updated++;
}
} catch {
// Skip 404s, timeouts, etc. silently
}
await sleep(100);
}
logger.info(`Other vendors done: ${updated} / ${rows.rows.length} updated`);
return { total: rows.rows.length, updated };
}
// =============================================================================
// Main
// =============================================================================
async function main(): Promise<void> {
logger.info("=== TIP Image Backfill Script ===");
logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`);
try {
await pool.query("SELECT 1");
logger.info("DB connection OK");
} catch (err) {
logger.error("DB connection failed", { error: (err as Error).message });
process.exit(1);
}
const startTime = Date.now();
const results: Record<string, unknown> = {};
try {
results.optcore = await backfillOptcore();
} catch (err) {
logger.error("Optcore backfill failed", { error: (err as Error).message });
results.optcore = { error: (err as Error).message };
}
try {
results.flexoptix = await backfillFlexoptix();
} catch (err) {
logger.error("Flexoptix backfill failed", { error: (err as Error).message });
results.flexoptix = { error: (err as Error).message };
}
try {
results.gaotek = await backfillGaoTek();
} catch (err) {
logger.error("GAO Tek backfill failed", { error: (err as Error).message });
results.gaotek = { error: (err as Error).message };
}
try {
results.others = await backfillOtherVendors();
} catch (err) {
logger.error("Other vendors backfill failed", { error: (err as Error).message });
results.others = { error: (err as Error).message };
}
const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1);
logger.info("=== Backfill complete ===", { results, elapsedSec });
}
main()
.then(() => pool.end())
.catch((err) => {
logger.error("Fatal error", { error: (err as Error).message });
pool.end();
process.exit(1);
});