- Add TIP Proxy Network (packages/proxy-agent): SOCKS5 proxy agent for residential IP bypass of CloudFront WAF blocks - Add /api/proxy/* routes: node registration, heartbeat, load balancing - Add image extraction to Flexoptix catalog scraper (GraphQL small_image) - Add image extraction to Optcore scraper (Playwright gallery img) - Fix Fluxlight price scraping (BigCommerce HTML structure: data-product-price-without-tax) - Add SmartOptics scraper (8 DWDM/coherent products, og:image extraction) - Fix findOrCreateScrapedTransceiver to update image_url for existing records - Add image backfill script (backfill-images.ts): 178 Flexoptix images added - Fix DB connection pool: max 5, idleTimeoutMillis 10s (was unlimited, caused >100 connections) - Add proxy.ts utility for scraper proxy rotation
541 lines
16 KiB
TypeScript
541 lines
16 KiB
TypeScript
/**
|
|
* Image Backfill Script
|
|
*
|
|
* Fills `image_url` for transceivers that currently have none.
|
|
* Priority order:
|
|
* 1. Optcore (2,580 products) — WP REST API featured_media per product
|
|
* 2. Flexoptix (344 products) — Magento GraphQL image fields
|
|
* 3. GAO Tek (413 products) — product page og:image scrape
|
|
* 4. Other vendors — og:image from stored price_observations URLs
|
|
*
|
|
* Run on Erik:
|
|
* node packages/scraper/dist/utils/backfill-images.js
|
|
*
|
|
* Or locally with SSH port-forward:
|
|
* ssh -L 5433:127.0.0.1:5433 erik -N &
|
|
* node dist/utils/backfill-images.js
|
|
*/
|
|
|
|
import { pool } from "./db";
|
|
import { logger } from "./logger";
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((r) => setTimeout(r, ms));
|
|
}
|
|
|
|
async function updateImageUrl(id: string, imageUrl: string): Promise<void> {
|
|
await pool.query(
|
|
`UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`,
|
|
[imageUrl, id]
|
|
);
|
|
}
|
|
|
|
async function fetchJson(url: string, init?: RequestInit): Promise<unknown> {
|
|
const resp = await fetch(url, {
|
|
...init,
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)",
|
|
Accept: "application/json",
|
|
...(init?.headers ?? {}),
|
|
},
|
|
signal: AbortSignal.timeout(20000),
|
|
});
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
|
return resp.json();
|
|
}
|
|
|
|
async function fetchHtml(url: string): Promise<string> {
|
|
const resp = await fetch(url, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)",
|
|
Accept: "text/html",
|
|
},
|
|
signal: AbortSignal.timeout(20000),
|
|
});
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
|
return resp.text();
|
|
}
|
|
|
|
// =============================================================================
|
|
// Optcore
|
|
// Strategy:
|
|
// Step 1 — enumerate all products per category, collect slug -> featured_media id
|
|
// Step 2 — resolve unique media IDs individually via /wp-json/wp/v2/media/<id>
|
|
// Step 3 — update DB for matched slugs
|
|
//
|
|
// Note: _embed does NOT work for the "product" custom post type on Optcore.
|
|
// We must resolve media IDs separately.
|
|
// =============================================================================
|
|
|
|
const OPTCORE_BASE = "https://www.optcore.net";
|
|
const OPTCORE_CATEGORY_IDS = [
|
|
309, 173, 76, 79, 73, 311, 313, 312, 333, 1088,
|
|
59, 1102, 4097, 77, 4101, 4092, 6441,
|
|
];
|
|
|
|
interface OptcoreWpProductRaw {
|
|
slug: string;
|
|
featured_media: number;
|
|
}
|
|
|
|
interface OptcoreMediaRaw {
|
|
id: number;
|
|
source_url: string;
|
|
media_details?: {
|
|
sizes?: {
|
|
medium?: { source_url: string };
|
|
large?: { source_url: string };
|
|
};
|
|
};
|
|
}
|
|
|
|
async function fetchOptcoreMediaUrl(mediaId: number): Promise<string | null> {
|
|
if (mediaId === 0) return null;
|
|
try {
|
|
const data = await fetchJson(
|
|
`${OPTCORE_BASE}/wp-json/wp/v2/media/${mediaId}?_fields=id,source_url,media_details`
|
|
) as OptcoreMediaRaw;
|
|
return (
|
|
data.media_details?.sizes?.medium?.source_url ||
|
|
data.media_details?.sizes?.large?.source_url ||
|
|
data.source_url ||
|
|
null
|
|
);
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function backfillOptcore(): Promise<{ updated: number; skipped: number; notFound: number }> {
|
|
logger.info("=== Optcore image backfill starting ===");
|
|
|
|
const rows = await pool.query(`
|
|
SELECT t.id, t.part_number
|
|
FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
WHERE v.name = 'Optcore' AND (t.image_url IS NULL OR t.image_url = '')
|
|
`);
|
|
|
|
// Map: part_number (lowercase) -> DB id
|
|
const byPartNumber = new Map<string, string>();
|
|
for (const row of rows.rows) {
|
|
byPartNumber.set((row.part_number as string).toLowerCase(), row.id as string);
|
|
}
|
|
logger.info(`Optcore: ${byPartNumber.size} products need images`);
|
|
|
|
if (byPartNumber.size === 0) {
|
|
return { updated: 0, skipped: 0, notFound: 0 };
|
|
}
|
|
|
|
// Step 1: enumerate all products across all transceiver categories
|
|
const slugToMediaId = new Map<string, number>();
|
|
const seenSlugs = new Set<string>();
|
|
|
|
for (const catId of OPTCORE_CATEGORY_IDS) {
|
|
let page = 1;
|
|
while (true) {
|
|
const apiUrl = `${OPTCORE_BASE}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,featured_media`;
|
|
try {
|
|
const resp = await fetch(apiUrl, {
|
|
headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)" },
|
|
signal: AbortSignal.timeout(20000),
|
|
});
|
|
if (!resp.ok) break;
|
|
const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1", 10);
|
|
const products = (await resp.json()) as OptcoreWpProductRaw[];
|
|
|
|
for (const p of products) {
|
|
if (seenSlugs.has(p.slug)) continue;
|
|
seenSlugs.add(p.slug);
|
|
if (p.featured_media) {
|
|
slugToMediaId.set(p.slug.toLowerCase(), p.featured_media);
|
|
}
|
|
}
|
|
|
|
if (page >= totalPages) break;
|
|
page++;
|
|
} catch (err) {
|
|
logger.warn(`Optcore cat=${catId} page=${page} fetch failed`, { error: (err as Error).message });
|
|
break;
|
|
}
|
|
await sleep(100);
|
|
}
|
|
}
|
|
|
|
logger.info(`Optcore: enumerated ${seenSlugs.size} WP products, ${slugToMediaId.size} have featured media`);
|
|
|
|
// Step 2: resolve unique media IDs
|
|
const uniqueMediaIds = new Set(slugToMediaId.values());
|
|
logger.info(`Optcore: resolving ${uniqueMediaIds.size} unique media IDs`);
|
|
|
|
const mediaIdToUrl = new Map<number, string>();
|
|
let resolved = 0;
|
|
for (const mediaId of uniqueMediaIds) {
|
|
const url = await fetchOptcoreMediaUrl(mediaId);
|
|
if (url) mediaIdToUrl.set(mediaId, url);
|
|
resolved++;
|
|
if (resolved % 20 === 0) {
|
|
logger.info(`Optcore media resolution: ${resolved}/${uniqueMediaIds.size}`);
|
|
}
|
|
await sleep(100);
|
|
}
|
|
|
|
// Step 3: match slugs to DB records and update
|
|
let updated = 0;
|
|
let skipped = 0;
|
|
let notFound = 0;
|
|
|
|
for (const [slug, mediaId] of slugToMediaId) {
|
|
const dbId = byPartNumber.get(slug);
|
|
if (!dbId) {
|
|
notFound++;
|
|
continue;
|
|
}
|
|
|
|
const imgUrl = mediaIdToUrl.get(mediaId);
|
|
if (!imgUrl) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
await updateImageUrl(dbId, imgUrl);
|
|
byPartNumber.delete(slug);
|
|
updated++;
|
|
if (updated % 50 === 0) {
|
|
logger.info(`Optcore DB updates: ${updated} so far`);
|
|
}
|
|
}
|
|
|
|
logger.info(`Optcore done: ${updated} updated, ${skipped} no-media-url, ${notFound} slug-not-in-db`);
|
|
return { updated, skipped, notFound };
|
|
}
|
|
|
|
// =============================================================================
|
|
// Flexoptix
|
|
// Magento GraphQL: /graphql — query by SKU, returns small_image.url
|
|
// =============================================================================
|
|
|
|
const FLEXOPTIX_BASE = "https://www.flexoptix.net";
|
|
const FLEXOPTIX_GRAPHQL = `${FLEXOPTIX_BASE}/graphql`;
|
|
|
|
interface FlexoptixGqlProduct {
|
|
name: string;
|
|
sku: string;
|
|
url_key: string;
|
|
small_image: { url: string } | null;
|
|
image: { url: string } | null;
|
|
thumbnail: { url: string } | null;
|
|
}
|
|
|
|
async function fetchFlexoptixImage(sku: string): Promise<string | null> {
|
|
// Strip ":Sx" coding suffix to get canonical SKU for search
|
|
const canonicalSku = sku.replace(/:.*$/, "").trim();
|
|
|
|
const query = `{
|
|
products(search: ${JSON.stringify(canonicalSku)}, pageSize: 5) {
|
|
items {
|
|
name
|
|
sku
|
|
url_key
|
|
small_image { url }
|
|
image { url }
|
|
thumbnail { url }
|
|
}
|
|
}
|
|
}`;
|
|
|
|
const data = await fetchJson(FLEXOPTIX_GRAPHQL, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({ query }),
|
|
}) as {
|
|
data?: { products?: { items: FlexoptixGqlProduct[] } };
|
|
};
|
|
|
|
const items = data.data?.products?.items ?? [];
|
|
|
|
// Prefer exact SKU match (canonical, case-insensitive)
|
|
for (const item of items) {
|
|
const itemSku = item.sku.replace(/:.*$/, "").trim().toLowerCase();
|
|
if (itemSku === canonicalSku.toLowerCase()) {
|
|
return (
|
|
item.small_image?.url ||
|
|
item.image?.url ||
|
|
item.thumbnail?.url ||
|
|
null
|
|
);
|
|
}
|
|
}
|
|
|
|
// Fallback: single result
|
|
if (items.length === 1) {
|
|
return (
|
|
items[0].small_image?.url ||
|
|
items[0].image?.url ||
|
|
items[0].thumbnail?.url ||
|
|
null
|
|
);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function backfillFlexoptix(): Promise<{ updated: number; skipped: number; errors: number }> {
|
|
logger.info("=== Flexoptix image backfill starting ===");
|
|
|
|
const rows = await pool.query(`
|
|
SELECT t.id, t.part_number
|
|
FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
|
|
ORDER BY t.part_number
|
|
`);
|
|
|
|
logger.info(`Flexoptix: ${rows.rows.length} products need images`);
|
|
|
|
let updated = 0;
|
|
let skipped = 0;
|
|
let errors = 0;
|
|
|
|
for (const row of rows.rows) {
|
|
const sku = row.part_number as string;
|
|
try {
|
|
const imgUrl = await fetchFlexoptixImage(sku);
|
|
if (imgUrl) {
|
|
await updateImageUrl(row.id as string, imgUrl);
|
|
updated++;
|
|
if (updated % 25 === 0) {
|
|
logger.info(`Flexoptix progress: ${updated} updated so far`);
|
|
}
|
|
} else {
|
|
skipped++;
|
|
}
|
|
} catch (err) {
|
|
logger.error(`Flexoptix error for ${sku}`, { error: (err as Error).message });
|
|
errors++;
|
|
}
|
|
await sleep(100);
|
|
}
|
|
|
|
logger.info(`Flexoptix done: ${updated} updated, ${skipped} no-image, ${errors} errors`);
|
|
return { updated, skipped, errors };
|
|
}
|
|
|
|
// =============================================================================
|
|
// GAO Tek
|
|
// Fetch product page and extract og:image meta tag
|
|
// =============================================================================
|
|
|
|
const GAOTEK_BASE = "https://gaotek.com";
|
|
|
|
function extractOgImage(html: string): string | null {
|
|
const match =
|
|
html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
|
|
html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);
|
|
if (!match) return null;
|
|
const url = match[1];
|
|
// Skip placeholder and logo images
|
|
if (url.includes("placeholder") || url.includes("logo") || url.includes("mobilelogo")) return null;
|
|
return url;
|
|
}
|
|
|
|
async function fetchGaoTekImage(productUrl: string): Promise<string | null> {
|
|
const html = await fetchHtml(productUrl);
|
|
return extractOgImage(html);
|
|
}
|
|
|
|
async function backfillGaoTek(): Promise<{ updated: number; skipped: number; errors: number }> {
|
|
logger.info("=== GAO Tek image backfill starting ===");
|
|
|
|
// Prefer records that have stored product URLs in price_observations
|
|
const withUrlRows = await pool.query(`
|
|
SELECT DISTINCT ON (t.id)
|
|
t.id, t.part_number, t.slug, po.url as product_url
|
|
FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
LEFT JOIN price_observations po ON po.transceiver_id = t.id
|
|
WHERE v.name = 'GAO Tek'
|
|
AND (t.image_url IS NULL OR t.image_url = '')
|
|
AND po.url IS NOT NULL
|
|
AND po.url LIKE '%gaotek%'
|
|
ORDER BY t.id, po.time DESC
|
|
`);
|
|
|
|
// All records without images
|
|
const allRows = await pool.query(`
|
|
SELECT t.id, t.part_number, t.slug
|
|
FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
WHERE v.name = 'GAO Tek'
|
|
AND (t.image_url IS NULL OR t.image_url = '')
|
|
`);
|
|
|
|
const withUrlIds = new Set(withUrlRows.rows.map((r) => r.id as string));
|
|
|
|
// Combine: known URLs first, then slug-derived URLs
|
|
const toProcess: Array<{ id: string; partNumber: string; productUrl: string }> = [];
|
|
|
|
for (const row of withUrlRows.rows) {
|
|
toProcess.push({
|
|
id: row.id as string,
|
|
partNumber: row.part_number as string,
|
|
productUrl: row.product_url as string,
|
|
});
|
|
}
|
|
|
|
for (const row of allRows.rows) {
|
|
if (withUrlIds.has(row.id as string)) continue;
|
|
const rawSlug = (row.slug as string).replace(/^scraped-/, "");
|
|
toProcess.push({
|
|
id: row.id as string,
|
|
partNumber: row.part_number as string,
|
|
productUrl: `${GAOTEK_BASE}/product/${rawSlug}/`,
|
|
});
|
|
}
|
|
|
|
logger.info(`GAO Tek: ${toProcess.length} products to attempt`);
|
|
|
|
let updated = 0;
|
|
let skipped = 0;
|
|
let errors = 0;
|
|
|
|
for (const item of toProcess) {
|
|
try {
|
|
const imgUrl = await fetchGaoTekImage(item.productUrl);
|
|
if (imgUrl) {
|
|
await updateImageUrl(item.id, imgUrl);
|
|
updated++;
|
|
if (updated % 25 === 0) {
|
|
logger.info(`GAO Tek progress: ${updated} updated so far`);
|
|
}
|
|
} else {
|
|
skipped++;
|
|
}
|
|
} catch (err) {
|
|
const msg = (err as Error).message;
|
|
if (!msg.includes("HTTP 404") && !msg.includes("HTTP 403")) {
|
|
logger.warn(`GAO Tek error for ${item.productUrl}`, { error: msg.slice(0, 80) });
|
|
}
|
|
errors++;
|
|
}
|
|
await sleep(100);
|
|
}
|
|
|
|
logger.info(`GAO Tek done: ${updated} updated, ${skipped} no-image, ${errors} errors/404s`);
|
|
return { updated, skipped, errors };
|
|
}
|
|
|
|
// =============================================================================
|
|
// Other vendors — og:image from stored price_observations URLs
|
|
// =============================================================================
|
|
|
|
const OTHER_VENDOR_NAMES = [
|
|
"T&S Communication",
|
|
"Ascent Optics",
|
|
"ATGBICS",
|
|
"Skylane Optics",
|
|
"SmartOptics",
|
|
"ProLabs",
|
|
"FS.COM",
|
|
"GBICS",
|
|
"Fluxlight",
|
|
];
|
|
|
|
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {
|
|
logger.info("=== Other vendors og:image backfill starting ===");
|
|
|
|
const rows = await pool.query(`
|
|
SELECT DISTINCT ON (t.id)
|
|
t.id, t.part_number, v.name as vendor_name, po.url as product_url
|
|
FROM transceivers t
|
|
JOIN vendors v ON t.vendor_id = v.id
|
|
JOIN price_observations po ON po.transceiver_id = t.id
|
|
WHERE v.name = ANY($1)
|
|
AND (t.image_url IS NULL OR t.image_url = '')
|
|
AND po.url IS NOT NULL
|
|
AND po.url ~ '^https?://'
|
|
ORDER BY t.id, po.time DESC
|
|
`, [OTHER_VENDOR_NAMES]);
|
|
|
|
logger.info(`Other vendors: ${rows.rows.length} products with URLs to process`);
|
|
|
|
let updated = 0;
|
|
|
|
for (const row of rows.rows) {
|
|
const productUrl = row.product_url as string;
|
|
try {
|
|
const html = await fetchHtml(productUrl);
|
|
const imgUrl = extractOgImage(html);
|
|
if (imgUrl) {
|
|
await updateImageUrl(row.id as string, imgUrl);
|
|
updated++;
|
|
}
|
|
} catch {
|
|
// Skip 404s, timeouts, etc. silently
|
|
}
|
|
await sleep(100);
|
|
}
|
|
|
|
logger.info(`Other vendors done: ${updated} / ${rows.rows.length} updated`);
|
|
return { total: rows.rows.length, updated };
|
|
}
|
|
|
|
// =============================================================================
|
|
// Main
|
|
// =============================================================================
|
|
|
|
async function main(): Promise<void> {
|
|
logger.info("=== TIP Image Backfill Script ===");
|
|
logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`);
|
|
|
|
try {
|
|
await pool.query("SELECT 1");
|
|
logger.info("DB connection OK");
|
|
} catch (err) {
|
|
logger.error("DB connection failed", { error: (err as Error).message });
|
|
process.exit(1);
|
|
}
|
|
|
|
const startTime = Date.now();
|
|
const results: Record<string, unknown> = {};
|
|
|
|
try {
|
|
results.optcore = await backfillOptcore();
|
|
} catch (err) {
|
|
logger.error("Optcore backfill failed", { error: (err as Error).message });
|
|
results.optcore = { error: (err as Error).message };
|
|
}
|
|
|
|
try {
|
|
results.flexoptix = await backfillFlexoptix();
|
|
} catch (err) {
|
|
logger.error("Flexoptix backfill failed", { error: (err as Error).message });
|
|
results.flexoptix = { error: (err as Error).message };
|
|
}
|
|
|
|
try {
|
|
results.gaotek = await backfillGaoTek();
|
|
} catch (err) {
|
|
logger.error("GAO Tek backfill failed", { error: (err as Error).message });
|
|
results.gaotek = { error: (err as Error).message };
|
|
}
|
|
|
|
try {
|
|
results.others = await backfillOtherVendors();
|
|
} catch (err) {
|
|
logger.error("Other vendors backfill failed", { error: (err as Error).message });
|
|
results.others = { error: (err as Error).message };
|
|
}
|
|
|
|
const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
logger.info("=== Backfill complete ===", { results, elapsedSec });
|
|
}
|
|
|
|
main()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
logger.error("Fatal error", { error: (err as Error).message });
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|