/** * Image Backfill Script * * Fills `image_url` for transceivers that currently have none. * Priority order: * 1. Optcore (2,580 products) — WP REST API featured_media per product * 2. Flexoptix (344 products) — Magento GraphQL image fields * 3. GAO Tek (413 products) — product page og:image scrape * 4. Other vendors — og:image from stored price_observations URLs * * Run on Erik: * node packages/scraper/dist/utils/backfill-images.js * * Or locally with SSH port-forward: * ssh -L 5433:127.0.0.1:5433 erik -N & * node dist/utils/backfill-images.js */ import { pool } from "./db"; import { logger } from "./logger"; function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } async function updateImageUrl(id: string, imageUrl: string): Promise { await pool.query( `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`, [imageUrl, id] ); } async function fetchJson(url: string, init?: RequestInit): Promise { const resp = await fetch(url, { ...init, headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)", Accept: "application/json", ...(init?.headers ?? {}), }, signal: AbortSignal.timeout(20000), }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); return resp.json(); } async function fetchHtml(url: string): Promise { const resp = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)", Accept: "text/html", }, signal: AbortSignal.timeout(20000), }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); return resp.text(); } // ============================================================================= // Optcore // Strategy: // Step 1 — enumerate all products per category, collect slug -> featured_media id // Step 2 — resolve unique media IDs individually via /wp-json/wp/v2/media/ // Step 3 — update DB for matched slugs // // Note: _embed does NOT work for the "product" custom post type on Optcore. // We must resolve media IDs separately. // ============================================================================= const OPTCORE_BASE = "https://www.optcore.net"; const OPTCORE_CATEGORY_IDS = [ 309, 173, 76, 79, 73, 311, 313, 312, 333, 1088, 59, 1102, 4097, 77, 4101, 4092, 6441, ]; interface OptcoreWpProductRaw { slug: string; featured_media: number; } interface OptcoreMediaRaw { id: number; source_url: string; media_details?: { sizes?: { medium?: { source_url: string }; large?: { source_url: string }; }; }; } async function fetchOptcoreMediaUrl(mediaId: number): Promise { if (mediaId === 0) return null; try { const data = await fetchJson( `${OPTCORE_BASE}/wp-json/wp/v2/media/${mediaId}?_fields=id,source_url,media_details` ) as OptcoreMediaRaw; return ( data.media_details?.sizes?.medium?.source_url || data.media_details?.sizes?.large?.source_url || data.source_url || null ); } catch { return null; } } async function backfillOptcore(): Promise<{ updated: number; skipped: number; notFound: number }> { logger.info("=== Optcore image backfill starting ==="); const rows = await pool.query(` SELECT t.id, t.part_number FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'Optcore' AND (t.image_url IS NULL OR t.image_url = '') `); // Map: part_number (lowercase) -> DB id const byPartNumber = new Map(); for (const row of rows.rows) { byPartNumber.set((row.part_number as string).toLowerCase(), row.id as string); } logger.info(`Optcore: ${byPartNumber.size} products need images`); if (byPartNumber.size === 0) { return { updated: 0, skipped: 0, notFound: 0 }; } // Step 1: enumerate all products across all transceiver categories const slugToMediaId = new Map(); const seenSlugs = new Set(); for (const catId of OPTCORE_CATEGORY_IDS) { let page = 1; while (true) { const apiUrl = `${OPTCORE_BASE}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,featured_media`; try { const resp = await fetch(apiUrl, { headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)" }, signal: AbortSignal.timeout(20000), }); if (!resp.ok) break; const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1", 10); const products = (await resp.json()) as OptcoreWpProductRaw[]; for (const p of products) { if (seenSlugs.has(p.slug)) continue; seenSlugs.add(p.slug); if (p.featured_media) { slugToMediaId.set(p.slug.toLowerCase(), p.featured_media); } } if (page >= totalPages) break; page++; } catch (err) { logger.warn(`Optcore cat=${catId} page=${page} fetch failed`, { error: (err as Error).message }); break; } await sleep(100); } } logger.info(`Optcore: enumerated ${seenSlugs.size} WP products, ${slugToMediaId.size} have featured media`); // Step 2: resolve unique media IDs const uniqueMediaIds = new Set(slugToMediaId.values()); logger.info(`Optcore: resolving ${uniqueMediaIds.size} unique media IDs`); const mediaIdToUrl = new Map(); let resolved = 0; for (const mediaId of uniqueMediaIds) { const url = await fetchOptcoreMediaUrl(mediaId); if (url) mediaIdToUrl.set(mediaId, url); resolved++; if (resolved % 20 === 0) { logger.info(`Optcore media resolution: ${resolved}/${uniqueMediaIds.size}`); } await sleep(100); } // Step 3: match slugs to DB records and update let updated = 0; let skipped = 0; let notFound = 0; for (const [slug, mediaId] of slugToMediaId) { const dbId = byPartNumber.get(slug); if (!dbId) { notFound++; continue; } const imgUrl = mediaIdToUrl.get(mediaId); if (!imgUrl) { skipped++; continue; } await updateImageUrl(dbId, imgUrl); byPartNumber.delete(slug); updated++; if (updated % 50 === 0) { logger.info(`Optcore DB updates: ${updated} so far`); } } logger.info(`Optcore done: ${updated} updated, ${skipped} no-media-url, ${notFound} slug-not-in-db`); return { updated, skipped, notFound }; } // ============================================================================= // Flexoptix // Magento GraphQL: /graphql — query by SKU, returns small_image.url // ============================================================================= const FLEXOPTIX_BASE = "https://www.flexoptix.net"; const FLEXOPTIX_GRAPHQL = `${FLEXOPTIX_BASE}/graphql`; interface FlexoptixGqlProduct { name: string; sku: string; url_key: string; small_image: { url: string } | null; image: { url: string } | null; thumbnail: { url: string } | null; } async function fetchFlexoptixImage(sku: string): Promise { // Strip ":Sx" coding suffix to get canonical SKU for search const canonicalSku = sku.replace(/:.*$/, "").trim(); const query = `{ products(search: ${JSON.stringify(canonicalSku)}, pageSize: 5) { items { name sku url_key small_image { url } image { url } thumbnail { url } } } }`; const data = await fetchJson(FLEXOPTIX_GRAPHQL, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ query }), }) as { data?: { products?: { items: FlexoptixGqlProduct[] } }; }; const items = data.data?.products?.items ?? []; // Prefer exact SKU match (canonical, case-insensitive) for (const item of items) { const itemSku = item.sku.replace(/:.*$/, "").trim().toLowerCase(); if (itemSku === canonicalSku.toLowerCase()) { return ( item.small_image?.url || item.image?.url || item.thumbnail?.url || null ); } } // Fallback: single result if (items.length === 1) { return ( items[0].small_image?.url || items[0].image?.url || items[0].thumbnail?.url || null ); } return null; } async function backfillFlexoptix(): Promise<{ updated: number; skipped: number; errors: number }> { logger.info("=== Flexoptix image backfill starting ==="); const rows = await pool.query(` SELECT t.id, t.part_number FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '') ORDER BY t.part_number `); logger.info(`Flexoptix: ${rows.rows.length} products need images`); let updated = 0; let skipped = 0; let errors = 0; for (const row of rows.rows) { const sku = row.part_number as string; try { const imgUrl = await fetchFlexoptixImage(sku); if (imgUrl) { await updateImageUrl(row.id as string, imgUrl); updated++; if (updated % 25 === 0) { logger.info(`Flexoptix progress: ${updated} updated so far`); } } else { skipped++; } } catch (err) { logger.error(`Flexoptix error for ${sku}`, { error: (err as Error).message }); errors++; } await sleep(100); } logger.info(`Flexoptix done: ${updated} updated, ${skipped} no-image, ${errors} errors`); return { updated, skipped, errors }; } // ============================================================================= // GAO Tek // Fetch product page and extract og:image meta tag // ============================================================================= const GAOTEK_BASE = "https://gaotek.com"; function extractOgImage(html: string): string | null { const match = html.match(/]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) || html.match(/]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i); if (!match) return null; const url = match[1]; // Skip placeholder and logo images if (url.includes("placeholder") || url.includes("logo") || url.includes("mobilelogo")) return null; return url; } async function fetchGaoTekImage(productUrl: string): Promise { const html = await fetchHtml(productUrl); return extractOgImage(html); } async function backfillGaoTek(): Promise<{ updated: number; skipped: number; errors: number }> { logger.info("=== GAO Tek image backfill starting ==="); // Prefer records that have stored product URLs in price_observations const withUrlRows = await pool.query(` SELECT DISTINCT ON (t.id) t.id, t.part_number, t.slug, po.url as product_url FROM transceivers t JOIN vendors v ON t.vendor_id = v.id LEFT JOIN price_observations po ON po.transceiver_id = t.id WHERE v.name = 'GAO Tek' AND (t.image_url IS NULL OR t.image_url = '') AND po.url IS NOT NULL AND po.url LIKE '%gaotek%' ORDER BY t.id, po.time DESC `); // All records without images const allRows = await pool.query(` SELECT t.id, t.part_number, t.slug FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'GAO Tek' AND (t.image_url IS NULL OR t.image_url = '') `); const withUrlIds = new Set(withUrlRows.rows.map((r) => r.id as string)); // Combine: known URLs first, then slug-derived URLs const toProcess: Array<{ id: string; partNumber: string; productUrl: string }> = []; for (const row of withUrlRows.rows) { toProcess.push({ id: row.id as string, partNumber: row.part_number as string, productUrl: row.product_url as string, }); } for (const row of allRows.rows) { if (withUrlIds.has(row.id as string)) continue; const rawSlug = (row.slug as string).replace(/^scraped-/, ""); toProcess.push({ id: row.id as string, partNumber: row.part_number as string, productUrl: `${GAOTEK_BASE}/product/${rawSlug}/`, }); } logger.info(`GAO Tek: ${toProcess.length} products to attempt`); let updated = 0; let skipped = 0; let errors = 0; for (const item of toProcess) { try { const imgUrl = await fetchGaoTekImage(item.productUrl); if (imgUrl) { await updateImageUrl(item.id, imgUrl); updated++; if (updated % 25 === 0) { logger.info(`GAO Tek progress: ${updated} updated so far`); } } else { skipped++; } } catch (err) { const msg = (err as Error).message; if (!msg.includes("HTTP 404") && !msg.includes("HTTP 403")) { logger.warn(`GAO Tek error for ${item.productUrl}`, { error: msg.slice(0, 80) }); } errors++; } await sleep(100); } logger.info(`GAO Tek done: ${updated} updated, ${skipped} no-image, ${errors} errors/404s`); return { updated, skipped, errors }; } // ============================================================================= // Other vendors — og:image from stored price_observations URLs // ============================================================================= const OTHER_VENDOR_NAMES = [ "T&S Communication", "Ascent Optics", "ATGBICS", "Skylane Optics", "SmartOptics", "ProLabs", "FS.COM", "GBICS", "Fluxlight", ]; async function backfillOtherVendors(): Promise<{ total: number; updated: number }> { logger.info("=== Other vendors og:image backfill starting ==="); const rows = await pool.query(` SELECT DISTINCT ON (t.id) t.id, t.part_number, v.name as vendor_name, po.url as product_url FROM transceivers t JOIN vendors v ON t.vendor_id = v.id JOIN price_observations po ON po.transceiver_id = t.id WHERE v.name = ANY($1) AND (t.image_url IS NULL OR t.image_url = '') AND po.url IS NOT NULL AND po.url ~ '^https?://' ORDER BY t.id, po.time DESC `, [OTHER_VENDOR_NAMES]); logger.info(`Other vendors: ${rows.rows.length} products with URLs to process`); let updated = 0; for (const row of rows.rows) { const productUrl = row.product_url as string; try { const html = await fetchHtml(productUrl); const imgUrl = extractOgImage(html); if (imgUrl) { await updateImageUrl(row.id as string, imgUrl); updated++; } } catch { // Skip 404s, timeouts, etc. silently } await sleep(100); } logger.info(`Other vendors done: ${updated} / ${rows.rows.length} updated`); return { total: rows.rows.length, updated }; } // ============================================================================= // Main // ============================================================================= async function main(): Promise { logger.info("=== TIP Image Backfill Script ==="); logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`); try { await pool.query("SELECT 1"); logger.info("DB connection OK"); } catch (err) { logger.error("DB connection failed", { error: (err as Error).message }); process.exit(1); } const startTime = Date.now(); const results: Record = {}; try { results.optcore = await backfillOptcore(); } catch (err) { logger.error("Optcore backfill failed", { error: (err as Error).message }); results.optcore = { error: (err as Error).message }; } try { results.flexoptix = await backfillFlexoptix(); } catch (err) { logger.error("Flexoptix backfill failed", { error: (err as Error).message }); results.flexoptix = { error: (err as Error).message }; } try { results.gaotek = await backfillGaoTek(); } catch (err) { logger.error("GAO Tek backfill failed", { error: (err as Error).message }); results.gaotek = { error: (err as Error).message }; } try { results.others = await backfillOtherVendors(); } catch (err) { logger.error("Other vendors backfill failed", { error: (err as Error).message }); results.others = { error: (err as Error).message }; } const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1); logger.info("=== Backfill complete ===", { results, elapsedSec }); } main() .then(() => pool.end()) .catch((err) => { logger.error("Fatal error", { error: (err as Error).message }); pool.end(); process.exit(1); });