From 102678731881143bb640d05e5a031f786b40bdff Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 3 Apr 2026 21:13:03 +0200 Subject: [PATCH] feat: add proxy network, image backfill, and scraper improvements - Add TIP Proxy Network (packages/proxy-agent): SOCKS5 proxy agent for residential IP bypass of CloudFront WAF blocks - Add /api/proxy/* routes: node registration, heartbeat, load balancing - Add image extraction to Flexoptix catalog scraper (GraphQL small_image) - Add image extraction to Optcore scraper (Playwright gallery img) - Fix Fluxlight price scraping (BigCommerce HTML structure: data-product-price-without-tax) - Add SmartOptics scraper (8 DWDM/coherent products, og:image extraction) - Fix findOrCreateScrapedTransceiver to update image_url for existing records - Add image backfill script (backfill-images.ts): 178 Flexoptix images added - Fix DB connection pool: max 5, idleTimeoutMillis 10s (was unlimited, caused >100 connections) - Add proxy.ts utility for scraper proxy rotation --- CHANGELOG_PENDING.md | 4 + packages/api/src/routes/proxy.ts | 306 ++++++++++ packages/proxy-agent/README.md | 50 ++ packages/proxy-agent/package.json | 18 + packages/proxy-agent/src/agent.ts | 174 ++++++ packages/proxy-agent/src/bandwidth.ts | 50 ++ packages/proxy-agent/src/config.ts | 76 +++ packages/proxy-agent/src/heartbeat.ts | 82 +++ packages/proxy-agent/src/index.ts | 167 ++++++ packages/proxy-agent/src/socks5.ts | 181 ++++++ packages/proxy-agent/tsconfig.json | 18 + .../scraper/src/scrapers/flexoptix-catalog.ts | 8 + packages/scraper/src/scrapers/fluxlight.ts | 9 +- packages/scraper/src/scrapers/optcore.ts | 11 +- packages/scraper/src/scrapers/smartoptics.ts | 294 ++++------ packages/scraper/src/utils/backfill-images.ts | 540 ++++++++++++++++++ packages/scraper/src/utils/db.ts | 21 +- packages/scraper/src/utils/proxy.ts | 77 +++ 18 files changed, 1909 insertions(+), 177 deletions(-) create mode 100644 packages/api/src/routes/proxy.ts create mode 100644 packages/proxy-agent/README.md create mode 100644 packages/proxy-agent/package.json create mode 100644 packages/proxy-agent/src/agent.ts create mode 100644 packages/proxy-agent/src/bandwidth.ts create mode 100644 packages/proxy-agent/src/config.ts create mode 100644 packages/proxy-agent/src/heartbeat.ts create mode 100644 packages/proxy-agent/src/index.ts create mode 100644 packages/proxy-agent/src/socks5.ts create mode 100644 packages/proxy-agent/tsconfig.json create mode 100644 packages/scraper/src/utils/backfill-images.ts create mode 100644 packages/scraper/src/utils/proxy.ts diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index edfc30e..05e5b42 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -18,6 +18,10 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA {"d":"2026-04-02","t":"FEAT","m":"Forecast Engine: weighted demand index (0-100) from 6 signal types — capex 0.30, ai_clusters 0.25, ebay_velocity 0.20, lead_times 0.15, github 0.06, standards 0.04 — 3/9/12/18 month horizons for 5 technologies"} {"d":"2026-04-02","t":"FEAT","m":"NAS sync: datasheet/manual download — PDFs from product_documents organized into switches/transceivers/whitepapers/other"} {"d":"2026-04-02","t":"INFRA","m":"Scheduler: 50 total pg-boss jobs — 8 new prediction/forecast jobs with cron schedules"} +{"d":"2026-04-03","t":"FEAT","m":"TIP Proxy Network: residential proxy pool — contributor nodes donate bandwidth, SOCKS5 server (Node.js net only), register/heartbeat/next/rotate/stats API, round-robin routing with uptime+latency scoring"} +{"d":"2026-04-03","t":"FEAT","m":"@tip/proxy-agent: standalone CLI package — tip-agent start/status/stop, configurable bandwidth cap, 30s heartbeat, graceful shutdown"} +{"d":"2026-04-03","t":"UI","m":"Dashboard Network tab: node stats, join-the-network card with token generator, install command box, country breakdown table"} +{"d":"2026-04-03","t":"INFRA","m":"Mac Studio home node: tip-agent running on 192.168.178.213:1081, PROXY_URL=socks5://192.168.178.213:1081 set in PM2 env for scraper+api, ProLabs WAF bypass now active"} {"d":"2026-04-01","t":"FEAT","m":"Product Intelligence Layer (migration 020): product_issues table (forum/community bugs), condition+marketplace on price_observations, features JSONB on switches+transceivers"} {"d":"2026-04-01","t":"FEAT","m":"eBay Enricher: scrapes eBay.de for switch/transceiver listings — extracts features, description, refurbished prices, images — nightly via pg-boss"} {"d":"2026-04-01","t":"FEAT","m":"Community Issues Scraper: extracts known bugs/incompatibilities from Reddit, ServeTheHome, Arista Community, Cisco Community, NetworkEngineering SE"} diff --git a/packages/api/src/routes/proxy.ts b/packages/api/src/routes/proxy.ts new file mode 100644 index 0000000..49a5d09 --- /dev/null +++ b/packages/api/src/routes/proxy.ts @@ -0,0 +1,306 @@ +/** + * Proxy Network API routes + * + * POST /api/proxy/register — register new node, returns token + * POST /api/proxy/heartbeat — update node status + stats + * GET /api/proxy/nodes — list all nodes (admin) + * GET /api/proxy/next — get best available proxy for scraper use + * POST /api/proxy/rotate — mark node as used, get next + * GET /api/proxy/stats — network-wide stats + * + * Note: register + heartbeat are public (no auth required). + * nodes + rotate require admin token via X-Admin-Token header. + */ +import { Router, Request, Response } from "express"; +import * as crypto from "crypto"; +import { pool } from "../db/client"; + +export const proxyRouter = Router(); + +const ADMIN_TOKEN = process.env.TIP_ADMIN_TOKEN ?? "tip-admin-2026"; + +function requireAdmin(req: Request, res: Response): boolean { + const token = + req.headers["x-admin-token"] ?? + req.query["admin_token"] ?? + ""; + if (token !== ADMIN_TOKEN) { + res.status(401).json({ error: "Admin token required" }); + return false; + } + return true; +} + +function generateToken(): string { + return crypto.randomBytes(32).toString("hex"); +} + +// Round-robin index tracker (in-memory, resets on restart) +let rrIndex = 0; + +// POST /api/proxy/register +proxyRouter.post("/register", async (req: Request, res: Response) => { + try { + const { token: existingToken, name, owner_email, port, version } = req.body as { + token?: string; + name?: string; + owner_email?: string; + port?: number; + version?: string; + }; + + const ip = + (req.headers["x-forwarded-for"] as string | undefined)?.split(",")[0]?.trim() ?? + req.socket.remoteAddress ?? + ""; + + // If token provided, upsert — otherwise create new + const token = existingToken ?? generateToken(); + + const result = await pool.query( + `INSERT INTO proxy_nodes (token, name, owner_email, ip, port, version, status, last_seen) + VALUES ($1, $2, $3, $4, $5, $6, 'online', NOW()) + ON CONFLICT (token) DO UPDATE SET + name = COALESCE(EXCLUDED.name, proxy_nodes.name), + ip = EXCLUDED.ip, + port = COALESCE(EXCLUDED.port, proxy_nodes.port), + version = COALESCE(EXCLUDED.version, proxy_nodes.version), + status = 'online', + last_seen = NOW() + RETURNING id, token, name, status`, + [token, name ?? null, owner_email ?? null, ip, port ?? 1080, version ?? null] + ); + + const node = result.rows[0]; + res.json({ + success: true, + token: node.token, + nodeId: node.id, + message: "Node registered. Keep this token safe!", + }); + } catch (err) { + console.error("[proxy] register error:", err); + res.status(500).json({ error: "Registration failed" }); + } +}); + +// POST /api/proxy/heartbeat +proxyRouter.post("/heartbeat", async (req: Request, res: Response) => { + try { + const { + token, + ip, + port, + bytesProxied, + requestsProxied, + latencyMs, + version, + } = req.body as { + token: string; + ip?: string; + port?: number; + bytesProxied?: number; + requestsProxied?: number; + latencyMs?: number; + version?: string; + }; + + if (!token) { + res.status(400).json({ error: "token required" }); + return; + } + + const remoteIp = + ip ?? + (req.headers["x-forwarded-for"] as string | undefined)?.split(",")[0]?.trim() ?? + req.socket.remoteAddress ?? + ""; + + const bytesGb = (bytesProxied ?? 0) / (1024 ** 3); + + const result = await pool.query( + `UPDATE proxy_nodes SET + status = 'online', + last_seen = NOW(), + ip = COALESCE($2, ip), + port = COALESCE($3, port), + bytes_proxied = COALESCE($4, bytes_proxied), + requests_proxied = COALESCE($5, requests_proxied), + latency_ms = COALESCE($6, latency_ms), + version = COALESCE($7, version), + bandwidth_used_gb = COALESCE($8, bandwidth_used_gb) + WHERE token = $1 + RETURNING id, status`, + [ + token, + remoteIp || null, + port ?? null, + bytesProxied ?? null, + requestsProxied ?? null, + latencyMs ?? null, + version ?? null, + bytesGb > 0 ? bytesGb : null, + ] + ); + + if (result.rowCount === 0) { + res.status(404).json({ error: "Node not found. Please register first." }); + return; + } + + res.json({ success: true, status: "online" }); + } catch (err) { + console.error("[proxy] heartbeat error:", err); + res.status(500).json({ error: "Heartbeat failed" }); + } +}); + +// GET /api/proxy/nodes (admin) +proxyRouter.get("/nodes", async (req: Request, res: Response) => { + if (!requireAdmin(req, res)) return; + try { + const result = await pool.query( + `SELECT id, token, name, country_code, city, ip, port, protocol, + status, last_seen, bytes_proxied, requests_proxied, + uptime_pct, bandwidth_limit_gb, bandwidth_used_gb, + latency_ms, version, registered_at + FROM proxy_nodes + ORDER BY status = 'online' DESC, last_seen DESC` + ); + res.json({ success: true, nodes: result.rows, total: result.rowCount }); + } catch (err) { + console.error("[proxy] nodes error:", err); + res.status(500).json({ error: "Failed to fetch nodes" }); + } +}); + +// GET /api/proxy/next — best available proxy for scraper use +proxyRouter.get("/next", async (_req: Request, res: Response) => { + try { + const result = await pool.query( + `SELECT id, ip, port, protocol, country_code, city, latency_ms + FROM proxy_nodes + WHERE status = 'online' + AND last_seen > NOW() - INTERVAL '2 minutes' + AND (bandwidth_used_gb < bandwidth_limit_gb OR bandwidth_limit_gb = 0) + ORDER BY uptime_pct DESC NULLS LAST, latency_ms ASC NULLS LAST + LIMIT 10` + ); + + if (result.rowCount === 0) { + res.status(503).json({ + error: "No proxies available", + fallback: process.env.PROXY_URL ?? null, + }); + return; + } + + const nodes = result.rows; + const node = nodes[rrIndex % nodes.length]; + rrIndex = (rrIndex + 1) % nodes.length; + + res.json({ + success: true, + host: node.ip, + port: node.port, + protocol: node.protocol ?? "socks5", + nodeId: node.id, + country: node.country_code, + city: node.city, + latencyMs: node.latency_ms, + }); + } catch (err) { + console.error("[proxy] next error:", err); + res.status(500).json({ error: "Failed to get next proxy" }); + } +}); + +// POST /api/proxy/rotate — mark node as used, get next one +proxyRouter.post("/rotate", async (req: Request, res: Response) => { + if (!requireAdmin(req, res)) return; + try { + const { nodeId } = req.body as { nodeId?: string }; + if (nodeId) { + // Optionally mark as temporarily degraded (just log — no ban) + await pool.query( + `UPDATE proxy_nodes SET requests_proxied = requests_proxied + 1 WHERE id = $1`, + [nodeId] + ); + } + // Return next proxy + const result = await pool.query( + `SELECT id, ip, port, protocol, country_code + FROM proxy_nodes + WHERE status = 'online' + AND last_seen > NOW() - INTERVAL '2 minutes' + AND id != COALESCE($1::uuid, '00000000-0000-0000-0000-000000000000') + ORDER BY requests_proxied ASC, latency_ms ASC NULLS LAST + LIMIT 1`, + [nodeId ?? null] + ); + + if (result.rowCount === 0) { + res.status(503).json({ error: "No other proxies available" }); + return; + } + + const node = result.rows[0]; + res.json({ + success: true, + host: node.ip, + port: node.port, + protocol: node.protocol ?? "socks5", + nodeId: node.id, + country: node.country_code, + }); + } catch (err) { + console.error("[proxy] rotate error:", err); + res.status(500).json({ error: "Failed to rotate proxy" }); + } +}); + +// GET /api/proxy/stats — public network stats +proxyRouter.get("/stats", async (_req: Request, res: Response) => { + try { + const result = await pool.query(` + SELECT + COUNT(*) AS total_nodes, + COUNT(*) FILTER (WHERE status = 'online') AS online_nodes, + COUNT(*) FILTER (WHERE status = 'offline') AS offline_nodes, + COUNT(DISTINCT country_code) AS countries, + COALESCE(SUM(bandwidth_used_gb), 0) AS total_gb_proxied, + COALESCE(SUM(requests_proxied), 0) AS total_requests, + COALESCE(AVG(latency_ms) FILTER (WHERE status = 'online'), 0) AS avg_latency_ms + FROM proxy_nodes + `); + + const row = result.rows[0] ?? {}; + + // Per-country breakdown + const countryResult = await pool.query(` + SELECT country_code, COUNT(*) AS nodes, + COUNT(*) FILTER (WHERE status = 'online') AS online + FROM proxy_nodes + WHERE country_code IS NOT NULL + GROUP BY country_code + ORDER BY online DESC, nodes DESC + `); + + res.json({ + success: true, + network: { + totalNodes: Number(row.total_nodes ?? 0), + onlineNodes: Number(row.online_nodes ?? 0), + offlineNodes: Number(row.offline_nodes ?? 0), + countries: Number(row.countries ?? 0), + totalGbProxied: parseFloat(Number(row.total_gb_proxied ?? 0).toFixed(2)), + totalRequests: Number(row.total_requests ?? 0), + avgLatencyMs: Math.round(Number(row.avg_latency_ms ?? 0)), + }, + countries: countryResult.rows, + }); + } catch (err) { + console.error("[proxy] stats error:", err); + res.status(500).json({ error: "Failed to get stats" }); + } +}); diff --git a/packages/proxy-agent/README.md b/packages/proxy-agent/README.md new file mode 100644 index 0000000..d9f2b7b --- /dev/null +++ b/packages/proxy-agent/README.md @@ -0,0 +1,50 @@ +# @tip/proxy-agent + +Contribute bandwidth to the TIP (Transceiver Intelligence Platform) proxy network +and get free API access in return. + +## How it works + +- You run the agent on a residential or home server machine +- The agent starts a SOCKS5 proxy server on your machine +- TIP scrapers route through your IP to bypass datacenter IP blocks (e.g. CloudFront WAF) +- You get: free TIP API access, contributor badge, early access to new features + +## Install & Start + +```bash +# Generate a token first on https://transceiver-db.context-x.org/dashboard (Network tab) +# Then start the agent: +npx @tip/proxy-agent start --token YOUR_TOKEN +``` + +## Options + +``` +tip-agent start --token TOKEN [--port 1080] [--max-gb 10] [--name "My Node"] +tip-agent status +tip-agent stop +``` + +| Option | Default | Description | +|------------|---------------------------------------------|--------------------------| +| `--token` | (required) | Your TIP node token | +| `--port` | 1080 | SOCKS5 listen port | +| `--max-gb` | 10 | Max bandwidth to donate | +| `--name` | hostname | Node display name | +| `--server` | https://transceiver-db.context-x.org | TIP API server | + +## Protocol + +- SOCKS5 (RFC 1928) +- No-auth method +- TCP CONNECT only +- 30s connection timeout +- Heartbeat every 30s to routing server + +## Privacy + +- Only outbound TCP CONNECT requests are proxied (no BIND, no UDP) +- Bandwidth is capped at your configured limit +- No logs of proxied content are stored +- Your IP is visible to the target website (residential proxy model) diff --git a/packages/proxy-agent/package.json b/packages/proxy-agent/package.json new file mode 100644 index 0000000..cd6820a --- /dev/null +++ b/packages/proxy-agent/package.json @@ -0,0 +1,18 @@ +{ + "name": "@tip/proxy-agent", + "version": "1.0.0", + "description": "TIP Network Proxy Agent — contribute bandwidth, get free API access", + "bin": { "tip-agent": "./dist/index.js" }, + "main": "./dist/index.js", + "scripts": { + "build": "tsc", + "start": "node dist/index.js" + }, + "dependencies": { + "dotenv": "^16.0.0" + }, + "devDependencies": { + "typescript": "^5.0.0", + "@types/node": "^20.0.0" + } +} diff --git a/packages/proxy-agent/src/agent.ts b/packages/proxy-agent/src/agent.ts new file mode 100644 index 0000000..d8d86d8 --- /dev/null +++ b/packages/proxy-agent/src/agent.ts @@ -0,0 +1,174 @@ +/** + * Core agent logic — registers with TIP routing server, + * starts SOCKS5 server, manages heartbeat and graceful shutdown. + */ +import * as https from "https"; +import * as http from "http"; +import * as os from "os"; +import { createSocks5Server } from "./socks5"; +import { BandwidthTracker } from "./bandwidth"; +import { startHeartbeatLoop } from "./heartbeat"; +import { saveConfig, savePid, clearPid, AgentConfig } from "./config"; + +const AGENT_VERSION = "1.0.0"; + +interface RegisterResponse { + token: string; + nodeId: string; + message?: string; +} + +function postJson(url: string, body: unknown): Promise<{ status: number; data: unknown }> { + return new Promise((resolve, reject) => { + const json = JSON.stringify(body); + const parsed = new URL(url); + const mod = parsed.protocol === "https:" ? https : http; + + const req = mod.request( + { + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === "https:" ? 443 : 80), + path: parsed.pathname + parsed.search, + method: "POST", + headers: { + "Content-Type": "application/json", + "Content-Length": Buffer.byteLength(json), + "User-Agent": `TIP-Agent/${AGENT_VERSION}`, + }, + timeout: 15_000, + }, + (res) => { + const chunks: Buffer[] = []; + res.on("data", (c: Buffer) => chunks.push(c)); + res.on("end", () => { + try { + const data = JSON.parse(Buffer.concat(chunks).toString()); + resolve({ status: res.statusCode ?? 0, data }); + } catch { + resolve({ status: res.statusCode ?? 0, data: {} }); + } + }); + } + ); + + req.on("timeout", () => { req.destroy(); reject(new Error("timeout")); }); + req.on("error", reject); + req.write(json); + req.end(); + }); +} + +async function registerNode(cfg: AgentConfig): Promise { + const url = `${cfg.serverUrl}/api/proxy/register`; + console.log(`[agent] Registering with ${url}...`); + + const { status, data } = await postJson(url, { + token: cfg.token, + name: cfg.name, + port: cfg.port, + version: AGENT_VERSION, + }); + + if (status >= 200 && status < 300) { + const resp = data as RegisterResponse; + console.log(`[agent] Registered successfully. Node: ${resp.nodeId ?? "unknown"}`); + return resp; + } + + throw new Error(`Registration failed: HTTP ${status}`); +} + +export async function startAgent(cfg: AgentConfig): Promise { + const bandwidth = new BandwidthTracker(cfg.maxGb); + + // Register with server + await registerNode(cfg).catch((err) => { + console.warn(`[agent] Registration warning: ${err.message} (continuing anyway)`); + }); + + // Start SOCKS5 server + const server = createSocks5Server({ + port: cfg.port, + bandwidth, + onNewConnection: () => { + if (bandwidth.isLimitReached()) { + console.warn("[agent] Bandwidth limit reached — rejecting new connections"); + return false; + } + return true; + }, + }); + + await new Promise((resolve, reject) => { + server.listen(cfg.port, () => { + console.log(`[agent] SOCKS5 proxy listening on port ${cfg.port}`); + resolve(); + }); + server.on("error", reject); + }); + + // Save PID + savePid(process.pid); + saveConfig({ ...cfg, pid: process.pid, startedAt: new Date().toISOString() }); + + // Start heartbeat + const heartbeatTimer = startHeartbeatLoop( + cfg.serverUrl, + () => { + const stats = bandwidth.getStats(); + return { + token: cfg.token, + port: cfg.port, + bytesProxied: stats.bytesProxied, + requestsProxied: stats.requestsProxied, + version: AGENT_VERSION, + }; + }, + (result) => { + if (!result.ok) { + console.warn(`[agent] Heartbeat failed: ${result.error ?? "unknown error"}`); + } + } + ); + + console.log(`[agent] TIP Agent running. Max bandwidth: ${cfg.maxGb} GB`); + console.log(`[agent] Press Ctrl+C to stop.\n`); + + // Print stats every minute + const statsTimer = setInterval(() => { + const stats = bandwidth.getStats(); + const usedGb = (stats.bytesProxied / (1024 ** 3)).toFixed(3); + console.log( + `[agent] Stats: ${stats.requestsProxied} requests, ${usedGb} GB used / ${cfg.maxGb} GB limit` + ); + }, 60_000); + + // Graceful shutdown + const shutdown = (signal: string) => { + console.log(`\n[agent] Received ${signal}, shutting down...`); + clearInterval(heartbeatTimer); + clearInterval(statsTimer); + clearPid(); + server.close(() => { + console.log("[agent] SOCKS5 server closed. Goodbye."); + process.exit(0); + }); + setTimeout(() => process.exit(1), 5000); + }; + + process.on("SIGINT", () => shutdown("SIGINT")); + process.on("SIGTERM", () => shutdown("SIGTERM")); +} + +export function getLocalIp(): string { + const interfaces = os.networkInterfaces(); + for (const iface of Object.values(interfaces)) { + if (!iface) continue; + for (const addr of iface) { + if (addr.family === "IPv4" && !addr.internal) { + return addr.address; + } + } + } + return "127.0.0.1"; +} diff --git a/packages/proxy-agent/src/bandwidth.ts b/packages/proxy-agent/src/bandwidth.ts new file mode 100644 index 0000000..d6235b8 --- /dev/null +++ b/packages/proxy-agent/src/bandwidth.ts @@ -0,0 +1,50 @@ +/** + * Bandwidth tracker — monitors usage and enforces limits + */ + +export interface BandwidthStats { + bytesProxied: number; + requestsProxied: number; + bytesLimitHard: number; + limitReached: boolean; +} + +export class BandwidthTracker { + private bytesProxied = 0; + private requestsProxied = 0; + private readonly bytesLimit: number; + + constructor(maxGb: number) { + this.bytesLimit = maxGb * 1024 * 1024 * 1024; + } + + addBytes(n: number): void { + this.bytesProxied += n; + } + + incrementRequests(): void { + this.requestsProxied++; + } + + isLimitReached(): boolean { + return this.bytesProxied >= this.bytesLimit; + } + + getStats(): BandwidthStats { + return { + bytesProxied: this.bytesProxied, + requestsProxied: this.requestsProxied, + bytesLimitHard: this.bytesLimit, + limitReached: this.isLimitReached(), + }; + } + + getUsedGb(): number { + return this.bytesProxied / (1024 * 1024 * 1024); + } + + reset(): void { + this.bytesProxied = 0; + this.requestsProxied = 0; + } +} diff --git a/packages/proxy-agent/src/config.ts b/packages/proxy-agent/src/config.ts new file mode 100644 index 0000000..f6706c1 --- /dev/null +++ b/packages/proxy-agent/src/config.ts @@ -0,0 +1,76 @@ +/** + * Config management — persists agent config to ~/.tip-agent/config.json + */ +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; + +const CONFIG_DIR = path.join(os.homedir(), ".tip-agent"); +const CONFIG_FILE = path.join(CONFIG_DIR, "config.json"); + +export interface AgentConfig { + token: string; + port: number; + maxGb: number; + name: string; + serverUrl: string; + pid?: number; + startedAt?: string; +} + +const DEFAULTS: Partial = { + port: 1080, + maxGb: 10, + name: os.hostname(), + serverUrl: "https://transceiver-db.context-x.org", +}; + +export function loadConfig(): AgentConfig | null { + if (!fs.existsSync(CONFIG_FILE)) return null; + try { + const raw = fs.readFileSync(CONFIG_FILE, "utf8"); + return JSON.parse(raw) as AgentConfig; + } catch { + return null; + } +} + +export function saveConfig(cfg: Partial): AgentConfig { + if (!fs.existsSync(CONFIG_DIR)) { + fs.mkdirSync(CONFIG_DIR, { recursive: true }); + } + const existing = loadConfig() ?? {}; + const merged = { ...DEFAULTS, ...existing, ...cfg } as AgentConfig; + fs.writeFileSync(CONFIG_FILE, JSON.stringify(merged, null, 2), "utf8"); + return merged; +} + +export function clearConfig(): void { + if (fs.existsSync(CONFIG_FILE)) { + fs.unlinkSync(CONFIG_FILE); + } +} + +export function getPidFile(): string { + return path.join(CONFIG_DIR, "agent.pid"); +} + +export function savePid(pid: number): void { + if (!fs.existsSync(CONFIG_DIR)) { + fs.mkdirSync(CONFIG_DIR, { recursive: true }); + } + fs.writeFileSync(getPidFile(), String(pid), "utf8"); +} + +export function loadPid(): number | null { + const pidFile = getPidFile(); + if (!fs.existsSync(pidFile)) return null; + const raw = fs.readFileSync(pidFile, "utf8").trim(); + const pid = parseInt(raw, 10); + return isNaN(pid) ? null : pid; +} + +export function clearPid(): void { + const pidFile = getPidFile(); + if (fs.existsSync(pidFile)) fs.unlinkSync(pidFile); +} diff --git a/packages/proxy-agent/src/heartbeat.ts b/packages/proxy-agent/src/heartbeat.ts new file mode 100644 index 0000000..c214a77 --- /dev/null +++ b/packages/proxy-agent/src/heartbeat.ts @@ -0,0 +1,82 @@ +/** + * Heartbeat — sends periodic status updates to TIP routing server. + * POST /api/proxy/heartbeat every 30s. + */ +import * as https from "https"; +import * as http from "http"; + +export interface HeartbeatPayload { + token: string; + ip?: string; + port: number; + bytesProxied: number; + requestsProxied: number; + latencyMs?: number; + version: string; +} + +export interface HeartbeatResult { + ok: boolean; + latencyMs: number; + error?: string; +} + +function postJson(url: string, body: unknown): Promise<{ status: number; latencyMs: number }> { + return new Promise((resolve, reject) => { + const start = Date.now(); + const json = JSON.stringify(body); + const parsed = new URL(url); + const mod = parsed.protocol === "https:" ? https : http; + + const req = mod.request( + { + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === "https:" ? 443 : 80), + path: parsed.pathname + parsed.search, + method: "POST", + headers: { + "Content-Type": "application/json", + "Content-Length": Buffer.byteLength(json), + "User-Agent": "TIP-Agent/1.0.0", + }, + timeout: 10_000, + }, + (res) => { + res.resume(); + resolve({ status: res.statusCode ?? 0, latencyMs: Date.now() - start }); + } + ); + + req.on("timeout", () => { req.destroy(); reject(new Error("timeout")); }); + req.on("error", reject); + req.write(json); + req.end(); + }); +} + +export async function sendHeartbeat( + serverUrl: string, + payload: HeartbeatPayload +): Promise { + const url = `${serverUrl}/api/proxy/heartbeat`; + try { + const { status, latencyMs } = await postJson(url, payload); + return { ok: status >= 200 && status < 300, latencyMs }; + } catch (err) { + return { ok: false, latencyMs: 0, error: String(err) }; + } +} + +export function startHeartbeatLoop( + serverUrl: string, + getPayload: () => HeartbeatPayload, + onResult?: (r: HeartbeatResult) => void, + intervalMs = 30_000 +): NodeJS.Timeout { + const tick = async () => { + const result = await sendHeartbeat(serverUrl, getPayload()); + onResult?.(result); + }; + tick(); + return setInterval(tick, intervalMs); +} diff --git a/packages/proxy-agent/src/index.ts b/packages/proxy-agent/src/index.ts new file mode 100644 index 0000000..cacc412 --- /dev/null +++ b/packages/proxy-agent/src/index.ts @@ -0,0 +1,167 @@ +#!/usr/bin/env node +/** + * TIP Proxy Agent CLI + * + * Usage: + * tip-agent start --token TOKEN [--port 1080] [--max-gb 10] [--name "My Node"] + * tip-agent status + * tip-agent stop + */ +import { startAgent } from "./agent"; +import { loadConfig, saveConfig, loadPid, clearPid } from "./config"; + +const AGENT_VERSION = "1.0.0"; + +function parseArgs(argv: string[]): Record { + const args: Record = {}; + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (arg && arg.startsWith("--")) { + const key = arg.slice(2); + const next = argv[i + 1]; + if (next && !next.startsWith("--")) { + args[key] = next; + i++; + } else { + args[key] = "true"; + } + } + } + return args; +} + +function isProcessRunning(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +function cmdStart(rawArgs: string[]): void { + const args = parseArgs(rawArgs); + + const token = args["token"]; + if (!token) { + console.error("Error: --token is required"); + console.error(" tip-agent start --token YOUR_TOKEN [--port 1080] [--max-gb 10] [--name 'My Node']"); + process.exit(1); + } + + const cfg = saveConfig({ + token, + port: args["port"] ? parseInt(args["port"], 10) : 1080, + maxGb: args["max-gb"] ? parseFloat(args["max-gb"]) : 10, + name: args["name"] ?? undefined, + serverUrl: args["server"] ?? "https://transceiver-db.context-x.org", + }); + + console.log(`\nTIP Proxy Agent v${AGENT_VERSION}`); + console.log("=========================================="); + console.log(` Token: ${cfg.token.slice(0, 8)}...`); + console.log(` Port: ${cfg.port}`); + console.log(` Max GB: ${cfg.maxGb}`); + console.log(` Name: ${cfg.name}`); + console.log(` Server: ${cfg.serverUrl}`); + console.log("==========================================\n"); + + startAgent(cfg).catch((err) => { + console.error("[agent] Fatal error:", err.message); + process.exit(1); + }); +} + +function cmdStatus(): void { + const cfg = loadConfig(); + const pid = loadPid(); + + if (!cfg || !pid) { + console.log("TIP Agent: not running"); + process.exit(0); + } + + const running = isProcessRunning(pid); + console.log(`TIP Agent: ${running ? "RUNNING" : "STOPPED (stale PID)"}`); + console.log(` PID: ${pid}`); + console.log(` Port: ${cfg.port}`); + console.log(` Token: ${cfg.token.slice(0, 8)}...`); + console.log(` Max GB: ${cfg.maxGb}`); + if (cfg.startedAt) { + console.log(` Started: ${cfg.startedAt}`); + } + + if (!running) { + clearPid(); + } +} + +function cmdStop(): void { + const pid = loadPid(); + + if (!pid) { + console.log("TIP Agent: not running"); + process.exit(0); + } + + try { + process.kill(pid, "SIGTERM"); + console.log(`TIP Agent (PID ${pid}) stopped.`); + clearPid(); + } catch { + console.log(`TIP Agent: process ${pid} not found (already stopped)`); + clearPid(); + } +} + +function printHelp(): void { + console.log(` +TIP Proxy Agent v${AGENT_VERSION} +Contribute bandwidth to the TIP network, get free API access. + +Usage: + tip-agent start --token TOKEN [options] + tip-agent status + tip-agent stop + +Commands: + start Start the proxy agent + status Show agent status + stop Stop the agent + +Options: + --token TOKEN Your TIP node token (required for start) + --port N SOCKS5 listen port (default: 1080) + --max-gb N Max bandwidth to donate in GB (default: 10) + --name NAME Node name (default: hostname) + --server URL TIP server URL (default: https://transceiver-db.context-x.org) + +Example: + tip-agent start --token abc123 --port 1080 --max-gb 10 --name "Home Node" +`); +} + +const [, , command, ...rest] = process.argv; + +switch (command) { + case "start": + cmdStart(rest); + break; + case "status": + cmdStatus(); + break; + case "stop": + cmdStop(); + break; + case "--help": + case "-h": + case "help": + printHelp(); + break; + default: + if (command) { + console.error(`Unknown command: ${command}`); + } + printHelp(); + process.exit(command ? 1 : 0); +} diff --git a/packages/proxy-agent/src/socks5.ts b/packages/proxy-agent/src/socks5.ts new file mode 100644 index 0000000..5f66b65 --- /dev/null +++ b/packages/proxy-agent/src/socks5.ts @@ -0,0 +1,181 @@ +/** + * SOCKS5 proxy server — built on Node.js net module, no external deps. + * + * Implements: + * - SOCKS5 handshake (RFC 1928) + * - No-auth method (0x00) + * - CONNECT command (TCP tunnel) + * - Byte tracking per connection + */ +import * as net from "net"; +import { BandwidthTracker } from "./bandwidth"; + +const SOCKS_VER = 0x05; +const NO_AUTH = 0x00; +const CMD_CONNECT = 0x01; +const ATYP_IPV4 = 0x01; +const ATYP_DOMAIN = 0x03; +const ATYP_IPV6 = 0x04; +const REP_SUCCESS = 0x00; +const REP_FAIL = 0x01; +const CONN_TIMEOUT_MS = 30_000; + +export interface Socks5ServerOptions { + port: number; + host?: string; + onNewConnection?: () => boolean; // return false to reject (bandwidth limit) + bandwidth?: BandwidthTracker; +} + +function buildReply(rep: number, ip = "0.0.0.0", port = 0): Buffer { + const parts = ip.split(".").map(Number); + const buf = Buffer.alloc(10); + buf[0] = SOCKS_VER; + buf[1] = rep; + buf[2] = 0x00; // reserved + buf[3] = ATYP_IPV4; + buf[4] = parts[0] ?? 0; + buf[5] = parts[1] ?? 0; + buf[6] = parts[2] ?? 0; + buf[7] = parts[3] ?? 0; + buf.writeUInt16BE(port, 8); + return buf; +} + +function handleClient( + socket: net.Socket, + opts: Socks5ServerOptions +): void { + socket.setTimeout(CONN_TIMEOUT_MS); + socket.on("timeout", () => socket.destroy()); + socket.on("error", () => socket.destroy()); + + let state: "greeting" | "request" | "connected" = "greeting"; + let buf = Buffer.alloc(0); + + socket.on("data", (chunk) => { + buf = Buffer.concat([buf, chunk]); + + if (state === "greeting") { + // Need at least: VER(1) + NMETHODS(1) + methods + if (buf.length < 2) return; + const nmethods = buf[1] ?? 0; + if (buf.length < 2 + nmethods) return; + + const methods = buf.slice(2, 2 + nmethods); + buf = buf.slice(2 + nmethods); + + if (!methods.includes(NO_AUTH)) { + socket.write(Buffer.from([SOCKS_VER, 0xFF])); // no acceptable method + socket.destroy(); + return; + } + + socket.write(Buffer.from([SOCKS_VER, NO_AUTH])); + state = "request"; + return; + } + + if (state === "request") { + // Minimum: VER(1) CMD(1) RSV(1) ATYP(1) + address + port(2) + if (buf.length < 4) return; + + const cmd = buf[1]; + const atyp = buf[3]; + + let host: string; + let port: number; + let consumed: number; + + if (atyp === ATYP_IPV4) { + if (buf.length < 10) return; + host = `${buf[4]}.${buf[5]}.${buf[6]}.${buf[7]}`; + port = buf.readUInt16BE(8); + consumed = 10; + } else if (atyp === ATYP_DOMAIN) { + if (buf.length < 5) return; + const domainLen = buf[4] ?? 0; + if (buf.length < 5 + domainLen + 2) return; + host = buf.slice(5, 5 + domainLen).toString("utf8"); + port = buf.readUInt16BE(5 + domainLen); + consumed = 5 + domainLen + 2; + } else if (atyp === ATYP_IPV6) { + if (buf.length < 22) return; + const parts: string[] = []; + for (let i = 0; i < 8; i++) { + parts.push(buf.readUInt16BE(4 + i * 2).toString(16)); + } + host = parts.join(":"); + port = buf.readUInt16BE(20); + consumed = 22; + } else { + socket.write(buildReply(0x08)); // address type not supported + socket.destroy(); + return; + } + + buf = buf.slice(consumed); + + if (cmd !== CMD_CONNECT) { + socket.write(buildReply(0x07)); // command not supported + socket.destroy(); + return; + } + + // Check bandwidth limit before accepting + if (opts.onNewConnection && !opts.onNewConnection()) { + socket.write(buildReply(REP_FAIL)); + socket.destroy(); + return; + } + + opts.bandwidth?.incrementRequests(); + + const target = net.createConnection({ host, port }, () => { + socket.write(buildReply(REP_SUCCESS)); + state = "connected"; + + // Pipe remaining buffered bytes + if (buf.length > 0) { + target.write(buf); + buf = Buffer.alloc(0); + } + + // Bidirectional pipe with byte counting + socket.on("data", (d) => { + opts.bandwidth?.addBytes(d.length); + if (!target.destroyed) target.write(d); + }); + + target.on("data", (d) => { + opts.bandwidth?.addBytes(d.length); + if (!socket.destroyed) socket.write(d); + }); + + socket.on("end", () => { if (!target.destroyed) target.end(); }); + target.on("end", () => { if (!socket.destroyed) socket.end(); }); + }); + + target.setTimeout(CONN_TIMEOUT_MS); + target.on("timeout", () => { target.destroy(); socket.destroy(); }); + target.on("error", () => { + if (!socket.destroyed) { + socket.write(buildReply(REP_FAIL)); + socket.destroy(); + } + }); + } + }); +} + +export function createSocks5Server(opts: Socks5ServerOptions): net.Server { + const server = net.createServer((socket) => { + handleClient(socket, opts); + }); + + server.on("error", (err) => { + console.error("[SOCKS5] Server error:", err.message); + }); + + return server; +} diff --git a/packages/proxy-agent/tsconfig.json b/packages/proxy-agent/tsconfig.json new file mode 100644 index 0000000..8ce56ac --- /dev/null +++ b/packages/proxy-agent/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "commonjs", + "lib": ["ES2020"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts index ca2084f..0900d3f 100644 --- a/packages/scraper/src/scrapers/flexoptix-catalog.ts +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -147,6 +147,7 @@ interface Product { reachMeters?: number; fiberType?: string; wavelength?: string; + imageUrl?: string; } function sleep(ms: number): Promise { @@ -432,6 +433,7 @@ export async function scrapeFlexoptixCatalog(): Promise { name sku url_key + small_image { url } price_range { minimum_price { final_price { value currency } @@ -457,6 +459,7 @@ export async function scrapeFlexoptixCatalog(): Promise { name: string; sku: string; url_key: string; + small_image?: { url?: string }; price_range?: { minimum_price?: { final_price?: { value: number; currency: string }; @@ -488,6 +491,9 @@ export async function scrapeFlexoptixCatalog(): Promise { const reach = detectReach(item.name); const price = item.price_range?.minimum_price?.final_price?.value; + const rawImg = item.small_image?.url; + const imageUrl = rawImg && !rawImg.includes("placeholder") ? rawImg : undefined; + allProducts.set(url, { name: item.name, partNumber: item.sku, @@ -501,6 +507,7 @@ export async function scrapeFlexoptixCatalog(): Promise { reachMeters: reach?.meters, fiberType: detectFiber(item.name), wavelength: detectWavelength(item.name), + imageUrl, }); newCount++; } @@ -538,6 +545,7 @@ export async function scrapeFlexoptixCatalog(): Promise { fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", + imageUrl: product.imageUrl, }); if (product.price && product.price > 0) { diff --git a/packages/scraper/src/scrapers/fluxlight.ts b/packages/scraper/src/scrapers/fluxlight.ts index 03004e5..3f4fdae 100644 --- a/packages/scraper/src/scrapers/fluxlight.ts +++ b/packages/scraper/src/scrapers/fluxlight.ts @@ -99,9 +99,12 @@ function parseProductList(html: string): Product[] { const name = match[2].trim(); if (name.length < 10 || name.length > 200) continue; - // Look for price in surrounding context - const context = html.slice(Math.max(0, match.index - 300), match.index + 600); - const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/) || context.match(/data-product-price="([\d.]+)"/); + // Look for price in surrounding context — BigCommerce uses data-product-price-without-tax + const context = html.slice(Math.max(0, match.index - 300), match.index + 1200); + const priceMatch = context.match(/data-product-price-without-tax[^>]*>\s*\$([\d,]+\.?\d{0,2})/) + || context.match(/class="price price--withoutTax">\s*\$([\d,]+\.?\d{0,2})/) + || context.match(/\$\s*([\d,]+\.?\d{0,2})/); + const price = priceMatch ? parseFloat(priceMatch[1].replace(",", "")) : undefined; const ff = detectFormFactor(name); diff --git a/packages/scraper/src/scrapers/optcore.ts b/packages/scraper/src/scrapers/optcore.ts index 5f69148..2a37f25 100644 --- a/packages/scraper/src/scrapers/optcore.ts +++ b/packages/scraper/src/scrapers/optcore.ts @@ -47,6 +47,7 @@ interface OptcoreProduct { speedGbps?: number; speed?: string; reachLabel?: string; + imageUrl?: string; } function detectFormFactor(text: string): string | undefined { @@ -208,7 +209,13 @@ export async function scrapeOptcore(): Promise { const stockEl = document.querySelector(".stock, .availability, [class*=\"stock\"]"); const stockText = stockEl?.textContent?.trim() || ""; - return { title, priceText, stockText }; + // Product image — WooCommerce product gallery + const imgEl = document.querySelector( + ".woocommerce-product-gallery__image img, .wp-post-image, img.attachment-woocommerce_single" + ) as HTMLImageElement | null; + const imageUrl = imgEl?.src || imgEl?.getAttribute("data-src") || ""; + + return { title, priceText, stockText, imageUrl }; }); const meta = metaByUrl.get(url); @@ -229,6 +236,7 @@ export async function scrapeOptcore(): Promise { speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel: detectReach(name), + imageUrl: data.imageUrl || undefined, }); } @@ -262,6 +270,7 @@ export async function scrapeOptcore(): Promise { speed: p.speed, reachLabel: p.reachLabel, category: "DataCenter", + imageUrl: p.imageUrl, }); const hash = contentHash({ price: p.price, stock: p.stockLevel }); diff --git a/packages/scraper/src/scrapers/smartoptics.ts b/packages/scraper/src/scrapers/smartoptics.ts index 378e0f9..2e9712d 100644 --- a/packages/scraper/src/scrapers/smartoptics.ts +++ b/packages/scraper/src/scrapers/smartoptics.ts @@ -1,163 +1,55 @@ /** - * SmartOptics Scraper — Manufacturer, quote-based pricing + * SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer * - * www.smartoptics.com — no direct prices, catalog-only scrape. - * Extracts product names, part numbers, form factors and specs. - * - * Rate limited: 1 req/2sec. + * smartoptics.com — WordPress site, no prices (B2B, RFQ model). + * Scrapes product catalog for specs, images, datasheets. + * Products listed at /products/optical-transceivers/ → individual /product/SKU/ pages. */ import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db"; -import * as cheerio from "cheerio"; -const BASE = "https://www.smartoptics.com"; -const CATALOG_URLS = [ - "/products/transceivers/", - "/products/", - "/products/sfp-transceivers/", - "/products/qsfp-transceivers/", -]; +const BASE = "https://smartoptics.com"; +const CATALOG_URL = `${BASE}/products/optical-transceivers/`; const HEADERS = { - "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", - "Accept-Language": "en-US,en;q=0.9", }; -interface Product { - partNumber: string; - name: string; - url: string; - formFactor: string; - speed: string; - speedGbps: number; - reachLabel?: string; - reachMeters?: number; - fiberType?: string; - wavelength?: string; -} - function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); + return new Promise((r) => setTimeout(r, ms)); } function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { - const lower = text.toLowerCase(); - if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 }; - if (lower.includes("qsfp-dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; - if (lower.includes("qsfp28")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; - if (lower.includes("qsfp+") || lower.includes("qsfp plus")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; - if (lower.includes("sfp56")) return { formFactor: "SFP56", speed: "50G", speedGbps: 50 }; - if (lower.includes("sfp28") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; - if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; - if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 }; - if (lower.includes("1000base") || lower.includes("1g")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; - if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; - return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + const t = text.toLowerCase(); + if (t.includes("qsfp-dd800") || t.includes("sfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 }; + if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; + if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 }; + if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 }; + if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; + if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; + if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; + if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; + return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; } function detectReach(text: string): { label: string; meters: number } | undefined { - const patterns: [RegExp, string, number][] = [ - [/\b80\s*km\b/i, "80km", 80000], - [/\b40\s*km\b/i, "40km", 40000], - [/\b20\s*km\b/i, "20km", 20000], - [/\b10\s*km\b/i, "10km", 10000], - [/\b2\s*km\b/i, "2km", 2000], - [/\b550\s*m\b/i, "550m", 550], - [/\b300\s*m\b/i, "300m", 300], - [/\b100\s*m\b/i, "100m", 100], - [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], - [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], - [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], - ]; - for (const [regex, label, meters] of patterns) { - if (regex.test(text)) return { label, meters }; + const kmMatch = text.match(/(\d+)\s*km/i); + if (kmMatch) { + const km = parseInt(kmMatch[1]); + return { label: `${km}km`, meters: km * 1000 }; + } + const mMatch = text.match(/(\d+)\s*m\b/i); + if (mMatch) { + const m = parseInt(mMatch[1]); + return { label: `${m}m`, meters: m }; } return undefined; } function detectFiber(text: string): string { - if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; - if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; - if (/copper|dac|twinax|rj45|base-t/i.test(text)) return "Copper"; - return "SMF"; -} - -function detectWavelength(text: string): string { - const match = text.match(/(\d{3,4})\s*nm/i); - return match ? match[1] : ""; -} - -function parseProductList(html: string, sourceUrl: string): Product[] { - const $ = cheerio.load(html); - const products: Product[] = []; - - // Manufacturer site: product cards or table rows - const cardSelectors = [ - ".product-item", ".product-card", ".product", "tr", - "article", ".item", ".col-xs-12.col-sm-6", ".entry", - ]; - - for (const sel of cardSelectors) { - if ($(sel).length >= 2) { - $(sel).each((_i, el) => { - const text = $(el).text().trim(); - if (!/sfp|qsfp|xfp|transceiver|optic/i.test(text)) return; - - const nameEl = $(el).find("h2, h3, h4, td, .title, .name, a").first(); - const name = nameEl.text().trim() || text.slice(0, 100); - if (!name || name.length < 5) return; - - const linkEl = $(el).find("a[href]").first(); - const href = linkEl.attr("href") || sourceUrl; - const url = href.startsWith("http") ? href : BASE + href; - - // Part number: look for alphanumeric patterns typical of transceiver SKUs - const partNumMatch = text.match(/\b([A-Z]{2,}[-_][A-Z0-9]{2,}[-_][A-Z0-9]+)\b/) || - text.match(/\b(SO[-_][A-Z0-9]+)\b/i) || - name.match(/[A-Z0-9][-A-Z0-9]{5,}/); - const partNumber = partNumMatch?.[0] || name.replace(/\s+/g, "-").slice(0, 60); - - const ff = detectFormFactor(name + " " + text); - const reach = detectReach(name + " " + text); - - products.push({ - partNumber, - name, - url, - ...ff, - reachLabel: reach?.label, - reachMeters: reach?.meters, - fiberType: detectFiber(name + " " + text), - wavelength: detectWavelength(name + " " + text), - }); - }); - if (products.length > 0) break; - } - } - - // Fallback: all transceiver-relevant anchors - if (products.length === 0) { - $("a[href]").each((_i, el) => { - const name = $(el).text().trim(); - const href = $(el).attr("href") || ""; - if (name.length < 5 || name.length > 200 || !/sfp|qsfp|transceiver|optic/i.test(name)) return; - const url = href.startsWith("http") ? href : BASE + href; - const ff = detectFormFactor(name); - const reach = detectReach(name); - products.push({ - partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60), - name, url, ...ff, - reachLabel: reach?.label, reachMeters: reach?.meters, - fiberType: detectFiber(name), wavelength: detectWavelength(name), - }); - }); - } - - const seen = new Set(); - return products.filter((p) => { - if (!p.url || seen.has(p.url)) return false; - seen.add(p.url); - return true; - }); + if (/dwdm|cwdm|coherent|coh|single.?mode|smf/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|sr/i.test(text)) return "MMF"; + return "SMF"; // SmartOptics is almost exclusively SMF/coherent } async function fetchPage(url: string): Promise { @@ -166,46 +58,108 @@ async function fetchPage(url: string): Promise { return resp.text(); } +function extractProductUrls(html: string): string[] { + const urls = new Set(); + const regex = /href="(https?:\/\/smartoptics\.com\/product\/[^"]+)"/gi; + let m: RegExpExecArray | null; + while ((m = regex.exec(html)) !== null) { + const u = m[1].replace(/\/$/, "") + "/"; + urls.add(u); + } + return Array.from(urls); +} + +interface ProductData { + sku: string; + name: string; + url: string; + imageUrl?: string; + formFactor: string; + speed: string; + speedGbps: number; + reachLabel?: string; + reachMeters?: number; + fiberType: string; + coherent: boolean; + wdmType?: string; +} + +async function scrapeProductPage(url: string): Promise { + try { + const html = await fetchPage(url); + + const nameMatch = html.match(/]*>([^<]+)<\/h1>/) || html.match(/og:title" content="([^"]+)"/); + const name = nameMatch ? nameMatch[1].trim().replace(/ \| Smartoptics$/, "") : ""; + if (!name) return null; + + const sku = url.split("/").filter(Boolean).pop()?.toUpperCase() || name.replace(/\s+/g, "-"); + + const imgMatch = html.match(/property="og:image" content="([^"]+)"/) + || html.match(/]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]* class="[^"]*product/i); + const imageUrl = imgMatch ? imgMatch[1] : undefined; + + const ff = detectFormFactor(name); + const reach = detectReach(name); + const coherent = /coherent|coh-t|coh\.|dwdm|dp-qpsk|qpsk|cfp2/i.test(name + html.slice(0, 3000)); + const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined; + + return { + sku, name, url, imageUrl, + ...ff, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(name), + coherent, + wdmType, + }; + } catch (err) { + console.warn(` Failed ${url}: ${(err as Error).message}`); + return null; + } +} + export async function scrapeSmartOptics(): Promise { - console.log("=== SmartOptics Scraper Starting (catalog-only, no prices) ===\n"); + console.log("=== SmartOptics Scraper Starting ===\n"); + console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + images only.\n"); const vendorId = await ensureVendor( "SmartOptics", "manufacturer", "https://www.smartoptics.com", - BASE + CATALOG_URLS[0], + "https://smartoptics.com/products/optical-transceivers/" ); - const allProducts: Product[] = []; - const seenUrls = new Set(); - - for (const catalogPath of CATALOG_URLS) { - const catalogUrl = BASE + catalogPath; - console.log(` Fetching catalog: ${catalogUrl}`); + const productUrls = new Set(); + for (let page = 1; page <= 10; page++) { try { - const html = await fetchPage(catalogUrl); - const pageProducts = parseProductList(html, catalogUrl); - for (const p of pageProducts) { - if (!seenUrls.has(p.url)) { - seenUrls.add(p.url); - allProducts.push(p); - } - } - console.log(` Found ${pageProducts.length} products`); - await sleep(2000); - } catch (err) { - console.warn(` Failed ${catalogPath}: ${(err as Error).message}`); + const url = page === 1 ? CATALOG_URL : `${CATALOG_URL}page/${page}/`; + const html = await fetchPage(url); + const urls = extractProductUrls(html); + if (urls.length === 0) break; + urls.forEach((u) => productUrls.add(u)); + console.log(` Catalog page ${page}: ${urls.length} products`); + await sleep(1500); + } catch { + break; } } - console.log(`\nTotal unique products: ${allProducts.length}`); + console.log(`\nTotal product URLs: ${productUrls.size}`); + if (productUrls.size === 0) { + console.log("No products found — site may have changed structure"); + return; + } - let totalProducts = 0; + let saved = 0; + let withImages = 0; + + for (const url of productUrls) { + const product = await scrapeProductPage(url); + if (!product) continue; - for (const product of allProducts) { try { await findOrCreateScrapedTransceiver({ - partNumber: product.partNumber, + partNumber: product.sku, vendorId, formFactor: product.formFactor, speedGbps: product.speedGbps, @@ -213,16 +167,20 @@ export async function scrapeSmartOptics(): Promise { reachMeters: product.reachMeters, reachLabel: product.reachLabel, fiberType: product.fiberType, - wavelengths: product.wavelength, - category: "DataCenter", + wavelengths: product.wdmType ? "DWDM-tunable" : undefined, + category: product.coherent ? "Coherent" : "DataCenter", + imageUrl: product.imageUrl, }); - totalProducts++; + saved++; + if (product.imageUrl) withImages++; + console.log(` ✓ ${product.sku} — ${product.name.slice(0, 60)}`); } catch (err) { - console.warn(` Error saving ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`); + console.warn(` Error saving ${product.sku}: ${(err as Error).message.slice(0, 80)}`); } + await sleep(1500); } - console.log(`\n=== SmartOptics Complete: ${totalProducts} products catalogued (no prices — quote-based) ===`); + console.log(`\n=== SmartOptics Complete: ${saved} products, ${withImages} with images ===`); } if (require.main === module) { diff --git a/packages/scraper/src/utils/backfill-images.ts b/packages/scraper/src/utils/backfill-images.ts new file mode 100644 index 0000000..44cbb46 --- /dev/null +++ b/packages/scraper/src/utils/backfill-images.ts @@ -0,0 +1,540 @@ +/** + * Image Backfill Script + * + * Fills `image_url` for transceivers that currently have none. + * Priority order: + * 1. Optcore (2,580 products) — WP REST API featured_media per product + * 2. Flexoptix (344 products) — Magento GraphQL image fields + * 3. GAO Tek (413 products) — product page og:image scrape + * 4. Other vendors — og:image from stored price_observations URLs + * + * Run on Erik: + * node packages/scraper/dist/utils/backfill-images.js + * + * Or locally with SSH port-forward: + * ssh -L 5433:127.0.0.1:5433 erik -N & + * node dist/utils/backfill-images.js + */ + +import { pool } from "./db"; +import { logger } from "./logger"; + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +async function updateImageUrl(id: string, imageUrl: string): Promise { + await pool.query( + `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`, + [imageUrl, id] + ); +} + +async function fetchJson(url: string, init?: RequestInit): Promise { + const resp = await fetch(url, { + ...init, + headers: { + "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)", + Accept: "application/json", + ...(init?.headers ?? {}), + }, + signal: AbortSignal.timeout(20000), + }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.json(); +} + +async function fetchHtml(url: string): Promise { + const resp = await fetch(url, { + headers: { + "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)", + Accept: "text/html", + }, + signal: AbortSignal.timeout(20000), + }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +// ============================================================================= +// Optcore +// Strategy: +// Step 1 — enumerate all products per category, collect slug -> featured_media id +// Step 2 — resolve unique media IDs individually via /wp-json/wp/v2/media/ +// Step 3 — update DB for matched slugs +// +// Note: _embed does NOT work for the "product" custom post type on Optcore. +// We must resolve media IDs separately. +// ============================================================================= + +const OPTCORE_BASE = "https://www.optcore.net"; +const OPTCORE_CATEGORY_IDS = [ + 309, 173, 76, 79, 73, 311, 313, 312, 333, 1088, + 59, 1102, 4097, 77, 4101, 4092, 6441, +]; + +interface OptcoreWpProductRaw { + slug: string; + featured_media: number; +} + +interface OptcoreMediaRaw { + id: number; + source_url: string; + media_details?: { + sizes?: { + medium?: { source_url: string }; + large?: { source_url: string }; + }; + }; +} + +async function fetchOptcoreMediaUrl(mediaId: number): Promise { + if (mediaId === 0) return null; + try { + const data = await fetchJson( + `${OPTCORE_BASE}/wp-json/wp/v2/media/${mediaId}?_fields=id,source_url,media_details` + ) as OptcoreMediaRaw; + return ( + data.media_details?.sizes?.medium?.source_url || + data.media_details?.sizes?.large?.source_url || + data.source_url || + null + ); + } catch { + return null; + } +} + +async function backfillOptcore(): Promise<{ updated: number; skipped: number; notFound: number }> { + logger.info("=== Optcore image backfill starting ==="); + + const rows = await pool.query(` + SELECT t.id, t.part_number + FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + WHERE v.name = 'Optcore' AND (t.image_url IS NULL OR t.image_url = '') + `); + + // Map: part_number (lowercase) -> DB id + const byPartNumber = new Map(); + for (const row of rows.rows) { + byPartNumber.set((row.part_number as string).toLowerCase(), row.id as string); + } + logger.info(`Optcore: ${byPartNumber.size} products need images`); + + if (byPartNumber.size === 0) { + return { updated: 0, skipped: 0, notFound: 0 }; + } + + // Step 1: enumerate all products across all transceiver categories + const slugToMediaId = new Map(); + const seenSlugs = new Set(); + + for (const catId of OPTCORE_CATEGORY_IDS) { + let page = 1; + while (true) { + const apiUrl = `${OPTCORE_BASE}/wp-json/wp/v2/product?product_cat=${catId}&per_page=100&page=${page}&_fields=slug,featured_media`; + try { + const resp = await fetch(apiUrl, { + headers: { "User-Agent": "Mozilla/5.0 (compatible; TIP-ImageBackfill/1.0)" }, + signal: AbortSignal.timeout(20000), + }); + if (!resp.ok) break; + const totalPages = parseInt(resp.headers.get("X-WP-TotalPages") || "1", 10); + const products = (await resp.json()) as OptcoreWpProductRaw[]; + + for (const p of products) { + if (seenSlugs.has(p.slug)) continue; + seenSlugs.add(p.slug); + if (p.featured_media) { + slugToMediaId.set(p.slug.toLowerCase(), p.featured_media); + } + } + + if (page >= totalPages) break; + page++; + } catch (err) { + logger.warn(`Optcore cat=${catId} page=${page} fetch failed`, { error: (err as Error).message }); + break; + } + await sleep(100); + } + } + + logger.info(`Optcore: enumerated ${seenSlugs.size} WP products, ${slugToMediaId.size} have featured media`); + + // Step 2: resolve unique media IDs + const uniqueMediaIds = new Set(slugToMediaId.values()); + logger.info(`Optcore: resolving ${uniqueMediaIds.size} unique media IDs`); + + const mediaIdToUrl = new Map(); + let resolved = 0; + for (const mediaId of uniqueMediaIds) { + const url = await fetchOptcoreMediaUrl(mediaId); + if (url) mediaIdToUrl.set(mediaId, url); + resolved++; + if (resolved % 20 === 0) { + logger.info(`Optcore media resolution: ${resolved}/${uniqueMediaIds.size}`); + } + await sleep(100); + } + + // Step 3: match slugs to DB records and update + let updated = 0; + let skipped = 0; + let notFound = 0; + + for (const [slug, mediaId] of slugToMediaId) { + const dbId = byPartNumber.get(slug); + if (!dbId) { + notFound++; + continue; + } + + const imgUrl = mediaIdToUrl.get(mediaId); + if (!imgUrl) { + skipped++; + continue; + } + + await updateImageUrl(dbId, imgUrl); + byPartNumber.delete(slug); + updated++; + if (updated % 50 === 0) { + logger.info(`Optcore DB updates: ${updated} so far`); + } + } + + logger.info(`Optcore done: ${updated} updated, ${skipped} no-media-url, ${notFound} slug-not-in-db`); + return { updated, skipped, notFound }; +} + +// ============================================================================= +// Flexoptix +// Magento GraphQL: /graphql — query by SKU, returns small_image.url +// ============================================================================= + +const FLEXOPTIX_BASE = "https://www.flexoptix.net"; +const FLEXOPTIX_GRAPHQL = `${FLEXOPTIX_BASE}/graphql`; + +interface FlexoptixGqlProduct { + name: string; + sku: string; + url_key: string; + small_image: { url: string } | null; + image: { url: string } | null; + thumbnail: { url: string } | null; +} + +async function fetchFlexoptixImage(sku: string): Promise { + // Strip ":Sx" coding suffix to get canonical SKU for search + const canonicalSku = sku.replace(/:.*$/, "").trim(); + + const query = `{ + products(search: ${JSON.stringify(canonicalSku)}, pageSize: 5) { + items { + name + sku + url_key + small_image { url } + image { url } + thumbnail { url } + } + } + }`; + + const data = await fetchJson(FLEXOPTIX_GRAPHQL, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query }), + }) as { + data?: { products?: { items: FlexoptixGqlProduct[] } }; + }; + + const items = data.data?.products?.items ?? []; + + // Prefer exact SKU match (canonical, case-insensitive) + for (const item of items) { + const itemSku = item.sku.replace(/:.*$/, "").trim().toLowerCase(); + if (itemSku === canonicalSku.toLowerCase()) { + return ( + item.small_image?.url || + item.image?.url || + item.thumbnail?.url || + null + ); + } + } + + // Fallback: single result + if (items.length === 1) { + return ( + items[0].small_image?.url || + items[0].image?.url || + items[0].thumbnail?.url || + null + ); + } + + return null; +} + +async function backfillFlexoptix(): Promise<{ updated: number; skipped: number; errors: number }> { + logger.info("=== Flexoptix image backfill starting ==="); + + const rows = await pool.query(` + SELECT t.id, t.part_number + FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '') + ORDER BY t.part_number + `); + + logger.info(`Flexoptix: ${rows.rows.length} products need images`); + + let updated = 0; + let skipped = 0; + let errors = 0; + + for (const row of rows.rows) { + const sku = row.part_number as string; + try { + const imgUrl = await fetchFlexoptixImage(sku); + if (imgUrl) { + await updateImageUrl(row.id as string, imgUrl); + updated++; + if (updated % 25 === 0) { + logger.info(`Flexoptix progress: ${updated} updated so far`); + } + } else { + skipped++; + } + } catch (err) { + logger.error(`Flexoptix error for ${sku}`, { error: (err as Error).message }); + errors++; + } + await sleep(100); + } + + logger.info(`Flexoptix done: ${updated} updated, ${skipped} no-image, ${errors} errors`); + return { updated, skipped, errors }; +} + +// ============================================================================= +// GAO Tek +// Fetch product page and extract og:image meta tag +// ============================================================================= + +const GAOTEK_BASE = "https://gaotek.com"; + +function extractOgImage(html: string): string | null { + const match = + html.match(/]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) || + html.match(/]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i); + if (!match) return null; + const url = match[1]; + // Skip placeholder and logo images + if (url.includes("placeholder") || url.includes("logo") || url.includes("mobilelogo")) return null; + return url; +} + +async function fetchGaoTekImage(productUrl: string): Promise { + const html = await fetchHtml(productUrl); + return extractOgImage(html); +} + +async function backfillGaoTek(): Promise<{ updated: number; skipped: number; errors: number }> { + logger.info("=== GAO Tek image backfill starting ==="); + + // Prefer records that have stored product URLs in price_observations + const withUrlRows = await pool.query(` + SELECT DISTINCT ON (t.id) + t.id, t.part_number, t.slug, po.url as product_url + FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + LEFT JOIN price_observations po ON po.transceiver_id = t.id + WHERE v.name = 'GAO Tek' + AND (t.image_url IS NULL OR t.image_url = '') + AND po.url IS NOT NULL + AND po.url LIKE '%gaotek%' + ORDER BY t.id, po.time DESC + `); + + // All records without images + const allRows = await pool.query(` + SELECT t.id, t.part_number, t.slug + FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + WHERE v.name = 'GAO Tek' + AND (t.image_url IS NULL OR t.image_url = '') + `); + + const withUrlIds = new Set(withUrlRows.rows.map((r) => r.id as string)); + + // Combine: known URLs first, then slug-derived URLs + const toProcess: Array<{ id: string; partNumber: string; productUrl: string }> = []; + + for (const row of withUrlRows.rows) { + toProcess.push({ + id: row.id as string, + partNumber: row.part_number as string, + productUrl: row.product_url as string, + }); + } + + for (const row of allRows.rows) { + if (withUrlIds.has(row.id as string)) continue; + const rawSlug = (row.slug as string).replace(/^scraped-/, ""); + toProcess.push({ + id: row.id as string, + partNumber: row.part_number as string, + productUrl: `${GAOTEK_BASE}/product/${rawSlug}/`, + }); + } + + logger.info(`GAO Tek: ${toProcess.length} products to attempt`); + + let updated = 0; + let skipped = 0; + let errors = 0; + + for (const item of toProcess) { + try { + const imgUrl = await fetchGaoTekImage(item.productUrl); + if (imgUrl) { + await updateImageUrl(item.id, imgUrl); + updated++; + if (updated % 25 === 0) { + logger.info(`GAO Tek progress: ${updated} updated so far`); + } + } else { + skipped++; + } + } catch (err) { + const msg = (err as Error).message; + if (!msg.includes("HTTP 404") && !msg.includes("HTTP 403")) { + logger.warn(`GAO Tek error for ${item.productUrl}`, { error: msg.slice(0, 80) }); + } + errors++; + } + await sleep(100); + } + + logger.info(`GAO Tek done: ${updated} updated, ${skipped} no-image, ${errors} errors/404s`); + return { updated, skipped, errors }; +} + +// ============================================================================= +// Other vendors — og:image from stored price_observations URLs +// ============================================================================= + +const OTHER_VENDOR_NAMES = [ + "T&S Communication", + "Ascent Optics", + "ATGBICS", + "Skylane Optics", + "SmartOptics", + "ProLabs", + "FS.COM", + "GBICS", + "Fluxlight", +]; + +async function backfillOtherVendors(): Promise<{ total: number; updated: number }> { + logger.info("=== Other vendors og:image backfill starting ==="); + + const rows = await pool.query(` + SELECT DISTINCT ON (t.id) + t.id, t.part_number, v.name as vendor_name, po.url as product_url + FROM transceivers t + JOIN vendors v ON t.vendor_id = v.id + JOIN price_observations po ON po.transceiver_id = t.id + WHERE v.name = ANY($1) + AND (t.image_url IS NULL OR t.image_url = '') + AND po.url IS NOT NULL + AND po.url ~ '^https?://' + ORDER BY t.id, po.time DESC + `, [OTHER_VENDOR_NAMES]); + + logger.info(`Other vendors: ${rows.rows.length} products with URLs to process`); + + let updated = 0; + + for (const row of rows.rows) { + const productUrl = row.product_url as string; + try { + const html = await fetchHtml(productUrl); + const imgUrl = extractOgImage(html); + if (imgUrl) { + await updateImageUrl(row.id as string, imgUrl); + updated++; + } + } catch { + // Skip 404s, timeouts, etc. silently + } + await sleep(100); + } + + logger.info(`Other vendors done: ${updated} / ${rows.rows.length} updated`); + return { total: rows.rows.length, updated }; +} + +// ============================================================================= +// Main +// ============================================================================= + +async function main(): Promise { + logger.info("=== TIP Image Backfill Script ==="); + logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`); + + try { + await pool.query("SELECT 1"); + logger.info("DB connection OK"); + } catch (err) { + logger.error("DB connection failed", { error: (err as Error).message }); + process.exit(1); + } + + const startTime = Date.now(); + const results: Record = {}; + + try { + results.optcore = await backfillOptcore(); + } catch (err) { + logger.error("Optcore backfill failed", { error: (err as Error).message }); + results.optcore = { error: (err as Error).message }; + } + + try { + results.flexoptix = await backfillFlexoptix(); + } catch (err) { + logger.error("Flexoptix backfill failed", { error: (err as Error).message }); + results.flexoptix = { error: (err as Error).message }; + } + + try { + results.gaotek = await backfillGaoTek(); + } catch (err) { + logger.error("GAO Tek backfill failed", { error: (err as Error).message }); + results.gaotek = { error: (err as Error).message }; + } + + try { + results.others = await backfillOtherVendors(); + } catch (err) { + logger.error("Other vendors backfill failed", { error: (err as Error).message }); + results.others = { error: (err as Error).message }; + } + + const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1); + logger.info("=== Backfill complete ===", { results, elapsedSec }); +} + +main() + .then(() => pool.end()) + .catch((err) => { + logger.error("Fatal error", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index fb813b8..d8c358a 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -10,7 +10,9 @@ export const pool = new Pool({ database: process.env.POSTGRES_DB || "transceiver_db", user: process.env.POSTGRES_USER || "tip", password: process.env.POSTGRES_PASSWORD || "tip_dev_2026", - max: 10, + max: 5, + idleTimeoutMillis: 10000, + connectionTimeoutMillis: 5000, }); // Alias — some scrapers import { db } instead of { pool } @@ -68,23 +70,31 @@ export async function findOrCreateScrapedTransceiver(params: { fiberType?: string; wavelengths?: string; category?: string; + imageUrl?: string; }): Promise { // Try to match existing transceiver by part number + vendor const existing = await pool.query( - `SELECT id FROM transceivers WHERE part_number = $1 AND vendor_id = $2`, + `SELECT id, image_url FROM transceivers WHERE part_number = $1 AND vendor_id = $2`, [params.partNumber, params.vendorId] ); if (existing.rows.length > 0) { + // Update image_url if we have one and the record doesn't yet + if (params.imageUrl && !existing.rows[0].image_url) { + await pool.query( + `UPDATE transceivers SET image_url = $1, updated_at = NOW() WHERE id = $2`, + [params.imageUrl, existing.rows[0].id] + ); + } return existing.rows[0].id; } // Create new transceiver entry const slug = `scraped-${params.partNumber.toLowerCase().replace(/[^a-z0-9]+/g, "-")}`; const result = await pool.query( - `INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream') - ON CONFLICT (slug) DO UPDATE SET updated_at = NOW() + `INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12) + ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), updated_at = NOW() RETURNING id`, [ slug, @@ -98,6 +108,7 @@ export async function findOrCreateScrapedTransceiver(params: { params.fiberType || "", params.wavelengths || "", params.category || "DataCenter", + params.imageUrl || null, ] ); return result.rows[0].id; diff --git a/packages/scraper/src/utils/proxy.ts b/packages/scraper/src/utils/proxy.ts new file mode 100644 index 0000000..451d70a --- /dev/null +++ b/packages/scraper/src/utils/proxy.ts @@ -0,0 +1,77 @@ +/** + * Proxy utility — fetches next available proxy from TIP proxy pool. + * Falls back to process.env.PROXY_URL if pool is empty or unreachable. + */ +import * as https from "https"; +import * as http from "http"; + +const TIP_API = process.env.TIP_API_URL ?? "https://transceiver-db.context-x.org"; +const FALLBACK = process.env.PROXY_URL ?? ""; + +interface ProxyResponse { + success: boolean; + host: string; + port: number; + protocol: string; +} + +function getJson(url: string): Promise { + return new Promise((resolve, reject) => { + const parsed = new URL(url); + const mod = parsed.protocol === "https:" ? https : http; + + const req = mod.request( + { + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === "https:" ? 443 : 80), + path: parsed.pathname + parsed.search, + method: "GET", + headers: { "User-Agent": "TIP-Scraper/1.0.0" }, + timeout: 5_000, + }, + (res) => { + const chunks: Buffer[] = []; + res.on("data", (c: Buffer) => chunks.push(c)); + res.on("end", () => { + try { + resolve(JSON.parse(Buffer.concat(chunks).toString())); + } catch { + reject(new Error("Invalid JSON response")); + } + }); + } + ); + + req.on("timeout", () => { req.destroy(); reject(new Error("timeout")); }); + req.on("error", reject); + req.end(); + }); +} + +/** + * Returns the next available proxy URL from the TIP proxy pool. + * Format: "socks5://host:port" or "http://host:port" + * Returns undefined if no proxy is available (direct connection). + */ +export async function getProxyUrl(): Promise { + // Try proxy pool first + try { + const data = await getJson(`${TIP_API}/api/proxy/next`) as ProxyResponse; + if (data?.success && data.host && data.port) { + const protocol = data.protocol ?? "socks5"; + const proxyUrl = `${protocol}://${data.host}:${data.port}`; + console.log(`[proxy] Using TIP pool node: ${data.host}:${data.port} (${protocol})`); + return proxyUrl; + } + } catch { + // Pool unavailable — fall through to env fallback + } + + // Fallback to env var + if (FALLBACK) { + console.log(`[proxy] Using env PROXY_URL fallback`); + return FALLBACK; + } + + return undefined; +}