#!/usr/bin/env python3 """V4: Flexoptix enrichment - runs directly on Erik with psql + curl.""" import subprocess import re import sys import time import os DB_CMD = "PGPASSWORD=***REDACTED*** psql -h localhost -p 5433 -U tip -d transceiver_db" SQL_OUT = "/tmp/enrichment-v4.sql" LOG = "/tmp/enrich-v4.log" def log(msg): with open(LOG, "a") as f: f.write(msg + "\n") print(msg, file=sys.stderr) def curl_get(url): r = subprocess.run( ["curl", "-s", "-L", "--max-time", "15", "-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url], capture_output=True, text=True ) return r.stdout def esc(v): return str(v).replace("'", "''") log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 start") # Read product list (already generated by v3) products = [] with open("/tmp/fo-list.txt") as f: for line in f: line = line.strip() if not line: continue parts = line.split("|") if len(parts) >= 3: products.append({ "id": parts[0], "url": parts[1], "part_number": parts[2], "standard_name": parts[3] if len(parts) > 3 else parts[2], }) log(f"Products: {len(products)}") sql_lines = [f"-- V4 enrichment {time.strftime('%Y-%m-%d %H:%M')}", f"-- Products: {len(products)}", ""] ok = 0 img_count = 0 MAPPING = [ ("POWER CONSUMPTION", "power_consumption_w", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None), ("CONNECTOR / POLISH", "connector", None), ("CONNECTOR", "connector", None), ("MODULATION", "modulation", None), ("WAVELENGTH TX (TYPICAL)", "wavelengths", None), ("WAVELENGTH", "wavelengths", None), ("DISTANCE", "reach_label", None), ("TEMPERATURE RANGE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))), ("OPERATING TEMPERATURE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))), ("LANE COUNT", "lanes", lambda v: re.search(r"\d+", v).group() if re.search(r"\d+", v) else None), ("BANDWIDTH PER LANE", "lane_rate", None), ("BANDWIDTH", "lane_rate", None), ("INBUILT FEC", "fec_type", lambda v: v if v.lower() not in ("no", "none") else None), ("POWERBUDGET (DB)", "optical_budget_db", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None), ("TRANSMIT MIN/MAX PER LANE", "tx_power_min_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None), ("RECEIVER MIN/MAX PER LANE", "rx_sensitivity_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None), ("INTERFACE", "fiber_type", None), ("COMPLIANCE CODE", "ieee_reference", None), ("DIGITAL DIAGNOSTIC MONITORING (DDM)", "dom_support", lambda v: "true" if "yes" in v.lower() else "false"), ] for i, p in enumerate(products): name = p["standard_name"] or p["part_number"] html = curl_get(p["url"]) if len(html) < 500: log(f"[{i+1}/{len(products)}] {name} SKIP (empty)") continue # Extract image img = None for m in re.finditer(r'https://[^"\s]+/cache/[^"\s]+_A_[^"\s]+\.jpg', html): img = m.group(0) break # Extract specs specs = {} for m in re.finditer(r'