#!/usr/bin/env python3 """V4: Flexoptix enrichment - runs directly on Erik with psql + curl.""" import subprocess import re import sys import time import os DB_CMD = "PGPASSWORD=***REDACTED*** psql -h localhost -p 5433 -U tip -d transceiver_db" SQL_OUT = "/tmp/enrichment-v4.sql" LOG = "/tmp/enrich-v4.log" def log(msg): with open(LOG, "a") as f: f.write(msg + "\n") print(msg, file=sys.stderr) def curl_get(url): r = subprocess.run( ["curl", "-s", "-L", "--max-time", "15", "-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url], capture_output=True, text=True ) return r.stdout def esc(v): return str(v).replace("'", "''") log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 start") # Read product list (already generated by v3) products = [] with open("/tmp/fo-list.txt") as f: for line in f: line = line.strip() if not line: continue parts = line.split("|") if len(parts) >= 3: products.append({ "id": parts[0], "url": parts[1], "part_number": parts[2], "standard_name": parts[3] if len(parts) > 3 else parts[2], }) log(f"Products: {len(products)}") sql_lines = [f"-- V4 enrichment {time.strftime('%Y-%m-%d %H:%M')}", f"-- Products: {len(products)}", ""] ok = 0 img_count = 0 MAPPING = [ ("POWER CONSUMPTION", "power_consumption_w", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None), ("CONNECTOR / POLISH", "connector", None), ("CONNECTOR", "connector", None), ("MODULATION", "modulation", None), ("WAVELENGTH TX (TYPICAL)", "wavelengths", None), ("WAVELENGTH", "wavelengths", None), ("DISTANCE", "reach_label", None), ("TEMPERATURE RANGE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))), ("OPERATING TEMPERATURE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))), ("LANE COUNT", "lanes", lambda v: re.search(r"\d+", v).group() if re.search(r"\d+", v) else None), ("BANDWIDTH PER LANE", "lane_rate", None), ("BANDWIDTH", "lane_rate", None), ("INBUILT FEC", "fec_type", lambda v: v if v.lower() not in ("no", "none") else None), ("POWERBUDGET (DB)", "optical_budget_db", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None), ("TRANSMIT MIN/MAX PER LANE", "tx_power_min_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None), ("RECEIVER MIN/MAX PER LANE", "rx_sensitivity_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None), ("INTERFACE", "fiber_type", None), ("COMPLIANCE CODE", "ieee_reference", None), ("DIGITAL DIAGNOSTIC MONITORING (DDM)", "dom_support", lambda v: "true" if "yes" in v.lower() else "false"), ] for i, p in enumerate(products): name = p["standard_name"] or p["part_number"] html = curl_get(p["url"]) if len(html) < 500: log(f"[{i+1}/{len(products)}] {name} SKIP (empty)") continue # Extract image img = None for m in re.finditer(r'https://[^"\s]+/cache/[^"\s]+_A_[^"\s]+\.jpg', html): img = m.group(0) break # Extract specs specs = {} for m in re.finditer(r']*>(.*?)\s*]*>(.*?)', html, re.S | re.I): label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper() value = re.sub(r'<[^>]+>', '', m.group(2)).strip() if label and value and value.lower() not in ('n/a', '-', ''): specs[label] = value if not specs and not img: log(f"[{i+1}/{len(products)}] {name} SKIP (no data)") continue # Map specs to columns (first match per column wins) cols = {} mapped_labels = set() for label, col, fn in MAPPING: if label in specs and col not in cols: try: val = fn(specs[label]) if fn else specs[label] if val is not None: cols[col] = val mapped_labels.add(label) except Exception: pass # Unmapped -> notes extra = [f"{k}: {v}" for k, v in specs.items() if k not in mapped_labels and len(v) < 200] if extra: cols["notes"] = "; ".join(extra)[:1000] if img: cols["image_url"] = img img_count += 1 if not cols: continue # Build SQL sets = [] for col, val in cols.items(): if col == "dom_support": sets.append(f"{col} = {val}") else: sets.append(f"{col} = '{esc(val)}'") sql_lines.append(f"-- {name}") sql_lines.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{p['id']}';") sql_lines.append("") ok += 1 if (i + 1) % 20 == 0: log(f"[{i+1}/{len(products)}] {ok} enriched, {img_count} images so far") time.sleep(0.3) sql_lines.append(f"-- Total: {ok}/{len(products)} enriched, {img_count} images") with open(SQL_OUT, "w") as f: f.write("\n".join(sql_lines)) log(f"Generated: {ok}/{len(products)} enriched, {img_count} images") log(f"SQL at: {SQL_OUT}") # Apply log("Applying SQL...") os.environ["PGPASSWORD"] = "***REDACTED***" r = subprocess.run( ["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-f", SQL_OUT], capture_output=True, text=True ) errors = [l for l in r.stderr.split("\n") if "ERROR" in l] if errors: log(f"Errors: {len(errors)}") for e in errors[:5]: log(f" {e}") # Restart API subprocess.run(["bash", "-c", "cd /opt/tip && pm2 restart tip-api"], capture_output=True) # Final counts for query in [ "SELECT 'img=' || count(*) FROM transceivers WHERE image_url IS NOT NULL", "SELECT 'conn=' || count(*) FROM transceivers WHERE connector IS NOT NULL", "SELECT 'notes=' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''", "SELECT 'mod=' || count(*) FROM transceivers WHERE modulation IS NOT NULL", "SELECT 'power=' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL", "SELECT 'lane=' || count(*) FROM transceivers WHERE lane_rate IS NOT NULL", ]: r = subprocess.run( ["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-t", "-A", "-c", query], capture_output=True, text=True, env={**os.environ, "PGPASSWORD": "***REDACTED***"} ) log(r.stdout.strip()) log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 DONE")