transceiver-db/scripts/enrich-v4.py

183 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""V4: Flexoptix enrichment - runs directly on Erik with psql + curl."""
import subprocess
import re
import sys
import time
import os
DB_CMD = "PGPASSWORD=***REDACTED*** psql -h localhost -p 5433 -U tip -d transceiver_db"
SQL_OUT = "/tmp/enrichment-v4.sql"
LOG = "/tmp/enrich-v4.log"
def log(msg):
with open(LOG, "a") as f:
f.write(msg + "\n")
print(msg, file=sys.stderr)
def curl_get(url):
r = subprocess.run(
["curl", "-s", "-L", "--max-time", "15",
"-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url],
capture_output=True, text=True
)
return r.stdout
def esc(v):
return str(v).replace("'", "''")
log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 start")
# Read product list (already generated by v3)
products = []
with open("/tmp/fo-list.txt") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split("|")
if len(parts) >= 3:
products.append({
"id": parts[0],
"url": parts[1],
"part_number": parts[2],
"standard_name": parts[3] if len(parts) > 3 else parts[2],
})
log(f"Products: {len(products)}")
sql_lines = [f"-- V4 enrichment {time.strftime('%Y-%m-%d %H:%M')}", f"-- Products: {len(products)}", ""]
ok = 0
img_count = 0
MAPPING = [
("POWER CONSUMPTION", "power_consumption_w", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None),
("CONNECTOR / POLISH", "connector", None),
("CONNECTOR", "connector", None),
("MODULATION", "modulation", None),
("WAVELENGTH TX (TYPICAL)", "wavelengths", None),
("WAVELENGTH", "wavelengths", None),
("DISTANCE", "reach_label", None),
("TEMPERATURE RANGE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))),
("OPERATING TEMPERATURE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))),
("LANE COUNT", "lanes", lambda v: re.search(r"\d+", v).group() if re.search(r"\d+", v) else None),
("BANDWIDTH PER LANE", "lane_rate", None),
("BANDWIDTH", "lane_rate", None),
("INBUILT FEC", "fec_type", lambda v: v if v.lower() not in ("no", "none") else None),
("POWERBUDGET (DB)", "optical_budget_db", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None),
("TRANSMIT MIN/MAX PER LANE", "tx_power_min_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None),
("RECEIVER MIN/MAX PER LANE", "rx_sensitivity_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None),
("INTERFACE", "fiber_type", None),
("COMPLIANCE CODE", "ieee_reference", None),
("DIGITAL DIAGNOSTIC MONITORING (DDM)", "dom_support", lambda v: "true" if "yes" in v.lower() else "false"),
]
for i, p in enumerate(products):
name = p["standard_name"] or p["part_number"]
html = curl_get(p["url"])
if len(html) < 500:
log(f"[{i+1}/{len(products)}] {name} SKIP (empty)")
continue
# Extract image
img = None
for m in re.finditer(r'https://[^"\s]+/cache/[^"\s]+_A_[^"\s]+\.jpg', html):
img = m.group(0)
break
# Extract specs
specs = {}
for m in re.finditer(r'<th[^>]*>(.*?)</th>\s*<td[^>]*>(.*?)</td>', html, re.S | re.I):
label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper()
value = re.sub(r'<[^>]+>', '', m.group(2)).strip()
if label and value and value.lower() not in ('n/a', '-', ''):
specs[label] = value
if not specs and not img:
log(f"[{i+1}/{len(products)}] {name} SKIP (no data)")
continue
# Map specs to columns (first match per column wins)
cols = {}
mapped_labels = set()
for label, col, fn in MAPPING:
if label in specs and col not in cols:
try:
val = fn(specs[label]) if fn else specs[label]
if val is not None:
cols[col] = val
mapped_labels.add(label)
except Exception:
pass
# Unmapped -> notes
extra = [f"{k}: {v}" for k, v in specs.items() if k not in mapped_labels and len(v) < 200]
if extra:
cols["notes"] = "; ".join(extra)[:1000]
if img:
cols["image_url"] = img
img_count += 1
if not cols:
continue
# Build SQL
sets = []
for col, val in cols.items():
if col == "dom_support":
sets.append(f"{col} = {val}")
else:
sets.append(f"{col} = '{esc(val)}'")
sql_lines.append(f"-- {name}")
sql_lines.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{p['id']}';")
sql_lines.append("")
ok += 1
if (i + 1) % 20 == 0:
log(f"[{i+1}/{len(products)}] {ok} enriched, {img_count} images so far")
time.sleep(0.3)
sql_lines.append(f"-- Total: {ok}/{len(products)} enriched, {img_count} images")
with open(SQL_OUT, "w") as f:
f.write("\n".join(sql_lines))
log(f"Generated: {ok}/{len(products)} enriched, {img_count} images")
log(f"SQL at: {SQL_OUT}")
# Apply
log("Applying SQL...")
os.environ["PGPASSWORD"] = "***REDACTED***"
r = subprocess.run(
["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-f", SQL_OUT],
capture_output=True, text=True
)
errors = [l for l in r.stderr.split("\n") if "ERROR" in l]
if errors:
log(f"Errors: {len(errors)}")
for e in errors[:5]:
log(f" {e}")
# Restart API
subprocess.run(["bash", "-c", "cd /opt/tip && pm2 restart tip-api"], capture_output=True)
# Final counts
for query in [
"SELECT 'img=' || count(*) FROM transceivers WHERE image_url IS NOT NULL",
"SELECT 'conn=' || count(*) FROM transceivers WHERE connector IS NOT NULL",
"SELECT 'notes=' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''",
"SELECT 'mod=' || count(*) FROM transceivers WHERE modulation IS NOT NULL",
"SELECT 'power=' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL",
"SELECT 'lane=' || count(*) FROM transceivers WHERE lane_rate IS NOT NULL",
]:
r = subprocess.run(
["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-t", "-A", "-c", query],
capture_output=True, text=True, env={**os.environ, "PGPASSWORD": "***REDACTED***"}
)
log(r.stdout.strip())
log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 DONE")