183 lines
6.6 KiB
Python
183 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""V4: Flexoptix enrichment - runs directly on Erik with psql + curl."""
|
|
import subprocess
|
|
import re
|
|
import sys
|
|
import time
|
|
import os
|
|
|
|
DB_CMD = "PGPASSWORD=tip_prod_2026 psql -h localhost -p 5433 -U tip -d transceiver_db"
|
|
SQL_OUT = "/tmp/enrichment-v4.sql"
|
|
LOG = "/tmp/enrich-v4.log"
|
|
|
|
def log(msg):
|
|
with open(LOG, "a") as f:
|
|
f.write(msg + "\n")
|
|
print(msg, file=sys.stderr)
|
|
|
|
def curl_get(url):
|
|
r = subprocess.run(
|
|
["curl", "-s", "-L", "--max-time", "15",
|
|
"-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url],
|
|
capture_output=True, text=True
|
|
)
|
|
return r.stdout
|
|
|
|
def esc(v):
|
|
return str(v).replace("'", "''")
|
|
|
|
log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 start")
|
|
|
|
# Read product list (already generated by v3)
|
|
products = []
|
|
with open("/tmp/fo-list.txt") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
parts = line.split("|")
|
|
if len(parts) >= 3:
|
|
products.append({
|
|
"id": parts[0],
|
|
"url": parts[1],
|
|
"part_number": parts[2],
|
|
"standard_name": parts[3] if len(parts) > 3 else parts[2],
|
|
})
|
|
|
|
log(f"Products: {len(products)}")
|
|
|
|
sql_lines = [f"-- V4 enrichment {time.strftime('%Y-%m-%d %H:%M')}", f"-- Products: {len(products)}", ""]
|
|
|
|
ok = 0
|
|
img_count = 0
|
|
|
|
MAPPING = [
|
|
("POWER CONSUMPTION", "power_consumption_w", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None),
|
|
("CONNECTOR / POLISH", "connector", None),
|
|
("CONNECTOR", "connector", None),
|
|
("MODULATION", "modulation", None),
|
|
("WAVELENGTH TX (TYPICAL)", "wavelengths", None),
|
|
("WAVELENGTH", "wavelengths", None),
|
|
("DISTANCE", "reach_label", None),
|
|
("TEMPERATURE RANGE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))),
|
|
("OPERATING TEMPERATURE", "temp_range", lambda v: "COM" if any(x in v for x in ["ommercial", "0°C to 70"]) else ("IND" if any(x in v for x in ["ndustrial", "-40"]) else ("EXT" if "xtended" in v else "COM"))),
|
|
("LANE COUNT", "lanes", lambda v: re.search(r"\d+", v).group() if re.search(r"\d+", v) else None),
|
|
("BANDWIDTH PER LANE", "lane_rate", None),
|
|
("BANDWIDTH", "lane_rate", None),
|
|
("INBUILT FEC", "fec_type", lambda v: v if v.lower() not in ("no", "none") else None),
|
|
("POWERBUDGET (DB)", "optical_budget_db", lambda v: re.search(r"[\d.]+", v).group() if re.search(r"[\d.]+", v) else None),
|
|
("TRANSMIT MIN/MAX PER LANE", "tx_power_min_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None),
|
|
("RECEIVER MIN/MAX PER LANE", "rx_sensitivity_dbm", lambda v: re.search(r"-?[\d.]+", v).group() if re.search(r"-?[\d.]+", v) else None),
|
|
("INTERFACE", "fiber_type", None),
|
|
("COMPLIANCE CODE", "ieee_reference", None),
|
|
("DIGITAL DIAGNOSTIC MONITORING (DDM)", "dom_support", lambda v: "true" if "yes" in v.lower() else "false"),
|
|
]
|
|
|
|
for i, p in enumerate(products):
|
|
name = p["standard_name"] or p["part_number"]
|
|
html = curl_get(p["url"])
|
|
if len(html) < 500:
|
|
log(f"[{i+1}/{len(products)}] {name} SKIP (empty)")
|
|
continue
|
|
|
|
# Extract image
|
|
img = None
|
|
for m in re.finditer(r'https://[^"\s]+/cache/[^"\s]+_A_[^"\s]+\.jpg', html):
|
|
img = m.group(0)
|
|
break
|
|
|
|
# Extract specs
|
|
specs = {}
|
|
for m in re.finditer(r'<th[^>]*>(.*?)</th>\s*<td[^>]*>(.*?)</td>', html, re.S | re.I):
|
|
label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper()
|
|
value = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
|
if label and value and value.lower() not in ('n/a', '-', ''):
|
|
specs[label] = value
|
|
|
|
if not specs and not img:
|
|
log(f"[{i+1}/{len(products)}] {name} SKIP (no data)")
|
|
continue
|
|
|
|
# Map specs to columns (first match per column wins)
|
|
cols = {}
|
|
mapped_labels = set()
|
|
for label, col, fn in MAPPING:
|
|
if label in specs and col not in cols:
|
|
try:
|
|
val = fn(specs[label]) if fn else specs[label]
|
|
if val is not None:
|
|
cols[col] = val
|
|
mapped_labels.add(label)
|
|
except Exception:
|
|
pass
|
|
|
|
# Unmapped -> notes
|
|
extra = [f"{k}: {v}" for k, v in specs.items() if k not in mapped_labels and len(v) < 200]
|
|
if extra:
|
|
cols["notes"] = "; ".join(extra)[:1000]
|
|
|
|
if img:
|
|
cols["image_url"] = img
|
|
img_count += 1
|
|
|
|
if not cols:
|
|
continue
|
|
|
|
# Build SQL
|
|
sets = []
|
|
for col, val in cols.items():
|
|
if col == "dom_support":
|
|
sets.append(f"{col} = {val}")
|
|
else:
|
|
sets.append(f"{col} = '{esc(val)}'")
|
|
|
|
sql_lines.append(f"-- {name}")
|
|
sql_lines.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{p['id']}';")
|
|
sql_lines.append("")
|
|
ok += 1
|
|
if (i + 1) % 20 == 0:
|
|
log(f"[{i+1}/{len(products)}] {ok} enriched, {img_count} images so far")
|
|
|
|
time.sleep(0.3)
|
|
|
|
sql_lines.append(f"-- Total: {ok}/{len(products)} enriched, {img_count} images")
|
|
|
|
with open(SQL_OUT, "w") as f:
|
|
f.write("\n".join(sql_lines))
|
|
|
|
log(f"Generated: {ok}/{len(products)} enriched, {img_count} images")
|
|
log(f"SQL at: {SQL_OUT}")
|
|
|
|
# Apply
|
|
log("Applying SQL...")
|
|
os.environ["PGPASSWORD"] = "tip_prod_2026"
|
|
r = subprocess.run(
|
|
["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-f", SQL_OUT],
|
|
capture_output=True, text=True
|
|
)
|
|
errors = [l for l in r.stderr.split("\n") if "ERROR" in l]
|
|
if errors:
|
|
log(f"Errors: {len(errors)}")
|
|
for e in errors[:5]:
|
|
log(f" {e}")
|
|
|
|
# Restart API
|
|
subprocess.run(["bash", "-c", "cd /opt/tip && pm2 restart tip-api"], capture_output=True)
|
|
|
|
# Final counts
|
|
for query in [
|
|
"SELECT 'img=' || count(*) FROM transceivers WHERE image_url IS NOT NULL",
|
|
"SELECT 'conn=' || count(*) FROM transceivers WHERE connector IS NOT NULL",
|
|
"SELECT 'notes=' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''",
|
|
"SELECT 'mod=' || count(*) FROM transceivers WHERE modulation IS NOT NULL",
|
|
"SELECT 'power=' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL",
|
|
"SELECT 'lane=' || count(*) FROM transceivers WHERE lane_rate IS NOT NULL",
|
|
]:
|
|
r = subprocess.run(
|
|
["psql", "-h", "localhost", "-p", "5433", "-U", "tip", "-d", "transceiver_db", "-t", "-A", "-c", query],
|
|
capture_output=True, text=True, env={**os.environ, "PGPASSWORD": "tip_prod_2026"}
|
|
)
|
|
log(r.stdout.strip())
|
|
|
|
log(f"{time.strftime('%Y-%m-%d %H:%M:%S')}: V4 DONE")
|