transceiver-db/scripts/scrape-flexoptix-enrichment.py

215 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Scrape Flexoptix product pages to enrich transceiver data.
Extracts: product image, all spec fields from the spec table.
Outputs SQL UPDATE statements to apply to the DB.
Uses curl subprocess to avoid Python 3.14 SSL issues.
"""
import subprocess
import time
import json
import sys
import re
import os
from bs4 import BeautifulSoup
API = "https://transceiver-db.context-x.org"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_SQL = os.path.join(os.path.dirname(SCRIPT_DIR), "sql", "011-flexoptix-enrichment.sql")
DELAY = 0.2
CACHE_HASH = "bd7a52a6ab629d9c2973634d6ae35193"
def curl_get(url, max_time=10):
r = subprocess.run(
["curl", "-s", "-L", "--max-time", str(max_time),
"-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url],
capture_output=True, text=True
)
return r.stdout
def get_flexoptix_transceivers():
all_tx = []
offset = 0
while True:
for attempt in range(3):
raw = curl_get(f"{API}/api/transceivers?limit=25&offset={offset}", max_time=60)
try:
data = json.loads(raw)
break
except json.JSONDecodeError:
wait = 5 * (attempt + 1)
print(f" JSON error at offset {offset}, attempt {attempt+1}/3, wait {wait}s", file=sys.stderr)
time.sleep(wait)
else:
print(f" SKIP offset {offset} after 3 attempts", file=sys.stderr)
offset += 25
continue
items = data.get("data", [])
if not items:
break
for t in items:
if t.get("vendor_name") == "FLEXOPTIX" and t.get("product_page_url"):
all_tx.append(t)
offset += 25
time.sleep(0.5)
if len(items) < 25:
break
return all_tx
def scrape_product_page(url):
try:
html = curl_get(url)
if not html or len(html) < 1000:
return None
soup = BeautifulSoup(html, "lxml")
result = {"specs": {}, "image_url": None}
# Extract product image
for img in soup.find_all("img"):
src = img.get("src", "") or img.get("data-src", "")
if CACHE_HASH in src and "_A_" in src and src.endswith(".jpg"):
result["image_url"] = src
break
if not result["image_url"]:
for img in soup.find_all("img"):
src = img.get("src", "") or img.get("data-src", "")
if "/media/catalog/product/" in src and "_A_" in src:
result["image_url"] = src
break
# Extract spec tables
for table in soup.find_all("table"):
for row in table.find_all("tr"):
cells = row.find_all(["th", "td"])
if len(cells) >= 2:
label = cells[0].get_text(strip=True).upper()
value = cells[1].get_text(strip=True)
if label and value and value.lower() not in ("n/a", "-", ""):
result["specs"][label] = value
return result
except Exception as e:
print(f" Error: {e}", file=sys.stderr)
return None
def escape_sql(val):
if val is None:
return "NULL"
return "'" + str(val).replace("'", "''").replace("\\", "\\\\") + "'"
def map_spec_to_columns(specs):
updates = {}
SPEC_MAP = {
"POWER CONSUMPTION": ("power_consumption_w", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None),
"CONNECTOR / POLISH": ("connector", lambda v: v),
"CONNECTOR": ("connector", lambda v: v),
"MODULATION": ("modulation", lambda v: v),
"WAVELENGTH TX (TYPICAL)": ("wavelengths", lambda v: v),
"WAVELENGTH": ("wavelengths", lambda v: v),
"DISTANCE": ("reach_label", lambda v: v),
"TEMPERATURE RANGE": ("temp_range", lambda v: v),
"OPERATING TEMPERATURE": ("temp_range", lambda v: v),
"LANE COUNT": ("lanes", lambda v: re.search(r"(\d+)", v).group(1) if re.search(r"(\d+)", v) else None),
"BANDWIDTH PER LANE": ("lane_rate", lambda v: v),
"BANDWIDTH": ("lane_rate", lambda v: v), # fallback
"INBUILT FEC": ("fec_type", lambda v: v if v.lower() not in ("no", "none") else None),
"POWERBUDGET (DB)": ("optical_budget_db", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None),
"TRANSMIT MIN/MAX PER LANE": ("tx_power_min_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None),
"RECEIVER MIN/MAX PER LANE": ("rx_sensitivity_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None),
"INTERFACE": ("fiber_type", lambda v: v),
"COMPLIANCE CODE": ("ieee_reference", lambda v: v),
"DIGITAL DIAGNOSTIC MONITORING (DDM)": ("dom_support", lambda v: "true" if "yes" in v.lower() else "false"),
}
for label, value in specs.items():
if label in SPEC_MAP:
col, transform = SPEC_MAP[label]
try:
mapped = transform(value)
if mapped is not None and col not in updates:
updates[col] = mapped
except (AttributeError, ValueError):
pass
# Build rich notes from ALL unmapped specs
extra = []
for label, value in specs.items():
if label not in SPEC_MAP and len(value) < 200:
extra.append(f"{label}: {value}")
if extra:
updates["notes"] = "; ".join(extra)[:1000]
return updates
def main():
print("Fetching Flexoptix transceivers from API...")
transceivers = get_flexoptix_transceivers()
print(f"Found {len(transceivers)} Flexoptix transceivers")
sql = [
"-- 011: Flexoptix product enrichment",
f"-- Generated: {time.strftime('%Y-%m-%d %H:%M')}",
f"-- Products: {len(transceivers)}",
"", "BEGIN;", "",
]
enriched = images = specs_count = 0
for i, tx in enumerate(transceivers):
url = tx["product_page_url"]
txid = tx["id"]
name = tx.get("standard_name") or tx.get("part_number") or tx.get("slug")
print(f"[{i+1}/{len(transceivers)}] {name}")
result = scrape_product_page(url)
if not result:
print(f" SKIP")
continue
sets = []
if result.get("image_url"):
sets.append(f"image_url = {escape_sql(result['image_url'])}")
images += 1
if result.get("specs"):
cols = map_spec_to_columns(result["specs"])
for col, val in cols.items():
if col == "dom_support":
sets.append(f"{col} = {val}")
else:
sets.append(f"{col} = {escape_sql(val)}")
if cols:
specs_count += 1
print(f" -> {len(cols)} specs, img={'YES' if result.get('image_url') else 'NO'}")
if sets:
sql.append(f"-- {name}")
sql.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{txid}';")
sql.append("")
enriched += 1
time.sleep(DELAY)
sql.append("COMMIT;")
sql.append(f"-- Summary: {enriched}/{len(transceivers)} enriched, {images} images, {specs_count} with specs")
os.makedirs(os.path.dirname(OUTPUT_SQL), exist_ok=True)
with open(OUTPUT_SQL, "w") as f:
f.write("\n".join(sql))
print(f"\nDone! {enriched} enriched ({images} images, {specs_count} specs)")
print(f"SQL: {OUTPUT_SQL}")
if __name__ == "__main__":
main()