#!/usr/bin/env python3 """ Scrape Flexoptix product pages to enrich transceiver data. Extracts: product image, all spec fields from the spec table. Outputs SQL UPDATE statements to apply to the DB. Uses curl subprocess to avoid Python 3.14 SSL issues. """ import subprocess import time import json import sys import re import os from bs4 import BeautifulSoup API = "https://transceiver-db.context-x.org" SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_SQL = os.path.join(os.path.dirname(SCRIPT_DIR), "sql", "011-flexoptix-enrichment.sql") DELAY = 0.2 CACHE_HASH = "bd7a52a6ab629d9c2973634d6ae35193" def curl_get(url, max_time=10): r = subprocess.run( ["curl", "-s", "-L", "--max-time", str(max_time), "-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url], capture_output=True, text=True ) return r.stdout def get_flexoptix_transceivers(): all_tx = [] offset = 0 while True: for attempt in range(3): raw = curl_get(f"{API}/api/transceivers?limit=25&offset={offset}", max_time=60) try: data = json.loads(raw) break except json.JSONDecodeError: wait = 5 * (attempt + 1) print(f" JSON error at offset {offset}, attempt {attempt+1}/3, wait {wait}s", file=sys.stderr) time.sleep(wait) else: print(f" SKIP offset {offset} after 3 attempts", file=sys.stderr) offset += 25 continue items = data.get("data", []) if not items: break for t in items: if t.get("vendor_name") == "FLEXOPTIX" and t.get("product_page_url"): all_tx.append(t) offset += 25 time.sleep(0.5) if len(items) < 25: break return all_tx def scrape_product_page(url): try: html = curl_get(url) if not html or len(html) < 1000: return None soup = BeautifulSoup(html, "lxml") result = {"specs": {}, "image_url": None} # Extract product image for img in soup.find_all("img"): src = img.get("src", "") or img.get("data-src", "") if CACHE_HASH in src and "_A_" in src and src.endswith(".jpg"): result["image_url"] = src break if not result["image_url"]: for img in soup.find_all("img"): src = img.get("src", "") or img.get("data-src", "") if "/media/catalog/product/" in src and "_A_" in src: result["image_url"] = src break # Extract spec tables for table in soup.find_all("table"): for row in table.find_all("tr"): cells = row.find_all(["th", "td"]) if len(cells) >= 2: label = cells[0].get_text(strip=True).upper() value = cells[1].get_text(strip=True) if label and value and value.lower() not in ("n/a", "-", ""): result["specs"][label] = value return result except Exception as e: print(f" Error: {e}", file=sys.stderr) return None def escape_sql(val): if val is None: return "NULL" return "'" + str(val).replace("'", "''").replace("\\", "\\\\") + "'" def map_spec_to_columns(specs): updates = {} SPEC_MAP = { "POWER CONSUMPTION": ("power_consumption_w", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None), "CONNECTOR / POLISH": ("connector", lambda v: v), "CONNECTOR": ("connector", lambda v: v), "MODULATION": ("modulation", lambda v: v), "WAVELENGTH TX (TYPICAL)": ("wavelengths", lambda v: v), "WAVELENGTH": ("wavelengths", lambda v: v), "DISTANCE": ("reach_label", lambda v: v), "TEMPERATURE RANGE": ("temp_range", lambda v: v), "OPERATING TEMPERATURE": ("temp_range", lambda v: v), "LANE COUNT": ("lanes", lambda v: re.search(r"(\d+)", v).group(1) if re.search(r"(\d+)", v) else None), "BANDWIDTH PER LANE": ("lane_rate", lambda v: v), "BANDWIDTH": ("lane_rate", lambda v: v), # fallback "INBUILT FEC": ("fec_type", lambda v: v if v.lower() not in ("no", "none") else None), "POWERBUDGET (DB)": ("optical_budget_db", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None), "TRANSMIT MIN/MAX PER LANE": ("tx_power_min_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None), "RECEIVER MIN/MAX PER LANE": ("rx_sensitivity_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None), "INTERFACE": ("fiber_type", lambda v: v), "COMPLIANCE CODE": ("ieee_reference", lambda v: v), "DIGITAL DIAGNOSTIC MONITORING (DDM)": ("dom_support", lambda v: "true" if "yes" in v.lower() else "false"), } for label, value in specs.items(): if label in SPEC_MAP: col, transform = SPEC_MAP[label] try: mapped = transform(value) if mapped is not None and col not in updates: updates[col] = mapped except (AttributeError, ValueError): pass # Build rich notes from ALL unmapped specs extra = [] for label, value in specs.items(): if label not in SPEC_MAP and len(value) < 200: extra.append(f"{label}: {value}") if extra: updates["notes"] = "; ".join(extra)[:1000] return updates def main(): print("Fetching Flexoptix transceivers from API...") transceivers = get_flexoptix_transceivers() print(f"Found {len(transceivers)} Flexoptix transceivers") sql = [ "-- 011: Flexoptix product enrichment", f"-- Generated: {time.strftime('%Y-%m-%d %H:%M')}", f"-- Products: {len(transceivers)}", "", "BEGIN;", "", ] enriched = images = specs_count = 0 for i, tx in enumerate(transceivers): url = tx["product_page_url"] txid = tx["id"] name = tx.get("standard_name") or tx.get("part_number") or tx.get("slug") print(f"[{i+1}/{len(transceivers)}] {name}") result = scrape_product_page(url) if not result: print(f" SKIP") continue sets = [] if result.get("image_url"): sets.append(f"image_url = {escape_sql(result['image_url'])}") images += 1 if result.get("specs"): cols = map_spec_to_columns(result["specs"]) for col, val in cols.items(): if col == "dom_support": sets.append(f"{col} = {val}") else: sets.append(f"{col} = {escape_sql(val)}") if cols: specs_count += 1 print(f" -> {len(cols)} specs, img={'YES' if result.get('image_url') else 'NO'}") if sets: sql.append(f"-- {name}") sql.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{txid}';") sql.append("") enriched += 1 time.sleep(DELAY) sql.append("COMMIT;") sql.append(f"-- Summary: {enriched}/{len(transceivers)} enriched, {images} images, {specs_count} with specs") os.makedirs(os.path.dirname(OUTPUT_SQL), exist_ok=True) with open(OUTPUT_SQL, "w") as f: f.write("\n".join(sql)) print(f"\nDone! {enriched} enriched ({images} images, {specs_count} specs)") print(f"SQL: {OUTPUT_SQL}") if __name__ == "__main__": main()