215 lines
7.5 KiB
Python
215 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape Flexoptix product pages to enrich transceiver data.
|
|
Extracts: product image, all spec fields from the spec table.
|
|
Outputs SQL UPDATE statements to apply to the DB.
|
|
Uses curl subprocess to avoid Python 3.14 SSL issues.
|
|
"""
|
|
|
|
import subprocess
|
|
import time
|
|
import json
|
|
import sys
|
|
import re
|
|
import os
|
|
from bs4 import BeautifulSoup
|
|
|
|
API = "https://transceiver-db.context-x.org"
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
OUTPUT_SQL = os.path.join(os.path.dirname(SCRIPT_DIR), "sql", "011-flexoptix-enrichment.sql")
|
|
DELAY = 0.2
|
|
CACHE_HASH = "bd7a52a6ab629d9c2973634d6ae35193"
|
|
|
|
|
|
def curl_get(url, max_time=10):
|
|
r = subprocess.run(
|
|
["curl", "-s", "-L", "--max-time", str(max_time),
|
|
"-H", "User-Agent: Mozilla/5.0 TIP-Bot/1.0", url],
|
|
capture_output=True, text=True
|
|
)
|
|
return r.stdout
|
|
|
|
|
|
def get_flexoptix_transceivers():
|
|
all_tx = []
|
|
offset = 0
|
|
while True:
|
|
for attempt in range(3):
|
|
raw = curl_get(f"{API}/api/transceivers?limit=25&offset={offset}", max_time=60)
|
|
try:
|
|
data = json.loads(raw)
|
|
break
|
|
except json.JSONDecodeError:
|
|
wait = 5 * (attempt + 1)
|
|
print(f" JSON error at offset {offset}, attempt {attempt+1}/3, wait {wait}s", file=sys.stderr)
|
|
time.sleep(wait)
|
|
else:
|
|
print(f" SKIP offset {offset} after 3 attempts", file=sys.stderr)
|
|
offset += 25
|
|
continue
|
|
items = data.get("data", [])
|
|
if not items:
|
|
break
|
|
for t in items:
|
|
if t.get("vendor_name") == "FLEXOPTIX" and t.get("product_page_url"):
|
|
all_tx.append(t)
|
|
offset += 25
|
|
time.sleep(0.5)
|
|
if len(items) < 25:
|
|
break
|
|
return all_tx
|
|
|
|
|
|
def scrape_product_page(url):
|
|
try:
|
|
html = curl_get(url)
|
|
if not html or len(html) < 1000:
|
|
return None
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
result = {"specs": {}, "image_url": None}
|
|
|
|
# Extract product image
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src", "") or img.get("data-src", "")
|
|
if CACHE_HASH in src and "_A_" in src and src.endswith(".jpg"):
|
|
result["image_url"] = src
|
|
break
|
|
if not result["image_url"]:
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src", "") or img.get("data-src", "")
|
|
if "/media/catalog/product/" in src and "_A_" in src:
|
|
result["image_url"] = src
|
|
break
|
|
|
|
# Extract spec tables
|
|
for table in soup.find_all("table"):
|
|
for row in table.find_all("tr"):
|
|
cells = row.find_all(["th", "td"])
|
|
if len(cells) >= 2:
|
|
label = cells[0].get_text(strip=True).upper()
|
|
value = cells[1].get_text(strip=True)
|
|
if label and value and value.lower() not in ("n/a", "-", ""):
|
|
result["specs"][label] = value
|
|
|
|
return result
|
|
except Exception as e:
|
|
print(f" Error: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def escape_sql(val):
|
|
if val is None:
|
|
return "NULL"
|
|
return "'" + str(val).replace("'", "''").replace("\\", "\\\\") + "'"
|
|
|
|
|
|
def map_spec_to_columns(specs):
|
|
updates = {}
|
|
SPEC_MAP = {
|
|
"POWER CONSUMPTION": ("power_consumption_w", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None),
|
|
"CONNECTOR / POLISH": ("connector", lambda v: v),
|
|
"CONNECTOR": ("connector", lambda v: v),
|
|
"MODULATION": ("modulation", lambda v: v),
|
|
"WAVELENGTH TX (TYPICAL)": ("wavelengths", lambda v: v),
|
|
"WAVELENGTH": ("wavelengths", lambda v: v),
|
|
"DISTANCE": ("reach_label", lambda v: v),
|
|
"TEMPERATURE RANGE": ("temp_range", lambda v: v),
|
|
"OPERATING TEMPERATURE": ("temp_range", lambda v: v),
|
|
"LANE COUNT": ("lanes", lambda v: re.search(r"(\d+)", v).group(1) if re.search(r"(\d+)", v) else None),
|
|
"BANDWIDTH PER LANE": ("lane_rate", lambda v: v),
|
|
"BANDWIDTH": ("lane_rate", lambda v: v), # fallback
|
|
"INBUILT FEC": ("fec_type", lambda v: v if v.lower() not in ("no", "none") else None),
|
|
"POWERBUDGET (DB)": ("optical_budget_db", lambda v: re.search(r"([\d.]+)", v).group(1) if re.search(r"([\d.]+)", v) else None),
|
|
"TRANSMIT MIN/MAX PER LANE": ("tx_power_min_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None),
|
|
"RECEIVER MIN/MAX PER LANE": ("rx_sensitivity_dbm", lambda v: re.search(r"(-?[\d.]+)", v).group(1) if re.search(r"(-?[\d.]+)", v) else None),
|
|
"INTERFACE": ("fiber_type", lambda v: v),
|
|
"COMPLIANCE CODE": ("ieee_reference", lambda v: v),
|
|
"DIGITAL DIAGNOSTIC MONITORING (DDM)": ("dom_support", lambda v: "true" if "yes" in v.lower() else "false"),
|
|
}
|
|
|
|
for label, value in specs.items():
|
|
if label in SPEC_MAP:
|
|
col, transform = SPEC_MAP[label]
|
|
try:
|
|
mapped = transform(value)
|
|
if mapped is not None and col not in updates:
|
|
updates[col] = mapped
|
|
except (AttributeError, ValueError):
|
|
pass
|
|
|
|
# Build rich notes from ALL unmapped specs
|
|
extra = []
|
|
for label, value in specs.items():
|
|
if label not in SPEC_MAP and len(value) < 200:
|
|
extra.append(f"{label}: {value}")
|
|
if extra:
|
|
updates["notes"] = "; ".join(extra)[:1000]
|
|
|
|
return updates
|
|
|
|
|
|
def main():
|
|
print("Fetching Flexoptix transceivers from API...")
|
|
transceivers = get_flexoptix_transceivers()
|
|
print(f"Found {len(transceivers)} Flexoptix transceivers")
|
|
|
|
sql = [
|
|
"-- 011: Flexoptix product enrichment",
|
|
f"-- Generated: {time.strftime('%Y-%m-%d %H:%M')}",
|
|
f"-- Products: {len(transceivers)}",
|
|
"", "BEGIN;", "",
|
|
]
|
|
|
|
enriched = images = specs_count = 0
|
|
|
|
for i, tx in enumerate(transceivers):
|
|
url = tx["product_page_url"]
|
|
txid = tx["id"]
|
|
name = tx.get("standard_name") or tx.get("part_number") or tx.get("slug")
|
|
print(f"[{i+1}/{len(transceivers)}] {name}")
|
|
|
|
result = scrape_product_page(url)
|
|
if not result:
|
|
print(f" SKIP")
|
|
continue
|
|
|
|
sets = []
|
|
|
|
if result.get("image_url"):
|
|
sets.append(f"image_url = {escape_sql(result['image_url'])}")
|
|
images += 1
|
|
|
|
if result.get("specs"):
|
|
cols = map_spec_to_columns(result["specs"])
|
|
for col, val in cols.items():
|
|
if col == "dom_support":
|
|
sets.append(f"{col} = {val}")
|
|
else:
|
|
sets.append(f"{col} = {escape_sql(val)}")
|
|
if cols:
|
|
specs_count += 1
|
|
print(f" -> {len(cols)} specs, img={'YES' if result.get('image_url') else 'NO'}")
|
|
|
|
if sets:
|
|
sql.append(f"-- {name}")
|
|
sql.append(f"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{txid}';")
|
|
sql.append("")
|
|
enriched += 1
|
|
|
|
time.sleep(DELAY)
|
|
|
|
sql.append("COMMIT;")
|
|
sql.append(f"-- Summary: {enriched}/{len(transceivers)} enriched, {images} images, {specs_count} with specs")
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT_SQL), exist_ok=True)
|
|
with open(OUTPUT_SQL, "w") as f:
|
|
f.write("\n".join(sql))
|
|
|
|
print(f"\nDone! {enriched} enriched ({images} images, {specs_count} specs)")
|
|
print(f"SQL: {OUTPUT_SQL}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|