#!/bin/bash # V2: Flexoptix enrichment with deduplication # Run ON Erik LOG="/tmp/enrich-v2.log" SQL="/tmp/011-flexoptix-enrichment-v2.sql" DB="PGPASSWORD=tip_prod_2026 psql -h localhost -p 5433 -U tip -d transceiver_db" echo "$(date): Starting V2 enrichment" > "$LOG" # Get products eval $DB -t -A -F'|' -c \ "SELECT t.id, t.product_page_url, t.part_number, t.standard_name FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'FLEXOPTIX' AND t.product_page_url IS NOT NULL ORDER BY t.part_number" \ > /tmp/fo-products.txt 2>> "$LOG" TOTAL=$(wc -l < /tmp/fo-products.txt | tr -d ' ') echo "Found $TOTAL products" >> "$LOG" # Header cat > "$SQL" << EOF -- Flexoptix enrichment V2 (deduplicated) -- Generated: $(date '+%Y-%m-%d %H:%M') -- Products: $TOTAL EOF COUNT=0 OK=0 while IFS='|' read -r ID URL PN SN; do [ -z "$URL" ] && continue COUNT=$((COUNT + 1)) NAME="${SN:-$PN}" HTML=$(curl -s -L --max-time 15 -H "User-Agent: Mozilla/5.0 TIP-Bot/1.0" "$URL" 2>/dev/null) [ ${#HTML} -lt 500 ] && { echo "[$COUNT] $NAME SKIP" >> "$LOG"; continue; } # Use python3 to extract specs AND generate clean SQL (no duplicates) UPDATE=$(python3 -c " import re, sys html = sys.stdin.read() tid = '$ID' # Extract image img = None for m in re.finditer(r'https://[^\"\s]+/cache/[^\"\s]+_A_[^\"\s]+\.jpg', html): img = m.group(0) break if not img: for m in re.finditer(r'https://[^\"\s]+/media/catalog/product/[^\"\s]+_A_[^\"\s]+\.jpg', html): img = m.group(0) break # Extract specs from ... specs = {} for m in re.finditer(r']*>(.*?)\s*]*>(.*?)', html, re.S|re.I): label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper() value = re.sub(r'<[^>]+>', '', m.group(2)).strip() if label and value and value.lower() not in ('n/a', '-', ''): specs[label] = value if not specs and not img: sys.exit(0) # Map to columns (first match wins per column) cols = {} MAPPING = [ ('POWER CONSUMPTION', 'power_consumption_w', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None), ('CONNECTOR / POLISH', 'connector', lambda v: v), ('CONNECTOR', 'connector', lambda v: v), ('MODULATION', 'modulation', lambda v: v), ('WAVELENGTH TX (TYPICAL)', 'wavelengths', lambda v: v), ('WAVELENGTH', 'wavelengths', lambda v: v), ('DISTANCE', 'reach_label', lambda v: v), ('TEMPERATURE RANGE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))), ('OPERATING TEMPERATURE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))), ('LANE COUNT', 'lanes', lambda v: re.search(r'\d+', v).group() if re.search(r'\d+', v) else None), ('BANDWIDTH PER LANE', 'lane_rate', lambda v: v), ('BANDWIDTH', 'lane_rate', lambda v: v), ('INBUILT FEC', 'fec_type', lambda v: v if v.lower() not in ('no', 'none') else None), ('POWERBUDGET (DB)', 'optical_budget_db', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None), ('TRANSMIT MIN/MAX PER LANE', 'tx_power_min_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None), ('RECEIVER MIN/MAX PER LANE', 'rx_sensitivity_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None), ('INTERFACE', 'fiber_type', lambda v: v), ('COMPLIANCE CODE', 'ieee_reference', lambda v: v), ('DIGITAL DIAGNOSTIC MONITORING (DDM)', 'dom_support', lambda v: 'true' if 'yes' in v.lower() else 'false'), ] mapped_labels = set() for label, col, transform in MAPPING: if label in specs and col not in cols: try: val = transform(specs[label]) if val is not None: cols[col] = val mapped_labels.add(label) except: pass # Unmapped specs -> notes extra = [] for k, v in specs.items(): if k not in mapped_labels and len(v) < 200: extra.append(f'{k}: {v}') if extra: cols['notes'] = '; '.join(extra)[:1000] if img: cols['image_url'] = img if not cols: sys.exit(0) # Build SQL def esc(v): return v.replace(chr(39), chr(39)+chr(39)).replace(chr(92), chr(92)+chr(92)) sets = [] for col, val in cols.items(): if col == 'dom_support': sets.append(f'{col} = {val}') else: sets.append(f\"{col} = '{esc(str(val))}'\") print(f'-- {\"$NAME\"}') print(f\"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{tid}';\") " <<< "$HTML" 2>/dev/null) if [ -n "$UPDATE" ]; then echo "$UPDATE" >> "$SQL" echo "" >> "$SQL" OK=$((OK + 1)) echo "[$COUNT] $NAME -> OK" >> "$LOG" else echo "[$COUNT] $NAME -> no data" >> "$LOG" fi sleep 0.3 done < /tmp/fo-products.txt echo "-- Summary: $OK/$TOTAL enriched" >> "$SQL" echo "" >> "$LOG" echo "Enrichment SQL generated: $OK/$TOTAL" >> "$LOG" echo "Applying SQL..." >> "$LOG" eval $DB -f "$SQL" >> "$LOG" 2>&1 echo "" >> "$LOG" echo "=== FINAL COUNTS ===" >> "$LOG" eval $DB -t -A -c "SELECT 'images: ' || count(*) FROM transceivers WHERE image_url IS NOT NULL" >> "$LOG" eval $DB -t -A -c "SELECT 'connector: ' || count(*) FROM transceivers WHERE connector IS NOT NULL" >> "$LOG" eval $DB -t -A -c "SELECT 'notes: ' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''" >> "$LOG" eval $DB -t -A -c "SELECT 'modulation: ' || count(*) FROM transceivers WHERE modulation IS NOT NULL" >> "$LOG" eval $DB -t -A -c "SELECT 'power_w: ' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL" >> "$LOG" echo "$(date): V2 DONE" >> "$LOG"