#!/bin/bash
# V2: Flexoptix enrichment with deduplication
# Run ON Erik
LOG="/tmp/enrich-v2.log"
SQL="/tmp/011-flexoptix-enrichment-v2.sql"
DB="PGPASSWORD=tip_prod_2026 psql -h localhost -p 5433 -U tip -d transceiver_db"
echo "$(date): Starting V2 enrichment" > "$LOG"
# Get products
eval $DB -t -A -F'|' -c \
"SELECT t.id, t.product_page_url, t.part_number, t.standard_name FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'FLEXOPTIX' AND t.product_page_url IS NOT NULL ORDER BY t.part_number" \
> /tmp/fo-products.txt 2>> "$LOG"
TOTAL=$(wc -l < /tmp/fo-products.txt | tr -d ' ')
echo "Found $TOTAL products" >> "$LOG"
# Header
cat > "$SQL" << EOF
-- Flexoptix enrichment V2 (deduplicated)
-- Generated: $(date '+%Y-%m-%d %H:%M')
-- Products: $TOTAL
EOF
COUNT=0
OK=0
while IFS='|' read -r ID URL PN SN; do
[ -z "$URL" ] && continue
COUNT=$((COUNT + 1))
NAME="${SN:-$PN}"
HTML=$(curl -s -L --max-time 15 -H "User-Agent: Mozilla/5.0 TIP-Bot/1.0" "$URL" 2>/dev/null)
[ ${#HTML} -lt 500 ] && { echo "[$COUNT] $NAME SKIP" >> "$LOG"; continue; }
# Use python3 to extract specs AND generate clean SQL (no duplicates)
UPDATE=$(python3 -c "
import re, sys
html = sys.stdin.read()
tid = '$ID'
# Extract image
img = None
for m in re.finditer(r'https://[^\"\s]+/cache/[^\"\s]+_A_[^\"\s]+\.jpg', html):
img = m.group(0)
break
if not img:
for m in re.finditer(r'https://[^\"\s]+/media/catalog/product/[^\"\s]+_A_[^\"\s]+\.jpg', html):
img = m.group(0)
break
# Extract specs from
... |
specs = {}
for m in re.finditer(r' | ]*>(.*?) | \s*]*>(.*?) | ', html, re.S|re.I):
label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper()
value = re.sub(r'<[^>]+>', '', m.group(2)).strip()
if label and value and value.lower() not in ('n/a', '-', ''):
specs[label] = value
if not specs and not img:
sys.exit(0)
# Map to columns (first match wins per column)
cols = {}
MAPPING = [
('POWER CONSUMPTION', 'power_consumption_w', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None),
('CONNECTOR / POLISH', 'connector', lambda v: v),
('CONNECTOR', 'connector', lambda v: v),
('MODULATION', 'modulation', lambda v: v),
('WAVELENGTH TX (TYPICAL)', 'wavelengths', lambda v: v),
('WAVELENGTH', 'wavelengths', lambda v: v),
('DISTANCE', 'reach_label', lambda v: v),
('TEMPERATURE RANGE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))),
('OPERATING TEMPERATURE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))),
('LANE COUNT', 'lanes', lambda v: re.search(r'\d+', v).group() if re.search(r'\d+', v) else None),
('BANDWIDTH PER LANE', 'lane_rate', lambda v: v),
('BANDWIDTH', 'lane_rate', lambda v: v),
('INBUILT FEC', 'fec_type', lambda v: v if v.lower() not in ('no', 'none') else None),
('POWERBUDGET (DB)', 'optical_budget_db', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None),
('TRANSMIT MIN/MAX PER LANE', 'tx_power_min_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None),
('RECEIVER MIN/MAX PER LANE', 'rx_sensitivity_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None),
('INTERFACE', 'fiber_type', lambda v: v),
('COMPLIANCE CODE', 'ieee_reference', lambda v: v),
('DIGITAL DIAGNOSTIC MONITORING (DDM)', 'dom_support', lambda v: 'true' if 'yes' in v.lower() else 'false'),
]
mapped_labels = set()
for label, col, transform in MAPPING:
if label in specs and col not in cols:
try:
val = transform(specs[label])
if val is not None:
cols[col] = val
mapped_labels.add(label)
except:
pass
# Unmapped specs -> notes
extra = []
for k, v in specs.items():
if k not in mapped_labels and len(v) < 200:
extra.append(f'{k}: {v}')
if extra:
cols['notes'] = '; '.join(extra)[:1000]
if img:
cols['image_url'] = img
if not cols:
sys.exit(0)
# Build SQL
def esc(v):
return v.replace(chr(39), chr(39)+chr(39)).replace(chr(92), chr(92)+chr(92))
sets = []
for col, val in cols.items():
if col == 'dom_support':
sets.append(f'{col} = {val}')
else:
sets.append(f\"{col} = '{esc(str(val))}'\")
print(f'-- {\"$NAME\"}')
print(f\"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{tid}';\")
" <<< "$HTML" 2>/dev/null)
if [ -n "$UPDATE" ]; then
echo "$UPDATE" >> "$SQL"
echo "" >> "$SQL"
OK=$((OK + 1))
echo "[$COUNT] $NAME -> OK" >> "$LOG"
else
echo "[$COUNT] $NAME -> no data" >> "$LOG"
fi
sleep 0.3
done < /tmp/fo-products.txt
echo "-- Summary: $OK/$TOTAL enriched" >> "$SQL"
echo "" >> "$LOG"
echo "Enrichment SQL generated: $OK/$TOTAL" >> "$LOG"
echo "Applying SQL..." >> "$LOG"
eval $DB -f "$SQL" >> "$LOG" 2>&1
echo "" >> "$LOG"
echo "=== FINAL COUNTS ===" >> "$LOG"
eval $DB -t -A -c "SELECT 'images: ' || count(*) FROM transceivers WHERE image_url IS NOT NULL" >> "$LOG"
eval $DB -t -A -c "SELECT 'connector: ' || count(*) FROM transceivers WHERE connector IS NOT NULL" >> "$LOG"
eval $DB -t -A -c "SELECT 'notes: ' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''" >> "$LOG"
eval $DB -t -A -c "SELECT 'modulation: ' || count(*) FROM transceivers WHERE modulation IS NOT NULL" >> "$LOG"
eval $DB -t -A -c "SELECT 'power_w: ' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL" >> "$LOG"
echo "$(date): V2 DONE" >> "$LOG"