159 lines
5.6 KiB
Bash
159 lines
5.6 KiB
Bash
#!/bin/bash
|
|
# V2: Flexoptix enrichment with deduplication
|
|
# Run ON Erik
|
|
|
|
LOG="/tmp/enrich-v2.log"
|
|
SQL="/tmp/011-flexoptix-enrichment-v2.sql"
|
|
DB="PGPASSWORD=tip_prod_2026 psql -h localhost -p 5433 -U tip -d transceiver_db"
|
|
|
|
echo "$(date): Starting V2 enrichment" > "$LOG"
|
|
|
|
# Get products
|
|
eval $DB -t -A -F'|' -c \
|
|
"SELECT t.id, t.product_page_url, t.part_number, t.standard_name FROM transceivers t JOIN vendors v ON t.vendor_id = v.id WHERE v.name = 'FLEXOPTIX' AND t.product_page_url IS NOT NULL ORDER BY t.part_number" \
|
|
> /tmp/fo-products.txt 2>> "$LOG"
|
|
|
|
TOTAL=$(wc -l < /tmp/fo-products.txt | tr -d ' ')
|
|
echo "Found $TOTAL products" >> "$LOG"
|
|
|
|
# Header
|
|
cat > "$SQL" << EOF
|
|
-- Flexoptix enrichment V2 (deduplicated)
|
|
-- Generated: $(date '+%Y-%m-%d %H:%M')
|
|
-- Products: $TOTAL
|
|
|
|
EOF
|
|
|
|
COUNT=0
|
|
OK=0
|
|
|
|
while IFS='|' read -r ID URL PN SN; do
|
|
[ -z "$URL" ] && continue
|
|
COUNT=$((COUNT + 1))
|
|
NAME="${SN:-$PN}"
|
|
|
|
HTML=$(curl -s -L --max-time 15 -H "User-Agent: Mozilla/5.0 TIP-Bot/1.0" "$URL" 2>/dev/null)
|
|
[ ${#HTML} -lt 500 ] && { echo "[$COUNT] $NAME SKIP" >> "$LOG"; continue; }
|
|
|
|
# Use python3 to extract specs AND generate clean SQL (no duplicates)
|
|
UPDATE=$(python3 -c "
|
|
import re, sys
|
|
|
|
html = sys.stdin.read()
|
|
tid = '$ID'
|
|
|
|
# Extract image
|
|
img = None
|
|
for m in re.finditer(r'https://[^\"\s]+/cache/[^\"\s]+_A_[^\"\s]+\.jpg', html):
|
|
img = m.group(0)
|
|
break
|
|
if not img:
|
|
for m in re.finditer(r'https://[^\"\s]+/media/catalog/product/[^\"\s]+_A_[^\"\s]+\.jpg', html):
|
|
img = m.group(0)
|
|
break
|
|
|
|
# Extract specs from <th>...<td>
|
|
specs = {}
|
|
for m in re.finditer(r'<th[^>]*>(.*?)</th>\s*<td[^>]*>(.*?)</td>', html, re.S|re.I):
|
|
label = re.sub(r'<[^>]+>', '', m.group(1)).strip().upper()
|
|
value = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
|
if label and value and value.lower() not in ('n/a', '-', ''):
|
|
specs[label] = value
|
|
|
|
if not specs and not img:
|
|
sys.exit(0)
|
|
|
|
# Map to columns (first match wins per column)
|
|
cols = {}
|
|
MAPPING = [
|
|
('POWER CONSUMPTION', 'power_consumption_w', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None),
|
|
('CONNECTOR / POLISH', 'connector', lambda v: v),
|
|
('CONNECTOR', 'connector', lambda v: v),
|
|
('MODULATION', 'modulation', lambda v: v),
|
|
('WAVELENGTH TX (TYPICAL)', 'wavelengths', lambda v: v),
|
|
('WAVELENGTH', 'wavelengths', lambda v: v),
|
|
('DISTANCE', 'reach_label', lambda v: v),
|
|
('TEMPERATURE RANGE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))),
|
|
('OPERATING TEMPERATURE', 'temp_range', lambda v: 'COM' if 'ommercial' in v or '0°C' in v else ('IND' if 'ndustrial' in v or '-40' in v else ('EXT' if 'xtended' in v else 'COM'))),
|
|
('LANE COUNT', 'lanes', lambda v: re.search(r'\d+', v).group() if re.search(r'\d+', v) else None),
|
|
('BANDWIDTH PER LANE', 'lane_rate', lambda v: v),
|
|
('BANDWIDTH', 'lane_rate', lambda v: v),
|
|
('INBUILT FEC', 'fec_type', lambda v: v if v.lower() not in ('no', 'none') else None),
|
|
('POWERBUDGET (DB)', 'optical_budget_db', lambda v: re.search(r'[\d.]+', v).group() if re.search(r'[\d.]+', v) else None),
|
|
('TRANSMIT MIN/MAX PER LANE', 'tx_power_min_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None),
|
|
('RECEIVER MIN/MAX PER LANE', 'rx_sensitivity_dbm', lambda v: re.search(r'-?[\d.]+', v).group() if re.search(r'-?[\d.]+', v) else None),
|
|
('INTERFACE', 'fiber_type', lambda v: v),
|
|
('COMPLIANCE CODE', 'ieee_reference', lambda v: v),
|
|
('DIGITAL DIAGNOSTIC MONITORING (DDM)', 'dom_support', lambda v: 'true' if 'yes' in v.lower() else 'false'),
|
|
]
|
|
|
|
mapped_labels = set()
|
|
for label, col, transform in MAPPING:
|
|
if label in specs and col not in cols:
|
|
try:
|
|
val = transform(specs[label])
|
|
if val is not None:
|
|
cols[col] = val
|
|
mapped_labels.add(label)
|
|
except:
|
|
pass
|
|
|
|
# Unmapped specs -> notes
|
|
extra = []
|
|
for k, v in specs.items():
|
|
if k not in mapped_labels and len(v) < 200:
|
|
extra.append(f'{k}: {v}')
|
|
if extra:
|
|
cols['notes'] = '; '.join(extra)[:1000]
|
|
|
|
if img:
|
|
cols['image_url'] = img
|
|
|
|
if not cols:
|
|
sys.exit(0)
|
|
|
|
# Build SQL
|
|
def esc(v):
|
|
return v.replace(chr(39), chr(39)+chr(39)).replace(chr(92), chr(92)+chr(92))
|
|
|
|
sets = []
|
|
for col, val in cols.items():
|
|
if col == 'dom_support':
|
|
sets.append(f'{col} = {val}')
|
|
else:
|
|
sets.append(f\"{col} = '{esc(str(val))}'\")
|
|
|
|
print(f'-- {\"$NAME\"}')
|
|
print(f\"UPDATE transceivers SET {', '.join(sets)} WHERE id = '{tid}';\")
|
|
" <<< "$HTML" 2>/dev/null)
|
|
|
|
if [ -n "$UPDATE" ]; then
|
|
echo "$UPDATE" >> "$SQL"
|
|
echo "" >> "$SQL"
|
|
OK=$((OK + 1))
|
|
echo "[$COUNT] $NAME -> OK" >> "$LOG"
|
|
else
|
|
echo "[$COUNT] $NAME -> no data" >> "$LOG"
|
|
fi
|
|
|
|
sleep 0.3
|
|
done < /tmp/fo-products.txt
|
|
|
|
echo "-- Summary: $OK/$TOTAL enriched" >> "$SQL"
|
|
|
|
echo "" >> "$LOG"
|
|
echo "Enrichment SQL generated: $OK/$TOTAL" >> "$LOG"
|
|
echo "Applying SQL..." >> "$LOG"
|
|
|
|
eval $DB -f "$SQL" >> "$LOG" 2>&1
|
|
|
|
echo "" >> "$LOG"
|
|
echo "=== FINAL COUNTS ===" >> "$LOG"
|
|
eval $DB -t -A -c "SELECT 'images: ' || count(*) FROM transceivers WHERE image_url IS NOT NULL" >> "$LOG"
|
|
eval $DB -t -A -c "SELECT 'connector: ' || count(*) FROM transceivers WHERE connector IS NOT NULL" >> "$LOG"
|
|
eval $DB -t -A -c "SELECT 'notes: ' || count(*) FROM transceivers WHERE notes IS NOT NULL AND notes != ''" >> "$LOG"
|
|
eval $DB -t -A -c "SELECT 'modulation: ' || count(*) FROM transceivers WHERE modulation IS NOT NULL" >> "$LOG"
|
|
eval $DB -t -A -c "SELECT 'power_w: ' || count(*) FROM transceivers WHERE power_consumption_w IS NOT NULL" >> "$LOG"
|
|
|
|
echo "$(date): V2 DONE" >> "$LOG"
|