transceiver-db/sql/018-cleanup-garbage-data.sql
Rene Fichtmueller 7d2562af9c fix: detect+warn garbage product names, add DB cleanup migration 018
- isGarbageName(): detects scraped-slugs, 'All Optical Transceivers', 'Compatible NNGbps...',
  generic form-factor descriptions with no real SKU
- Panel title priority: real standard_name → part_number → description → constructed from specs
- Details warning shown when details_verified = false (amber banner)
- sql/018: marks garbage entries as data_confidence='garbage' for future DELETE
2026-04-01 21:26:13 +02:00

35 lines
1.9 KiB
SQL

-- Migration 018: Remove / quarantine garbage scraper data
-- Products with category-page names instead of real product designations
-- These came from GBICS and similar vendors scraping listing pages, not product pages
-- ─────────────────────────────────────────────────────────────────────────────
-- Step 1: Count what we're dealing with before deleting
SELECT
COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs,
COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage,
COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage,
COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage,
COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number,
COUNT(*) AS total
FROM transceivers;
-- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first)
UPDATE transceivers SET
data_confidence = 'garbage',
updated_at = NOW()
WHERE
standard_name ILIKE 'scraped-%'
OR standard_name ILIKE '%all optical transceivers%'
OR standard_name ILIKE 'compatible %gbps%'
OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25)
OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20)
OR (standard_name ILIKE 'qsfp%400g gigabit%')
OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20);
-- Step 3: Report what was marked
SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC;
-- NOTE: Actual DELETE deferred until scraper improvements are in place.
-- Run this to hard-delete garbage when ready:
-- DELETE FROM transceivers WHERE data_confidence = 'garbage';