-- Migration 018: Remove / quarantine garbage scraper data -- Products with category-page names instead of real product designations -- These came from GBICS and similar vendors scraping listing pages, not product pages -- ───────────────────────────────────────────────────────────────────────────── -- Step 1: Count what we're dealing with before deleting SELECT COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs, COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage, COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage, COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage, COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number, COUNT(*) AS total FROM transceivers; -- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first) UPDATE transceivers SET data_confidence = 'garbage', updated_at = NOW() WHERE standard_name ILIKE 'scraped-%' OR standard_name ILIKE '%all optical transceivers%' OR standard_name ILIKE 'compatible %gbps%' OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20) OR (standard_name ILIKE 'qsfp%400g gigabit%') OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20); -- Step 3: Report what was marked SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC; -- NOTE: Actual DELETE deferred until scraper improvements are in place. -- Run this to hard-delete garbage when ready: -- DELETE FROM transceivers WHERE data_confidence = 'garbage';