- isGarbageName(): detects scraped-slugs, 'All Optical Transceivers', 'Compatible NNGbps...', generic form-factor descriptions with no real SKU - Panel title priority: real standard_name → part_number → description → constructed from specs - Details warning shown when details_verified = false (amber banner) - sql/018: marks garbage entries as data_confidence='garbage' for future DELETE
35 lines
1.9 KiB
SQL
35 lines
1.9 KiB
SQL
-- Migration 018: Remove / quarantine garbage scraper data
|
|
-- Products with category-page names instead of real product designations
|
|
-- These came from GBICS and similar vendors scraping listing pages, not product pages
|
|
-- ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
-- Step 1: Count what we're dealing with before deleting
|
|
SELECT
|
|
COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs,
|
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage,
|
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage,
|
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage,
|
|
COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number,
|
|
COUNT(*) AS total
|
|
FROM transceivers;
|
|
|
|
-- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first)
|
|
UPDATE transceivers SET
|
|
data_confidence = 'garbage',
|
|
updated_at = NOW()
|
|
WHERE
|
|
standard_name ILIKE 'scraped-%'
|
|
OR standard_name ILIKE '%all optical transceivers%'
|
|
OR standard_name ILIKE 'compatible %gbps%'
|
|
OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25)
|
|
OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20)
|
|
OR (standard_name ILIKE 'qsfp%400g gigabit%')
|
|
OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20);
|
|
|
|
-- Step 3: Report what was marked
|
|
SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC;
|
|
|
|
-- NOTE: Actual DELETE deferred until scraper improvements are in place.
|
|
-- Run this to hard-delete garbage when ready:
|
|
-- DELETE FROM transceivers WHERE data_confidence = 'garbage';
|