From 7d2562af9ce9e8cb096cb27bd6f4e6d1b2127f2c Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 1 Apr 2026 21:26:13 +0200 Subject: [PATCH] fix: detect+warn garbage product names, add DB cleanup migration 018 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - isGarbageName(): detects scraped-slugs, 'All Optical Transceivers', 'Compatible NNGbps...', generic form-factor descriptions with no real SKU - Panel title priority: real standard_name → part_number → description → constructed from specs - Details warning shown when details_verified = false (amber banner) - sql/018: marks garbage entries as data_confidence='garbage' for future DELETE --- packages/dashboard/index.html | 27 +++++++++++++++++++------ sql/018-cleanup-garbage-data.sql | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 sql/018-cleanup-garbage-data.sql diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html index 2c96b2c..9e5b183 100644 --- a/packages/dashboard/index.html +++ b/packages/dashboard/index.html @@ -1833,15 +1833,30 @@ async function openTxDetail(id) { } h += ''; - // Title below image — show proper manufacturer designation, never auto-generated slugs - // A real name has mixed case or dots/digits; a slug looks like "scraped-o-czz8hg-z-a" - var isSlugLike = function(s) { return !s || s.startsWith('scraped-') || /^[a-z0-9-]+$/.test(s); }; - var titleName = (!isSlugLike(t.standard_name) ? t.standard_name : null) - || (!isSlugLike(t.part_number) ? t.part_number : null) - || t.description + // Title below image — proper manufacturer designation only, never garbage/auto-generated names + var isGarbageName = function(s) { + if (!s) return true; + if (s.startsWith('scraped-')) return true; // auto-generated slug + if (/^[a-z0-9-]+$/.test(s)) return true; // pure slug: only lowercase+digits+dash + if (/^all optical/i.test(s)) return true; // GBICS category page garbage + if (/^compatible \d+/i.test(s)) return true; // "Compatible 800GBASE-..." category + if (/^osfp \d+g/i.test(s)) return true; // generic form-factor description + if (/^qsfp.{0,5}\d+g/i.test(s)) return true; // "QSFP 400G Gigabit Ethernet" etc. + if (s.toLowerCase().includes('gigabit ethernet') && s.length > 25) return true; + if (s.toLowerCase().startsWith('sfp') && /^sfp\s*\d+g\s*\w+/i.test(s)) return true; + return false; + }; + var titleName = (!isGarbageName(t.standard_name) ? t.standard_name : null) + || (!isGarbageName(t.part_number) ? t.part_number : null) + || (t.description && !isGarbageName(t.description) ? t.description : null) || txDescName(t) || t.slug; h += '
' + esc(titleName) + '
'; + + // Show data quality warning when product is not details-verified + if (!t.details_verified) { + h += '
⚠ Produktdaten nicht aus offizieller Quelle verifiziert
'; + } h += '
'; if (t.vendor_name) h += '' + esc(t.vendor_name) + ' '; if (t.category) h += '' + esc(t.category) + ' '; diff --git a/sql/018-cleanup-garbage-data.sql b/sql/018-cleanup-garbage-data.sql new file mode 100644 index 0000000..d5e32eb --- /dev/null +++ b/sql/018-cleanup-garbage-data.sql @@ -0,0 +1,34 @@ +-- Migration 018: Remove / quarantine garbage scraper data +-- Products with category-page names instead of real product designations +-- These came from GBICS and similar vendors scraping listing pages, not product pages +-- ───────────────────────────────────────────────────────────────────────────── + +-- Step 1: Count what we're dealing with before deleting +SELECT + COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs, + COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage, + COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage, + COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage, + COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number, + COUNT(*) AS total +FROM transceivers; + +-- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first) +UPDATE transceivers SET + data_confidence = 'garbage', + updated_at = NOW() +WHERE + standard_name ILIKE 'scraped-%' + OR standard_name ILIKE '%all optical transceivers%' + OR standard_name ILIKE 'compatible %gbps%' + OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) + OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20) + OR (standard_name ILIKE 'qsfp%400g gigabit%') + OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20); + +-- Step 3: Report what was marked +SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC; + +-- NOTE: Actual DELETE deferred until scraper improvements are in place. +-- Run this to hard-delete garbage when ready: +-- DELETE FROM transceivers WHERE data_confidence = 'garbage';