fix: detect+warn garbage product names, add DB cleanup migration 018

- isGarbageName(): detects scraped-slugs, 'All Optical Transceivers', 'Compatible NNGbps...',
  generic form-factor descriptions with no real SKU
- Panel title priority: real standard_name → part_number → description → constructed from specs
- Details warning shown when details_verified = false (amber banner)
- sql/018: marks garbage entries as data_confidence='garbage' for future DELETE
This commit is contained in:
Rene Fichtmueller 2026-04-01 21:26:13 +02:00
parent 4d4ffcf5a5
commit 7d2562af9c
2 changed files with 55 additions and 6 deletions

View File

@ -1833,15 +1833,30 @@ async function openTxDetail(id) {
}
h += '</div>';
// Title below image — show proper manufacturer designation, never auto-generated slugs
// A real name has mixed case or dots/digits; a slug looks like "scraped-o-czz8hg-z-a"
var isSlugLike = function(s) { return !s || s.startsWith('scraped-') || /^[a-z0-9-]+$/.test(s); };
var titleName = (!isSlugLike(t.standard_name) ? t.standard_name : null)
|| (!isSlugLike(t.part_number) ? t.part_number : null)
|| t.description
// Title below image — proper manufacturer designation only, never garbage/auto-generated names
var isGarbageName = function(s) {
if (!s) return true;
if (s.startsWith('scraped-')) return true; // auto-generated slug
if (/^[a-z0-9-]+$/.test(s)) return true; // pure slug: only lowercase+digits+dash
if (/^all optical/i.test(s)) return true; // GBICS category page garbage
if (/^compatible \d+/i.test(s)) return true; // "Compatible 800GBASE-..." category
if (/^osfp \d+g/i.test(s)) return true; // generic form-factor description
if (/^qsfp.{0,5}\d+g/i.test(s)) return true; // "QSFP 400G Gigabit Ethernet" etc.
if (s.toLowerCase().includes('gigabit ethernet') && s.length > 25) return true;
if (s.toLowerCase().startsWith('sfp') && /^sfp\s*\d+g\s*\w+/i.test(s)) return true;
return false;
};
var titleName = (!isGarbageName(t.standard_name) ? t.standard_name : null)
|| (!isGarbageName(t.part_number) ? t.part_number : null)
|| (t.description && !isGarbageName(t.description) ? t.description : null)
|| txDescName(t)
|| t.slug;
h += '<div class="panel-title">' + esc(titleName) + '</div>';
// Show data quality warning when product is not details-verified
if (!t.details_verified) {
h += '<div style="font-size:0.72rem;color:#c1440e;background:rgba(193,68,14,0.07);border:1px solid rgba(193,68,14,0.2);border-radius:5px;padding:0.35rem 0.6rem;margin:0.4rem 0">⚠ Produktdaten nicht aus offizieller Quelle verifiziert</div>';
}
h += '<div class="panel-sub">';
if (t.vendor_name) h += '<span class="b b-blue" title="Hersteller / Marke dieses Produkts">' + esc(t.vendor_name) + '</span> ';
if (t.category) h += '<span class="b b-neutral" title="Einsatzbereich: ' + esc(t.category) + '">' + esc(t.category) + '</span> ';

View File

@ -0,0 +1,34 @@
-- Migration 018: Remove / quarantine garbage scraper data
-- Products with category-page names instead of real product designations
-- These came from GBICS and similar vendors scraping listing pages, not product pages
-- ─────────────────────────────────────────────────────────────────────────────
-- Step 1: Count what we're dealing with before deleting
SELECT
COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs,
COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage,
COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage,
COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage,
COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number,
COUNT(*) AS total
FROM transceivers;
-- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first)
UPDATE transceivers SET
data_confidence = 'garbage',
updated_at = NOW()
WHERE
standard_name ILIKE 'scraped-%'
OR standard_name ILIKE '%all optical transceivers%'
OR standard_name ILIKE 'compatible %gbps%'
OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25)
OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20)
OR (standard_name ILIKE 'qsfp%400g gigabit%')
OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20);
-- Step 3: Report what was marked
SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC;
-- NOTE: Actual DELETE deferred until scraper improvements are in place.
-- Run this to hard-delete garbage when ready:
-- DELETE FROM transceivers WHERE data_confidence = 'garbage';