fix: detect+warn garbage product names, add DB cleanup migration 018
- isGarbageName(): detects scraped-slugs, 'All Optical Transceivers', 'Compatible NNGbps...', generic form-factor descriptions with no real SKU - Panel title priority: real standard_name → part_number → description → constructed from specs - Details warning shown when details_verified = false (amber banner) - sql/018: marks garbage entries as data_confidence='garbage' for future DELETE
This commit is contained in:
parent
7b14ac4bbe
commit
480decd307
@ -1833,15 +1833,30 @@ async function openTxDetail(id) {
|
|||||||
}
|
}
|
||||||
h += '</div>';
|
h += '</div>';
|
||||||
|
|
||||||
// Title below image — show proper manufacturer designation, never auto-generated slugs
|
// Title below image — proper manufacturer designation only, never garbage/auto-generated names
|
||||||
// A real name has mixed case or dots/digits; a slug looks like "scraped-o-czz8hg-z-a"
|
var isGarbageName = function(s) {
|
||||||
var isSlugLike = function(s) { return !s || s.startsWith('scraped-') || /^[a-z0-9-]+$/.test(s); };
|
if (!s) return true;
|
||||||
var titleName = (!isSlugLike(t.standard_name) ? t.standard_name : null)
|
if (s.startsWith('scraped-')) return true; // auto-generated slug
|
||||||
|| (!isSlugLike(t.part_number) ? t.part_number : null)
|
if (/^[a-z0-9-]+$/.test(s)) return true; // pure slug: only lowercase+digits+dash
|
||||||
|| t.description
|
if (/^all optical/i.test(s)) return true; // GBICS category page garbage
|
||||||
|
if (/^compatible \d+/i.test(s)) return true; // "Compatible 800GBASE-..." category
|
||||||
|
if (/^osfp \d+g/i.test(s)) return true; // generic form-factor description
|
||||||
|
if (/^qsfp.{0,5}\d+g/i.test(s)) return true; // "QSFP 400G Gigabit Ethernet" etc.
|
||||||
|
if (s.toLowerCase().includes('gigabit ethernet') && s.length > 25) return true;
|
||||||
|
if (s.toLowerCase().startsWith('sfp') && /^sfp\s*\d+g\s*\w+/i.test(s)) return true;
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
var titleName = (!isGarbageName(t.standard_name) ? t.standard_name : null)
|
||||||
|
|| (!isGarbageName(t.part_number) ? t.part_number : null)
|
||||||
|
|| (t.description && !isGarbageName(t.description) ? t.description : null)
|
||||||
|| txDescName(t)
|
|| txDescName(t)
|
||||||
|| t.slug;
|
|| t.slug;
|
||||||
h += '<div class="panel-title">' + esc(titleName) + '</div>';
|
h += '<div class="panel-title">' + esc(titleName) + '</div>';
|
||||||
|
|
||||||
|
// Show data quality warning when product is not details-verified
|
||||||
|
if (!t.details_verified) {
|
||||||
|
h += '<div style="font-size:0.72rem;color:#c1440e;background:rgba(193,68,14,0.07);border:1px solid rgba(193,68,14,0.2);border-radius:5px;padding:0.35rem 0.6rem;margin:0.4rem 0">⚠ Produktdaten nicht aus offizieller Quelle verifiziert</div>';
|
||||||
|
}
|
||||||
h += '<div class="panel-sub">';
|
h += '<div class="panel-sub">';
|
||||||
if (t.vendor_name) h += '<span class="b b-blue" title="Hersteller / Marke dieses Produkts">' + esc(t.vendor_name) + '</span> ';
|
if (t.vendor_name) h += '<span class="b b-blue" title="Hersteller / Marke dieses Produkts">' + esc(t.vendor_name) + '</span> ';
|
||||||
if (t.category) h += '<span class="b b-neutral" title="Einsatzbereich: ' + esc(t.category) + '">' + esc(t.category) + '</span> ';
|
if (t.category) h += '<span class="b b-neutral" title="Einsatzbereich: ' + esc(t.category) + '">' + esc(t.category) + '</span> ';
|
||||||
|
|||||||
34
sql/018-cleanup-garbage-data.sql
Normal file
34
sql/018-cleanup-garbage-data.sql
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
-- Migration 018: Remove / quarantine garbage scraper data
|
||||||
|
-- Products with category-page names instead of real product designations
|
||||||
|
-- These came from GBICS and similar vendors scraping listing pages, not product pages
|
||||||
|
-- ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
-- Step 1: Count what we're dealing with before deleting
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE standard_name ILIKE 'scraped-%') AS scraped_slugs,
|
||||||
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%all optical%') AS all_optical_garbage,
|
||||||
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%compatible%gbps%') AS compatible_gbps_garbage,
|
||||||
|
COUNT(*) FILTER (WHERE standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25) AS gigabit_eth_garbage,
|
||||||
|
COUNT(*) FILTER (WHERE part_number IS NULL OR part_number = slug) AS no_real_part_number,
|
||||||
|
COUNT(*) AS total
|
||||||
|
FROM transceivers;
|
||||||
|
|
||||||
|
-- Step 2: Mark garbage entries as 'garbage' data_confidence (non-destructive first)
|
||||||
|
UPDATE transceivers SET
|
||||||
|
data_confidence = 'garbage',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE
|
||||||
|
standard_name ILIKE 'scraped-%'
|
||||||
|
OR standard_name ILIKE '%all optical transceivers%'
|
||||||
|
OR standard_name ILIKE 'compatible %gbps%'
|
||||||
|
OR (standard_name ILIKE '%gigabit ethernet%' AND LENGTH(standard_name) > 25)
|
||||||
|
OR (standard_name ILIKE 'osfp %g%' AND LENGTH(standard_name) < 20)
|
||||||
|
OR (standard_name ILIKE 'qsfp%400g gigabit%')
|
||||||
|
OR (standard_name ILIKE 'sfp%gigabit%' AND LENGTH(standard_name) > 20);
|
||||||
|
|
||||||
|
-- Step 3: Report what was marked
|
||||||
|
SELECT data_confidence, COUNT(*) FROM transceivers GROUP BY data_confidence ORDER BY COUNT(*) DESC;
|
||||||
|
|
||||||
|
-- NOTE: Actual DELETE deferred until scraper improvements are in place.
|
||||||
|
-- Run this to hard-delete garbage when ready:
|
||||||
|
-- DELETE FROM transceivers WHERE data_confidence = 'garbage';
|
||||||
Loading…
x
Reference in New Issue
Block a user