feat: add targeted product page asset verifier
This commit is contained in:
parent
0d4bcb6924
commit
adb2661fac
@ -18,6 +18,7 @@
|
|||||||
"scrape:sfpcables": "tsx src/scrapers/sfpcables.ts",
|
"scrape:sfpcables": "tsx src/scrapers/sfpcables.ts",
|
||||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||||
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
||||||
|
"verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts",
|
||||||
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
||||||
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
||||||
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
||||||
|
|||||||
@ -44,7 +44,9 @@ async function quarantine(): Promise<void> {
|
|||||||
v.name = 'GBICS'
|
v.name = 'GBICS'
|
||||||
AND (
|
AND (
|
||||||
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
||||||
|
OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)'
|
||||||
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
OR (
|
OR (
|
||||||
@ -86,7 +88,9 @@ async function quarantine(): Promise<void> {
|
|||||||
v.name = 'GBICS'
|
v.name = 'GBICS'
|
||||||
AND (
|
AND (
|
||||||
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
||||||
|
OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)'
|
||||||
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
OR (
|
OR (
|
||||||
|
|||||||
368
packages/scraper/src/utils/verify-product-page-assets.ts
Normal file
368
packages/scraper/src/utils/verify-product-page-assets.ts
Normal file
@ -0,0 +1,368 @@
|
|||||||
|
import { pool, markDetailsVerified, markImageVerified } from "./db";
|
||||||
|
|
||||||
|
const EXCLUDED_CATEGORIES = [
|
||||||
|
"NonTransceiver",
|
||||||
|
"Accessory",
|
||||||
|
"Adapter / Converter",
|
||||||
|
"Switch / Media Converter",
|
||||||
|
"Switch / Network Infrastructure",
|
||||||
|
"NIC / Adapter",
|
||||||
|
"Mux / Passive Optical",
|
||||||
|
"Product Family",
|
||||||
|
"Loopback / Test Module",
|
||||||
|
];
|
||||||
|
|
||||||
|
const HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
};
|
||||||
|
|
||||||
|
type Candidate = {
|
||||||
|
id: string;
|
||||||
|
vendorName: string;
|
||||||
|
partNumber: string;
|
||||||
|
productUrl: string;
|
||||||
|
formFactor: string | null;
|
||||||
|
speedGbps: number | null;
|
||||||
|
reachLabel: string | null;
|
||||||
|
fiberType: string | null;
|
||||||
|
imageVerified: boolean | null;
|
||||||
|
detailsVerified: boolean | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
type InferredDetails = {
|
||||||
|
formFactor?: string;
|
||||||
|
speed?: string;
|
||||||
|
speedGbps?: number;
|
||||||
|
reachLabel?: string;
|
||||||
|
reachMeters?: number;
|
||||||
|
fiberType?: string;
|
||||||
|
wavelengths?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeEntities(value: string): string {
|
||||||
|
return value
|
||||||
|
.replace(/&/g, "&")
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">");
|
||||||
|
}
|
||||||
|
|
||||||
|
function absolutizeUrl(src: string, baseUrl: string): string {
|
||||||
|
try {
|
||||||
|
return new URL(decodeEntities(src), baseUrl).toString();
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripHtml(html: string): string {
|
||||||
|
return html
|
||||||
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
||||||
|
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
||||||
|
.replace(/<[^>]+>/g, " ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTitle(html: string): string {
|
||||||
|
const og = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|
||||||
|
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:title["'][^>]*>/i);
|
||||||
|
if (og) return decodeEntities(og[1]);
|
||||||
|
const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||||
|
return title ? decodeEntities(stripHtml(title[1])) : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractImageUrl(html: string, baseUrl: string): string | undefined {
|
||||||
|
const meta = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|
||||||
|
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["'][^>]*>/i)
|
||||||
|
|| html.match(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|
||||||
|
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:image["'][^>]*>/i);
|
||||||
|
if (meta) {
|
||||||
|
const url = absolutizeUrl(meta[1], baseUrl);
|
||||||
|
if (isLikelyProductImage(url)) return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
const idImage = html.match(/<img[^>]+id=["']image["'][^>]+src=["']([^"']+)["'][^>]*>/i)
|
||||||
|
|| html.match(/<img[^>]+src=["']([^"']+)["'][^>]+id=["']image["'][^>]*>/i);
|
||||||
|
if (idImage) {
|
||||||
|
const url = absolutizeUrl(idImage[1], baseUrl);
|
||||||
|
if (isLikelyProductImage(url)) return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
const imgRegex = /<img\b[^>]*\bsrc=["']([^"']+)["'][^>]*>/gi;
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
while ((match = imgRegex.exec(html)) !== null) {
|
||||||
|
const tag = match[0];
|
||||||
|
const url = absolutizeUrl(match[1], baseUrl);
|
||||||
|
const alt = (tag.match(/\balt=["']([^"']*)["']/i)?.[1] || "").toLowerCase();
|
||||||
|
const haystack = `${url} ${alt}`.toLowerCase();
|
||||||
|
if (!isLikelyProductImage(url)) continue;
|
||||||
|
if (/transceiver|optical|sfp|qsfp|osfp|xfp|module|base-/i.test(haystack)) return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isLikelyProductImage(url: string): boolean {
|
||||||
|
const lower = url.toLowerCase();
|
||||||
|
if (!lower || !/^https?:\/\//.test(lower)) return false;
|
||||||
|
if (!/\.(jpg|jpeg|png|webp)(\?|$)/.test(lower)) return false;
|
||||||
|
if (/(logo|country|icon|sprite|banner|menu|payment|placeholder|no-image|company|featured-|solution|fofc|mdc|mmc)/.test(lower)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return /(catalog\/product|\/uploads\/image\/|product|transceiver|sfp|qsfp|osfp|xfp)/.test(lower);
|
||||||
|
}
|
||||||
|
|
||||||
|
function inferDetails(text: string): InferredDetails {
|
||||||
|
const lower = text.toLowerCase();
|
||||||
|
const details: InferredDetails = {};
|
||||||
|
|
||||||
|
const formFactors: Array<[RegExp, string]> = [
|
||||||
|
[/\bqsfp-dd\b/i, "QSFP-DD"],
|
||||||
|
[/\bqsfp112\b/i, "QSFP112"],
|
||||||
|
[/\bosfp112\b/i, "OSFP112"],
|
||||||
|
[/\bosfp\b/i, "OSFP"],
|
||||||
|
[/\bqsfp56\b/i, "QSFP56"],
|
||||||
|
[/\bqsfp28\b/i, "QSFP28"],
|
||||||
|
[/\bqsfp\+|\bqsfp\b/i, "QSFP+"],
|
||||||
|
[/\bsfp56\b/i, "SFP56"],
|
||||||
|
[/\bsfp28\b/i, "SFP28"],
|
||||||
|
[/\bsfp\+|\bsfp plus\b/i, "SFP+"],
|
||||||
|
[/\bxfp\b/i, "XFP"],
|
||||||
|
[/\bsfp\b/i, "SFP"],
|
||||||
|
];
|
||||||
|
for (const [regex, formFactor] of formFactors) {
|
||||||
|
if (regex.test(text)) {
|
||||||
|
details.formFactor = formFactor;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const speedPatterns: Array<[RegExp, string, number]> = [
|
||||||
|
[/\b800\s*g(?:base)?\b/i, "800G", 800],
|
||||||
|
[/\b400\s*g(?:base)?\b/i, "400G", 400],
|
||||||
|
[/\b200\s*g(?:base)?\b/i, "200G", 200],
|
||||||
|
[/\b128\s*gfc\b/i, "128G FC", 128],
|
||||||
|
[/\b100\s*g(?:base)?\b/i, "100G", 100],
|
||||||
|
[/\b50\s*g(?:base)?\b/i, "50G", 50],
|
||||||
|
[/\b40\s*g(?:base)?\b/i, "40G", 40],
|
||||||
|
[/\b32\s*gfc\b/i, "32G FC", 32],
|
||||||
|
[/\b16\s*gfc\b/i, "16G FC", 16],
|
||||||
|
[/\b10\s*g(?:base)?\b/i, "10G", 10],
|
||||||
|
[/\b1\.?25\s*g(?:base)?\b|\b1000base\b|\b1\s*g(?:base)?\b/i, "1G", 1],
|
||||||
|
[/(^|[^.\d-])25\s*g(?:base)?\b/i, "25G", 25],
|
||||||
|
[/\b2[.-]?5\s*g(?:base)?\b/i, "2.5G", 2.5],
|
||||||
|
];
|
||||||
|
for (const [regex, speed, speedGbps] of speedPatterns) {
|
||||||
|
if (regex.test(text)) {
|
||||||
|
details.speed = speed;
|
||||||
|
details.speedGbps = speedGbps;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const kmMatch = text.match(/\b(160|120|100|80|60|40|30|20|10|3|2|1(?:\.4)?)\s*km\b/i);
|
||||||
|
const mMatch = text.match(/\b(550|500|400|300|150|100|80|30)\s*m(?:eters?)?\b/i);
|
||||||
|
if (kmMatch) {
|
||||||
|
const km = parseFloat(kmMatch[1]);
|
||||||
|
details.reachLabel = `${kmMatch[1]}km`;
|
||||||
|
details.reachMeters = Math.round(km * 1000);
|
||||||
|
} else if (mMatch) {
|
||||||
|
const meters = parseInt(mMatch[1], 10);
|
||||||
|
details.reachLabel = `${meters}m`;
|
||||||
|
details.reachMeters = meters;
|
||||||
|
} else if (/\bfr4\b/i.test(text)) {
|
||||||
|
details.reachLabel = "2km";
|
||||||
|
details.reachMeters = 2000;
|
||||||
|
} else if (/\bdr\d*\b/i.test(text)) {
|
||||||
|
details.reachLabel = "500m";
|
||||||
|
details.reachMeters = 500;
|
||||||
|
} else if (/\blr4?\b/i.test(text)) {
|
||||||
|
details.reachLabel = "10km";
|
||||||
|
details.reachMeters = 10000;
|
||||||
|
} else if (/\ber4?\b/i.test(text)) {
|
||||||
|
details.reachLabel = "40km";
|
||||||
|
details.reachMeters = 40000;
|
||||||
|
} else if (/\bzr4?\b/i.test(text)) {
|
||||||
|
details.reachLabel = "80km";
|
||||||
|
details.reachMeters = 80000;
|
||||||
|
} else if (/\bsr\d*|\bsw4\b/i.test(text)) {
|
||||||
|
details.reachLabel = "300m";
|
||||||
|
details.reachMeters = 300;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/base-?t|rj-?45|cat ?6|cat ?5|copper/i.test(text)) {
|
||||||
|
details.fiberType = "Copper";
|
||||||
|
} else if (/single.?mode|\bsmf\b|\blr\b|\ber\b|\bzr\b|\bfr\b|\bdr\b|cwdm|dwdm|bidi|psm/i.test(lower)) {
|
||||||
|
details.fiberType = "SMF";
|
||||||
|
} else if (/multi.?mode|\bmmf\b|\bsr\b|\bsw\b/i.test(lower)) {
|
||||||
|
details.fiberType = "MMF";
|
||||||
|
}
|
||||||
|
|
||||||
|
const wavelength = text.match(/\b(\d{3,4})\s*nm\b/i);
|
||||||
|
if (wavelength) details.wavelengths = wavelength[1];
|
||||||
|
|
||||||
|
return details;
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeDetails(existing: Candidate, inferred: InferredDetails): InferredDetails {
|
||||||
|
const updates: InferredDetails = {};
|
||||||
|
if (inferred.formFactor && (!existing.formFactor || existing.formFactor === "SFP+" && inferred.formFactor.startsWith("QSFP"))) {
|
||||||
|
updates.formFactor = inferred.formFactor;
|
||||||
|
}
|
||||||
|
if (inferred.speedGbps && (!existing.speedGbps || inferred.speedGbps > existing.speedGbps)) {
|
||||||
|
updates.speed = inferred.speed;
|
||||||
|
updates.speedGbps = inferred.speedGbps;
|
||||||
|
}
|
||||||
|
if (inferred.reachLabel && !existing.reachLabel) {
|
||||||
|
updates.reachLabel = inferred.reachLabel;
|
||||||
|
updates.reachMeters = inferred.reachMeters;
|
||||||
|
}
|
||||||
|
if (inferred.fiberType && !existing.fiberType) {
|
||||||
|
updates.fiberType = inferred.fiberType;
|
||||||
|
}
|
||||||
|
return updates;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchHtml(url: string): Promise<string> {
|
||||||
|
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||||
|
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||||
|
return resp.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function applyDetailUpdates(id: string, updates: InferredDetails): Promise<void> {
|
||||||
|
const sets: string[] = [];
|
||||||
|
const values: unknown[] = [id];
|
||||||
|
let idx = 2;
|
||||||
|
|
||||||
|
if (updates.formFactor) {
|
||||||
|
sets.push(`form_factor = $${idx++}`);
|
||||||
|
values.push(updates.formFactor);
|
||||||
|
}
|
||||||
|
if (updates.speedGbps) {
|
||||||
|
sets.push(`speed_gbps = $${idx++}`);
|
||||||
|
values.push(updates.speedGbps);
|
||||||
|
}
|
||||||
|
if (updates.speed) {
|
||||||
|
sets.push(`speed = $${idx++}`);
|
||||||
|
values.push(updates.speed);
|
||||||
|
}
|
||||||
|
if (updates.reachMeters) {
|
||||||
|
sets.push(`reach_meters = $${idx++}`);
|
||||||
|
values.push(updates.reachMeters);
|
||||||
|
}
|
||||||
|
if (updates.reachLabel) {
|
||||||
|
sets.push(`reach_label = $${idx++}`);
|
||||||
|
values.push(updates.reachLabel);
|
||||||
|
}
|
||||||
|
if (updates.fiberType) {
|
||||||
|
sets.push(`fiber_type = $${idx++}`);
|
||||||
|
values.push(updates.fiberType);
|
||||||
|
}
|
||||||
|
if (updates.wavelengths) {
|
||||||
|
sets.push(`wavelengths = COALESCE(NULLIF(wavelengths, ''), $${idx++})`);
|
||||||
|
values.push(updates.wavelengths);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sets.length === 0) return;
|
||||||
|
sets.push("updated_at = NOW()");
|
||||||
|
await pool.query(`UPDATE transceivers SET ${sets.join(", ")} WHERE id = $1`, values);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
const vendorFilter = (process.env["PRODUCT_ASSET_VENDOR"] || "").trim();
|
||||||
|
const limit = parseInt(process.env["PRODUCT_ASSET_LIMIT"] || "120", 10);
|
||||||
|
const apply = process.env["PRODUCT_ASSET_APPLY"] === "1";
|
||||||
|
|
||||||
|
const vendorNames = vendorFilter
|
||||||
|
? vendorFilter.split(",").map((v) => v.trim()).filter(Boolean)
|
||||||
|
: ["T&S Communication", "10Gtek", "GBICS"];
|
||||||
|
|
||||||
|
const candidates = await pool.query<Candidate>(
|
||||||
|
`SELECT t.id,
|
||||||
|
v.name AS "vendorName",
|
||||||
|
t.part_number AS "partNumber",
|
||||||
|
t.product_page_url AS "productUrl",
|
||||||
|
t.form_factor AS "formFactor",
|
||||||
|
t.speed_gbps AS "speedGbps",
|
||||||
|
t.reach_label AS "reachLabel",
|
||||||
|
t.fiber_type AS "fiberType",
|
||||||
|
t.image_verified AS "imageVerified",
|
||||||
|
t.details_verified AS "detailsVerified"
|
||||||
|
FROM transceivers t
|
||||||
|
JOIN vendors v ON v.id = t.vendor_id
|
||||||
|
WHERE v.name = ANY($1)
|
||||||
|
AND COALESCE(t.category, '') <> ALL($2)
|
||||||
|
AND COALESCE(t.product_page_url, '') ~* '^https?://'
|
||||||
|
AND (
|
||||||
|
COALESCE(t.image_verified, false) = false
|
||||||
|
OR COALESCE(t.details_verified, false) = false
|
||||||
|
OR t.speed_gbps IS NULL
|
||||||
|
OR t.speed_gbps = 0
|
||||||
|
OR COALESCE(t.fiber_type, '') = ''
|
||||||
|
)
|
||||||
|
ORDER BY v.name, t.part_number
|
||||||
|
LIMIT $3`,
|
||||||
|
[vendorNames, EXCLUDED_CATEGORIES, limit],
|
||||||
|
);
|
||||||
|
|
||||||
|
let images = 0;
|
||||||
|
let details = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
|
console.log("=== Product page asset verifier ===", { vendorNames, limit, apply, count: candidates.rows.length });
|
||||||
|
|
||||||
|
for (const row of candidates.rows) {
|
||||||
|
try {
|
||||||
|
const html = await fetchHtml(row.productUrl);
|
||||||
|
const title = extractTitle(html);
|
||||||
|
const text = `${row.partNumber} ${row.productUrl} ${title}`;
|
||||||
|
const imageUrl = extractImageUrl(html, row.productUrl);
|
||||||
|
const inferred = inferDetails(text);
|
||||||
|
const updates = mergeDetails(row, inferred);
|
||||||
|
|
||||||
|
if (!apply) {
|
||||||
|
console.log("dry-run product asset", {
|
||||||
|
vendor: row.vendorName,
|
||||||
|
partNumber: row.partNumber,
|
||||||
|
imageUrl,
|
||||||
|
updates,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
await applyDetailUpdates(row.id, updates);
|
||||||
|
if (imageUrl && !row.imageVerified) {
|
||||||
|
const changed = await markImageVerified(row.id, imageUrl);
|
||||||
|
if (changed) images++;
|
||||||
|
}
|
||||||
|
const marked = await markDetailsVerified({ transceiverId: row.id, sourceUrl: row.productUrl });
|
||||||
|
if (marked) details++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!imageUrl && Object.keys(updates).length === 0) skipped++;
|
||||||
|
await sleep(750);
|
||||||
|
} catch (err) {
|
||||||
|
errors++;
|
||||||
|
console.warn(`asset verifier error for ${row.vendorName} ${row.partNumber}: ${(err as Error).message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("Product page asset verifier complete", { images, details, skipped, errors, apply });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
main()
|
||||||
|
.then(() => pool.end())
|
||||||
|
.catch((err) => {
|
||||||
|
console.error("Fatal:", err);
|
||||||
|
pool.end();
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -37,6 +37,37 @@ Updated: 2026-05-09 22:15 UTC
|
|||||||
- reconcile must preserve explicit competitor research states
|
- reconcile must preserve explicit competitor research states
|
||||||
- GBICS/T&S artifact quarantine rules
|
- GBICS/T&S artifact quarantine rules
|
||||||
|
|
||||||
|
- TIP product-page asset verifier on 2026-05-09:
|
||||||
|
- added `verify:product-page-assets`
|
||||||
|
- deterministic scope:
|
||||||
|
- only existing `product_page_url` rows
|
||||||
|
- vendor-limited batches via `PRODUCT_ASSET_VENDOR`
|
||||||
|
- dry-run by default, apply only with `PRODUCT_ASSET_APPLY=1`
|
||||||
|
- extracts images from source-backed product image tags/meta only
|
||||||
|
- infers details only from part number, product URL, and title to avoid navigation pollution
|
||||||
|
- remote build passed on Erik
|
||||||
|
- live verifier results:
|
||||||
|
- GBICS extra quarantine: `17` additional category/family artifacts
|
||||||
|
- T&S Communication asset apply: `36` images, `36` details closed after a second DR8 reach pass
|
||||||
|
- 10Gtek/SFPcables asset apply: `5` images, `10` details improved on rows with existing product URLs
|
||||||
|
- current vendor gaps:
|
||||||
|
- GBICS: `71` active rows, `0` missing price, `0` missing image, `0` missing details
|
||||||
|
- T&S Communication: `36` active rows, `36` missing price, `0` missing image, `0` missing details
|
||||||
|
- 10Gtek: `175` active rows, `126` missing price, `126` missing image, `20` missing details
|
||||||
|
- interpretation:
|
||||||
|
- T&S is now product-data complete but public-price blocked; pages expose no real public price (`price: 0.00` / quote-only behavior)
|
||||||
|
- 10Gtek remaining gaps are mostly rows without reliable product URLs/price sources and need alias/source discovery rather than blind image guessing
|
||||||
|
- live health after this pass:
|
||||||
|
- active products: `17195`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12057`
|
||||||
|
- details verified: `16713`
|
||||||
|
- fully verified: `10459`
|
||||||
|
- competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5790`
|
||||||
|
- TIPLLM training pool updated with:
|
||||||
|
- product-page asset verifier dry-run/apply pattern
|
||||||
|
- T&S quote-only public-price rule
|
||||||
|
|
||||||
- MAGATAMA multi-LLM training lane expansion on 2026-05-09:
|
- MAGATAMA multi-LLM training lane expansion on 2026-05-09:
|
||||||
- added first-class training lanes for:
|
- added first-class training lanes for:
|
||||||
- `pulso_llm`
|
- `pulso_llm`
|
||||||
|
|||||||
@ -136,3 +136,49 @@ Verification after a fresh reconcile test:
|
|||||||
- `ambiguous=556`
|
- `ambiguous=556`
|
||||||
- `needs_research=5807`
|
- `needs_research=5807`
|
||||||
- fully product-verified rows still in competitor `needs_research`: `0`
|
- fully product-verified rows still in competitor `needs_research`: `0`
|
||||||
|
|
||||||
|
## Follow-Up: Product Page Asset Verifier
|
||||||
|
|
||||||
|
Added `packages/scraper/src/utils/verify-product-page-assets.ts` and script `verify:product-page-assets`.
|
||||||
|
|
||||||
|
Safety design:
|
||||||
|
|
||||||
|
- vendor-limited with `PRODUCT_ASSET_VENDOR`
|
||||||
|
- batch-limited with `PRODUCT_ASSET_LIMIT`
|
||||||
|
- dry-run by default
|
||||||
|
- apply only with `PRODUCT_ASSET_APPLY=1`
|
||||||
|
- uses existing `product_page_url` rows only
|
||||||
|
- writes through existing `markImageVerified` / `markDetailsVerified` helpers so the evidence ledger remains canonical
|
||||||
|
- infers details only from part number, product URL, and title; full body text was intentionally excluded after a dry-run showed T&S navigation links polluting speed inference
|
||||||
|
|
||||||
|
Live runs:
|
||||||
|
|
||||||
|
- extra GBICS quarantine:
|
||||||
|
- `17` category/family artifacts removed from the active base
|
||||||
|
- GBICS is now product-data complete in the active base: `71` active rows, `0/0/0` price/image/details gaps
|
||||||
|
- T&S Communication:
|
||||||
|
- product-page asset verifier applied
|
||||||
|
- `36` images verified
|
||||||
|
- `36` details verified after adding DR8/SR8 reach inference
|
||||||
|
- current active gaps: `36` active rows, `36` missing price, `0` missing image, `0` missing details
|
||||||
|
- interpretation: T&S is public-price blocked; pages expose no deterministic price, only quote/no-price or `price: 0.00`
|
||||||
|
- 10Gtek / SFPcables:
|
||||||
|
- dry-run caught `1.25G` / `2.5G` vs `25G` regex risk; regex was hardened before apply
|
||||||
|
- `5` images verified
|
||||||
|
- `10` details verified/improved
|
||||||
|
- current active gaps: `175` active rows, `126` missing price, `126` missing image, `20` missing details
|
||||||
|
- interpretation: remaining 10Gtek rows need URL/source alias discovery, not blind verification
|
||||||
|
|
||||||
|
Live health after this pass:
|
||||||
|
|
||||||
|
- health: healthy
|
||||||
|
- active products: `17195`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12057`
|
||||||
|
- details verified: `16713`
|
||||||
|
- fully verified: `10459`
|
||||||
|
- competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=74`
|
||||||
|
- `ambiguous=556`
|
||||||
|
- `needs_research=5790`
|
||||||
|
|||||||
@ -47,3 +47,5 @@
|
|||||||
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
|
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
|
||||||
{"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]}
|
{"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]}
|
||||||
{"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]}
|
{"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]}
|
||||||
|
{"id":"tip-20260509-product-page-asset-verifier-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Design deterministic product-page verifiers that improve evidence without broad crawling or invented values."},{"role":"user","content":"How should TIP safely backfill images and details from existing vendor product URLs?"},{"role":"assistant","content":"Use a targeted product-page asset verifier. Select active rows with an existing product_page_url and missing image/details; fetch one page per row with low concurrency; extract only likely product images from og:image, twitter:image, Magento product image IDs, or filtered product image URLs; infer details only from part number, product URL, and page title, not from full body navigation. Always dry-run first to catch navigation pollution such as T&S 800G menu links or 1.25G being misread as 25G. Apply through `markImageVerified` and `markDetailsVerified` so evidence is recorded."}]}
|
||||||
|
{"id":"tip-20260509-quote-only-tscom-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Public-price verification requires a real public price."},{"role":"user","content":"T&S Communication product pages have valid images and details but expose no real public prices, only no price or price 0.00. What should TIP do?"},{"role":"assistant","content":"Do not mark price_verified. Treat T&S as image/detail-verifiable but public-price-missing until a real price observation exists. `price: 0.00`, quote/request flows, missing amount fields, or SEO filenames containing `price` are not price evidence. Keep image_verified/details_verified true when source-backed, keep price_verified false, and document the vendor as quote-only/public-price-unavailable rather than fabricating a market price."}]}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user