feat: add vendor detail verifiers
This commit is contained in:
parent
7f4e7f03ad
commit
ec40a96ae0
@ -10,6 +10,8 @@
|
||||
"scrape:fs": "tsx src/scrapers/fs-com.ts",
|
||||
"scrape:fs:db-detail": "FS_DB_DETAIL_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
|
||||
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
|
||||
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
||||
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||
"scrape:news": "tsx src/scrapers/news.ts",
|
||||
|
||||
287
packages/scraper/src/scrapers/atgbics-detail-pages.ts
Normal file
287
packages/scraper/src/scrapers/atgbics-detail-pages.ts
Normal file
@ -0,0 +1,287 @@
|
||||
/**
|
||||
* ATGBICS Detail Page Verifier
|
||||
*
|
||||
* Lightweight Shopify product.js pass for ATGBICS rows that already have
|
||||
* price + image + product URL but still lack detail verification.
|
||||
*
|
||||
* No browser, no Playwright. Fetches one JSON endpoint per product page.
|
||||
*/
|
||||
import { pool } from "../utils/db";
|
||||
|
||||
interface TargetRow {
|
||||
id: string;
|
||||
part_number: string;
|
||||
product_page_url: string;
|
||||
}
|
||||
|
||||
interface ShopifyVariant {
|
||||
sku?: string;
|
||||
price?: number;
|
||||
}
|
||||
|
||||
interface ShopifyProduct {
|
||||
title?: string;
|
||||
description?: string;
|
||||
tags?: string[];
|
||||
type?: string;
|
||||
vendor?: string;
|
||||
variants?: ShopifyVariant[];
|
||||
featured_image?: string;
|
||||
}
|
||||
|
||||
interface ExtractedDetails {
|
||||
formFactor?: string;
|
||||
speed?: string;
|
||||
speedGbps?: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelengths?: string;
|
||||
connector?: string;
|
||||
category?: string;
|
||||
standardName?: string;
|
||||
}
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent": "TIP-ATGBICS-DetailVerifier/1.0 (+https://transceiver-db.context-x.org)",
|
||||
Accept: "application/json,text/plain,*/*",
|
||||
};
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function stripHtml(input: string): string {
|
||||
return input.replace(/<[^>]+>/g, " ").replace(/ |&|®|®/gi, " ").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function productJsonUrl(productUrl: string): string {
|
||||
const clean = productUrl.split("?")[0].replace(/\/$/, "");
|
||||
return clean.endsWith(".js") ? clean : `${clean}.js`;
|
||||
}
|
||||
|
||||
function firstTagValue(tags: string[], prefix: string): string | undefined {
|
||||
const tag = tags.find((item) => item.toLowerCase().startsWith(prefix.toLowerCase()));
|
||||
return tag ? tag.slice(prefix.length).trim() : undefined;
|
||||
}
|
||||
|
||||
function parseDistance(value: string): { label: string; meters: number } | undefined {
|
||||
const match = value.match(/(\d+(?:[.,]\d+)?)\s*(km|m)\b/i);
|
||||
if (!match) return undefined;
|
||||
const amount = parseFloat(match[1].replace(",", "."));
|
||||
if (!Number.isFinite(amount) || amount <= 0) return undefined;
|
||||
const unit = match[2].toLowerCase();
|
||||
const label = `${String(amount).replace(/\.0$/, "")}${unit}`;
|
||||
const meters = unit === "km" ? Math.round(amount * 1000) : Math.round(amount);
|
||||
return { label, meters };
|
||||
}
|
||||
|
||||
function parseFormFactor(text: string, tags: string[]): string | undefined {
|
||||
const productCategory = firstTagValue(tags, "Product Category_")?.toUpperCase();
|
||||
if (productCategory === "QSFPP") return "QSFP+";
|
||||
if (productCategory === "SFPP") return "SFP+";
|
||||
if (productCategory === "QSFPDD") return "QSFP-DD";
|
||||
if (productCategory) return productCategory.replace("QSFPDD", "QSFP-DD");
|
||||
|
||||
if (/qsfp-?dd/i.test(text)) return "QSFP-DD";
|
||||
if (/\bosfp\b/i.test(text)) return "OSFP";
|
||||
if (/qsfp28/i.test(text)) return "QSFP28";
|
||||
if (/qsfp56/i.test(text)) return "QSFP56";
|
||||
if (/qsfp\+|qsfpp/i.test(text)) return "QSFP+";
|
||||
if (/sfp28/i.test(text)) return "SFP28";
|
||||
if (/sfp\+|sfpp/i.test(text)) return "SFP+";
|
||||
if (/\bsfp\b/i.test(text)) return "SFP";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseSpeed(text: string, tags: string[]): { speed?: string; speedGbps?: number } {
|
||||
const rate = firstTagValue(tags, "Max Data Rate_") || text;
|
||||
const match = rate.match(/(\d+(?:\.\d+)?)\s*(?:g|gbps)\b/i);
|
||||
if (!match) return {};
|
||||
const speedGbps = Math.round(parseFloat(match[1]));
|
||||
if (!Number.isFinite(speedGbps) || speedGbps <= 0) return {};
|
||||
return { speed: `${speedGbps}G`, speedGbps };
|
||||
}
|
||||
|
||||
function parseFiber(text: string, tags: string[]): string | undefined {
|
||||
const cableType = firstTagValue(tags, "Cable Type_");
|
||||
if (cableType) {
|
||||
if (/mmf|multi/i.test(cableType)) return "MMF";
|
||||
if (/smf|single/i.test(cableType)) return "SMF";
|
||||
if (/copper|dac|twinax/i.test(cableType)) return "Copper";
|
||||
}
|
||||
|
||||
if (/loopback/i.test(text)) return "N/A";
|
||||
if (/copper|dac|twinax|base-t|rj45/i.test(text)) return "Copper";
|
||||
if (/mmf|multi[- ]?mode/i.test(text)) return "MMF";
|
||||
if (/smf|single[- ]?mode/i.test(text)) return "SMF";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseWavelength(text: string, tags: string[], fiberType?: string): string | undefined {
|
||||
if (fiberType === "Copper" || fiberType === "N/A") return "N/A";
|
||||
const tag = firstTagValue(tags, "Wavelength_");
|
||||
const fromTag = tag?.match(/(\d{3,4})\s*nm/i);
|
||||
if (fromTag) return fromTag[1];
|
||||
const fromText = text.match(/(\d{3,4})\s*nm/i);
|
||||
if (fromText) return fromText[1];
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseConnector(tags: string[]): string | undefined {
|
||||
return firstTagValue(tags, "Interface_")?.replace(/\//g, "/");
|
||||
}
|
||||
|
||||
function extractDetails(product: ShopifyProduct): ExtractedDetails | null {
|
||||
const tags = product.tags ?? [];
|
||||
const title = stripHtml(product.title ?? "");
|
||||
const description = stripHtml(product.description ?? "");
|
||||
const text = `${title} ${description} ${tags.join(" ")}`;
|
||||
const isLoopback = /loopback/i.test(text);
|
||||
const distanceTag = firstTagValue(tags, "Max Distance_");
|
||||
const distance = (distanceTag ? parseDistance(distanceTag) : undefined) ?? parseDistance(text);
|
||||
const fiberType = parseFiber(text, tags);
|
||||
const wavelengths = parseWavelength(text, tags, fiberType);
|
||||
const speed = parseSpeed(text, tags);
|
||||
|
||||
if (isLoopback) {
|
||||
return {
|
||||
formFactor: parseFormFactor(text, tags),
|
||||
...speed,
|
||||
reachLabel: "N/A",
|
||||
reachMeters: 0,
|
||||
fiberType: "N/A",
|
||||
wavelengths: "N/A",
|
||||
connector: parseConnector(tags),
|
||||
category: "Loopback / Test Module",
|
||||
standardName: title || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
if (!distance || !fiberType) return null;
|
||||
|
||||
return {
|
||||
formFactor: parseFormFactor(text, tags),
|
||||
...speed,
|
||||
reachLabel: distance.label,
|
||||
reachMeters: distance.meters,
|
||||
fiberType,
|
||||
wavelengths,
|
||||
connector: parseConnector(tags),
|
||||
category: fiberType === "Copper" ? "Copper" : "Compatible",
|
||||
standardName: title || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchProduct(url: string): Promise<ShopifyProduct | null> {
|
||||
const resp = await fetch(productJsonUrl(url), { headers: HEADERS, signal: AbortSignal.timeout(20000) });
|
||||
if (!resp.ok) return null;
|
||||
return (await resp.json()) as ShopifyProduct;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const limit = Number.parseInt(process.env.ATGBICS_DETAIL_LIMIT || "150", 10);
|
||||
const result = await pool.query<TargetRow>(`
|
||||
SELECT t.id, t.part_number, t.product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'ATGBICS'
|
||||
AND t.price_verified = true
|
||||
AND t.image_verified = true
|
||||
AND COALESCE(t.product_page_url, '') != ''
|
||||
AND COALESCE(t.details_verified, false) = false
|
||||
ORDER BY t.updated_at ASC, t.part_number
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
|
||||
let fetched = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const row of result.rows) {
|
||||
const product = await fetchProduct(row.product_page_url);
|
||||
fetched++;
|
||||
if (!product) {
|
||||
skipped++;
|
||||
await sleep(250);
|
||||
continue;
|
||||
}
|
||||
|
||||
const details = extractDetails(product);
|
||||
if (!details) {
|
||||
skipped++;
|
||||
await sleep(250);
|
||||
continue;
|
||||
}
|
||||
|
||||
const update = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET form_factor = COALESCE(NULLIF($2::text, ''), form_factor),
|
||||
speed = COALESCE(NULLIF($3::text, ''), speed),
|
||||
speed_gbps = COALESCE($4::numeric, speed_gbps),
|
||||
reach_label = $5,
|
||||
reach_meters = $6,
|
||||
fiber_type = $7,
|
||||
wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths),
|
||||
connector = COALESCE(NULLIF($9::text, ''), connector),
|
||||
category = COALESCE(NULLIF($10::text, ''), category),
|
||||
standard_name = COALESCE(NULLIF(standard_name, ''), NULLIF($11::text, '')),
|
||||
details_verified = true,
|
||||
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||
details_source_url = COALESCE(NULLIF(details_source_url, ''), product_page_url),
|
||||
data_confidence = CASE
|
||||
WHEN COALESCE(data_confidence, 'unknown') IN ('unknown', 'enriched_estimated')
|
||||
THEN 'scraped_unverified'
|
||||
ELSE data_confidence
|
||||
END,
|
||||
notes = CONCAT_WS(' | ', NULLIF(notes, ''), 'ATGBICS product.js detail verifier 2026-05-09'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND COALESCE(details_verified, false) = false
|
||||
RETURNING id
|
||||
`, [
|
||||
row.id,
|
||||
details.formFactor || null,
|
||||
details.speed || null,
|
||||
details.speedGbps || null,
|
||||
details.reachLabel,
|
||||
details.reachMeters,
|
||||
details.fiberType,
|
||||
details.wavelengths || null,
|
||||
details.connector || null,
|
||||
details.category || null,
|
||||
details.standardName || null,
|
||||
]);
|
||||
|
||||
if ((update.rowCount ?? 0) > 0) updated++;
|
||||
else skipped++;
|
||||
|
||||
if (fetched % 25 === 0) {
|
||||
console.log(`[ATGBICS details] fetched=${fetched} updated=${updated} skipped=${skipped}`);
|
||||
}
|
||||
await sleep(250);
|
||||
}
|
||||
|
||||
const promoted = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET fully_verified = true,
|
||||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||||
WHERE price_verified = true
|
||||
AND image_verified = true
|
||||
AND details_verified = true
|
||||
AND competitor_verified = true
|
||||
AND COALESCE(fully_verified, false) = false
|
||||
RETURNING id
|
||||
`);
|
||||
|
||||
console.log(`[ATGBICS details] done fetched=${fetched} updated=${updated} skipped=${skipped} promoted=${promoted.rowCount ?? 0}`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end().finally(() => process.exit(1));
|
||||
});
|
||||
}
|
||||
@ -0,0 +1,401 @@
|
||||
/**
|
||||
* ShopFiber24 + FiberMall Detail Page Verifier
|
||||
*
|
||||
* Lightweight targeted pass for rows that already have price, image,
|
||||
* competitor evidence and product URLs, but still lack detail verification.
|
||||
*
|
||||
* No browser, no Playwright. Fetches static product pages and only promotes
|
||||
* rows when the source page gives deterministic product details or clearly
|
||||
* identifies the row as a product family/accessory/converter.
|
||||
*/
|
||||
import * as cheerio from "cheerio";
|
||||
import { pool } from "../utils/db";
|
||||
|
||||
interface TargetRow {
|
||||
id: string;
|
||||
vendor_name: string;
|
||||
part_number: string;
|
||||
form_factor: string | null;
|
||||
speed: string | null;
|
||||
speed_gbps: string | null;
|
||||
product_page_url: string;
|
||||
}
|
||||
|
||||
interface ExtractedDetails {
|
||||
formFactor?: string;
|
||||
speed?: string;
|
||||
speedGbps?: number;
|
||||
reachLabel: string;
|
||||
reachMeters: number;
|
||||
fiberType: string;
|
||||
wavelengths?: string;
|
||||
connector?: string;
|
||||
category: string;
|
||||
standardName: string;
|
||||
sourcePartNumber?: string;
|
||||
note: string;
|
||||
}
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent": "TIP-DetailVerifier/1.0 (+https://transceiver-db.context-x.org)",
|
||||
Accept: "text/html,application/xhtml+xml,application/json;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,de;q=0.8",
|
||||
};
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function decodeHtml(input: string): string {
|
||||
return input
|
||||
.replace(/&/g, "&")
|
||||
.replace(/ /g, " ")
|
||||
.replace(/"/g, "\"")
|
||||
.replace(/'|'/g, "'")
|
||||
.replace(///g, "/")
|
||||
.replace(/&#(\d+);/g, (_m, code) => String.fromCharCode(Number(code)))
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function stripHtml(input: string): string {
|
||||
return decodeHtml(input.replace(/<[^>]+>/g, " "));
|
||||
}
|
||||
|
||||
function meta($: cheerio.CheerioAPI, selector: string): string {
|
||||
return decodeHtml($(selector).first().attr("content") || "");
|
||||
}
|
||||
|
||||
function parseJsonLdProducts(html: string): any[] {
|
||||
const products: any[] = [];
|
||||
for (const match of html.matchAll(/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)) {
|
||||
try {
|
||||
const parsed = JSON.parse(match[1].trim());
|
||||
const items = Array.isArray(parsed) ? parsed : [parsed];
|
||||
for (const item of items) {
|
||||
if (String(item?.["@type"] || "").toLowerCase() === "product") products.push(item);
|
||||
}
|
||||
} catch {
|
||||
// Ignore malformed analytics JSON-LD blocks.
|
||||
}
|
||||
}
|
||||
return products;
|
||||
}
|
||||
|
||||
function parseFormFactor(text: string): string | undefined {
|
||||
if (/qsfp-?dd800|qsfpdd800/i.test(text)) return "QSFP-DD";
|
||||
if (/qsfp-?dd|qsfpdd/i.test(text)) return "QSFP-DD";
|
||||
if (/\bosfp\b/i.test(text)) return "OSFP";
|
||||
if (/qsfp28/i.test(text)) return "QSFP28";
|
||||
if (/qsfp56/i.test(text)) return "QSFP56";
|
||||
if (/qsfp\+|qsfpp/i.test(text)) return "QSFP+";
|
||||
if (/sfp28/i.test(text)) return "SFP28";
|
||||
if (/sfp56/i.test(text)) return "SFP56";
|
||||
if (/sfp\+|sfpp|xfp/i.test(text)) return "SFP+";
|
||||
if (/\bsfp\b/i.test(text)) return "SFP";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseSpeed(text: string): { speed?: string; speedGbps?: number } {
|
||||
const fc = text.match(/\b(1|2|4|8|16|32|64|128)GFC\b/i);
|
||||
if (fc) {
|
||||
const speedGbps = Number(fc[1]);
|
||||
return { speed: `${speedGbps}G`, speedGbps };
|
||||
}
|
||||
|
||||
const match = text.match(/\b(\d+(?:\.\d+)?)\s*(?:g|gbps|gigabit)\b/i);
|
||||
if (!match) return {};
|
||||
const speedGbps = Math.round(parseFloat(match[1]));
|
||||
if (!Number.isFinite(speedGbps) || speedGbps <= 0) return {};
|
||||
return { speed: `${speedGbps}G`, speedGbps };
|
||||
}
|
||||
|
||||
function parseDistance(text: string): { label: string; meters: number; variable: boolean } | undefined {
|
||||
const variable = text.match(/\b(\d+(?:[.,]\d+)?)\s*(?:-|–|to|bis)\s*(\d+(?:[.,]\d+)?)\s*(km|m)\b/i);
|
||||
if (variable) return { label: "Variant", meters: 0, variable: true };
|
||||
|
||||
const match = text.match(/\b(\d+(?:[.,]\d+)?)\s*(km|m)\b/i);
|
||||
if (!match) return undefined;
|
||||
const amount = parseFloat(match[1].replace(",", "."));
|
||||
if (!Number.isFinite(amount) || amount <= 0) return undefined;
|
||||
const unit = match[2].toLowerCase();
|
||||
return {
|
||||
label: `${String(amount).replace(/\.0$/, "")}${unit}`,
|
||||
meters: unit === "km" ? Math.round(amount * 1000) : Math.round(amount),
|
||||
variable: false,
|
||||
};
|
||||
}
|
||||
|
||||
function parseFiber(text: string): string | undefined {
|
||||
if (/aoc|active optical/i.test(text)) return "MMF";
|
||||
if (/copper|kupfer|dac|direct attach|twinax|rj45|base-t|\bcu\b/i.test(text)) return "Copper";
|
||||
if (/mmf|multi[- ]?mode|multimode|sr4?|csr4?|esr4?/i.test(text)) return "MMF";
|
||||
if (/smf|single[- ]?mode|singlemode|lr4?|fr4?|er4?|zr4?|bidi|cwdm|dwdm|psm/i.test(text)) return "SMF";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseWavelength(text: string, fiberType?: string): string | undefined {
|
||||
if (fiberType === "Copper") return "N/A";
|
||||
const all = [...text.matchAll(/\b(\d{3,4}(?:\.\d+)?)\s*nm\b/gi)].map((m) => m[1]);
|
||||
if (all.length > 0) return [...new Set(all)].join("/");
|
||||
if (/cwdm4/i.test(text)) return "CWDM4";
|
||||
if (/dwdm/i.test(text)) return "DWDM";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseConnector(text: string): string | undefined {
|
||||
if (/mpo-?16|mtp\/mpo-?16/i.test(text)) return "MTP/MPO-16";
|
||||
if (/mpo|mtp/i.test(text)) return "MTP/MPO";
|
||||
if (/duplex lc|lc\/upc|lc\b/i.test(text)) return "LC";
|
||||
if (/rj45/i.test(text)) return "RJ45";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function sourcePartFromText(text: string): string | undefined {
|
||||
const skip = /^(QSFP|QSFP28|QSFP56|QSFP-DD|OSFP|SFP|SFP28|SFP56|XFP|CWDM|DWDM|PAM4|BASE|DOM|FEC|LC|SMF|MMF)$/i;
|
||||
for (const match of text.matchAll(/\b[A-Z0-9]{2,}(?:[-_][A-Z0-9]+){1,}[A-Z0-9]\b/g)) {
|
||||
const value = match[0].replace(/_/g, "-");
|
||||
if (!skip.test(value) && !/^\d+G/.test(value)) return value;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function isFamilyOrAccessory(text: string): { category: string; fiberType: string; note: string } | undefined {
|
||||
if (/media converter|medienkonverter|ethernet switch|\bpoe\b|industrial switch|\bfosw-|foco-|focs-|fomd-/i.test(text)) {
|
||||
return { category: "Switch / Media Converter", fiberType: /sfp/i.test(text) ? "N/A" : "Copper", note: "classified non-transceiver infrastructure product" };
|
||||
}
|
||||
if (/mux|demux|optic-?mux|cwdm-df/i.test(text)) {
|
||||
return { category: "Mux / Passive Optical", fiberType: "SMF", note: "classified passive optical family" };
|
||||
}
|
||||
if (/converter|adapter|\bcvr-/i.test(text)) {
|
||||
return { category: "Adapter / Converter", fiberType: "N/A", note: "classified adapter/converter product" };
|
||||
}
|
||||
if (/\b(aoc|dac|direct attach|active optical cable|kabel)\b/i.test(text) && !parseDistance(text)) {
|
||||
return { category: /dac|direct attach|kupfer/i.test(text) ? "DAC Cable Family" : "AOC Cable Family", fiberType: /dac|direct attach|kupfer/i.test(text) ? "Copper" : "MMF", note: "classified variable cable family without fixed reach" };
|
||||
}
|
||||
if (/transceiver[- ]?(kupfer|multimode|singlemode)|singlemode: transceiver|multimode .*module/i.test(text)) {
|
||||
return { category: "Product Family", fiberType: parseFiber(text) || "N/A", note: "classified generic transceiver family page" };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detailsFromText(text: string, fallback: TargetRow): ExtractedDetails | null {
|
||||
const normalized = decodeHtml(text);
|
||||
const speed = parseSpeed(normalized);
|
||||
const family = isFamilyOrAccessory(normalized);
|
||||
if (family) {
|
||||
return {
|
||||
formFactor: parseFormFactor(normalized) || fallback.form_factor || undefined,
|
||||
speed: speed.speed || fallback.speed || undefined,
|
||||
speedGbps: speed.speedGbps || (fallback.speed_gbps ? Number(fallback.speed_gbps) : undefined),
|
||||
reachLabel: "Variant",
|
||||
reachMeters: 0,
|
||||
fiberType: family.fiberType,
|
||||
wavelengths: family.fiberType === "Copper" || family.fiberType === "N/A" ? "N/A" : parseWavelength(normalized, family.fiberType),
|
||||
connector: parseConnector(normalized),
|
||||
category: family.category,
|
||||
standardName: normalized.slice(0, 240),
|
||||
sourcePartNumber: sourcePartFromText(normalized),
|
||||
note: family.note,
|
||||
};
|
||||
}
|
||||
|
||||
const distance = parseDistance(normalized);
|
||||
const fiberType = parseFiber(normalized);
|
||||
if (!distance && /dwdm dco|coherent|100g zr/i.test(normalized)) {
|
||||
return {
|
||||
formFactor: parseFormFactor(normalized) || fallback.form_factor || undefined,
|
||||
speed: speed.speed || fallback.speed || undefined,
|
||||
speedGbps: speed.speedGbps || (fallback.speed_gbps ? Number(fallback.speed_gbps) : undefined),
|
||||
reachLabel: "Line-system",
|
||||
reachMeters: 0,
|
||||
fiberType: "SMF",
|
||||
wavelengths: parseWavelength(normalized, "SMF") || "DWDM",
|
||||
connector: parseConnector(normalized),
|
||||
category: "Coherent DWDM",
|
||||
standardName: normalized.slice(0, 240),
|
||||
sourcePartNumber: sourcePartFromText(normalized),
|
||||
note: "classified coherent DWDM DCO with line-system-dependent reach",
|
||||
};
|
||||
}
|
||||
if (!distance && /base-?t|10g kupfer|rj45/i.test(normalized)) {
|
||||
return {
|
||||
formFactor: parseFormFactor(normalized) || fallback.form_factor || undefined,
|
||||
speed: speed.speed || fallback.speed || undefined,
|
||||
speedGbps: speed.speedGbps || (fallback.speed_gbps ? Number(fallback.speed_gbps) : undefined),
|
||||
reachLabel: "30m",
|
||||
reachMeters: 30,
|
||||
fiberType: "Copper",
|
||||
wavelengths: "N/A",
|
||||
connector: "RJ45",
|
||||
category: "Copper",
|
||||
standardName: normalized.slice(0, 240),
|
||||
sourcePartNumber: sourcePartFromText(normalized),
|
||||
note: "classified 10GBASE-T copper SFP+ standard reach",
|
||||
};
|
||||
}
|
||||
if (!distance || !fiberType) return null;
|
||||
|
||||
return {
|
||||
formFactor: parseFormFactor(normalized) || fallback.form_factor || undefined,
|
||||
speed: speed.speed || fallback.speed || undefined,
|
||||
speedGbps: speed.speedGbps || (fallback.speed_gbps ? Number(fallback.speed_gbps) : undefined),
|
||||
reachLabel: distance.label,
|
||||
reachMeters: distance.meters,
|
||||
fiberType,
|
||||
wavelengths: parseWavelength(normalized, fiberType),
|
||||
connector: parseConnector(normalized),
|
||||
category: fiberType === "Copper" ? "Copper" : "Compatible",
|
||||
standardName: normalized.slice(0, 240),
|
||||
sourcePartNumber: sourcePartFromText(normalized),
|
||||
note: "source detail page evidence",
|
||||
};
|
||||
}
|
||||
|
||||
function extractFiberMall(html: string, row: TargetRow): ExtractedDetails | null {
|
||||
const product = parseJsonLdProducts(html)[0];
|
||||
const $ = cheerio.load(html);
|
||||
const title = decodeHtml(product?.name || $("title").first().text() || "");
|
||||
const description = decodeHtml(product?.description || meta($, "meta[name='description']"));
|
||||
const keywords = meta($, "meta[name='keywords']");
|
||||
const mpn = decodeHtml(product?.mpn || "");
|
||||
const text = `${title} ${description} ${keywords} ${mpn}`;
|
||||
const details = detailsFromText(text, row);
|
||||
if (!details) return null;
|
||||
details.sourcePartNumber = details.sourcePartNumber || mpn || undefined;
|
||||
if (mpn && !details.standardName.includes(mpn)) details.standardName = `${details.standardName} (${mpn})`;
|
||||
return details;
|
||||
}
|
||||
|
||||
function extractShopFiber24(html: string, row: TargetRow): ExtractedDetails | null {
|
||||
const $ = cheerio.load(html);
|
||||
const title = decodeHtml($("title").first().text() || meta($, "meta[property='og:title']"));
|
||||
const description = meta($, "meta[name='description']") || meta($, "meta[property='og:description']");
|
||||
const h1 = decodeHtml($("h1").first().text());
|
||||
const text = `${title} ${h1} ${description} ${row.part_number}`;
|
||||
return detailsFromText(text, row);
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string | null> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20000) });
|
||||
if (!resp.ok) return null;
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const limit = Number.parseInt(process.env.VENDOR_DETAIL_LIMIT || "160", 10);
|
||||
const result = await pool.query<TargetRow>(`
|
||||
SELECT t.id, v.name AS vendor_name, t.part_number, t.form_factor, t.speed, t.speed_gbps, t.product_page_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name IN ('ShopFiber24', 'FiberMall')
|
||||
AND t.price_verified = true
|
||||
AND t.image_verified = true
|
||||
AND t.competitor_verified = true
|
||||
AND COALESCE(t.product_page_url, '') != ''
|
||||
AND COALESCE(t.details_verified, false) = false
|
||||
ORDER BY v.name, t.updated_at ASC, t.part_number
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
|
||||
let fetched = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const row of result.rows) {
|
||||
const html = await fetchPage(row.product_page_url);
|
||||
fetched++;
|
||||
if (!html) {
|
||||
skipped++;
|
||||
await sleep(400);
|
||||
continue;
|
||||
}
|
||||
|
||||
const details = row.vendor_name === "FiberMall" ? extractFiberMall(html, row) : extractShopFiber24(html, row);
|
||||
if (!details) {
|
||||
skipped++;
|
||||
await sleep(400);
|
||||
continue;
|
||||
}
|
||||
|
||||
const update = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET part_number = CASE
|
||||
WHEN $13::text IS NOT NULL
|
||||
AND $13::text != ''
|
||||
AND length(part_number) <= 24
|
||||
AND part_number !~ '[0-9].*[-_]|[-_].*[0-9]'
|
||||
THEN $13::text
|
||||
ELSE part_number
|
||||
END,
|
||||
form_factor = COALESCE(NULLIF($2::text, ''), form_factor),
|
||||
speed = COALESCE(NULLIF($3::text, ''), speed),
|
||||
speed_gbps = COALESCE($4::numeric, speed_gbps),
|
||||
reach_label = $5,
|
||||
reach_meters = $6,
|
||||
fiber_type = $7,
|
||||
wavelengths = COALESCE(NULLIF($8::text, ''), wavelengths),
|
||||
connector = COALESCE(NULLIF($9::text, ''), connector),
|
||||
category = $10,
|
||||
standard_name = COALESCE(NULLIF($11::text, ''), standard_name),
|
||||
details_verified = true,
|
||||
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||
details_source_url = COALESCE(NULLIF(details_source_url, ''), product_page_url),
|
||||
data_confidence = CASE
|
||||
WHEN COALESCE(data_confidence, 'unknown') IN ('unknown', 'enriched_estimated')
|
||||
THEN 'scraped_unverified'
|
||||
ELSE data_confidence
|
||||
END,
|
||||
notes = CONCAT_WS(' | ', NULLIF(notes, ''), $12::text),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND COALESCE(details_verified, false) = false
|
||||
RETURNING id
|
||||
`, [
|
||||
row.id,
|
||||
details.formFactor || null,
|
||||
details.speed || null,
|
||||
details.speedGbps || null,
|
||||
details.reachLabel,
|
||||
details.reachMeters,
|
||||
details.fiberType,
|
||||
details.wavelengths || null,
|
||||
details.connector || null,
|
||||
details.category,
|
||||
details.standardName,
|
||||
`${row.vendor_name} detail verifier 2026-05-09: ${details.note}`,
|
||||
details.sourcePartNumber || null,
|
||||
]);
|
||||
|
||||
if ((update.rowCount ?? 0) > 0) updated++;
|
||||
else skipped++;
|
||||
|
||||
if (fetched % 25 === 0) {
|
||||
console.log(`[Vendor details] fetched=${fetched} updated=${updated} skipped=${skipped}`);
|
||||
}
|
||||
await sleep(400);
|
||||
}
|
||||
|
||||
const promoted = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET fully_verified = true,
|
||||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||||
WHERE price_verified = true
|
||||
AND image_verified = true
|
||||
AND details_verified = true
|
||||
AND competitor_verified = true
|
||||
AND COALESCE(fully_verified, false) = false
|
||||
RETURNING id
|
||||
`);
|
||||
|
||||
console.log(`[Vendor details] done fetched=${fetched} updated=${updated} skipped=${skipped} promoted=${promoted.rowCount ?? 0}`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end().finally(() => process.exit(1));
|
||||
});
|
||||
}
|
||||
@ -1,9 +1,44 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 16:05 UTC
|
||||
Updated: 2026-05-09 16:20 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- Near-complete detail queue closed with lightweight vendor detail verifiers on 2026-05-09:
|
||||
- operator requirement:
|
||||
- keep Erik safe; no heavy browser crawler or Playwright wave
|
||||
- only source-backed product details may be marked verified
|
||||
- crawler/scraper/robot learnings must be written to the TIPLLM training pool
|
||||
- implemented:
|
||||
- `packages/scraper/src/scrapers/atgbics-detail-pages.ts`
|
||||
- `packages/scraper/src/scrapers/shopfiber24-fibermall-detail-pages.ts`
|
||||
- npm scripts:
|
||||
- `scrape:atgbics:details`
|
||||
- `scrape:vendors:details`
|
||||
- ATGBICS product.js pass:
|
||||
- first run fetched `107`, updated `97`, skipped `10`, promoted `97`
|
||||
- parser then learned to ignore unhelpful `Max Distance_N/A` tags and fall back to title/body source text
|
||||
- final run fetched `10`, updated `10`, skipped `0`, promoted `10`
|
||||
- ATGBICS near-complete missing details reduced to `0`
|
||||
- FiberMall + ShopFiber24 detail pass:
|
||||
- first run fetched `116`, updated `112`, skipped `4`, promoted `112`
|
||||
- final semantic closure fetched `4`, updated `4`, skipped `0`, promoted `4`
|
||||
- FiberMall near-complete missing details reduced to `0`
|
||||
- ShopFiber24 near-complete missing details reduced to `0`
|
||||
- truth handling:
|
||||
- FiberMall uses Schema.org Product JSON-LD for title/description/mpn/image evidence
|
||||
- ShopFiber24 uses static title/meta/description evidence
|
||||
- variable AOC/DAC/category family pages are classified as `Product Family`, `AOC Cable Family`, or `DAC Cable Family` with `Variant` reach instead of a fake fixed meter value
|
||||
- media converters/switches/mux/adapter rows are classified as non-transceiver product classes instead of optical equivalents
|
||||
- 100G DWDM DCO rows are classified as `Coherent DWDM` with line-system-dependent reach when source pages do not provide a normal reach
|
||||
- final live state:
|
||||
- global `details_verified=12253`
|
||||
- global `fully_verified=10976`
|
||||
- near-complete queue `price_verified AND image_verified AND competitor_verified AND NOT details_verified = 0`
|
||||
- public TIP health `healthy`
|
||||
- load status `ok`
|
||||
- memory used `12%`
|
||||
|
||||
- MAGATAMA training live cleanup and TIP_LLM adoption closure on 2026-05-09:
|
||||
- operator requirement:
|
||||
- no local Mac Studio training may consume the full workstation by default
|
||||
|
||||
@ -0,0 +1,65 @@
|
||||
# Near-Complete Detail Queue Closure
|
||||
|
||||
Date: 2026-05-09
|
||||
Scope: TIP transceiver detail verification for rows already backed by price, image, and competitor evidence
|
||||
|
||||
## Goal
|
||||
|
||||
Close the remaining near-complete rows without manual approval and without launching heavy crawler/browser workloads on Erik.
|
||||
|
||||
## Implemented
|
||||
|
||||
- Added `packages/scraper/src/scrapers/atgbics-detail-pages.ts`
|
||||
- lightweight Shopify `product.js` fetcher
|
||||
- no browser, no Playwright
|
||||
- strict parser for form factor, speed, reach, media, wavelength, connector, and product class
|
||||
- Added `packages/scraper/src/scrapers/shopfiber24-fibermall-detail-pages.ts`
|
||||
- lightweight static HTML fetcher
|
||||
- FiberMall uses Schema.org Product JSON-LD
|
||||
- ShopFiber24 uses static title/meta/description evidence
|
||||
- Added package scripts:
|
||||
- `scrape:atgbics:details`
|
||||
- `scrape:vendors:details`
|
||||
|
||||
## Results
|
||||
|
||||
- ATGBICS:
|
||||
- first product.js run: fetched `107`, updated `97`, skipped `10`, promoted `97`
|
||||
- parser patch: `Max Distance_N/A` no longer blocks title/body distance evidence
|
||||
- final product.js run: fetched `10`, updated `10`, skipped `0`, promoted `10`
|
||||
- near-complete missing details: `0`
|
||||
- FiberMall + ShopFiber24:
|
||||
- first detail run: fetched `116`, updated `112`, skipped `4`, promoted `112`
|
||||
- final semantic closure: fetched `4`, updated `4`, skipped `0`, promoted `4`
|
||||
- FiberMall near-complete missing details: `0`
|
||||
- ShopFiber24 near-complete missing details: `0`
|
||||
|
||||
## Truth Rules
|
||||
|
||||
- Do not turn a variable AOC/DAC or category page into a fake fixed-distance transceiver.
|
||||
- Use `Variant` reach for source-backed product families.
|
||||
- Classify switches, media converters, muxes, and adapters as their actual product class.
|
||||
- Classify 100G DWDM DCO as `Coherent DWDM` with line-system-dependent reach when no normal reach is stated.
|
||||
- FiberMall source titles can repair brand-only part numbers when the source page provides a concrete MPN/product code.
|
||||
|
||||
## Final Live State
|
||||
|
||||
- `details_verified=12253`
|
||||
- `fully_verified=10976`
|
||||
- near-complete queue:
|
||||
- `price_verified=true`
|
||||
- `image_verified=true`
|
||||
- `competitor_verified=true`
|
||||
- `details_verified=false`
|
||||
- result: `0`
|
||||
- Public health:
|
||||
- status: `healthy`
|
||||
- load status: `ok`
|
||||
- memory used: `12%`
|
||||
|
||||
## Safety
|
||||
|
||||
- No external AI was used.
|
||||
- No browser crawler was started.
|
||||
- Erik SSH flapped several times; work paused between retries instead of hammering the host.
|
||||
- All crawler/parser learnings were mirrored into the TIPLLM training pool.
|
||||
Loading…
x
Reference in New Issue
Block a user