fix: advance TIP product verification robots
This commit is contained in:
parent
3779de5b88
commit
a43e572946
@ -19,7 +19,10 @@ export interface SearchParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function searchTransceivers(params: SearchParams) {
|
export async function searchTransceivers(params: SearchParams) {
|
||||||
const conditions: string[] = [];
|
const conditions: string[] = [
|
||||||
|
`COALESCE(t.data_confidence, 'unknown') != 'garbage'`,
|
||||||
|
`COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'`,
|
||||||
|
];
|
||||||
const values: any[] = [];
|
const values: any[] = [];
|
||||||
let idx = 1;
|
let idx = 1;
|
||||||
|
|
||||||
|
|||||||
@ -22,6 +22,8 @@ healthRouter.get("/", async (_req: Request, res: Response) => {
|
|||||||
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified,
|
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified,
|
||||||
COUNT(*) AS total
|
COUNT(*) AS total
|
||||||
FROM transceivers
|
FROM transceivers
|
||||||
|
WHERE COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||||
|
AND COALESCE(product_page_url, '') NOT LIKE '%/category/%'
|
||||||
`).catch(() => ({ rows: [{}] }));
|
`).catch(() => ({ rows: [{}] }));
|
||||||
const v = verStats.rows[0] || {};
|
const v = verStats.rows[0] || {};
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
|
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
|
||||||
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
||||||
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
||||||
|
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||||
"scrape:news": "tsx src/scrapers/news.ts",
|
"scrape:news": "tsx src/scrapers/news.ts",
|
||||||
|
|||||||
@ -2621,6 +2621,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
updated_at = NOW()
|
updated_at = NOW()
|
||||||
WHERE product_page_url IS NOT NULL
|
WHERE product_page_url IS NOT NULL
|
||||||
AND product_page_url != ''
|
AND product_page_url != ''
|
||||||
|
AND product_page_url NOT LIKE '%/category/%'
|
||||||
AND form_factor IS NOT NULL
|
AND form_factor IS NOT NULL
|
||||||
AND speed_gbps IS NOT NULL
|
AND speed_gbps IS NOT NULL
|
||||||
AND part_number IS NOT NULL
|
AND part_number IS NOT NULL
|
||||||
|
|||||||
@ -6,7 +6,14 @@
|
|||||||
*
|
*
|
||||||
* Rate limited: 1 req/2sec.
|
* Rate limited: 1 req/2sec.
|
||||||
*/
|
*/
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import {
|
||||||
|
pool,
|
||||||
|
findOrCreateScrapedTransceiver,
|
||||||
|
ensureVendor,
|
||||||
|
markDetailsVerified,
|
||||||
|
markImageVerified,
|
||||||
|
upsertPriceObservation,
|
||||||
|
} from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
import * as cheerio from "cheerio";
|
import * as cheerio from "cheerio";
|
||||||
|
|
||||||
@ -55,14 +62,23 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp
|
|||||||
|
|
||||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||||
const patterns: [RegExp, string, number][] = [
|
const patterns: [RegExp, string, number][] = [
|
||||||
|
[/\b160\s*km\b/i, "160km", 160000],
|
||||||
|
[/\b140\s*km\b/i, "140km", 140000],
|
||||||
|
[/\b120\s*km\b/i, "120km", 120000],
|
||||||
[/\b80\s*km\b/i, "80km", 80000],
|
[/\b80\s*km\b/i, "80km", 80000],
|
||||||
|
[/\b50\s*km\b/i, "50km", 50000],
|
||||||
[/\b40\s*km\b/i, "40km", 40000],
|
[/\b40\s*km\b/i, "40km", 40000],
|
||||||
|
[/\b30\s*km\b/i, "30km", 30000],
|
||||||
[/\b20\s*km\b/i, "20km", 20000],
|
[/\b20\s*km\b/i, "20km", 20000],
|
||||||
|
[/\b15\s*km\b/i, "15km", 15000],
|
||||||
[/\b10\s*km\b/i, "10km", 10000],
|
[/\b10\s*km\b/i, "10km", 10000],
|
||||||
[/\b2\s*km\b/i, "2km", 2000],
|
[/\b2\s*km\b/i, "2km", 2000],
|
||||||
|
[/\b1\s*km\b/i, "1km", 1000],
|
||||||
[/\b550\s*m\b/i, "550m", 550],
|
[/\b550\s*m\b/i, "550m", 550],
|
||||||
|
[/\b500\s*m\b/i, "500m", 500],
|
||||||
[/\b300\s*m\b/i, "300m", 300],
|
[/\b300\s*m\b/i, "300m", 300],
|
||||||
[/\b100\s*m\b/i, "100m", 100],
|
[/\b100\s*m\b/i, "100m", 100],
|
||||||
|
[/\b82\s*m\b/i, "82m", 82],
|
||||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
|
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
|
||||||
@ -70,6 +86,24 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
|||||||
for (const [regex, label, meters] of patterns) {
|
for (const [regex, label, meters] of patterns) {
|
||||||
if (regex.test(text)) return { label, meters };
|
if (regex.test(text)) return { label, meters };
|
||||||
}
|
}
|
||||||
|
const km = text.match(/\b(\d+(?:\.\d+)?)\s*km\b/i);
|
||||||
|
if (km) {
|
||||||
|
const value = Number(km[1]);
|
||||||
|
if (value > 0 && value <= 200) return { label: `${Number.isInteger(value) ? value : value.toString()}km`, meters: Math.round(value * 1000) };
|
||||||
|
}
|
||||||
|
const meters = text.match(/\b(\d+(?:\.\d+)?)\s*m\b/i);
|
||||||
|
if (meters) {
|
||||||
|
const value = Number(meters[1]);
|
||||||
|
if (value > 0 && value <= 10000) return { label: `${Number.isInteger(value) ? value : value.toString()}m`, meters: Math.round(value) };
|
||||||
|
}
|
||||||
|
const miles = text.match(/\b(\d+(?:\.\d+)?)\s*(?:mi|miles?)\b/i);
|
||||||
|
if (miles) {
|
||||||
|
const value = Number(miles[1]);
|
||||||
|
if (value > 0 && value <= 125) {
|
||||||
|
const kmRounded = Math.round(value * 1.609344);
|
||||||
|
return { label: `${kmRounded}km`, meters: kmRounded * 1000 };
|
||||||
|
}
|
||||||
|
}
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,19 +119,29 @@ function detectWavelength(text: string): string {
|
|||||||
return match ? match[1] : "";
|
return match ? match[1] : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanSku(text: string): string {
|
||||||
|
return text.replace(/\s+/g, "").replace(/GAOTek-/i, "GAOTek-").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function isProductUrl(url: string): boolean {
|
||||||
|
return /^https:\/\/gaotek\.com\/product\/[^/?#]+\/?$/i.test(url);
|
||||||
|
}
|
||||||
|
|
||||||
function parseProductList(html: string): Product[] {
|
function parseProductList(html: string): Product[] {
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
const products: Product[] = [];
|
const products: Product[] = [];
|
||||||
|
|
||||||
// WooCommerce product grid
|
// WooCommerce/Woodmart product grid. GAO Tek currently uses Woodmart
|
||||||
$("li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => {
|
// `.wd-product.product-grid-item` cards rather than classic `li.product`.
|
||||||
const titleEl = $(el).find(".woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first();
|
$(".wd-product.product-grid-item, div.product-grid-item, li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => {
|
||||||
|
const titleEl = $(el).find(".wd-entities-title a, .woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first();
|
||||||
const name = titleEl.text().trim();
|
const name = titleEl.text().trim();
|
||||||
if (!name || name.length < 5) return;
|
if (!name || name.length < 5) return;
|
||||||
|
|
||||||
const linkEl = $(el).find("a[href]").first();
|
const linkEl = $(el).find("a.wd-product-img-link[href], .wd-entities-title a[href], a[href]").first();
|
||||||
const href = linkEl.attr("href") || "";
|
const href = linkEl.attr("href") || "";
|
||||||
const url = href.startsWith("http") ? href : BASE + href;
|
const url = href.startsWith("http") ? href : BASE + href;
|
||||||
|
if (!isProductUrl(url)) return;
|
||||||
|
|
||||||
// WooCommerce price
|
// WooCommerce price
|
||||||
const priceText = $(el).find(".price, .woocommerce-Price-amount, .amount").text();
|
const priceText = $(el).find(".price, .woocommerce-Price-amount, .amount").text();
|
||||||
@ -109,8 +153,8 @@ function parseProductList(html: string): Product[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GAO Tek uses SKU for part numbers
|
// GAO Tek uses SKU for part numbers
|
||||||
const skuEl = $(el).find(".sku, [data-sku]");
|
const skuEl = $(el).find(".wd-sku, .sku, [data-sku]");
|
||||||
const partNumber = skuEl.text().trim() ||
|
const partNumber = cleanSku(skuEl.text()) ||
|
||||||
url.split("/").filter(Boolean).pop()?.replace(/-/g, " ").trim() ||
|
url.split("/").filter(Boolean).pop()?.replace(/-/g, " ").trim() ||
|
||||||
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
|
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
|
||||||
name.replace(/\s+/g, "-").slice(0, 60);
|
name.replace(/\s+/g, "-").slice(0, 60);
|
||||||
@ -141,15 +185,16 @@ function parseProductList(html: string): Product[] {
|
|||||||
|
|
||||||
// Fallback for non-WooCommerce layout
|
// Fallback for non-WooCommerce layout
|
||||||
if (products.length === 0) {
|
if (products.length === 0) {
|
||||||
$("a[href]").each((_i, el) => {
|
$("a[href*='/product/']").each((_i, el) => {
|
||||||
const name = $(el).text().trim();
|
const name = $(el).text().trim();
|
||||||
const href = $(el).attr("href") || "";
|
const href = $(el).attr("href") || "";
|
||||||
|
const url = href.startsWith("http") ? href : BASE + href;
|
||||||
if (
|
if (
|
||||||
name.length < 8 || name.length > 200 ||
|
name.length < 8 || name.length > 200 ||
|
||||||
|
!isProductUrl(url) ||
|
||||||
!/sfp|qsfp|xfp|transceiver|optic/i.test(name)
|
!/sfp|qsfp|xfp|transceiver|optic/i.test(name)
|
||||||
) return;
|
) return;
|
||||||
|
|
||||||
const url = href.startsWith("http") ? href : BASE + href;
|
|
||||||
const context = $(el).parent().parent().text();
|
const context = $(el).parent().parent().text();
|
||||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
|
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
|
||||||
let price: number | undefined;
|
let price: number | undefined;
|
||||||
@ -167,7 +212,9 @@ function parseProductList(html: string): Product[] {
|
|||||||
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
||||||
: undefined;
|
: undefined;
|
||||||
products.push({
|
products.push({
|
||||||
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60),
|
partNumber: cleanSku(context.match(/SKU:\s*([A-Z0-9][A-Z0-9\-\s]{4,})/i)?.[1] || "") ||
|
||||||
|
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
|
||||||
|
name.replace(/\s+/g, "-").slice(0, 60),
|
||||||
name, url, price, ...ff,
|
name, url, price, ...ff,
|
||||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||||
@ -178,7 +225,7 @@ function parseProductList(html: string): Product[] {
|
|||||||
|
|
||||||
const seen = new Set<string>();
|
const seen = new Set<string>();
|
||||||
return products.filter((p) => {
|
return products.filter((p) => {
|
||||||
if (!p.url || seen.has(p.url)) return false;
|
if (!p.url || !isProductUrl(p.url) || seen.has(p.url)) return false;
|
||||||
seen.add(p.url);
|
seen.add(p.url);
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
@ -190,6 +237,114 @@ async function fetchPage(url: string): Promise<string> {
|
|||||||
return resp.text();
|
return resp.text();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function saveGaoTekProduct(vendorId: string, product: Product): Promise<string> {
|
||||||
|
const existingByUrl = await pool.query<{ id: string; part_number: string }>(
|
||||||
|
`SELECT id, part_number
|
||||||
|
FROM transceivers
|
||||||
|
WHERE vendor_id = $1
|
||||||
|
AND product_page_url = $2
|
||||||
|
LIMIT 1`,
|
||||||
|
[vendorId, product.url]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existingByUrl.rows.length > 0) {
|
||||||
|
const existing = existingByUrl.rows[0];
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE transceivers
|
||||||
|
SET part_number = CASE
|
||||||
|
WHEN $3::text != ''
|
||||||
|
AND (part_number IS NULL
|
||||||
|
OR part_number = ''
|
||||||
|
OR ($3::text ~* '^GAOTek-' AND part_number !~* '^GAOTek-')
|
||||||
|
OR part_number LIKE '% %'
|
||||||
|
OR length(part_number) > 45
|
||||||
|
OR part_number ~* '^(fiber|optical|gbic|sfp|qsfp|cfp).*transceivers?$')
|
||||||
|
THEN $3::text
|
||||||
|
ELSE part_number
|
||||||
|
END,
|
||||||
|
product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
|
||||||
|
form_factor = COALESCE(NULLIF(form_factor, ''), $4),
|
||||||
|
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($5, speed_gbps) ELSE speed_gbps END,
|
||||||
|
speed = COALESCE(NULLIF(speed, ''), $6),
|
||||||
|
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
|
||||||
|
reach_label = COALESCE(NULLIF(reach_label, ''), $8),
|
||||||
|
fiber_type = COALESCE(NULLIF(fiber_type, ''), $9),
|
||||||
|
wavelengths = COALESCE(NULLIF(wavelengths, ''), $10),
|
||||||
|
category = COALESCE(NULLIF(category, ''), $11),
|
||||||
|
data_confidence = CASE
|
||||||
|
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||||
|
THEN 'scraped_unverified'
|
||||||
|
ELSE data_confidence
|
||||||
|
END,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[
|
||||||
|
existing.id,
|
||||||
|
product.url,
|
||||||
|
product.partNumber,
|
||||||
|
product.formFactor || null,
|
||||||
|
product.speedGbps || null,
|
||||||
|
product.speed || null,
|
||||||
|
product.reachMeters || null,
|
||||||
|
product.reachLabel || null,
|
||||||
|
product.fiberType || null,
|
||||||
|
product.wavelength || null,
|
||||||
|
"DataCenter",
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (product.imageUrl) await markImageVerified(existing.id, product.imageUrl);
|
||||||
|
await markDetailsVerified({ transceiverId: existing.id, sourceUrl: product.url });
|
||||||
|
return existing.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return findOrCreateScrapedTransceiver({
|
||||||
|
partNumber: product.partNumber,
|
||||||
|
vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
|
formFactor: product.formFactor,
|
||||||
|
speedGbps: product.speedGbps,
|
||||||
|
speed: product.speed,
|
||||||
|
reachMeters: product.reachMeters,
|
||||||
|
reachLabel: product.reachLabel,
|
||||||
|
fiberType: product.fiberType,
|
||||||
|
wavelengths: product.wavelength,
|
||||||
|
category: "DataCenter",
|
||||||
|
imageUrl: product.imageUrl,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function quarantineGaoTekCategoryArtifacts(vendorId: string): Promise<number> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`UPDATE transceivers
|
||||||
|
SET data_confidence = 'unknown',
|
||||||
|
price_verified = false,
|
||||||
|
image_verified = false,
|
||||||
|
details_verified = false,
|
||||||
|
fully_verified = false,
|
||||||
|
price_verified_at = NULL,
|
||||||
|
image_verified_at = NULL,
|
||||||
|
details_verified_at = NULL,
|
||||||
|
fully_verified_at = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE vendor_id = $1
|
||||||
|
AND (
|
||||||
|
product_page_url IS NULL
|
||||||
|
OR product_page_url = ''
|
||||||
|
OR product_page_url LIKE 'https://gaotek.com/category/%'
|
||||||
|
)
|
||||||
|
AND (
|
||||||
|
COALESCE(data_confidence, 'unknown') != 'unknown'
|
||||||
|
OR price_verified = true
|
||||||
|
OR image_verified = true
|
||||||
|
OR details_verified = true
|
||||||
|
OR fully_verified = true
|
||||||
|
)`,
|
||||||
|
[vendorId]
|
||||||
|
);
|
||||||
|
return result.rowCount ?? 0;
|
||||||
|
}
|
||||||
|
|
||||||
export async function scrapeGaoTek(): Promise<void> {
|
export async function scrapeGaoTek(): Promise<void> {
|
||||||
console.log("=== GAO Tek Scraper Starting ===\n");
|
console.log("=== GAO Tek Scraper Starting ===\n");
|
||||||
|
|
||||||
@ -233,25 +388,15 @@ export async function scrapeGaoTek(): Promise<void> {
|
|||||||
|
|
||||||
console.log(`\nTotal unique products: ${allProducts.length}`);
|
console.log(`\nTotal unique products: ${allProducts.length}`);
|
||||||
|
|
||||||
|
const quarantined = await quarantineGaoTekCategoryArtifacts(vendorId);
|
||||||
|
if (quarantined > 0) console.log(`Quarantined ${quarantined} GAO Tek category/non-product artifacts`);
|
||||||
|
|
||||||
let totalProducts = 0;
|
let totalProducts = 0;
|
||||||
let priceUpdates = 0;
|
let priceUpdates = 0;
|
||||||
|
|
||||||
for (const product of allProducts) {
|
for (const product of allProducts) {
|
||||||
try {
|
try {
|
||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await saveGaoTekProduct(vendorId, product);
|
||||||
partNumber: product.partNumber,
|
|
||||||
vendorId,
|
|
||||||
productUrl: product.url,
|
|
||||||
formFactor: product.formFactor,
|
|
||||||
speedGbps: product.speedGbps,
|
|
||||||
speed: product.speed,
|
|
||||||
reachMeters: product.reachMeters,
|
|
||||||
reachLabel: product.reachLabel,
|
|
||||||
fiberType: product.fiberType,
|
|
||||||
wavelengths: product.wavelength,
|
|
||||||
category: "DataCenter",
|
|
||||||
imageUrl: product.imageUrl,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||||
|
|||||||
@ -284,7 +284,7 @@ async function backfillFlexoptix(): Promise<{ updated: number; skipped: number;
|
|||||||
SELECT t.id, t.part_number
|
SELECT t.id, t.part_number
|
||||||
FROM transceivers t
|
FROM transceivers t
|
||||||
JOIN vendors v ON t.vendor_id = v.id
|
JOIN vendors v ON t.vendor_id = v.id
|
||||||
WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
|
WHERE UPPER(v.name) = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
|
||||||
ORDER BY t.part_number
|
ORDER BY t.part_number
|
||||||
`);
|
`);
|
||||||
|
|
||||||
@ -427,6 +427,11 @@ async function backfillGaoTek(): Promise<{ updated: number; skipped: number; err
|
|||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
const OTHER_VENDOR_NAMES = [
|
const OTHER_VENDOR_NAMES = [
|
||||||
|
"Cisco Systems",
|
||||||
|
"Juniper Networks",
|
||||||
|
"Arista Networks",
|
||||||
|
"10Gtek",
|
||||||
|
"QSFPTEK",
|
||||||
"T&S Communication",
|
"T&S Communication",
|
||||||
"Ascent Optics",
|
"Ascent Optics",
|
||||||
"ATGBICS",
|
"ATGBICS",
|
||||||
@ -436,6 +441,9 @@ const OTHER_VENDOR_NAMES = [
|
|||||||
"FS.COM",
|
"FS.COM",
|
||||||
"GBICS",
|
"GBICS",
|
||||||
"Fluxlight",
|
"Fluxlight",
|
||||||
|
"SFPcables",
|
||||||
|
"II-VI / Coherent",
|
||||||
|
"NADDOD",
|
||||||
];
|
];
|
||||||
|
|
||||||
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {
|
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {
|
||||||
|
|||||||
88
packages/scraper/src/utils/verify-catalog-details.ts
Normal file
88
packages/scraper/src/utils/verify-catalog-details.ts
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
/**
|
||||||
|
* Catalog Details Verifier
|
||||||
|
*
|
||||||
|
* Promotes catalog-derived OEM rows to details_verified when the row already
|
||||||
|
* has complete normalized technical specs and the vendor has a public docs,
|
||||||
|
* datasheet-library, or website source recorded in the vendors table.
|
||||||
|
*
|
||||||
|
* This deliberately does not verify price/image/competitor signals.
|
||||||
|
*/
|
||||||
|
import { pool, checkAndSetFullyVerified } from "./db";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
|
||||||
|
const DEFAULT_LIMIT = 5000;
|
||||||
|
|
||||||
|
interface Candidate {
|
||||||
|
id: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function verifyCatalogDetails(limit: number): Promise<void> {
|
||||||
|
logger.info("=== Catalog Details Verifier ===", { limit });
|
||||||
|
|
||||||
|
const candidates = await pool.query<Candidate>(`
|
||||||
|
WITH candidate AS (
|
||||||
|
SELECT
|
||||||
|
t.id,
|
||||||
|
COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) AS source_url
|
||||||
|
FROM transceivers t
|
||||||
|
JOIN vendors v ON v.id = t.vendor_id
|
||||||
|
WHERE COALESCE(t.details_verified, false) = false
|
||||||
|
AND COALESCE(t.data_confidence, 'unknown') != 'garbage'
|
||||||
|
AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'
|
||||||
|
AND COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) IS NOT NULL
|
||||||
|
AND t.part_number IS NOT NULL
|
||||||
|
AND t.part_number != ''
|
||||||
|
AND t.form_factor IS NOT NULL
|
||||||
|
AND t.form_factor != ''
|
||||||
|
AND t.speed_gbps IS NOT NULL
|
||||||
|
AND t.speed_gbps > 0
|
||||||
|
AND t.reach_label IS NOT NULL
|
||||||
|
AND t.reach_label != ''
|
||||||
|
AND t.fiber_type IS NOT NULL
|
||||||
|
AND t.fiber_type != ''
|
||||||
|
ORDER BY
|
||||||
|
CASE
|
||||||
|
WHEN v.type = 'oem' THEN 0
|
||||||
|
WHEN v.vendor_category IN ('network_switching', 'optics_vendor') THEN 1
|
||||||
|
ELSE 2
|
||||||
|
END,
|
||||||
|
v.name,
|
||||||
|
t.part_number
|
||||||
|
LIMIT $1
|
||||||
|
)
|
||||||
|
UPDATE transceivers t
|
||||||
|
SET details_verified = true,
|
||||||
|
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||||
|
details_source_url = COALESCE(NULLIF(details_source_url, ''), candidate.source_url),
|
||||||
|
data_confidence = CASE
|
||||||
|
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||||
|
THEN 'scraped_unverified'
|
||||||
|
ELSE data_confidence
|
||||||
|
END,
|
||||||
|
updated_at = NOW()
|
||||||
|
FROM candidate
|
||||||
|
WHERE t.id = candidate.id
|
||||||
|
RETURNING t.id
|
||||||
|
`, [limit]);
|
||||||
|
|
||||||
|
let fullyVerifiedEarned = 0;
|
||||||
|
for (const row of candidates.rows) {
|
||||||
|
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Catalog details verifier complete", {
|
||||||
|
details_verified: candidates.rowCount ?? 0,
|
||||||
|
fully_verified_earned: fullyVerifiedEarned,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
const limit = Math.max(1, parseInt(process.env.CATALOG_DETAILS_LIMIT || String(DEFAULT_LIMIT), 10));
|
||||||
|
verifyCatalogDetails(limit)
|
||||||
|
.then(() => pool.end())
|
||||||
|
.catch((err) => {
|
||||||
|
logger.error("Catalog details verifier failed", { error: (err as Error).message });
|
||||||
|
pool.end();
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -1,9 +1,65 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 18:07 UTC
|
Updated: 2026-05-09 18:16 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
|
- TIP global verification continuation on 2026-05-09:
|
||||||
|
- operator requirement:
|
||||||
|
- continue until all possible product data is searched, found, verified, and source-backed
|
||||||
|
- no external AI; use TIP deterministic scrapers/robots only
|
||||||
|
- keep Erik safe; do not launch a heavy crawler wave
|
||||||
|
- write crawler/scraper/robot learnings into the TIPLLM training pool
|
||||||
|
- deployed fixes:
|
||||||
|
- repaired GAO Tek scraper for the live Woodmart product grid:
|
||||||
|
- current selector is `.wd-product.product-grid-item`
|
||||||
|
- product title selector includes `.wd-entities-title a`
|
||||||
|
- SKU selector includes `.wd-sku`
|
||||||
|
- fallback now only accepts real `https://gaotek.com/product/...` URLs
|
||||||
|
- category URLs are excluded from active verification/search counters
|
||||||
|
- expanded GAO reach parsing:
|
||||||
|
- 1/2/10/15/20/30/40/50/80/120/140/160 km
|
||||||
|
- 82/100/300/500/550 m
|
||||||
|
- mile values converted to rounded km labels
|
||||||
|
- added `packages/scraper/src/utils/verify-catalog-details.ts`
|
||||||
|
- promotes details only for complete normalized catalog specs with a vendor website/docs/datasheet source URL
|
||||||
|
- does not mark price/image/competitor verified
|
||||||
|
- hardened scheduler reconcile so category URLs are not promoted as details source
|
||||||
|
- fixed Flexoptix image backfill vendor-name case bug (`Flexoptix` vs `FLEXOPTIX`)
|
||||||
|
- expanded other-vendor image backfill list for Cisco, Juniper, Arista, 10Gtek, QSFPTEK, SFPcables, Coherent, NADDOD
|
||||||
|
- crawler/robot runs:
|
||||||
|
- GAO Tek scraper:
|
||||||
|
- fetched 20 pages
|
||||||
|
- extracted 480 real product cards
|
||||||
|
- found 0 public prices
|
||||||
|
- reset 6 category/non-product artifacts
|
||||||
|
- pi-fetch priority wave:
|
||||||
|
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM
|
||||||
|
- all jobs completed
|
||||||
|
- reconcile completed
|
||||||
|
- equivalence matcher completed
|
||||||
|
- catalog-details verifier promoted 4,340 details
|
||||||
|
- image backfill:
|
||||||
|
- first expanded run updated 48 images
|
||||||
|
- Flexoptix case fix then updated 12 additional images
|
||||||
|
- live public TIP health after this pass:
|
||||||
|
- status `healthy`
|
||||||
|
- load status `ok`
|
||||||
|
- memory used `13%`
|
||||||
|
- active total `17,714`
|
||||||
|
- `price_verified=11,582`
|
||||||
|
- `image_verified=12,194`
|
||||||
|
- `details_verified=16,684`
|
||||||
|
- `fully_verified=11,052`
|
||||||
|
- hard truth:
|
||||||
|
- GAO Tek appears quote-only/no public price in the crawled catalog, so prices remain unverified rather than fabricated
|
||||||
|
- many OEM rows now have verified details but still lack public prices/images/competitor evidence
|
||||||
|
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no usable image for those SKUs
|
||||||
|
- top remaining blockers are mostly public price/image/competitor availability, not detail parsing
|
||||||
|
- TIPLLM training pool:
|
||||||
|
- appended `robot-experiences/2026-05-09.jsonl`
|
||||||
|
- validated JSONL locally
|
||||||
|
|
||||||
- MAGATAMA FO_BlogLLM RunPod training and adoption closure on 2026-05-09:
|
- MAGATAMA FO_BlogLLM RunPod training and adoption closure on 2026-05-09:
|
||||||
- operator requirement:
|
- operator requirement:
|
||||||
- training success must only count after artifact exists, local import works, smoke tests pass, Ollama alias/version switches, remote MAGATAMA registry is updated, and the live UI reports no active stale job
|
- training success must only count after artifact exists, local import works, smoke tests pass, Ollama alias/version switches, remote MAGATAMA registry is updated, and the live UI reports no active stale job
|
||||||
|
|||||||
@ -0,0 +1,57 @@
|
|||||||
|
# TIP Global Verification Continuation — 2026-05-09
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- Continue TIP verification with deterministic scrapers/robots only.
|
||||||
|
- Keep Erik safe; no heavy Playwright/proxmox-heavy wave.
|
||||||
|
- Write learnings into the TIPLLM training pool.
|
||||||
|
|
||||||
|
## Implemented
|
||||||
|
|
||||||
|
- Repaired GAO Tek scraper for the current Woodmart product-card layout.
|
||||||
|
- Excluded category URLs from active product verification/search counters.
|
||||||
|
- Added a catalog-details verifier for complete source-backed OEM/catalog specs.
|
||||||
|
- Fixed Flexoptix image backfill case sensitivity.
|
||||||
|
- Expanded `og:image` backfill vendor coverage.
|
||||||
|
- Hardened scheduler reconcile so category URLs are not promoted as details source.
|
||||||
|
|
||||||
|
## Live Runs
|
||||||
|
|
||||||
|
- GAO Tek:
|
||||||
|
- 20 pages fetched.
|
||||||
|
- 480 real product cards extracted.
|
||||||
|
- 0 public prices found.
|
||||||
|
- 6 category/non-product artifacts reset.
|
||||||
|
- Priority pi-fetch wave:
|
||||||
|
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM.
|
||||||
|
- All jobs completed.
|
||||||
|
- Reconcile completed.
|
||||||
|
- Equivalence matcher completed.
|
||||||
|
- Catalog-details verifier:
|
||||||
|
- 4,340 details verified.
|
||||||
|
- Image backfill:
|
||||||
|
- 48 images from expanded vendor list.
|
||||||
|
- 12 additional Flexoptix images after case-insensitive vendor fix.
|
||||||
|
|
||||||
|
## Final Observed State
|
||||||
|
|
||||||
|
- Public health: healthy.
|
||||||
|
- Load: ok.
|
||||||
|
- Memory: 13%.
|
||||||
|
- Active total: 17,714.
|
||||||
|
- Price verified: 11,582.
|
||||||
|
- Image verified: 12,194.
|
||||||
|
- Details verified: 16,684.
|
||||||
|
- Fully verified: 11,052.
|
||||||
|
|
||||||
|
## Remaining Truth
|
||||||
|
|
||||||
|
- GAO Tek is quote-only/no public price in the crawled catalog; prices were not fabricated.
|
||||||
|
- Many OEM rows now have verified details but still need public images/prices/competitor evidence.
|
||||||
|
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no image.
|
||||||
|
- Top remaining blockers are dominated by price/image/competitor availability.
|
||||||
|
|
||||||
|
## Training Pool
|
||||||
|
|
||||||
|
- Appended one JSONL event to `/tmp/tip-training-data/robot-experiences/2026-05-09.jsonl`.
|
||||||
|
- JSONL validated successfully.
|
||||||
Loading…
x
Reference in New Issue
Block a user