fix: advance TIP product verification robots
This commit is contained in:
parent
3779de5b88
commit
a43e572946
@ -19,7 +19,10 @@ export interface SearchParams {
|
||||
}
|
||||
|
||||
export async function searchTransceivers(params: SearchParams) {
|
||||
const conditions: string[] = [];
|
||||
const conditions: string[] = [
|
||||
`COALESCE(t.data_confidence, 'unknown') != 'garbage'`,
|
||||
`COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'`,
|
||||
];
|
||||
const values: any[] = [];
|
||||
let idx = 1;
|
||||
|
||||
|
||||
@ -22,6 +22,8 @@ healthRouter.get("/", async (_req: Request, res: Response) => {
|
||||
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified,
|
||||
COUNT(*) AS total
|
||||
FROM transceivers
|
||||
WHERE COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||
AND COALESCE(product_page_url, '') NOT LIKE '%/category/%'
|
||||
`).catch(() => ({ rows: [{}] }));
|
||||
const v = verStats.rows[0] || {};
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
|
||||
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
||||
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||
"scrape:news": "tsx src/scrapers/news.ts",
|
||||
|
||||
@ -2621,6 +2621,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
updated_at = NOW()
|
||||
WHERE product_page_url IS NOT NULL
|
||||
AND product_page_url != ''
|
||||
AND product_page_url NOT LIKE '%/category/%'
|
||||
AND form_factor IS NOT NULL
|
||||
AND speed_gbps IS NOT NULL
|
||||
AND part_number IS NOT NULL
|
||||
|
||||
@ -6,7 +6,14 @@
|
||||
*
|
||||
* Rate limited: 1 req/2sec.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import {
|
||||
pool,
|
||||
findOrCreateScrapedTransceiver,
|
||||
ensureVendor,
|
||||
markDetailsVerified,
|
||||
markImageVerified,
|
||||
upsertPriceObservation,
|
||||
} from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
import * as cheerio from "cheerio";
|
||||
|
||||
@ -55,14 +62,23 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b160\s*km\b/i, "160km", 160000],
|
||||
[/\b140\s*km\b/i, "140km", 140000],
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b50\s*km\b/i, "50km", 50000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b30\s*km\b/i, "30km", 30000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b15\s*km\b/i, "15km", 15000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b1\s*km\b/i, "1km", 1000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\b82\s*m\b/i, "82m", 82],
|
||||
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
|
||||
@ -70,6 +86,24 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
const km = text.match(/\b(\d+(?:\.\d+)?)\s*km\b/i);
|
||||
if (km) {
|
||||
const value = Number(km[1]);
|
||||
if (value > 0 && value <= 200) return { label: `${Number.isInteger(value) ? value : value.toString()}km`, meters: Math.round(value * 1000) };
|
||||
}
|
||||
const meters = text.match(/\b(\d+(?:\.\d+)?)\s*m\b/i);
|
||||
if (meters) {
|
||||
const value = Number(meters[1]);
|
||||
if (value > 0 && value <= 10000) return { label: `${Number.isInteger(value) ? value : value.toString()}m`, meters: Math.round(value) };
|
||||
}
|
||||
const miles = text.match(/\b(\d+(?:\.\d+)?)\s*(?:mi|miles?)\b/i);
|
||||
if (miles) {
|
||||
const value = Number(miles[1]);
|
||||
if (value > 0 && value <= 125) {
|
||||
const kmRounded = Math.round(value * 1.609344);
|
||||
return { label: `${kmRounded}km`, meters: kmRounded * 1000 };
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
@ -85,19 +119,29 @@ function detectWavelength(text: string): string {
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function cleanSku(text: string): string {
|
||||
return text.replace(/\s+/g, "").replace(/GAOTek-/i, "GAOTek-").trim();
|
||||
}
|
||||
|
||||
function isProductUrl(url: string): boolean {
|
||||
return /^https:\/\/gaotek\.com\/product\/[^/?#]+\/?$/i.test(url);
|
||||
}
|
||||
|
||||
function parseProductList(html: string): Product[] {
|
||||
const $ = cheerio.load(html);
|
||||
const products: Product[] = [];
|
||||
|
||||
// WooCommerce product grid
|
||||
$("li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => {
|
||||
const titleEl = $(el).find(".woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first();
|
||||
// WooCommerce/Woodmart product grid. GAO Tek currently uses Woodmart
|
||||
// `.wd-product.product-grid-item` cards rather than classic `li.product`.
|
||||
$(".wd-product.product-grid-item, div.product-grid-item, li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => {
|
||||
const titleEl = $(el).find(".wd-entities-title a, .woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first();
|
||||
const name = titleEl.text().trim();
|
||||
if (!name || name.length < 5) return;
|
||||
|
||||
const linkEl = $(el).find("a[href]").first();
|
||||
const linkEl = $(el).find("a.wd-product-img-link[href], .wd-entities-title a[href], a[href]").first();
|
||||
const href = linkEl.attr("href") || "";
|
||||
const url = href.startsWith("http") ? href : BASE + href;
|
||||
if (!isProductUrl(url)) return;
|
||||
|
||||
// WooCommerce price
|
||||
const priceText = $(el).find(".price, .woocommerce-Price-amount, .amount").text();
|
||||
@ -109,8 +153,8 @@ function parseProductList(html: string): Product[] {
|
||||
}
|
||||
|
||||
// GAO Tek uses SKU for part numbers
|
||||
const skuEl = $(el).find(".sku, [data-sku]");
|
||||
const partNumber = skuEl.text().trim() ||
|
||||
const skuEl = $(el).find(".wd-sku, .sku, [data-sku]");
|
||||
const partNumber = cleanSku(skuEl.text()) ||
|
||||
url.split("/").filter(Boolean).pop()?.replace(/-/g, " ").trim() ||
|
||||
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
|
||||
name.replace(/\s+/g, "-").slice(0, 60);
|
||||
@ -141,15 +185,16 @@ function parseProductList(html: string): Product[] {
|
||||
|
||||
// Fallback for non-WooCommerce layout
|
||||
if (products.length === 0) {
|
||||
$("a[href]").each((_i, el) => {
|
||||
$("a[href*='/product/']").each((_i, el) => {
|
||||
const name = $(el).text().trim();
|
||||
const href = $(el).attr("href") || "";
|
||||
const url = href.startsWith("http") ? href : BASE + href;
|
||||
if (
|
||||
name.length < 8 || name.length > 200 ||
|
||||
!isProductUrl(url) ||
|
||||
!/sfp|qsfp|xfp|transceiver|optic/i.test(name)
|
||||
) return;
|
||||
|
||||
const url = href.startsWith("http") ? href : BASE + href;
|
||||
const context = $(el).parent().parent().text();
|
||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
|
||||
let price: number | undefined;
|
||||
@ -167,7 +212,9 @@ function parseProductList(html: string): Product[] {
|
||||
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
|
||||
: undefined;
|
||||
products.push({
|
||||
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60),
|
||||
partNumber: cleanSku(context.match(/SKU:\s*([A-Z0-9][A-Z0-9\-\s]{4,})/i)?.[1] || "") ||
|
||||
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
|
||||
name.replace(/\s+/g, "-").slice(0, 60),
|
||||
name, url, price, ...ff,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
@ -178,7 +225,7 @@ function parseProductList(html: string): Product[] {
|
||||
|
||||
const seen = new Set<string>();
|
||||
return products.filter((p) => {
|
||||
if (!p.url || seen.has(p.url)) return false;
|
||||
if (!p.url || !isProductUrl(p.url) || seen.has(p.url)) return false;
|
||||
seen.add(p.url);
|
||||
return true;
|
||||
});
|
||||
@ -190,6 +237,114 @@ async function fetchPage(url: string): Promise<string> {
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
async function saveGaoTekProduct(vendorId: string, product: Product): Promise<string> {
|
||||
const existingByUrl = await pool.query<{ id: string; part_number: string }>(
|
||||
`SELECT id, part_number
|
||||
FROM transceivers
|
||||
WHERE vendor_id = $1
|
||||
AND product_page_url = $2
|
||||
LIMIT 1`,
|
||||
[vendorId, product.url]
|
||||
);
|
||||
|
||||
if (existingByUrl.rows.length > 0) {
|
||||
const existing = existingByUrl.rows[0];
|
||||
await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET part_number = CASE
|
||||
WHEN $3::text != ''
|
||||
AND (part_number IS NULL
|
||||
OR part_number = ''
|
||||
OR ($3::text ~* '^GAOTek-' AND part_number !~* '^GAOTek-')
|
||||
OR part_number LIKE '% %'
|
||||
OR length(part_number) > 45
|
||||
OR part_number ~* '^(fiber|optical|gbic|sfp|qsfp|cfp).*transceivers?$')
|
||||
THEN $3::text
|
||||
ELSE part_number
|
||||
END,
|
||||
product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
|
||||
form_factor = COALESCE(NULLIF(form_factor, ''), $4),
|
||||
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($5, speed_gbps) ELSE speed_gbps END,
|
||||
speed = COALESCE(NULLIF(speed, ''), $6),
|
||||
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
|
||||
reach_label = COALESCE(NULLIF(reach_label, ''), $8),
|
||||
fiber_type = COALESCE(NULLIF(fiber_type, ''), $9),
|
||||
wavelengths = COALESCE(NULLIF(wavelengths, ''), $10),
|
||||
category = COALESCE(NULLIF(category, ''), $11),
|
||||
data_confidence = CASE
|
||||
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||
THEN 'scraped_unverified'
|
||||
ELSE data_confidence
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
existing.id,
|
||||
product.url,
|
||||
product.partNumber,
|
||||
product.formFactor || null,
|
||||
product.speedGbps || null,
|
||||
product.speed || null,
|
||||
product.reachMeters || null,
|
||||
product.reachLabel || null,
|
||||
product.fiberType || null,
|
||||
product.wavelength || null,
|
||||
"DataCenter",
|
||||
]
|
||||
);
|
||||
|
||||
if (product.imageUrl) await markImageVerified(existing.id, product.imageUrl);
|
||||
await markDetailsVerified({ transceiverId: existing.id, sourceUrl: product.url });
|
||||
return existing.id;
|
||||
}
|
||||
|
||||
return findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
imageUrl: product.imageUrl,
|
||||
});
|
||||
}
|
||||
|
||||
async function quarantineGaoTekCategoryArtifacts(vendorId: string): Promise<number> {
|
||||
const result = await pool.query(
|
||||
`UPDATE transceivers
|
||||
SET data_confidence = 'unknown',
|
||||
price_verified = false,
|
||||
image_verified = false,
|
||||
details_verified = false,
|
||||
fully_verified = false,
|
||||
price_verified_at = NULL,
|
||||
image_verified_at = NULL,
|
||||
details_verified_at = NULL,
|
||||
fully_verified_at = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE vendor_id = $1
|
||||
AND (
|
||||
product_page_url IS NULL
|
||||
OR product_page_url = ''
|
||||
OR product_page_url LIKE 'https://gaotek.com/category/%'
|
||||
)
|
||||
AND (
|
||||
COALESCE(data_confidence, 'unknown') != 'unknown'
|
||||
OR price_verified = true
|
||||
OR image_verified = true
|
||||
OR details_verified = true
|
||||
OR fully_verified = true
|
||||
)`,
|
||||
[vendorId]
|
||||
);
|
||||
return result.rowCount ?? 0;
|
||||
}
|
||||
|
||||
export async function scrapeGaoTek(): Promise<void> {
|
||||
console.log("=== GAO Tek Scraper Starting ===\n");
|
||||
|
||||
@ -233,25 +388,15 @@ export async function scrapeGaoTek(): Promise<void> {
|
||||
|
||||
console.log(`\nTotal unique products: ${allProducts.length}`);
|
||||
|
||||
const quarantined = await quarantineGaoTekCategoryArtifacts(vendorId);
|
||||
if (quarantined > 0) console.log(`Quarantined ${quarantined} GAO Tek category/non-product artifacts`);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
|
||||
for (const product of allProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
productUrl: product.url,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
imageUrl: product.imageUrl,
|
||||
});
|
||||
const txId = await saveGaoTekProduct(vendorId, product);
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||
|
||||
@ -284,7 +284,7 @@ async function backfillFlexoptix(): Promise<{ updated: number; skipped: number;
|
||||
SELECT t.id, t.part_number
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON t.vendor_id = v.id
|
||||
WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
|
||||
WHERE UPPER(v.name) = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
|
||||
ORDER BY t.part_number
|
||||
`);
|
||||
|
||||
@ -427,6 +427,11 @@ async function backfillGaoTek(): Promise<{ updated: number; skipped: number; err
|
||||
// =============================================================================
|
||||
|
||||
const OTHER_VENDOR_NAMES = [
|
||||
"Cisco Systems",
|
||||
"Juniper Networks",
|
||||
"Arista Networks",
|
||||
"10Gtek",
|
||||
"QSFPTEK",
|
||||
"T&S Communication",
|
||||
"Ascent Optics",
|
||||
"ATGBICS",
|
||||
@ -436,6 +441,9 @@ const OTHER_VENDOR_NAMES = [
|
||||
"FS.COM",
|
||||
"GBICS",
|
||||
"Fluxlight",
|
||||
"SFPcables",
|
||||
"II-VI / Coherent",
|
||||
"NADDOD",
|
||||
];
|
||||
|
||||
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {
|
||||
|
||||
88
packages/scraper/src/utils/verify-catalog-details.ts
Normal file
88
packages/scraper/src/utils/verify-catalog-details.ts
Normal file
@ -0,0 +1,88 @@
|
||||
/**
|
||||
* Catalog Details Verifier
|
||||
*
|
||||
* Promotes catalog-derived OEM rows to details_verified when the row already
|
||||
* has complete normalized technical specs and the vendor has a public docs,
|
||||
* datasheet-library, or website source recorded in the vendors table.
|
||||
*
|
||||
* This deliberately does not verify price/image/competitor signals.
|
||||
*/
|
||||
import { pool, checkAndSetFullyVerified } from "./db";
|
||||
import { logger } from "./logger";
|
||||
|
||||
const DEFAULT_LIMIT = 5000;
|
||||
|
||||
interface Candidate {
|
||||
id: string;
|
||||
}
|
||||
|
||||
async function verifyCatalogDetails(limit: number): Promise<void> {
|
||||
logger.info("=== Catalog Details Verifier ===", { limit });
|
||||
|
||||
const candidates = await pool.query<Candidate>(`
|
||||
WITH candidate AS (
|
||||
SELECT
|
||||
t.id,
|
||||
COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) AS source_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE COALESCE(t.details_verified, false) = false
|
||||
AND COALESCE(t.data_confidence, 'unknown') != 'garbage'
|
||||
AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'
|
||||
AND COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) IS NOT NULL
|
||||
AND t.part_number IS NOT NULL
|
||||
AND t.part_number != ''
|
||||
AND t.form_factor IS NOT NULL
|
||||
AND t.form_factor != ''
|
||||
AND t.speed_gbps IS NOT NULL
|
||||
AND t.speed_gbps > 0
|
||||
AND t.reach_label IS NOT NULL
|
||||
AND t.reach_label != ''
|
||||
AND t.fiber_type IS NOT NULL
|
||||
AND t.fiber_type != ''
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN v.type = 'oem' THEN 0
|
||||
WHEN v.vendor_category IN ('network_switching', 'optics_vendor') THEN 1
|
||||
ELSE 2
|
||||
END,
|
||||
v.name,
|
||||
t.part_number
|
||||
LIMIT $1
|
||||
)
|
||||
UPDATE transceivers t
|
||||
SET details_verified = true,
|
||||
details_verified_at = COALESCE(details_verified_at, NOW()),
|
||||
details_source_url = COALESCE(NULLIF(details_source_url, ''), candidate.source_url),
|
||||
data_confidence = CASE
|
||||
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
|
||||
THEN 'scraped_unverified'
|
||||
ELSE data_confidence
|
||||
END,
|
||||
updated_at = NOW()
|
||||
FROM candidate
|
||||
WHERE t.id = candidate.id
|
||||
RETURNING t.id
|
||||
`, [limit]);
|
||||
|
||||
let fullyVerifiedEarned = 0;
|
||||
for (const row of candidates.rows) {
|
||||
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
||||
}
|
||||
|
||||
logger.info("Catalog details verifier complete", {
|
||||
details_verified: candidates.rowCount ?? 0,
|
||||
fully_verified_earned: fullyVerifiedEarned,
|
||||
});
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
const limit = Math.max(1, parseInt(process.env.CATALOG_DETAILS_LIMIT || String(DEFAULT_LIMIT), 10));
|
||||
verifyCatalogDetails(limit)
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
logger.error("Catalog details verifier failed", { error: (err as Error).message });
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -1,9 +1,65 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 18:07 UTC
|
||||
Updated: 2026-05-09 18:16 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP global verification continuation on 2026-05-09:
|
||||
- operator requirement:
|
||||
- continue until all possible product data is searched, found, verified, and source-backed
|
||||
- no external AI; use TIP deterministic scrapers/robots only
|
||||
- keep Erik safe; do not launch a heavy crawler wave
|
||||
- write crawler/scraper/robot learnings into the TIPLLM training pool
|
||||
- deployed fixes:
|
||||
- repaired GAO Tek scraper for the live Woodmart product grid:
|
||||
- current selector is `.wd-product.product-grid-item`
|
||||
- product title selector includes `.wd-entities-title a`
|
||||
- SKU selector includes `.wd-sku`
|
||||
- fallback now only accepts real `https://gaotek.com/product/...` URLs
|
||||
- category URLs are excluded from active verification/search counters
|
||||
- expanded GAO reach parsing:
|
||||
- 1/2/10/15/20/30/40/50/80/120/140/160 km
|
||||
- 82/100/300/500/550 m
|
||||
- mile values converted to rounded km labels
|
||||
- added `packages/scraper/src/utils/verify-catalog-details.ts`
|
||||
- promotes details only for complete normalized catalog specs with a vendor website/docs/datasheet source URL
|
||||
- does not mark price/image/competitor verified
|
||||
- hardened scheduler reconcile so category URLs are not promoted as details source
|
||||
- fixed Flexoptix image backfill vendor-name case bug (`Flexoptix` vs `FLEXOPTIX`)
|
||||
- expanded other-vendor image backfill list for Cisco, Juniper, Arista, 10Gtek, QSFPTEK, SFPcables, Coherent, NADDOD
|
||||
- crawler/robot runs:
|
||||
- GAO Tek scraper:
|
||||
- fetched 20 pages
|
||||
- extracted 480 real product cards
|
||||
- found 0 public prices
|
||||
- reset 6 category/non-product artifacts
|
||||
- pi-fetch priority wave:
|
||||
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM
|
||||
- all jobs completed
|
||||
- reconcile completed
|
||||
- equivalence matcher completed
|
||||
- catalog-details verifier promoted 4,340 details
|
||||
- image backfill:
|
||||
- first expanded run updated 48 images
|
||||
- Flexoptix case fix then updated 12 additional images
|
||||
- live public TIP health after this pass:
|
||||
- status `healthy`
|
||||
- load status `ok`
|
||||
- memory used `13%`
|
||||
- active total `17,714`
|
||||
- `price_verified=11,582`
|
||||
- `image_verified=12,194`
|
||||
- `details_verified=16,684`
|
||||
- `fully_verified=11,052`
|
||||
- hard truth:
|
||||
- GAO Tek appears quote-only/no public price in the crawled catalog, so prices remain unverified rather than fabricated
|
||||
- many OEM rows now have verified details but still lack public prices/images/competitor evidence
|
||||
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no usable image for those SKUs
|
||||
- top remaining blockers are mostly public price/image/competitor availability, not detail parsing
|
||||
- TIPLLM training pool:
|
||||
- appended `robot-experiences/2026-05-09.jsonl`
|
||||
- validated JSONL locally
|
||||
|
||||
- MAGATAMA FO_BlogLLM RunPod training and adoption closure on 2026-05-09:
|
||||
- operator requirement:
|
||||
- training success must only count after artifact exists, local import works, smoke tests pass, Ollama alias/version switches, remote MAGATAMA registry is updated, and the live UI reports no active stale job
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
# TIP Global Verification Continuation — 2026-05-09
|
||||
|
||||
## Scope
|
||||
|
||||
- Continue TIP verification with deterministic scrapers/robots only.
|
||||
- Keep Erik safe; no heavy Playwright/proxmox-heavy wave.
|
||||
- Write learnings into the TIPLLM training pool.
|
||||
|
||||
## Implemented
|
||||
|
||||
- Repaired GAO Tek scraper for the current Woodmart product-card layout.
|
||||
- Excluded category URLs from active product verification/search counters.
|
||||
- Added a catalog-details verifier for complete source-backed OEM/catalog specs.
|
||||
- Fixed Flexoptix image backfill case sensitivity.
|
||||
- Expanded `og:image` backfill vendor coverage.
|
||||
- Hardened scheduler reconcile so category URLs are not promoted as details source.
|
||||
|
||||
## Live Runs
|
||||
|
||||
- GAO Tek:
|
||||
- 20 pages fetched.
|
||||
- 480 real product cards extracted.
|
||||
- 0 public prices found.
|
||||
- 6 category/non-product artifacts reset.
|
||||
- Priority pi-fetch wave:
|
||||
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM.
|
||||
- All jobs completed.
|
||||
- Reconcile completed.
|
||||
- Equivalence matcher completed.
|
||||
- Catalog-details verifier:
|
||||
- 4,340 details verified.
|
||||
- Image backfill:
|
||||
- 48 images from expanded vendor list.
|
||||
- 12 additional Flexoptix images after case-insensitive vendor fix.
|
||||
|
||||
## Final Observed State
|
||||
|
||||
- Public health: healthy.
|
||||
- Load: ok.
|
||||
- Memory: 13%.
|
||||
- Active total: 17,714.
|
||||
- Price verified: 11,582.
|
||||
- Image verified: 12,194.
|
||||
- Details verified: 16,684.
|
||||
- Fully verified: 11,052.
|
||||
|
||||
## Remaining Truth
|
||||
|
||||
- GAO Tek is quote-only/no public price in the crawled catalog; prices were not fabricated.
|
||||
- Many OEM rows now have verified details but still need public images/prices/competitor evidence.
|
||||
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no image.
|
||||
- Top remaining blockers are dominated by price/image/competitor availability.
|
||||
|
||||
## Training Pool
|
||||
|
||||
- Appended one JSONL event to `/tmp/tip-training-data/robot-experiences/2026-05-09.jsonl`.
|
||||
- JSONL validated successfully.
|
||||
Loading…
x
Reference in New Issue
Block a user