fix: advance TIP product verification robots

This commit is contained in:
Rene Fichtmueller 2026-05-09 20:19:19 +02:00
parent 3779de5b88
commit a43e572946
9 changed files with 389 additions and 28 deletions

View File

@ -19,7 +19,10 @@ export interface SearchParams {
} }
export async function searchTransceivers(params: SearchParams) { export async function searchTransceivers(params: SearchParams) {
const conditions: string[] = []; const conditions: string[] = [
`COALESCE(t.data_confidence, 'unknown') != 'garbage'`,
`COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'`,
];
const values: any[] = []; const values: any[] = [];
let idx = 1; let idx = 1;

View File

@ -22,6 +22,8 @@ healthRouter.get("/", async (_req: Request, res: Response) => {
COUNT(*) FILTER (WHERE fully_verified) AS fully_verified, COUNT(*) FILTER (WHERE fully_verified) AS fully_verified,
COUNT(*) AS total COUNT(*) AS total
FROM transceivers FROM transceivers
WHERE COALESCE(data_confidence, 'unknown') != 'garbage'
AND COALESCE(product_page_url, '') NOT LIKE '%/category/%'
`).catch(() => ({ rows: [{}] })); `).catch(() => ({ rows: [{}] }));
const v = verStats.rows[0] || {}; const v = verStats.rows[0] || {};

View File

@ -12,6 +12,7 @@
"scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts", "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts",
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts", "scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts", "scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
"scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts",
"scrape:news": "tsx src/scrapers/news.ts", "scrape:news": "tsx src/scrapers/news.ts",

View File

@ -2621,6 +2621,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
updated_at = NOW() updated_at = NOW()
WHERE product_page_url IS NOT NULL WHERE product_page_url IS NOT NULL
AND product_page_url != '' AND product_page_url != ''
AND product_page_url NOT LIKE '%/category/%'
AND form_factor IS NOT NULL AND form_factor IS NOT NULL
AND speed_gbps IS NOT NULL AND speed_gbps IS NOT NULL
AND part_number IS NOT NULL AND part_number IS NOT NULL

View File

@ -6,7 +6,14 @@
* *
* Rate limited: 1 req/2sec. * Rate limited: 1 req/2sec.
*/ */
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import {
pool,
findOrCreateScrapedTransceiver,
ensureVendor,
markDetailsVerified,
markImageVerified,
upsertPriceObservation,
} from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
@ -55,14 +62,23 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp
function detectReach(text: string): { label: string; meters: number } | undefined { function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
[/\b160\s*km\b/i, "160km", 160000],
[/\b140\s*km\b/i, "140km", 140000],
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000], [/\b80\s*km\b/i, "80km", 80000],
[/\b50\s*km\b/i, "50km", 50000],
[/\b40\s*km\b/i, "40km", 40000], [/\b40\s*km\b/i, "40km", 40000],
[/\b30\s*km\b/i, "30km", 30000],
[/\b20\s*km\b/i, "20km", 20000], [/\b20\s*km\b/i, "20km", 20000],
[/\b15\s*km\b/i, "15km", 15000],
[/\b10\s*km\b/i, "10km", 10000], [/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000], [/\b2\s*km\b/i, "2km", 2000],
[/\b1\s*km\b/i, "1km", 1000],
[/\b550\s*m\b/i, "550m", 550], [/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b300\s*m\b/i, "300m", 300], [/\b300\s*m\b/i, "300m", 300],
[/\b100\s*m\b/i, "100m", 100], [/\b100\s*m\b/i, "100m", 100],
[/\b82\s*m\b/i, "82m", 82],
[/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000],
@ -70,6 +86,24 @@ function detectReach(text: string): { label: string; meters: number } | undefine
for (const [regex, label, meters] of patterns) { for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters }; if (regex.test(text)) return { label, meters };
} }
const km = text.match(/\b(\d+(?:\.\d+)?)\s*km\b/i);
if (km) {
const value = Number(km[1]);
if (value > 0 && value <= 200) return { label: `${Number.isInteger(value) ? value : value.toString()}km`, meters: Math.round(value * 1000) };
}
const meters = text.match(/\b(\d+(?:\.\d+)?)\s*m\b/i);
if (meters) {
const value = Number(meters[1]);
if (value > 0 && value <= 10000) return { label: `${Number.isInteger(value) ? value : value.toString()}m`, meters: Math.round(value) };
}
const miles = text.match(/\b(\d+(?:\.\d+)?)\s*(?:mi|miles?)\b/i);
if (miles) {
const value = Number(miles[1]);
if (value > 0 && value <= 125) {
const kmRounded = Math.round(value * 1.609344);
return { label: `${kmRounded}km`, meters: kmRounded * 1000 };
}
}
return undefined; return undefined;
} }
@ -85,19 +119,29 @@ function detectWavelength(text: string): string {
return match ? match[1] : ""; return match ? match[1] : "";
} }
function cleanSku(text: string): string {
return text.replace(/\s+/g, "").replace(/GAOTek-/i, "GAOTek-").trim();
}
function isProductUrl(url: string): boolean {
return /^https:\/\/gaotek\.com\/product\/[^/?#]+\/?$/i.test(url);
}
function parseProductList(html: string): Product[] { function parseProductList(html: string): Product[] {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const products: Product[] = []; const products: Product[] = [];
// WooCommerce product grid // WooCommerce/Woodmart product grid. GAO Tek currently uses Woodmart
$("li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => { // `.wd-product.product-grid-item` cards rather than classic `li.product`.
const titleEl = $(el).find(".woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first(); $(".wd-product.product-grid-item, div.product-grid-item, li.product, .product-item, .woocommerce-loop-product, article.product").each((_i, el) => {
const titleEl = $(el).find(".wd-entities-title a, .woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first();
const name = titleEl.text().trim(); const name = titleEl.text().trim();
if (!name || name.length < 5) return; if (!name || name.length < 5) return;
const linkEl = $(el).find("a[href]").first(); const linkEl = $(el).find("a.wd-product-img-link[href], .wd-entities-title a[href], a[href]").first();
const href = linkEl.attr("href") || ""; const href = linkEl.attr("href") || "";
const url = href.startsWith("http") ? href : BASE + href; const url = href.startsWith("http") ? href : BASE + href;
if (!isProductUrl(url)) return;
// WooCommerce price // WooCommerce price
const priceText = $(el).find(".price, .woocommerce-Price-amount, .amount").text(); const priceText = $(el).find(".price, .woocommerce-Price-amount, .amount").text();
@ -109,8 +153,8 @@ function parseProductList(html: string): Product[] {
} }
// GAO Tek uses SKU for part numbers // GAO Tek uses SKU for part numbers
const skuEl = $(el).find(".sku, [data-sku]"); const skuEl = $(el).find(".wd-sku, .sku, [data-sku]");
const partNumber = skuEl.text().trim() || const partNumber = cleanSku(skuEl.text()) ||
url.split("/").filter(Boolean).pop()?.replace(/-/g, " ").trim() || url.split("/").filter(Boolean).pop()?.replace(/-/g, " ").trim() ||
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
name.replace(/\s+/g, "-").slice(0, 60); name.replace(/\s+/g, "-").slice(0, 60);
@ -141,15 +185,16 @@ function parseProductList(html: string): Product[] {
// Fallback for non-WooCommerce layout // Fallback for non-WooCommerce layout
if (products.length === 0) { if (products.length === 0) {
$("a[href]").each((_i, el) => { $("a[href*='/product/']").each((_i, el) => {
const name = $(el).text().trim(); const name = $(el).text().trim();
const href = $(el).attr("href") || ""; const href = $(el).attr("href") || "";
const url = href.startsWith("http") ? href : BASE + href;
if ( if (
name.length < 8 || name.length > 200 || name.length < 8 || name.length > 200 ||
!isProductUrl(url) ||
!/sfp|qsfp|xfp|transceiver|optic/i.test(name) !/sfp|qsfp|xfp|transceiver|optic/i.test(name)
) return; ) return;
const url = href.startsWith("http") ? href : BASE + href;
const context = $(el).parent().parent().text(); const context = $(el).parent().parent().text();
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/); const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
let price: number | undefined; let price: number | undefined;
@ -167,7 +212,9 @@ function parseProductList(html: string): Product[] {
? (rawImg.startsWith("http") ? rawImg : BASE + rawImg) ? (rawImg.startsWith("http") ? rawImg : BASE + rawImg)
: undefined; : undefined;
products.push({ products.push({
partNumber: name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] || name.replace(/\s+/g, "-").slice(0, 60), partNumber: cleanSku(context.match(/SKU:\s*([A-Z0-9][A-Z0-9\-\s]{4,})/i)?.[1] || "") ||
name.match(/[A-Z0-9][-A-Z0-9]{5,}/)?.[0] ||
name.replace(/\s+/g, "-").slice(0, 60),
name, url, price, ...ff, name, url, price, ...ff,
reachLabel: reach?.label, reachMeters: reach?.meters, reachLabel: reach?.label, reachMeters: reach?.meters,
fiberType: detectFiber(name), wavelength: detectWavelength(name), fiberType: detectFiber(name), wavelength: detectWavelength(name),
@ -178,7 +225,7 @@ function parseProductList(html: string): Product[] {
const seen = new Set<string>(); const seen = new Set<string>();
return products.filter((p) => { return products.filter((p) => {
if (!p.url || seen.has(p.url)) return false; if (!p.url || !isProductUrl(p.url) || seen.has(p.url)) return false;
seen.add(p.url); seen.add(p.url);
return true; return true;
}); });
@ -190,6 +237,114 @@ async function fetchPage(url: string): Promise<string> {
return resp.text(); return resp.text();
} }
async function saveGaoTekProduct(vendorId: string, product: Product): Promise<string> {
const existingByUrl = await pool.query<{ id: string; part_number: string }>(
`SELECT id, part_number
FROM transceivers
WHERE vendor_id = $1
AND product_page_url = $2
LIMIT 1`,
[vendorId, product.url]
);
if (existingByUrl.rows.length > 0) {
const existing = existingByUrl.rows[0];
await pool.query(
`UPDATE transceivers
SET part_number = CASE
WHEN $3::text != ''
AND (part_number IS NULL
OR part_number = ''
OR ($3::text ~* '^GAOTek-' AND part_number !~* '^GAOTek-')
OR part_number LIKE '% %'
OR length(part_number) > 45
OR part_number ~* '^(fiber|optical|gbic|sfp|qsfp|cfp).*transceivers?$')
THEN $3::text
ELSE part_number
END,
product_page_url = COALESCE(NULLIF(product_page_url, ''), NULLIF($2, '')),
form_factor = COALESCE(NULLIF(form_factor, ''), $4),
speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN COALESCE($5, speed_gbps) ELSE speed_gbps END,
speed = COALESCE(NULLIF(speed, ''), $6),
reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN COALESCE($7, reach_meters) ELSE reach_meters END,
reach_label = COALESCE(NULLIF(reach_label, ''), $8),
fiber_type = COALESCE(NULLIF(fiber_type, ''), $9),
wavelengths = COALESCE(NULLIF(wavelengths, ''), $10),
category = COALESCE(NULLIF(category, ''), $11),
data_confidence = CASE
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
THEN 'scraped_unverified'
ELSE data_confidence
END,
updated_at = NOW()
WHERE id = $1`,
[
existing.id,
product.url,
product.partNumber,
product.formFactor || null,
product.speedGbps || null,
product.speed || null,
product.reachMeters || null,
product.reachLabel || null,
product.fiberType || null,
product.wavelength || null,
"DataCenter",
]
);
if (product.imageUrl) await markImageVerified(existing.id, product.imageUrl);
await markDetailsVerified({ transceiverId: existing.id, sourceUrl: product.url });
return existing.id;
}
return findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
imageUrl: product.imageUrl,
});
}
async function quarantineGaoTekCategoryArtifacts(vendorId: string): Promise<number> {
const result = await pool.query(
`UPDATE transceivers
SET data_confidence = 'unknown',
price_verified = false,
image_verified = false,
details_verified = false,
fully_verified = false,
price_verified_at = NULL,
image_verified_at = NULL,
details_verified_at = NULL,
fully_verified_at = NULL,
updated_at = NOW()
WHERE vendor_id = $1
AND (
product_page_url IS NULL
OR product_page_url = ''
OR product_page_url LIKE 'https://gaotek.com/category/%'
)
AND (
COALESCE(data_confidence, 'unknown') != 'unknown'
OR price_verified = true
OR image_verified = true
OR details_verified = true
OR fully_verified = true
)`,
[vendorId]
);
return result.rowCount ?? 0;
}
export async function scrapeGaoTek(): Promise<void> { export async function scrapeGaoTek(): Promise<void> {
console.log("=== GAO Tek Scraper Starting ===\n"); console.log("=== GAO Tek Scraper Starting ===\n");
@ -233,25 +388,15 @@ export async function scrapeGaoTek(): Promise<void> {
console.log(`\nTotal unique products: ${allProducts.length}`); console.log(`\nTotal unique products: ${allProducts.length}`);
const quarantined = await quarantineGaoTekCategoryArtifacts(vendorId);
if (quarantined > 0) console.log(`Quarantined ${quarantined} GAO Tek category/non-product artifacts`);
let totalProducts = 0; let totalProducts = 0;
let priceUpdates = 0; let priceUpdates = 0;
for (const product of allProducts) { for (const product of allProducts) {
try { try {
const txId = await findOrCreateScrapedTransceiver({ const txId = await saveGaoTekProduct(vendorId, product);
partNumber: product.partNumber,
vendorId,
productUrl: product.url,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
imageUrl: product.imageUrl,
});
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));

View File

@ -284,7 +284,7 @@ async function backfillFlexoptix(): Promise<{ updated: number; skipped: number;
SELECT t.id, t.part_number SELECT t.id, t.part_number
FROM transceivers t FROM transceivers t
JOIN vendors v ON t.vendor_id = v.id JOIN vendors v ON t.vendor_id = v.id
WHERE v.name = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '') WHERE UPPER(v.name) = 'FLEXOPTIX' AND (t.image_url IS NULL OR t.image_url = '')
ORDER BY t.part_number ORDER BY t.part_number
`); `);
@ -427,6 +427,11 @@ async function backfillGaoTek(): Promise<{ updated: number; skipped: number; err
// ============================================================================= // =============================================================================
const OTHER_VENDOR_NAMES = [ const OTHER_VENDOR_NAMES = [
"Cisco Systems",
"Juniper Networks",
"Arista Networks",
"10Gtek",
"QSFPTEK",
"T&S Communication", "T&S Communication",
"Ascent Optics", "Ascent Optics",
"ATGBICS", "ATGBICS",
@ -436,6 +441,9 @@ const OTHER_VENDOR_NAMES = [
"FS.COM", "FS.COM",
"GBICS", "GBICS",
"Fluxlight", "Fluxlight",
"SFPcables",
"II-VI / Coherent",
"NADDOD",
]; ];
async function backfillOtherVendors(): Promise<{ total: number; updated: number }> { async function backfillOtherVendors(): Promise<{ total: number; updated: number }> {

View File

@ -0,0 +1,88 @@
/**
* Catalog Details Verifier
*
* Promotes catalog-derived OEM rows to details_verified when the row already
* has complete normalized technical specs and the vendor has a public docs,
* datasheet-library, or website source recorded in the vendors table.
*
* This deliberately does not verify price/image/competitor signals.
*/
import { pool, checkAndSetFullyVerified } from "./db";
import { logger } from "./logger";
const DEFAULT_LIMIT = 5000;
interface Candidate {
id: string;
}
async function verifyCatalogDetails(limit: number): Promise<void> {
logger.info("=== Catalog Details Verifier ===", { limit });
const candidates = await pool.query<Candidate>(`
WITH candidate AS (
SELECT
t.id,
COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) AS source_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE COALESCE(t.details_verified, false) = false
AND COALESCE(t.data_confidence, 'unknown') != 'garbage'
AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'
AND COALESCE(NULLIF(v.datasheet_library_url, ''), NULLIF(v.docs_portal_url, ''), NULLIF(v.website, '')) IS NOT NULL
AND t.part_number IS NOT NULL
AND t.part_number != ''
AND t.form_factor IS NOT NULL
AND t.form_factor != ''
AND t.speed_gbps IS NOT NULL
AND t.speed_gbps > 0
AND t.reach_label IS NOT NULL
AND t.reach_label != ''
AND t.fiber_type IS NOT NULL
AND t.fiber_type != ''
ORDER BY
CASE
WHEN v.type = 'oem' THEN 0
WHEN v.vendor_category IN ('network_switching', 'optics_vendor') THEN 1
ELSE 2
END,
v.name,
t.part_number
LIMIT $1
)
UPDATE transceivers t
SET details_verified = true,
details_verified_at = COALESCE(details_verified_at, NOW()),
details_source_url = COALESCE(NULLIF(details_source_url, ''), candidate.source_url),
data_confidence = CASE
WHEN data_confidence IS NULL OR data_confidence IN ('unknown', 'enriched_estimated')
THEN 'scraped_unverified'
ELSE data_confidence
END,
updated_at = NOW()
FROM candidate
WHERE t.id = candidate.id
RETURNING t.id
`, [limit]);
let fullyVerifiedEarned = 0;
for (const row of candidates.rows) {
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
}
logger.info("Catalog details verifier complete", {
details_verified: candidates.rowCount ?? 0,
fully_verified_earned: fullyVerifiedEarned,
});
}
if (require.main === module) {
const limit = Math.max(1, parseInt(process.env.CATALOG_DETAILS_LIMIT || String(DEFAULT_LIMIT), 10));
verifyCatalogDetails(limit)
.then(() => pool.end())
.catch((err) => {
logger.error("Catalog details verifier failed", { error: (err as Error).message });
pool.end();
process.exit(1);
});
}

View File

@ -1,9 +1,65 @@
# Current TIP Sync State # Current TIP Sync State
Updated: 2026-05-09 18:07 UTC Updated: 2026-05-09 18:16 UTC
## Newest Work ## Newest Work
- TIP global verification continuation on 2026-05-09:
- operator requirement:
- continue until all possible product data is searched, found, verified, and source-backed
- no external AI; use TIP deterministic scrapers/robots only
- keep Erik safe; do not launch a heavy crawler wave
- write crawler/scraper/robot learnings into the TIPLLM training pool
- deployed fixes:
- repaired GAO Tek scraper for the live Woodmart product grid:
- current selector is `.wd-product.product-grid-item`
- product title selector includes `.wd-entities-title a`
- SKU selector includes `.wd-sku`
- fallback now only accepts real `https://gaotek.com/product/...` URLs
- category URLs are excluded from active verification/search counters
- expanded GAO reach parsing:
- 1/2/10/15/20/30/40/50/80/120/140/160 km
- 82/100/300/500/550 m
- mile values converted to rounded km labels
- added `packages/scraper/src/utils/verify-catalog-details.ts`
- promotes details only for complete normalized catalog specs with a vendor website/docs/datasheet source URL
- does not mark price/image/competitor verified
- hardened scheduler reconcile so category URLs are not promoted as details source
- fixed Flexoptix image backfill vendor-name case bug (`Flexoptix` vs `FLEXOPTIX`)
- expanded other-vendor image backfill list for Cisco, Juniper, Arista, 10Gtek, QSFPTEK, SFPcables, Coherent, NADDOD
- crawler/robot runs:
- GAO Tek scraper:
- fetched 20 pages
- extracted 480 real product cards
- found 0 public prices
- reset 6 category/non-product artifacts
- pi-fetch priority wave:
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM
- all jobs completed
- reconcile completed
- equivalence matcher completed
- catalog-details verifier promoted 4,340 details
- image backfill:
- first expanded run updated 48 images
- Flexoptix case fix then updated 12 additional images
- live public TIP health after this pass:
- status `healthy`
- load status `ok`
- memory used `13%`
- active total `17,714`
- `price_verified=11,582`
- `image_verified=12,194`
- `details_verified=16,684`
- `fully_verified=11,052`
- hard truth:
- GAO Tek appears quote-only/no public price in the crawled catalog, so prices remain unverified rather than fabricated
- many OEM rows now have verified details but still lack public prices/images/competitor evidence
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no usable image for those SKUs
- top remaining blockers are mostly public price/image/competitor availability, not detail parsing
- TIPLLM training pool:
- appended `robot-experiences/2026-05-09.jsonl`
- validated JSONL locally
- MAGATAMA FO_BlogLLM RunPod training and adoption closure on 2026-05-09: - MAGATAMA FO_BlogLLM RunPod training and adoption closure on 2026-05-09:
- operator requirement: - operator requirement:
- training success must only count after artifact exists, local import works, smoke tests pass, Ollama alias/version switches, remote MAGATAMA registry is updated, and the live UI reports no active stale job - training success must only count after artifact exists, local import works, smoke tests pass, Ollama alias/version switches, remote MAGATAMA registry is updated, and the live UI reports no active stale job

View File

@ -0,0 +1,57 @@
# TIP Global Verification Continuation — 2026-05-09
## Scope
- Continue TIP verification with deterministic scrapers/robots only.
- Keep Erik safe; no heavy Playwright/proxmox-heavy wave.
- Write learnings into the TIPLLM training pool.
## Implemented
- Repaired GAO Tek scraper for the current Woodmart product-card layout.
- Excluded category URLs from active product verification/search counters.
- Added a catalog-details verifier for complete source-backed OEM/catalog specs.
- Fixed Flexoptix image backfill case sensitivity.
- Expanded `og:image` backfill vendor coverage.
- Hardened scheduler reconcile so category URLs are not promoted as details source.
## Live Runs
- GAO Tek:
- 20 pages fetched.
- 480 real product cards extracted.
- 0 public prices found.
- 6 category/non-product artifacts reset.
- Priority pi-fetch wave:
- GAO Tek, Juniper OEM/MX/QFX, Cisco Nexus/Catalyst/ASR, Ascent, Eoptolink, Flexoptix, Flexoptix supported vendors, Arista OEM.
- All jobs completed.
- Reconcile completed.
- Equivalence matcher completed.
- Catalog-details verifier:
- 4,340 details verified.
- Image backfill:
- 48 images from expanded vendor list.
- 12 additional Flexoptix images after case-insensitive vendor fix.
## Final Observed State
- Public health: healthy.
- Load: ok.
- Memory: 13%.
- Active total: 17,714.
- Price verified: 11,582.
- Image verified: 12,194.
- Details verified: 16,684.
- Fully verified: 11,052.
## Remaining Truth
- GAO Tek is quote-only/no public price in the crawled catalog; prices were not fabricated.
- Many OEM rows now have verified details but still need public images/prices/competitor evidence.
- Flexoptix still has 110 image-missing SKUs after GraphQL returned no image.
- Top remaining blockers are dominated by price/image/competitor availability.
## Training Pool
- Appended one JSONL event to `/tmp/tip-training-data/robot-experiences/2026-05-09.jsonl`.
- JSONL validated successfully.