diff --git a/packages/api/src/routes/health.ts b/packages/api/src/routes/health.ts index b349d24..240d94c 100644 --- a/packages/api/src/routes/health.ts +++ b/packages/api/src/routes/health.ts @@ -17,6 +17,10 @@ healthRouter.get("/", async (_req: Request, res: Response) => { const verStats = await pool.query(` SELECT COUNT(*) FILTER (WHERE price_verified) AS price_verified, + COUNT(*) FILTER (WHERE price_status = 'public_price') AS price_public_price, + COUNT(*) FILTER (WHERE price_status = 'no_public_price') AS price_no_public_price, + COUNT(*) FILTER (WHERE price_status = 'ambiguous') AS price_ambiguous, + COUNT(*) FILTER (WHERE price_status IN ('unknown', 'needs_research')) AS price_needs_research, COUNT(*) FILTER (WHERE image_verified) AS image_verified, COUNT(*) FILTER (WHERE details_verified) AS details_verified, COUNT(*) FILTER (WHERE fully_verified) AS fully_verified, @@ -101,6 +105,12 @@ healthRouter.get("/", async (_req: Request, res: Response) => { }, verification: { price_verified: Number(v.price_verified || 0), + price_status: { + public_price: Number(v.price_public_price || 0), + no_public_price: Number(v.price_no_public_price || 0), + ambiguous: Number(v.price_ambiguous || 0), + needs_research: Number(v.price_needs_research || 0), + }, image_verified: Number(v.image_verified || 0), details_verified: Number(v.details_verified || 0), fully_verified: Number(v.fully_verified || 0), diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 28adc05..504cd38 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -19,8 +19,10 @@ "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts", + "verify:part-number-details": "tsx src/utils/verify-part-number-details.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", + "verify:price-availability": "tsx src/utils/resolve-price-availability.ts", "verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts", "verify:open-competitor-status": "tsx src/utils/resolve-open-competitor-status.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index f5b16ef..e6eaea4 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -21,7 +21,7 @@ export const db = pool; export async function recordVerificationEvidence(params: { transceiverId: string; - verificationType: "price" | "image" | "details" | "competitor_match" | "competitor_no_match" | "competitor_ambiguous" | "artifact_quarantine"; + verificationType: "price" | "price_unavailable" | "image" | "details" | "competitor_match" | "competitor_no_match" | "competitor_ambiguous" | "artifact_quarantine"; sourceUrl?: string; sourceVendorId?: string; evidenceValue?: Record; diff --git a/packages/scraper/src/utils/resolve-price-availability.ts b/packages/scraper/src/utils/resolve-price-availability.ts new file mode 100644 index 0000000..6f0cfa9 --- /dev/null +++ b/packages/scraper/src/utils/resolve-price-availability.ts @@ -0,0 +1,194 @@ +import { pool, recordVerificationEvidence } from "./db"; + +const EXCLUDED_CATEGORIES = [ + "NonTransceiver", + "Accessory", + "Adapter / Converter", + "Switch / Media Converter", + "Switch / Network Infrastructure", + "NIC / Adapter", + "Mux / Passive Optical", + "Product Family", + "Loopback / Test Module", +]; + +const NO_PUBLIC_PRICE_VENDOR_POLICIES: Record = { + "ADTRAN": "oem_catalog_quote_only", + "Alcatel-Lucent Enterprise": "oem_catalog_quote_only", + "Allied Telesis": "oem_catalog_quote_only", + "Arista Networks": "oem_catalog_quote_only", + "Ascent Optics": "manufacturer_catalog_no_public_checkout_price", + "Black Box": "enterprise_catalog_no_public_checkout_price", + "Brocade": "oem_catalog_quote_only", + "Calix": "oem_catalog_quote_only", + "Cambium Networks": "oem_catalog_quote_only", + "Check Point": "oem_catalog_quote_only", + "Ciena": "oem_catalog_quote_only", + "Cisco Systems": "oem_catalog_quote_only", + "Coherent Corp": "manufacturer_catalog_no_public_checkout_price", + "Dell Technologies": "oem_catalog_quote_only", + "D-LINK": "oem_catalog_quote_only", + "DZS": "oem_catalog_quote_only", + "Eoptolink": "manufacturer_catalog_no_public_checkout_price", + "Ericsson": "oem_catalog_quote_only", + "EXFO": "test_equipment_vendor_quote_only", + "Extreme Networks": "oem_catalog_quote_only", + "F5 Networks": "oem_catalog_quote_only", + "Fortinet": "oem_catalog_quote_only", + "GAO Tek": "quote_request_only_no_public_checkout_price", + "HPE Aruba Networking": "oem_catalog_quote_only", + "Huawei": "oem_catalog_quote_only", + "Infinera": "oem_catalog_quote_only", + "Intel": "oem_catalog_quote_only", + "II-VI / Coherent": "manufacturer_catalog_no_public_checkout_price", + "Juniper Networks": "oem_catalog_quote_only", + "Keysight Technologies": "test_equipment_vendor_quote_only", + "LANCOM Systems": "oem_catalog_quote_only", + "Lumentum": "manufacturer_catalog_no_public_checkout_price", + "Marvell": "component_vendor_no_public_checkout_price", + "MikroTik": "oem_catalog_quote_only", + "Netgear": "oem_catalog_quote_only", + "NVIDIA Mellanox": "oem_catalog_quote_only", + "NVIDIA Networking": "oem_catalog_quote_only", + "Nokia": "oem_catalog_quote_only", + "Palo Alto Networks": "oem_catalog_quote_only", + "Ribbon Communications": "oem_catalog_quote_only", + "Ruckus Networks (CommScope)": "oem_catalog_quote_only", + "SmartOptics": "manufacturer_catalog_no_public_checkout_price", + "Solarflare": "oem_catalog_quote_only", + "Spirent Communications": "test_equipment_vendor_quote_only", + "Stordis": "oem_catalog_quote_only", + "T&S Communication": "quote_request_only_or_price_zero_no_public_checkout_price", + "Turbolink": "manufacturer_catalog_no_public_checkout_price", + "Ubiquiti Networks": "oem_catalog_quote_only", + "Viavi Solutions": "test_equipment_vendor_quote_only", + "Zyxel": "oem_catalog_quote_only", + "ZTE": "oem_catalog_quote_only", +}; + +const NO_PUBLIC_PRICE_VENDOR_PATTERNS: Array<[RegExp, string]> = [ + [/\b(cisco|juniper|arista|nokia|huawei|hpe|hewlett|dell|extreme|brocade|foundry|ruckus|commscope|commScope|arris|aruba|viptela|pure storage|hitachi vantara|ibm storage)\b/i, "oem_catalog_quote_only"], + [/\b(check point|palo alto|fortinet|sonicwall|sophos|watchguard|barracuda|hillstone)\b/i, "security_vendor_catalog_quote_only"], + [/\b(hirschmann|siemens|ruggedcom|antaira|etherwan|red lion|korenix|perle|transition networks|rad data|moxa|omron|abb|beckhoff|belden|ge grid|phoenix contact)\b/i, "industrial_networking_catalog_no_public_checkout_price"], + [/\b(packetlight|harmonic|casa systems|fujitsu|fiberhome|nec|h3c|dlink|d-link|tp-link|netgear|zyxel|ubiquiti|lancom|mikrotik|senao|engenius|telco systems|tejas|teleste|packetfront|westermo|supermicro|coriant|sycamore|dragonwave|radiflow|datang|vecima|cradlepoint|ruijie|utstarcom|edgewater|grass valley|pica8|centec|ceragon|clearfield|cumulus networks|samsung networks|telrad|siklu|drivenets|pluribus|viasat|haivision|mavenir|audiocodes)\b/i, "oem_catalog_quote_only"], + [/\b(anritsu|keysight|viavi|exfo|ixia|spirent|tektronix|rohde|netscout|teledyne lecroy)\b/i, "test_equipment_vendor_quote_only"], + [/\b(intel|marvell|qlogic|chelsio|solarflare|emulex|lanner|kontron|advantech|source photonics|innolight|o-net|emcore|applied optoelectronics|cimc|sumitomo|lumentum|coherent|reflex photonics|ofs|corning|hisense broadband|oplink|accelink|neophotonics|skylane optics)\b/i, "component_vendor_no_public_checkout_price"], + [/\b(adva|ekinops|infinera|ciena|ribbon|nortel|3com|avaya|comnet|comtrend|evertz|ip infusion|stordis|calix|dzs|adtran|allied telesis|eci telecom|zte access|netapp|isol?an|curtiss-wright|l3harris|sierra wireless|black box|ericsson radio)\b/i, "oem_catalog_quote_only"], + [/\b(google cloud|microsoft azure|amazon web services|meta)\b/i, "hyperscaler_no_public_checkout_price"], + [/\b(rockwell automation|schneider electric|schweitzer engineering)\b/i, "industrial_networking_catalog_no_public_checkout_price"], + [/\b(sonic|vmware|versa networks)\b/i, "software_platform_no_public_transceiver_price"], +]; + +const PUBLIC_PRICE_VENDOR_PATTERNS = [ + /\b(10gtek|sfpcables|shopfiber24|gbics|fs\.com|flexoptix|naddod|qsfptek|atgbics|fiber24)\b/i, +]; + +function getNoPublicPriceReason(vendorName: string): string | undefined { + if (NO_PUBLIC_PRICE_VENDOR_POLICIES[vendorName]) return NO_PUBLIC_PRICE_VENDOR_POLICIES[vendorName]; + if (PUBLIC_PRICE_VENDOR_PATTERNS.some((regex) => regex.test(vendorName))) return undefined; + for (const [regex, reason] of NO_PUBLIC_PRICE_VENDOR_PATTERNS) { + if (regex.test(vendorName)) return reason; + } + return undefined; +} + +type Candidate = { + id: string; + vendorName: string; + partNumber: string; + productUrl: string | null; + priceStatus: string | null; +}; + +async function main(): Promise { + const vendorFilter = (process.env["PRICE_AVAILABILITY_VENDOR"] || "").trim(); + const limit = parseInt(process.env["PRICE_AVAILABILITY_LIMIT"] || "1000", 10); + const apply = process.env["PRICE_AVAILABILITY_APPLY"] === "1"; + const vendorNames = vendorFilter + ? vendorFilter.split(",").map((v) => v.trim()).filter(Boolean) + : []; + + const vendorWhere = vendorNames.length > 0 ? "AND v.name = ANY($3)" : ""; + const params: unknown[] = [EXCLUDED_CATEGORIES, limit]; + if (vendorNames.length > 0) params.push(vendorNames); + + const result = await pool.query( + `SELECT t.id, + v.name AS "vendorName", + t.part_number AS "partNumber", + t.product_page_url AS "productUrl", + COALESCE(t.price_status, 'unknown') AS "priceStatus" + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE COALESCE(t.category, '') <> ALL($1) + AND COALESCE(t.data_confidence, 'unknown') != 'garbage' + AND COALESCE(t.price_verified, false) = false + AND COALESCE(t.price_status, 'needs_research') IN ('unknown', 'needs_research', 'ambiguous') + ${vendorWhere} + ORDER BY v.name, t.part_number + LIMIT $2`, + params, + ); + + let noPublicPrice = 0; + let skipped = 0; + + console.log("=== Price availability resolver ===", { vendorNames, limit, apply, count: result.rows.length }); + + for (const row of result.rows) { + const reason = getNoPublicPriceReason(row.vendorName); + if (!reason) { + skipped++; + continue; + } + + if (!apply) { + console.log("dry-run price availability", { + vendor: row.vendorName, + partNumber: row.partNumber, + outcome: "no_public_price", + reason, + }); + noPublicPrice++; + continue; + } + + await pool.query( + `UPDATE transceivers + SET price_status = 'no_public_price', + price_status_updated_at = NOW(), + price_unavailable_verified_at = COALESCE(price_unavailable_verified_at, NOW()), + price_unavailable_reason = $2, + updated_at = NOW() + WHERE id = $1`, + [row.id, reason], + ); + + await recordVerificationEvidence({ + transceiverId: row.id, + verificationType: "price_unavailable", + sourceUrl: row.productUrl || undefined, + evidenceValue: { + outcome: "no_public_price", + reason, + vendor: row.vendorName, + partNumber: row.partNumber, + }, + robotName: "verify:price-availability", + confidence: 0.9, + }); + noPublicPrice++; + } + + console.log("Price availability resolver complete", { noPublicPrice, skipped, apply }); +} + +if (require.main === module) { + main() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/utils/verify-part-number-details.ts b/packages/scraper/src/utils/verify-part-number-details.ts new file mode 100644 index 0000000..cee2264 --- /dev/null +++ b/packages/scraper/src/utils/verify-part-number-details.ts @@ -0,0 +1,160 @@ +import { pool, markDetailsVerified, recordVerificationEvidence } from "./db"; + +const EXCLUDED_CATEGORIES = [ + "NonTransceiver", + "Accessory", + "Adapter / Converter", + "Switch / Media Converter", + "Switch / Network Infrastructure", + "NIC / Adapter", + "Mux / Passive Optical", + "Product Family", + "Loopback / Test Module", +]; + +type Candidate = { + id: string; + vendorName: string; + partNumber: string; + productUrl: string | null; + formFactor: string | null; + speedGbps: string | number | null; + reachLabel: string | null; + fiberType: string | null; +}; + +function inferSpeed(partNumber: string): { speed: string; speedGbps: number; rule: string } | null { + const pn = partNumber.toUpperCase(); + const compact = pn.replace(/[^A-Z0-9]+/g, "-"); + + const rules: Array<[RegExp, string, number, string]> = [ + [/\b(800G|800GE|800GBE)\b|JCO800|8X100G/, "800G", 800, "800G token"], + [/\b(400G|400GE|400GBE|400ZR)\b|JCO400|4X100G/, "400G", 400, "400G token"], + [/\b(200G|200GE|200GBE)\b|2X100G/, "200G", 200, "200G token"], + [/\b(128GFC)\b/, "128G FC", 128, "128GFC token"], + [/\b(100G|100GE|100GBE|CGE)\b|100GBASE|4X25G|2X50G|10X10G/, "100G", 100, "100G token"], + [/\b(50G|50GE|50GBE)\b/, "50G", 50, "50G token"], + [/\b(40G|40GE|40GBE)\b|40GBASE/, "40G", 40, "40G token"], + [/\b(32GFC)\b/, "32G FC", 32, "32GFC token"], + [/\b(25G|25GE|25GBE)\b|25GBASE/, "25G", 25, "25G token"], + [/\b(16GFC)\b/, "16G FC", 16, "16GFC token"], + [/\b(10G|10GE|10GBE)\b|10GBASE|SFP10G|XFP-10GE/, "10G", 10, "10G token"], + [/\b(2[.-]?5G|2[.-]?5GE|2[.-]?5GBE)\b/, "2.5G", 2.5, "2.5G token"], + [/\b(1G|1GE|1000BASE)\b|CTP-SFP-1GE|SFP-GE|GLC-(SX|LX|LH|ZX|EX|BX|TE|T)\b/, "1G", 1, "1G token"], + ]; + + for (const [regex, speed, speedGbps, rule] of rules) { + if (regex.test(compact) || regex.test(pn)) { + return { speed, speedGbps, rule }; + } + } + return null; +} + +async function main(): Promise { + const vendorFilter = (process.env["PART_DETAILS_VENDOR"] || "").trim(); + const limit = parseInt(process.env["PART_DETAILS_LIMIT"] || "1000", 10); + const apply = process.env["PART_DETAILS_APPLY"] === "1"; + const vendorNames = vendorFilter.split(",").map((v) => v.trim()).filter(Boolean); + + const vendorWhere = vendorNames.length > 0 ? "AND v.name = ANY($3)" : ""; + const params: unknown[] = [EXCLUDED_CATEGORIES, limit]; + if (vendorNames.length > 0) params.push(vendorNames); + + const result = await pool.query( + `SELECT t.id, + v.name AS "vendorName", + t.part_number AS "partNumber", + t.product_page_url AS "productUrl", + t.form_factor AS "formFactor", + t.speed_gbps AS "speedGbps", + t.reach_label AS "reachLabel", + t.fiber_type AS "fiberType" + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE COALESCE(t.category, '') <> ALL($1) + AND COALESCE(t.data_confidence, 'unknown') != 'garbage' + AND COALESCE(t.part_number, '') != '' + AND COALESCE(t.form_factor, '') != '' + AND COALESCE(t.reach_label, '') != '' + AND COALESCE(t.fiber_type, '') != '' + AND (t.speed_gbps IS NULL OR t.speed_gbps = 0 OR COALESCE(t.details_verified, false) = false) + ${vendorWhere} + ORDER BY v.name, t.part_number + LIMIT $2`, + params, + ); + + let candidates = 0; + let updated = 0; + let details = 0; + let skipped = 0; + + console.log("=== Part-number details verifier ===", { vendorNames, limit, apply, count: result.rows.length }); + + for (const row of result.rows) { + const inferred = inferSpeed(row.partNumber); + if (!inferred) { + skipped++; + continue; + } + candidates++; + const currentSpeed = Number(row.speedGbps || 0); + const shouldUpdateSpeed = currentSpeed === 0 || currentSpeed === null; + if (!apply) { + console.log("dry-run part details", { + vendor: row.vendorName, + partNumber: row.partNumber, + currentSpeed, + inferred, + canMarkDetails: Boolean(row.formFactor && row.reachLabel && row.fiberType), + }); + continue; + } + + if (shouldUpdateSpeed) { + await pool.query( + `UPDATE transceivers + SET speed = $2, + speed_gbps = $3, + updated_at = NOW() + WHERE id = $1`, + [row.id, inferred.speed, inferred.speedGbps], + ); + updated++; + } + + const marked = await markDetailsVerified({ + transceiverId: row.id, + sourceUrl: row.productUrl || undefined, + }); + if (marked) details++; + + await recordVerificationEvidence({ + transceiverId: row.id, + verificationType: "details", + sourceUrl: row.productUrl || undefined, + evidenceValue: { + rule: "part_number_speed_inference", + partNumber: row.partNumber, + speed: inferred.speed, + speedGbps: inferred.speedGbps, + inferenceRule: inferred.rule, + }, + robotName: "verify:part-number-details", + confidence: 0.95, + }); + } + + console.log("Part-number details verifier complete", { candidates, updated, details, skipped, apply }); +} + +if (require.main === module) { + main() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/utils/verify-product-page-assets.ts b/packages/scraper/src/utils/verify-product-page-assets.ts index 67b25df..31844f9 100644 --- a/packages/scraper/src/utils/verify-product-page-assets.ts +++ b/packages/scraper/src/utils/verify-product-page-assets.ts @@ -156,9 +156,9 @@ function inferDetails(text: string): InferredDetails { [/\b32\s*gfc\b/i, "32G FC", 32], [/\b16\s*gfc\b/i, "16G FC", 16], [/\b10\s*g(?:base)?\b/i, "10G", 10], - [/\b1\.?25\s*g(?:base)?\b|\b1000base\b|\b1\s*g(?:base)?\b/i, "1G", 1], + [/\b1[\s.-]?25\s*g(?:b(?:ase)?|bps)?\b|\b1000base\b|\b1\s*g(?:base)?\b/i, "1G", 1], [/(^|[^.\d-])25\s*g(?:base)?\b/i, "25G", 25], - [/\b2[.-]?5\s*g(?:base)?\b/i, "2.5G", 2.5], + [/\b2[\s.-]?5\s*g(?:base)?\b/i, "2.5G", 2.5], ]; for (const [regex, speed, speedGbps] of speedPatterns) { if (regex.test(text)) { diff --git a/sql/105-price-status-and-unavailable-evidence.sql b/sql/105-price-status-and-unavailable-evidence.sql new file mode 100644 index 0000000..4510d94 --- /dev/null +++ b/sql/105-price-status-and-unavailable-evidence.sql @@ -0,0 +1,73 @@ +-- Migration 105: Explicit price availability state +-- +-- Public price verification remains strict: price_verified requires a real +-- source-backed price observation. Quote-only/OEM/no-public-price cases are +-- represented separately so TIP can close research loops without inventing +-- prices. + +ALTER TABLE transceivers + ADD COLUMN IF NOT EXISTS price_status VARCHAR(32) NOT NULL DEFAULT 'unknown', + ADD COLUMN IF NOT EXISTS price_status_updated_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS price_unavailable_verified_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS price_unavailable_reason TEXT; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 FROM pg_constraint + WHERE conname = 'transceivers_price_status_check' + ) THEN + ALTER TABLE transceivers DROP CONSTRAINT transceivers_price_status_check; + END IF; + + ALTER TABLE transceivers + ADD CONSTRAINT transceivers_price_status_check + CHECK (price_status IN ( + 'unknown', + 'public_price', + 'no_public_price', + 'needs_research', + 'ambiguous' + )); +END $$; + +UPDATE transceivers +SET price_status = CASE + WHEN price_verified = true THEN 'public_price' + WHEN price_status = 'unknown' THEN 'needs_research' + ELSE price_status + END, + price_status_updated_at = COALESCE(price_status_updated_at, NOW()) +WHERE price_status IS NULL + OR price_status = 'unknown' + OR price_verified = true; + +CREATE INDEX IF NOT EXISTS idx_transceivers_price_status + ON transceivers (price_status); + +CREATE INDEX IF NOT EXISTS idx_transceivers_no_public_price + ON transceivers (price_unavailable_verified_at) + WHERE price_status = 'no_public_price'; + +ALTER TABLE transceiver_verification_evidence + DROP CONSTRAINT IF EXISTS transceiver_verification_evidence_verification_type_check; + +ALTER TABLE transceiver_verification_evidence + ADD CONSTRAINT transceiver_verification_evidence_verification_type_check + CHECK ( + verification_type::text = ANY ( + ARRAY[ + 'price', + 'price_unavailable', + 'image', + 'details', + 'competitor_match', + 'competitor_no_match', + 'competitor_ambiguous', + 'artifact_quarantine' + ] + ) + ); + +COMMENT ON COLUMN transceivers.price_status IS + 'Resolution state for price evidence: public_price, no_public_price, needs_research, ambiguous, unknown. price_verified remains true only for real public price observations.'; diff --git a/sync/CURRENT.md b/sync/CURRENT.md index c34e992..dcd4ddf 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,50 @@ # Current TIP Sync State -Updated: 2026-05-09 23:09 UTC +Updated: 2026-05-09 23:15 UTC ## Newest Work +- TIP continuation on 2026-05-10 local / 2026-05-09 UTC: + - added `verify:part-number-details` + - deterministic part-number speed inference for rows where form factor/reach/fiber already exist but `speed_gbps=0` + - dry-run caught Cisco `GLC-FE-*` as Fast Ethernet trap; rule hardened before apply + - live apply: + - Juniper Networks: `375` speed updates/details verified + - Cisco Systems: `176` speed updates/details verified + - evidence count `verify:part-number-details`: `551` + - health detail count moved to `16913` + - added migration `sql/105-price-status-and-unavailable-evidence.sql` + - new `transceivers.price_status` + - new `price_unavailable` evidence type + - strict rule remains: `price_verified` only means a real public price observation exists + - added `verify:price-availability` + - resolves quote-only/OEM/manufacturer/test-equipment/hyperscaler vendors to `price_status=no_public_price` + - writes `price_unavailable` evidence, does not fabricate price rows + - preserves real retail/source-discovery cases as `needs_research` + - Health API now exposes price status buckets + - live price-status result: + - `public_price=11414` + - `no_public_price=5595` + - `needs_research=186` + - `ambiguous=0` + - remaining price research is now limited to real retail/source-discovery vendors: + - `10Gtek=126` + - `SFPcables=31` + - `ShopFiber24=24` + - `ATGBICS=3` + - `Vcelink=2` + - SFPcables search tests for 10Gtek part numbers did not return reliable direct hits; remaining 10Gtek work is source/alias discovery, not no-public-price classification + - live health: + - active products: `17195` + - price verified: `11414` + - image verified: `12104` + - details verified: `16913` + - fully verified: `10505` + - competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5790` + - TIPLLM training pool updated with: + - part-number details verifier rules + - price_status/no-public-price model + - MAGATAMA all-lane RunPod training block started on 2026-05-09: - user requested all trainable LLM lanes via RunPod - lanes in scope: diff --git a/sync/history/2026-05-10-tip-price-status-and-oem-detail-resolution.md b/sync/history/2026-05-10-tip-price-status-and-oem-detail-resolution.md new file mode 100644 index 0000000..482846f --- /dev/null +++ b/sync/history/2026-05-10-tip-price-status-and-oem-detail-resolution.md @@ -0,0 +1,108 @@ +# 2026-05-10 — TIP Price Status + OEM Detail Resolution + +## Part-Number Details Verifier + +Added `packages/scraper/src/utils/verify-part-number-details.ts`. + +Purpose: + +- close detail gaps where form factor, reach and fiber are already present but `speed_gbps=0` +- use conservative part-number inference only +- write canonical `details` evidence with robot `verify:part-number-details` + +Safety catch: + +- Cisco `GLC-FE-*` appeared in dry-run +- these are Fast Ethernet, so the generic `GLC-* => 1G` rule was rejected +- final rule only infers 1G for safe patterns such as `GLC-SX`, `GLC-LX`, `GLC-LH`, `GLC-ZX`, `GLC-EX`, `GLC-BX`, `GLC-T`, `GLC-TE` + +Live apply: + +- Juniper Networks: + - `375` candidates updated + - `375` details verified +- Cisco Systems: + - `176` candidates updated + - `176` details verified +- evidence: + - `details | verify:part-number-details | 551` + +Result: + +- `details_verified` increased to `16913` +- Juniper detail gaps dropped from `173` to `43` +- Cisco detail gaps dropped from `146` to `80` + +## Price Status Model + +Added migration `sql/105-price-status-and-unavailable-evidence.sql`. + +New fields: + +- `transceivers.price_status` +- `transceivers.price_status_updated_at` +- `transceivers.price_unavailable_verified_at` +- `transceivers.price_unavailable_reason` + +Allowed states: + +- `public_price` +- `no_public_price` +- `needs_research` +- `ambiguous` +- `unknown` + +Important semantic rule: + +- `price_verified=true` still requires a real public price observation +- quote-only/OEM/no-checkout cases use `price_status=no_public_price` +- no price rows are fabricated + +Added `price_unavailable` evidence type. + +Added `packages/scraper/src/utils/resolve-price-availability.ts`. + +Live apply: + +- resolved quote-only/OEM/manufacturer/test-equipment/hyperscaler vendors to `no_public_price` +- wrote `5361` `price_unavailable` evidence rows +- preserved real retail/source-discovery vendors as `needs_research` + +Health API now exposes: + +- `price_status.public_price` +- `price_status.no_public_price` +- `price_status.ambiguous` +- `price_status.needs_research` + +Live health after this pass: + +- active products: `17195` +- price verified: `11414` +- price status: + - `public_price=11414` + - `no_public_price=5595` + - `needs_research=186` + - `ambiguous=0` +- image verified: `12104` +- details verified: `16913` +- fully verified: `10505` +- competitor status: + - `matched=10775` + - `no_valid_match=74` + - `ambiguous=556` + - `needs_research=5790` + +Remaining price research: + +- `10Gtek=126` +- `SFPcables=31` +- `ShopFiber24=24` +- `ATGBICS=3` +- `Vcelink=2` + +Interpretation: + +- remaining price work is now real retail/source discovery +- SFPcables search tests for 10Gtek part numbers did not return reliable direct hits +- 10Gtek requires a source/alias discovery robot rather than no-public-price classification diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 195c2d2..31bea1d 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -49,3 +49,5 @@ {"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]} {"id":"tip-20260509-product-page-asset-verifier-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Design deterministic product-page verifiers that improve evidence without broad crawling or invented values."},{"role":"user","content":"How should TIP safely backfill images and details from existing vendor product URLs?"},{"role":"assistant","content":"Use a targeted product-page asset verifier. Select active rows with an existing product_page_url and missing image/details; fetch one page per row with low concurrency; extract only likely product images from og:image, twitter:image, Magento product image IDs, or filtered product image URLs; infer details only from part number, product URL, and page title, not from full body navigation. Always dry-run first to catch navigation pollution such as T&S 800G menu links or 1.25G being misread as 25G. Apply through `markImageVerified` and `markDetailsVerified` so evidence is recorded."}]} {"id":"tip-20260509-quote-only-tscom-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Public-price verification requires a real public price."},{"role":"user","content":"T&S Communication product pages have valid images and details but expose no real public prices, only no price or price 0.00. What should TIP do?"},{"role":"assistant","content":"Do not mark price_verified. Treat T&S as image/detail-verifiable but public-price-missing until a real price observation exists. `price: 0.00`, quote/request flows, missing amount fields, or SEO filenames containing `price` are not price evidence. Keep image_verified/details_verified true when source-backed, keep price_verified false, and document the vendor as quote-only/public-price-unavailable rather than fabricating a market price."}]} +{"id":"tip-20260510-part-number-details-verifier-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Use deterministic part-number inference only for conservative, auditable detail repair."},{"role":"user","content":"How should TIP close OEM detail gaps when form factor, reach and fiber are known but speed_gbps is zero?"},{"role":"assistant","content":"Use a part-number details verifier. Infer speed only from unambiguous tokens such as 800G, 400G, 100GBASE, 40GBASE, SFP10G, XFP-10GE, 4X25G, 2X50G, 10X10G, CTP-SFP-1GE, and known FC tokens. Dry-run first and exclude traps such as Cisco GLC-FE Fast Ethernet; do not treat all GLC-* as 1G. Update speed/speed_gbps, then mark details through the canonical evidence path with robot_name `verify:part-number-details`."}]} +{"id":"tip-20260510-price-status-model-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Separate real public prices from resolved no-public-price states."},{"role":"user","content":"How should TIP handle OEM, component, hyperscaler, test-equipment, and quote-only vendors with no public checkout price?"},{"role":"assistant","content":"Do not set `price_verified` unless there is a real public price observation. Add or use `price_status`: `public_price` for true price observations, `no_public_price` for source/policy-backed quote-only or OEM catalog cases, `needs_research` for real retail/source discovery work, and `ambiguous` for uncertain cases. Write `price_unavailable` evidence with the vendor, part number, reason, and source URL if available. Keep retail vendors such as 10Gtek, SFPcables, ShopFiber24, ATGBICS, Vcelink, FS.com and Flexoptix out of no-public-price policies because they should have real prices."}]}