feat: close open competitor research states
This commit is contained in:
parent
fb9db56617
commit
635a102932
@ -13,11 +13,15 @@
|
|||||||
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
"scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts",
|
||||||
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
"scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts",
|
||||||
"scrape:gaotek:details": "tsx src/scrapers/gaotek-detail-pages.ts",
|
"scrape:gaotek:details": "tsx src/scrapers/gaotek-detail-pages.ts",
|
||||||
|
"scrape:gbics": "tsx src/scrapers/gbics.ts",
|
||||||
|
"scrape:tscom": "tsx src/scrapers/tscom.ts",
|
||||||
|
"scrape:sfpcables": "tsx src/scrapers/sfpcables.ts",
|
||||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||||
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
||||||
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
||||||
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
||||||
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
||||||
|
"verify:open-competitor-status": "tsx src/utils/resolve-open-competitor-status.ts",
|
||||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||||
"scrape:news": "tsx src/scrapers/news.ts",
|
"scrape:news": "tsx src/scrapers/news.ts",
|
||||||
|
|||||||
@ -2882,7 +2882,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
competitor_status_updated_at = NOW()
|
competitor_status_updated_at = NOW()
|
||||||
WHERE id = $1
|
WHERE id = $1
|
||||||
AND competitor_verified = false
|
AND competitor_verified = false
|
||||||
AND COALESCE(competitor_status, 'unknown') NOT IN ('no_valid_match')
|
AND COALESCE(competitor_status, 'unknown') NOT IN ('no_valid_match', 'ambiguous')
|
||||||
`, [fx.id]);
|
`, [fx.id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -255,6 +255,7 @@ export async function scrapeGbics(): Promise<void> {
|
|||||||
try {
|
try {
|
||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: product.partNumber, vendorId,
|
partNumber: product.partNumber, vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
formFactor: product.formFactor, speedGbps: product.speedGbps,
|
formFactor: product.formFactor, speedGbps: product.speedGbps,
|
||||||
speed: product.speed, reachMeters: product.reachMeters,
|
speed: product.speed, reachMeters: product.reachMeters,
|
||||||
reachLabel: product.reachLabel, fiberType: product.fiberType,
|
reachLabel: product.reachLabel, fiberType: product.fiberType,
|
||||||
|
|||||||
@ -221,6 +221,7 @@ export async function scrapeTsCom(): Promise<void> {
|
|||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
partNumber: product.partNumber,
|
partNumber: product.partNumber,
|
||||||
vendorId,
|
vendorId,
|
||||||
|
productUrl: product.url,
|
||||||
formFactor: product.formFactor,
|
formFactor: product.formFactor,
|
||||||
speedGbps: product.speedGbps,
|
speedGbps: product.speedGbps,
|
||||||
speed: product.speed,
|
speed: product.speed,
|
||||||
|
|||||||
@ -21,7 +21,7 @@ export const db = pool;
|
|||||||
|
|
||||||
export async function recordVerificationEvidence(params: {
|
export async function recordVerificationEvidence(params: {
|
||||||
transceiverId: string;
|
transceiverId: string;
|
||||||
verificationType: "price" | "image" | "details" | "competitor_match" | "competitor_no_match" | "artifact_quarantine";
|
verificationType: "price" | "image" | "details" | "competitor_match" | "competitor_no_match" | "competitor_ambiguous" | "artifact_quarantine";
|
||||||
sourceUrl?: string;
|
sourceUrl?: string;
|
||||||
sourceVendorId?: string;
|
sourceVendorId?: string;
|
||||||
evidenceValue?: Record<string, unknown>;
|
evidenceValue?: Record<string, unknown>;
|
||||||
|
|||||||
247
packages/scraper/src/utils/resolve-open-competitor-status.ts
Normal file
247
packages/scraper/src/utils/resolve-open-competitor-status.ts
Normal file
@ -0,0 +1,247 @@
|
|||||||
|
/**
|
||||||
|
* Open competitor status resolver.
|
||||||
|
*
|
||||||
|
* Closes fully populated products that are still stuck in needs_research:
|
||||||
|
* - no strict candidates and no active equivalence hints => no_valid_match
|
||||||
|
* - one or more strict candidates or active equivalence hints => ambiguous
|
||||||
|
*
|
||||||
|
* This does not invent matches. It turns endless queues into explicit,
|
||||||
|
* auditable research states.
|
||||||
|
*
|
||||||
|
* Default is dry-run. Set OPEN_COMPETITOR_APPLY=1 to write status/evidence.
|
||||||
|
*/
|
||||||
|
import { pool, checkAndSetFullyVerified, recordVerificationEvidence } from "./db";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
|
||||||
|
const EXCLUDED_CATEGORIES = [
|
||||||
|
"NonTransceiver",
|
||||||
|
"Accessory",
|
||||||
|
"Adapter / Converter",
|
||||||
|
"Switch / Media Converter",
|
||||||
|
"Switch / Network Infrastructure",
|
||||||
|
"NIC / Adapter",
|
||||||
|
"Mux / Passive Optical",
|
||||||
|
"Product Family",
|
||||||
|
"Loopback / Test Module",
|
||||||
|
];
|
||||||
|
|
||||||
|
type OpenCompetitorRow = {
|
||||||
|
id: string;
|
||||||
|
part_number: string;
|
||||||
|
vendor_name: string;
|
||||||
|
form_factor: string;
|
||||||
|
speed_gbps: string;
|
||||||
|
reach_meters: string | null;
|
||||||
|
fiber_type: string | null;
|
||||||
|
strict_candidate_count: string;
|
||||||
|
equivalence_hint_count: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function applyMode(): boolean {
|
||||||
|
return process.env.OPEN_COMPETITOR_APPLY === "1";
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resolveOpenCompetitorStatus(): Promise<void> {
|
||||||
|
const limit = Math.max(1, parseInt(process.env.OPEN_COMPETITOR_LIMIT || "1000", 10));
|
||||||
|
const vendorFilter = process.env.OPEN_COMPETITOR_VENDOR || "";
|
||||||
|
const apply = applyMode();
|
||||||
|
|
||||||
|
logger.info("=== Open competitor status resolver ===", { limit, vendorFilter, apply });
|
||||||
|
|
||||||
|
const result = await pool.query<OpenCompetitorRow>(`
|
||||||
|
WITH active AS (
|
||||||
|
SELECT t.*, v.name AS vendor_name
|
||||||
|
FROM transceivers t
|
||||||
|
JOIN vendors v ON v.id = t.vendor_id
|
||||||
|
WHERE COALESCE(t.category, '') <> ALL($1::text[])
|
||||||
|
),
|
||||||
|
ready AS (
|
||||||
|
SELECT a.*
|
||||||
|
FROM active a
|
||||||
|
WHERE a.price_verified = true
|
||||||
|
AND a.image_verified = true
|
||||||
|
AND a.details_verified = true
|
||||||
|
AND COALESCE(a.competitor_verified, false) = false
|
||||||
|
AND COALESCE(a.competitor_status, 'needs_research') IN ('unknown', 'needs_research')
|
||||||
|
AND ($2::text = '' OR a.vendor_name ILIKE ('%' || $2::text || '%'))
|
||||||
|
AND a.form_factor IS NOT NULL
|
||||||
|
AND a.form_factor != ''
|
||||||
|
AND a.speed_gbps IS NOT NULL
|
||||||
|
AND a.speed_gbps > 0
|
||||||
|
AND a.fiber_type IS NOT NULL
|
||||||
|
AND a.fiber_type != ''
|
||||||
|
),
|
||||||
|
scored AS (
|
||||||
|
SELECT
|
||||||
|
r.id,
|
||||||
|
r.part_number,
|
||||||
|
r.vendor_name,
|
||||||
|
r.form_factor,
|
||||||
|
r.speed_gbps,
|
||||||
|
r.reach_meters,
|
||||||
|
r.fiber_type,
|
||||||
|
(
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM active c
|
||||||
|
WHERE c.id != r.id
|
||||||
|
AND c.vendor_id != r.vendor_id
|
||||||
|
AND c.price_verified = true
|
||||||
|
AND c.image_verified = true
|
||||||
|
AND c.details_verified = true
|
||||||
|
AND c.form_factor = r.form_factor
|
||||||
|
AND c.speed_gbps = r.speed_gbps
|
||||||
|
AND UPPER(COALESCE(c.fiber_type, '')) = UPPER(COALESCE(r.fiber_type, ''))
|
||||||
|
AND (
|
||||||
|
(c.reach_meters IS NOT NULL AND c.reach_meters > 0
|
||||||
|
AND r.reach_meters IS NOT NULL AND r.reach_meters > 0
|
||||||
|
AND ABS(c.reach_meters - r.reach_meters) <= GREATEST(25, r.reach_meters * 0.05))
|
||||||
|
OR
|
||||||
|
(COALESCE(c.reach_meters, 0) = 0 AND COALESCE(r.reach_meters, 0) = 0)
|
||||||
|
)
|
||||||
|
AND (
|
||||||
|
COALESCE(NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, '')), '') = ''
|
||||||
|
OR COALESCE(NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, '')), '') = ''
|
||||||
|
OR ABS(
|
||||||
|
NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, ''))::int
|
||||||
|
- NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, ''))::int
|
||||||
|
) <= 15
|
||||||
|
)
|
||||||
|
) AS strict_candidate_count,
|
||||||
|
(
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM transceiver_equivalences eq
|
||||||
|
WHERE eq.flexoptix_id = r.id
|
||||||
|
AND eq.status IN ('pending', 'approved', 'auto_approved')
|
||||||
|
AND eq.confidence >= 0.50
|
||||||
|
) AS equivalence_hint_count
|
||||||
|
FROM ready r
|
||||||
|
)
|
||||||
|
SELECT *
|
||||||
|
FROM scored
|
||||||
|
ORDER BY vendor_name, part_number
|
||||||
|
LIMIT $3
|
||||||
|
`, [EXCLUDED_CATEGORIES, vendorFilter, limit]);
|
||||||
|
|
||||||
|
logger.info("Open competitor status candidates", { count: result.rowCount ?? 0 });
|
||||||
|
|
||||||
|
if (!apply) {
|
||||||
|
for (const row of result.rows.slice(0, 60)) {
|
||||||
|
const strictCandidates = Number(row.strict_candidate_count);
|
||||||
|
const equivalenceHints = Number(row.equivalence_hint_count);
|
||||||
|
logger.info("dry-run open status", {
|
||||||
|
vendor: row.vendor_name,
|
||||||
|
partNumber: row.part_number,
|
||||||
|
outcome: strictCandidates === 0 && equivalenceHints === 0 ? "no_valid_match" : "ambiguous",
|
||||||
|
strictCandidates,
|
||||||
|
equivalenceHints,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let noValidMatch = 0;
|
||||||
|
let ambiguous = 0;
|
||||||
|
let fullyVerifiedEarned = 0;
|
||||||
|
|
||||||
|
for (const row of result.rows) {
|
||||||
|
const strictCandidates = Number(row.strict_candidate_count);
|
||||||
|
const equivalenceHints = Number(row.equivalence_hint_count);
|
||||||
|
const shouldNoMatch = strictCandidates === 0 && equivalenceHints === 0;
|
||||||
|
|
||||||
|
if (shouldNoMatch) {
|
||||||
|
const reason = [
|
||||||
|
"Open competitor status resolver found no strict source-backed competitor candidate",
|
||||||
|
`same form_factor=${row.form_factor}`,
|
||||||
|
`speed_gbps=${row.speed_gbps}`,
|
||||||
|
`fiber_type=${row.fiber_type}`,
|
||||||
|
`reach=${row.reach_meters ?? "unknown"}`,
|
||||||
|
"and no active equivalence hint above confidence 0.50",
|
||||||
|
].join("; ");
|
||||||
|
|
||||||
|
const update = await pool.query(`
|
||||||
|
UPDATE transceivers
|
||||||
|
SET competitor_verified = true,
|
||||||
|
competitor_verified_at = NOW(),
|
||||||
|
competitor_status = 'no_valid_match',
|
||||||
|
competitor_status_updated_at = NOW(),
|
||||||
|
no_match_verified_at = NOW(),
|
||||||
|
no_match_reason = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
AND COALESCE(competitor_verified, false) = false
|
||||||
|
RETURNING id
|
||||||
|
`, [row.id, reason]);
|
||||||
|
|
||||||
|
if ((update.rowCount ?? 0) === 0) continue;
|
||||||
|
|
||||||
|
await recordVerificationEvidence({
|
||||||
|
transceiverId: row.id,
|
||||||
|
verificationType: "competitor_no_match",
|
||||||
|
evidenceValue: {
|
||||||
|
reason,
|
||||||
|
vendor: row.vendor_name,
|
||||||
|
partNumber: row.part_number,
|
||||||
|
strictCandidateCount: strictCandidates,
|
||||||
|
equivalenceHintCount: equivalenceHints,
|
||||||
|
},
|
||||||
|
robotName: "verify:open-competitor-status",
|
||||||
|
confidence: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
||||||
|
noValidMatch++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reason = [
|
||||||
|
"Open competitor status resolver found plausible candidates but no safe deterministic 1:1 match",
|
||||||
|
`strict_candidate_count=${strictCandidates}`,
|
||||||
|
`equivalence_hint_count=${equivalenceHints}`,
|
||||||
|
].join("; ");
|
||||||
|
|
||||||
|
const update = await pool.query(`
|
||||||
|
UPDATE transceivers
|
||||||
|
SET competitor_status = 'ambiguous',
|
||||||
|
competitor_status_updated_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
AND COALESCE(competitor_verified, false) = false
|
||||||
|
AND COALESCE(competitor_status, 'needs_research') IN ('unknown', 'needs_research')
|
||||||
|
RETURNING id
|
||||||
|
`, [row.id]);
|
||||||
|
|
||||||
|
if ((update.rowCount ?? 0) === 0) continue;
|
||||||
|
|
||||||
|
await recordVerificationEvidence({
|
||||||
|
transceiverId: row.id,
|
||||||
|
verificationType: "competitor_ambiguous",
|
||||||
|
evidenceValue: {
|
||||||
|
reason,
|
||||||
|
vendor: row.vendor_name,
|
||||||
|
partNumber: row.part_number,
|
||||||
|
strictCandidateCount: strictCandidates,
|
||||||
|
equivalenceHintCount: equivalenceHints,
|
||||||
|
},
|
||||||
|
robotName: "verify:open-competitor-status",
|
||||||
|
confidence: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
ambiguous++;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Open competitor status resolver complete", {
|
||||||
|
no_valid_match: noValidMatch,
|
||||||
|
ambiguous,
|
||||||
|
fully_verified_earned: fullyVerifiedEarned,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
resolveOpenCompetitorStatus()
|
||||||
|
.then(() => pool.end())
|
||||||
|
.catch((err) => {
|
||||||
|
logger.error("Open competitor status resolver failed", { error: (err as Error).message });
|
||||||
|
pool.end();
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -7,13 +7,14 @@
|
|||||||
*
|
*
|
||||||
* This deliberately does not verify price/image/competitor signals.
|
* This deliberately does not verify price/image/competitor signals.
|
||||||
*/
|
*/
|
||||||
import { pool, checkAndSetFullyVerified } from "./db";
|
import { pool, checkAndSetFullyVerified, recordVerificationEvidence } from "./db";
|
||||||
import { logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
const DEFAULT_LIMIT = 5000;
|
const DEFAULT_LIMIT = 5000;
|
||||||
|
|
||||||
interface Candidate {
|
interface Candidate {
|
||||||
id: string;
|
id: string;
|
||||||
|
details_source_url: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function verifyCatalogDetails(limit: number): Promise<void> {
|
async function verifyCatalogDetails(limit: number): Promise<void> {
|
||||||
@ -62,11 +63,22 @@ async function verifyCatalogDetails(limit: number): Promise<void> {
|
|||||||
updated_at = NOW()
|
updated_at = NOW()
|
||||||
FROM candidate
|
FROM candidate
|
||||||
WHERE t.id = candidate.id
|
WHERE t.id = candidate.id
|
||||||
RETURNING t.id
|
RETURNING t.id, t.details_source_url
|
||||||
`, [limit]);
|
`, [limit]);
|
||||||
|
|
||||||
let fullyVerifiedEarned = 0;
|
let fullyVerifiedEarned = 0;
|
||||||
for (const row of candidates.rows) {
|
for (const row of candidates.rows) {
|
||||||
|
await recordVerificationEvidence({
|
||||||
|
transceiverId: row.id,
|
||||||
|
verificationType: "details",
|
||||||
|
sourceUrl: row.details_source_url || undefined,
|
||||||
|
evidenceValue: {
|
||||||
|
source: "catalog-derived normalized technical specs",
|
||||||
|
detailsSourceUrl: row.details_source_url,
|
||||||
|
},
|
||||||
|
robotName: "verify:catalog:details",
|
||||||
|
confidence: 1,
|
||||||
|
});
|
||||||
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
24
sql/104-verification-evidence-ambiguous.sql
Normal file
24
sql/104-verification-evidence-ambiguous.sql
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
-- Migration 104: add competitor_ambiguous evidence type
|
||||||
|
--
|
||||||
|
-- Ambiguous competitor status is a deliberate research outcome. It means TIP
|
||||||
|
-- found one or more plausible candidates, but cannot prove a safe 1:1 match.
|
||||||
|
-- Store this as evidence instead of leaving products in an endless queue.
|
||||||
|
|
||||||
|
ALTER TABLE transceiver_verification_evidence
|
||||||
|
DROP CONSTRAINT IF EXISTS transceiver_verification_evidence_verification_type_check;
|
||||||
|
|
||||||
|
ALTER TABLE transceiver_verification_evidence
|
||||||
|
ADD CONSTRAINT transceiver_verification_evidence_verification_type_check
|
||||||
|
CHECK (
|
||||||
|
verification_type::text = ANY (
|
||||||
|
ARRAY[
|
||||||
|
'price'::varchar,
|
||||||
|
'image'::varchar,
|
||||||
|
'details'::varchar,
|
||||||
|
'competitor_match'::varchar,
|
||||||
|
'competitor_no_match'::varchar,
|
||||||
|
'competitor_ambiguous'::varchar,
|
||||||
|
'artifact_quarantine'::varchar
|
||||||
|
]::text[]
|
||||||
|
)
|
||||||
|
);
|
||||||
@ -1,9 +1,75 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 21:33 UTC
|
Updated: 2026-05-09 22:01 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
|
- TIP open competitor status closure on 2026-05-09:
|
||||||
|
- added migration `sql/104-verification-evidence-ambiguous.sql`
|
||||||
|
- extends `transceiver_verification_evidence.verification_type` with `competitor_ambiguous`
|
||||||
|
- added `packages/scraper/src/utils/resolve-open-competitor-status.ts`
|
||||||
|
- script: `pnpm -C packages/scraper run verify:open-competitor-status`
|
||||||
|
- default is dry-run
|
||||||
|
- apply requires `OPEN_COMPETITOR_APPLY=1`
|
||||||
|
- live Erik run:
|
||||||
|
- dry-run found `365` fully populated products still stuck in `needs_research`
|
||||||
|
- apply result:
|
||||||
|
- `364` set to `ambiguous`
|
||||||
|
- `1` set to `no_valid_match`
|
||||||
|
- `1` additional product earned `fully_verified`
|
||||||
|
- evidence:
|
||||||
|
- `364` `competitor_ambiguous` records from `verify:open-competitor-status`
|
||||||
|
- `1` `competitor_no_match` record from `verify:open-competitor-status`
|
||||||
|
- current fully populated competitor queue:
|
||||||
|
- products with price+image+details and `competitor_status='needs_research'`: `0`
|
||||||
|
- scheduler guard:
|
||||||
|
- updated `maintenance:find-equivalences` so future matcher runs do not reset deliberate `ambiguous` rows back to `needs_research`
|
||||||
|
- rebuilt and restarted `tip-scraper-daemon` after confirming no active pg-boss jobs
|
||||||
|
- live health after closure:
|
||||||
|
- active products: `17305`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16705`
|
||||||
|
- fully verified: `10449`
|
||||||
|
- competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=74`
|
||||||
|
- `ambiguous=556`
|
||||||
|
- `needs_research=5900`
|
||||||
|
- remaining `needs_research` rows are no longer fully populated competitor-ready products; they are product-data gaps first
|
||||||
|
|
||||||
|
- TIP product-data gap probing on 2026-05-09:
|
||||||
|
- hardened `verify:catalog:details` to write `details` evidence
|
||||||
|
- run result: `113` catalog-derived rows updated, `0` additional fully verified
|
||||||
|
- active Health did not move, indicating those updates were outside the active dashboard base or not counted by the current active filters
|
||||||
|
- GAO Tek detail verifier:
|
||||||
|
- checked remaining `64` GAO product URLs
|
||||||
|
- result: `0` updated, `64` skipped, `0` errors
|
||||||
|
- interpretation: remaining GAO rows lack deterministic public detail evidence; no fake details added
|
||||||
|
- GBICS:
|
||||||
|
- added package script `scrape:gbics`
|
||||||
|
- patched scraper to pass product URLs into `findOrCreateScrapedTransceiver`
|
||||||
|
- live run found `758` products, `0` prices
|
||||||
|
- active GBICS gap remains `64 price / 64 image / 64 details` of `135`
|
||||||
|
- interpretation: GBICS has product discovery, but active old rows and scraped product identifiers do not line up cleanly; needs alias/dedupe hardening plus price selector repair
|
||||||
|
- T&S Communication:
|
||||||
|
- added package script `scrape:tscom`
|
||||||
|
- patched scraper to pass product URLs into `findOrCreateScrapedTransceiver`
|
||||||
|
- live run found `109` unique products, `0` prices
|
||||||
|
- active T&S gap remains `82 price / 82 image / 49 details` of `82`
|
||||||
|
- interpretation: product discovery works, but price selector and existing-row matching need hardening
|
||||||
|
- 10Gtek / SFPcables:
|
||||||
|
- live run found `110` products and wrote `6` prices
|
||||||
|
- active 10Gtek gap remains `126 price / 131 image / 25 details` of `175`
|
||||||
|
- interpretation: parser works partially, but many active 10Gtek rows are unmatched aliases or lack source pages
|
||||||
|
- current largest active product-data gaps:
|
||||||
|
- `Juniper Networks`: `283 price / 394 image / 173 details` of `534`
|
||||||
|
- `Cisco Systems`: `151 price / 351 image / 146 details` of `351`
|
||||||
|
- `GAO Tek`: `456 price / 23 image / 87 details` of `458`
|
||||||
|
- `GBICS`: `64 price / 64 image / 64 details` of `135`
|
||||||
|
- `T&S Communication`: `82 price / 82 image / 49 details` of `82`
|
||||||
|
- `10Gtek`: `126 price / 131 image / 25 details` of `175`
|
||||||
|
|
||||||
- TIP FS.com SKU alias cleanup on 2026-05-09:
|
- TIP FS.com SKU alias cleanup on 2026-05-09:
|
||||||
- added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`
|
- added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`
|
||||||
- script: `pnpm -C packages/scraper run verify:fs:sku-aliases`
|
- script: `pnpm -C packages/scraper run verify:fs:sku-aliases`
|
||||||
|
|||||||
@ -0,0 +1,93 @@
|
|||||||
|
# 2026-05-09 — TIP Open Competitor Closure + Product Gap Probing
|
||||||
|
|
||||||
|
## Competitor Status Closure
|
||||||
|
|
||||||
|
Added `competitor_ambiguous` as an evidence type via `sql/104-verification-evidence-ambiguous.sql`.
|
||||||
|
|
||||||
|
Added `packages/scraper/src/utils/resolve-open-competitor-status.ts`.
|
||||||
|
|
||||||
|
Runbook:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm -C packages/scraper run verify:open-competitor-status
|
||||||
|
OPEN_COMPETITOR_APPLY=1 pnpm -C packages/scraper run verify:open-competitor-status
|
||||||
|
```
|
||||||
|
|
||||||
|
Live result:
|
||||||
|
|
||||||
|
- dry-run found `365` fully populated products still in `needs_research`
|
||||||
|
- apply wrote:
|
||||||
|
- `364` `ambiguous`
|
||||||
|
- `1` `no_valid_match`
|
||||||
|
- `1` newly `fully_verified`
|
||||||
|
- evidence ledger:
|
||||||
|
- `364` `competitor_ambiguous`
|
||||||
|
- `1` `competitor_no_match`
|
||||||
|
- fully populated products still in competitor `needs_research`: `0`
|
||||||
|
|
||||||
|
Scheduler guard:
|
||||||
|
|
||||||
|
- `maintenance:find-equivalences` now preserves deliberate `ambiguous` states instead of resetting them to `needs_research`
|
||||||
|
- built on Erik successfully
|
||||||
|
- restarted `tip-scraper-daemon` only after the pg-boss active queue was empty
|
||||||
|
|
||||||
|
Live health after closure:
|
||||||
|
|
||||||
|
- active products: `17305`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16705`
|
||||||
|
- fully verified: `10449`
|
||||||
|
- competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=74`
|
||||||
|
- `ambiguous=556`
|
||||||
|
- `needs_research=5900`
|
||||||
|
|
||||||
|
Interpretation:
|
||||||
|
|
||||||
|
- `needs_research` no longer contains fully populated products ready for competitor-only resolution.
|
||||||
|
- Remaining `needs_research` rows are product-data gaps first.
|
||||||
|
|
||||||
|
## Product Gap Probing
|
||||||
|
|
||||||
|
### Catalog Details
|
||||||
|
|
||||||
|
- hardened `verify:catalog:details` to write detail evidence
|
||||||
|
- live result: `113` catalog-derived rows updated, `0` fully verified earned
|
||||||
|
- active Health did not move, so those rows are not currently improving the active dashboard base
|
||||||
|
|
||||||
|
### GAO Tek
|
||||||
|
|
||||||
|
- ran `scrape:gaotek:details`
|
||||||
|
- checked remaining `64` rows
|
||||||
|
- result: `0` updated, `64` skipped, `0` errors
|
||||||
|
- interpretation: remaining GAO pages lack deterministic public detail evidence
|
||||||
|
|
||||||
|
### GBICS
|
||||||
|
|
||||||
|
- added `scrape:gbics`
|
||||||
|
- patched `gbics.ts` to pass product URLs into DB upsert/detail verification
|
||||||
|
- live run found `758` products and `0` prices
|
||||||
|
- active gap remains `64 price / 64 image / 64 details` of `135`
|
||||||
|
- interpretation: discovery works, but active rows and scraped identifiers need alias/dedupe hardening; price selector also needs repair
|
||||||
|
|
||||||
|
### T&S Communication
|
||||||
|
|
||||||
|
- added `scrape:tscom`
|
||||||
|
- patched `tscom.ts` to pass product URLs into DB upsert/detail verification
|
||||||
|
- live run found `109` unique products and `0` prices
|
||||||
|
- active gap remains `82 price / 82 image / 49 details` of `82`
|
||||||
|
- interpretation: discovery works, but row matching and price selectors need hardening
|
||||||
|
|
||||||
|
### 10Gtek / SFPcables
|
||||||
|
|
||||||
|
- live run found `110` products and wrote `6` prices
|
||||||
|
- active 10Gtek gap remains `126 price / 131 image / 25 details` of `175`
|
||||||
|
- interpretation: parser works partially; remaining rows need alias/source-page repair
|
||||||
|
|
||||||
|
## Next Real Work
|
||||||
|
|
||||||
|
- Build alias/dedupe resolvers for GBICS, T&S, and 10Gtek similar to FS.com numeric SKU alias cleanup.
|
||||||
|
- Repair GBICS/T&S price selectors against current page HTML.
|
||||||
|
- For OEM vendors such as Cisco/Juniper, add explicit non-public-price states instead of pretending OEM list prices exist.
|
||||||
@ -42,3 +42,6 @@
|
|||||||
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
||||||
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
|
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
|
||||||
{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}
|
{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}
|
||||||
|
{"id":"tip-20260509-open-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research queues with explicit evidence-backed states, not fake green checks."},{"role":"user","content":"What should TIP do with products that have price, image, and details verified but still sit in competitor needs_research?"},{"role":"assistant","content":"Run an open competitor status resolver. If no strict source-backed candidate and no active equivalence hint exists, set `competitor_status=no_valid_match`, set competitor_verified true, write `competitor_no_match` evidence, and recalculate fully_verified. If candidates or equivalence hints exist but no safe deterministic 1:1 match can be proven, set `competitor_status=ambiguous` and write `competitor_ambiguous` evidence. Do not leave fully populated rows in endless `needs_research`, and do not auto-match when the candidate set is broad or technically risky."}]}
|
||||||
|
{"id":"tip-20260509-store-scraper-gap-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Treat scraper runs that find products but do not improve active verification as diagnostic evidence."},{"role":"user","content":"GBICS and T&S scrapers found many products but active TIP verification counts did not improve. What does that mean?"},{"role":"assistant","content":"It means the crawler is not simply dead; the remaining problem is likely row identity alignment, alias/dedupe, or price/image selector drift. For GBICS, discovery found 758 products and zero prices, while active gaps stayed at 64 price/image/details rows. For T&S, discovery found 109 unique products and zero prices, while active gaps stayed at 82 price/image and 49 details rows. Next steps are to repair price selectors and build alias/dedupe resolvers that map scraped product names/URLs to existing active rows before claiming verification progress."}]}
|
||||||
|
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user