fix: automate equivalence research review queue

2026-05-09 07:48:11 +02:00 · 2026-05-09 07:48:11 +02:00 · 43b7250180
commit 43b7250180
parent ef225c7dc5
4 changed files with 507 additions and 51 deletions
--- a/packages/api/src/routes/review.ts
+++ b/packages/api/src/routes/review.ts
@ -7,6 +7,7 @@
 * POST /api/review/equivalences/:id/reject  — reject with optional reason
 * PATCH /api/review/equivalences/:id        — edit match_notes
 * POST /api/review/run-matcher             — trigger equivalence job immediately
+ * POST /api/review/run-research            — trigger equivalence research job immediately
 */
 import { Router, Request, Response } from "express";
 import { pool } from "../db/client";
@ -27,6 +28,17 @@ async function checkAndSetFullyVerified(transceiverId: string): Promise<boolean>
  return (result.rowCount ?? 0) > 0;
 }

+async function queueMaintenanceJob(name: string): Promise<void> {
+  await pool.query(
+    `
+    INSERT INTO pgboss.job (name, data, priority)
+    VALUES ($1, '{}', 0)
+    ON CONFLICT DO NOTHING
+  `,
+    [name]
+  );
+}
+
 export const reviewRouter = Router();

 // ── GET /api/review/equivalences ──────────────────────────────────────────────
@ -51,7 +63,7 @@ reviewRouter.get("/equivalences", async (req: Request, res: Response) => {
    params = [limit, offset];
    limitIdx = 1; offsetIdx = 2;
  } else if (status === "needs_research") {
-    where = `WHERE eq.status IN ('approved','auto_approved') AND eq.re_research_due_at IS NOT NULL AND eq.re_research_due_at <= NOW()`;
+    where = `WHERE eq.status IN ('pending','approved','auto_approved') AND eq.re_research_due_at IS NOT NULL AND eq.re_research_due_at <= NOW()`;
    params = [limit, offset];
    limitIdx = 1; offsetIdx = 2;
  } else {
@ -143,7 +155,7 @@ reviewRouter.get("/equivalences/stats", async (_req: Request, res: Response) =>
      SUM(CASE WHEN status = 'approved'      THEN 1 ELSE 0 END) AS approved,
      SUM(CASE WHEN status = 'auto_approved' THEN 1 ELSE 0 END) AS auto_approved,
      SUM(CASE WHEN status = 'rejected'      THEN 1 ELSE 0 END) AS rejected,
-      SUM(CASE WHEN status IN ('approved','auto_approved')
+      SUM(CASE WHEN status IN ('pending','approved','auto_approved')
               AND re_research_due_at IS NOT NULL
               AND re_research_due_at <= NOW()        THEN 1 ELSE 0 END) AS needs_research,
      COUNT(*)                                                            AS total
@ -254,9 +266,8 @@ reviewRouter.patch("/equivalences/:id", async (req: Request, res: Response) => {
 });

 // ── POST /api/review/equivalences/approve-all ─────────────────────────────────
-// Approve ALL pending equivalences regardless of confidence.
-// Low-confidence ones (< 0.73) get re_research_due_at = NOW() so the nightly
-// re-research job will re-verify them one by one.
+// Approve only high-confidence pending equivalences. Weak candidates are queued
+// for automated research instead of being marked as approved.
 reviewRouter.post("/equivalences/approve-all", async (req: Request, res: Response) => {
  const reviewer = (req.body as { reviewer?: string }).reviewer || "approve-all";
  const RE_RESEARCH_THRESHOLD = 0.73;
@ -271,15 +282,31 @@ reviewRouter.post("/equivalences/approve-all", async (req: Request, res: Respons

  for (const row of candidates.rows) {
    const needsReSearch = parseFloat(row.confidence) < RE_RESEARCH_THRESHOLD;
+    if (needsReSearch) {
+      await pool.query(`
+        UPDATE transceiver_equivalences
+        SET status             = 'pending',
+            reviewed_by        = $2,
+            reviewed_at        = NULL,
+            re_research_due_at = NOW(),
+            re_researched_at   = NULL,
+            match_notes        = CONCAT(COALESCE(match_notes, ''), E'\n[Automated research queued ' || NOW()::date || ': confidence below approval threshold]')
+        WHERE id = $1
+      `, [row.id, reviewer]);
+
+      scheduledReSearch++;
+      continue;
+    }
+
    await pool.query(`
      UPDATE transceiver_equivalences
      SET status             = 'approved',
          reviewed_by        = $2,
          reviewed_at        = NOW(),
-          re_research_due_at = $3,
+          re_research_due_at = NULL,
          re_researched_at   = NULL
      WHERE id = $1
-    `, [row.id, reviewer, needsReSearch ? new Date() : null]);
+    `, [row.id, reviewer]);

    await pool.query(`
      UPDATE transceivers
@ -289,11 +316,20 @@ reviewRouter.post("/equivalences/approve-all", async (req: Request, res: Respons

    const earned = await checkAndSetFullyVerified(row.flexoptix_id);
    if (earned) fullyVerified++;
-    if (needsReSearch) scheduledReSearch++;
    approved++;
  }

-  res.json({ success: true, approved, fully_verified_earned: fullyVerified, scheduled_re_research: scheduledReSearch });
+  if (scheduledReSearch > 0) {
+    await queueMaintenanceJob("maintenance:re-research-equivalences");
+  }
+
+  res.json({
+    success: true,
+    approved,
+    fully_verified_earned: fullyVerified,
+    scheduled_re_research: scheduledReSearch,
+    left_pending: scheduledReSearch,
+  });
 });

 // ── POST /api/review/equivalences/bulk-approve ────────────────────────────────
@ -315,7 +351,11 @@ reviewRouter.post("/equivalences/bulk-approve", async (req: Request, res: Respon
  for (const row of candidates.rows) {
    await pool.query(`
      UPDATE transceiver_equivalences
-      SET status = 'approved', reviewed_by = $2, reviewed_at = NOW()
+      SET status = 'approved',
+          reviewed_by = $2,
+          reviewed_at = NOW(),
+          re_research_due_at = NULL,
+          re_researched_at = NULL
      WHERE id = $1
    `, [row.id, reviewer]);

@ -336,13 +376,15 @@ reviewRouter.post("/equivalences/bulk-approve", async (req: Request, res: Respon
 // ── POST /api/review/run-matcher ──────────────────────────────────────────────
 // Trigger the equivalence matcher immediately (admin action)
 reviewRouter.post("/run-matcher", async (_req: Request, res: Response) => {
-  // Queue the job via pg-boss — import from scraper's db util won't work here,
-  // so we fire directly via DB insert into pg-boss queue
-  await pool.query(`
-    INSERT INTO pgboss.job (name, data, priority)
-    VALUES ('maintenance:find-equivalences', '{}', 0)
-    ON CONFLICT DO NOTHING
-  `);
+  await queueMaintenanceJob("maintenance:find-equivalences");

  res.json({ success: true, message: "Equivalence matcher queued" });
 });
+
+// ── POST /api/review/run-research ────────────────────────────────────────────
+// Trigger the automated equivalence research worker immediately.
+reviewRouter.post("/run-research", async (_req: Request, res: Response) => {
+  await queueMaintenanceJob("maintenance:re-research-equivalences");
+
+  res.json({ success: true, message: "Equivalence research queued" });
+});
--- a/packages/scraper/src/scheduler.ts
+++ b/packages/scraper/src/scheduler.ts
@ -44,6 +44,181 @@ config({ path: join(__dirname, "..", "..", "..", ".env") });

 const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;

+type EquivalenceProduct = {
+  part_number?: string | null;
+  standard_name?: string | null;
+  form_factor?: string | null;
+  speed_gbps?: number | string | null;
+  fiber_type?: string | null;
+  reach_meters?: number | string | null;
+  wavelengths?: string | null;
+  connector?: string | null;
+};
+
+type EquivalenceResearchResult = {
+  decision: "approve" | "reject";
+  confidence: number;
+  basis: string[];
+  reasons: string[];
+  rejectReason?: string;
+};
+
+function normalizeEquivalenceText(value: unknown): string | null {
+  if (value === null || value === undefined) return null;
+  const text = String(value).trim().toUpperCase();
+  return text.length > 0 ? text : null;
+}
+
+function numericEquivalenceValue(value: unknown): number | null {
+  if (value === null || value === undefined || value === "") return null;
+  const parsed = Number(value);
+  return Number.isFinite(parsed) ? parsed : null;
+}
+
+function extractPrimaryNm(wavelengths: unknown): number | null {
+  if (!wavelengths) return null;
+  const match = String(wavelengths).match(/(\d{3,4})/);
+  return match ? parseInt(match[1], 10) : null;
+}
+
+function evaluateEquivalenceResearch(
+  fx: EquivalenceProduct,
+  cp: EquivalenceProduct,
+  hasRecentPrice: boolean,
+): EquivalenceResearchResult {
+  const basis: string[] = [];
+  const reasons: string[] = [];
+  let score = 0;
+
+  const fxForm = normalizeEquivalenceText(fx.form_factor);
+  const cpForm = normalizeEquivalenceText(cp.form_factor);
+  const fxSpeed = numericEquivalenceValue(fx.speed_gbps);
+  const cpSpeed = numericEquivalenceValue(cp.speed_gbps);
+  const fxStandard = normalizeEquivalenceText(fx.standard_name);
+  const cpStandard = normalizeEquivalenceText(cp.standard_name);
+  const fxFiber = normalizeEquivalenceText(fx.fiber_type);
+  const cpFiber = normalizeEquivalenceText(cp.fiber_type);
+  const fxReach = numericEquivalenceValue(fx.reach_meters);
+  const cpReach = numericEquivalenceValue(cp.reach_meters);
+  const fxNm = extractPrimaryNm(fx.wavelengths);
+  const cpNm = extractPrimaryNm(cp.wavelengths);
+
+  if (!hasRecentPrice) {
+    reasons.push("no recent competitor price observation");
+    return {
+      decision: "reject",
+      confidence: 0,
+      basis,
+      reasons,
+      rejectReason: "automated research: competitor has no recent price observation",
+    };
+  }
+
+  if (fxForm && cpForm && fxForm === cpForm) {
+    score += 25;
+    basis.push("form_factor");
+  } else {
+    reasons.push("form factor mismatch or missing");
+  }
+
+  if (fxSpeed !== null && cpSpeed !== null && fxSpeed === cpSpeed) {
+    score += 20;
+    basis.push("speed_gbps");
+  } else {
+    reasons.push("speed mismatch or missing");
+  }
+
+  if (fxStandard && cpStandard && fxStandard === cpStandard) {
+    score += 30;
+    basis.push("standard_name");
+  } else {
+    reasons.push("standard name not identical");
+  }
+
+  if (fxNm !== null && cpNm !== null) {
+    if (Math.abs(fxNm - cpNm) <= 15) {
+      score += 20;
+      basis.push(`wavelength_${fxNm}nm`);
+    } else {
+      reasons.push(`wavelength mismatch ${fxNm}nm vs ${cpNm}nm`);
+      score -= 20;
+    }
+  } else {
+    reasons.push("wavelength missing");
+  }
+
+  if (fxFiber && cpFiber) {
+    if (fxFiber === cpFiber) {
+      score += 10;
+      basis.push("fiber_type");
+    } else {
+      reasons.push(`fiber mismatch ${fxFiber} vs ${cpFiber}`);
+      score -= 15;
+    }
+  } else {
+    reasons.push("fiber type missing");
+  }
+
+  if (fxReach !== null && cpReach !== null && fxReach > 0 && cpReach > 0) {
+    const ratio = Math.min(fxReach, cpReach) / Math.max(fxReach, cpReach);
+    if (ratio >= 0.85) {
+      score += 10;
+      basis.push("reach");
+    } else {
+      reasons.push(`reach mismatch ${fxReach}m vs ${cpReach}m`);
+      score -= 15;
+    }
+  } else {
+    reasons.push("reach missing");
+  }
+
+  const confidence = Math.max(0, Math.min(1, score / 115));
+  const criticalMismatch = reasons.some((reason) =>
+    reason.startsWith("wavelength mismatch") ||
+    reason.startsWith("fiber mismatch") ||
+    reason.startsWith("reach mismatch") ||
+    reason.startsWith("form factor mismatch") ||
+    reason.startsWith("speed mismatch")
+  );
+  const missingCriticalEvidence = reasons.some((reason) =>
+    reason === "wavelength missing" ||
+    reason === "fiber type missing" ||
+    reason === "reach missing"
+  );
+
+  if (criticalMismatch) {
+    return {
+      decision: "reject",
+      confidence,
+      basis,
+      reasons,
+      rejectReason: `automated research: technical mismatch (${reasons.join("; ")})`,
+    };
+  }
+
+  if (missingCriticalEvidence) {
+    return {
+      decision: "reject",
+      confidence,
+      basis,
+      reasons,
+      rejectReason: `automated research: insufficient technical evidence (${reasons.join("; ")})`,
+    };
+  }
+
+  if (confidence >= 0.73) {
+    return { decision: "approve", confidence, basis, reasons };
+  }
+
+  return {
+    decision: "reject",
+    confidence,
+    basis,
+    reasons,
+    rejectReason: `automated research: confidence ${confidence.toFixed(3)} below approval threshold`,
+  };
+}
+
 export async function createScheduler(): Promise<PgBoss> {
  const boss = new PgBoss({
    connectionString,
@ -2667,52 +2842,105 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
    }
  });

-  // ── Re-research approved equivalences ────────────────────────────────────────
-  // Processes up to 200 approved equivalences per day that have re_research_due_at <= NOW().
-  // Re-runs the confidence check: if competitor still has recent prices and specs still match,
-  // the approval is confirmed (re_researched_at = NOW(), next check in 30 days).
-  // If confidence drops or competitor has no recent price: reverts to pending.
+  // ── Re-research equivalences ────────────────────────────────────────────────
+  // Confirms only well-evidenced matches. Weak, stale, incomplete, or technically
+  // contradictory matches are rejected automatically instead of going back to a
+  // manual queue.
  await boss.work("maintenance:re-research-equivalences", async () => {
    const { pool } = await import("./utils/db");
    const ts = new Date().toISOString();
-    console.log(`[${ts}] Running: Re-research approved equivalences`);
+    const batchLimit = Math.max(1, Math.min(10000, parseInt(process.env["EQUIVALENCE_RESEARCH_BATCH_LIMIT"] || "2000", 10)));
+    console.log(`[${ts}] Running: Equivalence automated research`);

    const batch = await pool.query(`
-      SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.confidence,
-             fx.form_factor, fx.speed_gbps, fx.standard_name, fx.fiber_type,
-             fx.reach_meters, fx.wavelengths
+      SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.status, eq.confidence,
+             fx.part_number AS fx_part_number,
+             fx.form_factor AS fx_form_factor,
+             fx.speed_gbps AS fx_speed_gbps,
+             fx.standard_name AS fx_standard_name,
+             fx.fiber_type AS fx_fiber_type,
+             fx.reach_meters AS fx_reach_meters,
+             fx.wavelengths AS fx_wavelengths,
+             fx.connector AS fx_connector,
+             cp.part_number AS cp_part_number,
+             cp.form_factor AS cp_form_factor,
+             cp.speed_gbps AS cp_speed_gbps,
+             cp.standard_name AS cp_standard_name,
+             cp.fiber_type AS cp_fiber_type,
+             cp.reach_meters AS cp_reach_meters,
+             cp.wavelengths AS cp_wavelengths,
+             cp.connector AS cp_connector,
+             cpv.name AS competitor_vendor,
+             (
+               SELECT COUNT(*)
+               FROM price_observations po
+               WHERE po.transceiver_id = eq.competitor_id
+                 AND po.time > NOW() - INTERVAL '45 days'
+             ) AS recent_price_count
      FROM transceiver_equivalences eq
      JOIN transceivers fx ON eq.flexoptix_id = fx.id
-      WHERE eq.status IN ('approved', 'auto_approved')
+      JOIN transceivers cp ON eq.competitor_id = cp.id
+      JOIN vendors cpv ON cpv.id = cp.vendor_id
+      WHERE eq.status IN ('pending', 'approved', 'auto_approved')
        AND eq.re_research_due_at IS NOT NULL
        AND eq.re_research_due_at <= NOW()
      ORDER BY eq.re_research_due_at ASC
-      LIMIT 200
-    `);
+      LIMIT $1
+    `, [batchLimit]);

    let confirmed = 0;
-    let reverted = 0;
+    let rejected = 0;

    for (const eq of batch.rows) {
-      // Check if competitor still has a recent price observation
-      const priceCheck = await pool.query(`
-        SELECT COUNT(*) AS cnt
-        FROM price_observations
-        WHERE transceiver_id = $1 AND time > NOW() - INTERVAL '45 days'
-      `, [eq.competitor_id]);
+      const research = evaluateEquivalenceResearch(
+        {
+          part_number: eq.fx_part_number,
+          form_factor: eq.fx_form_factor,
+          speed_gbps: eq.fx_speed_gbps,
+          standard_name: eq.fx_standard_name,
+          fiber_type: eq.fx_fiber_type,
+          reach_meters: eq.fx_reach_meters,
+          wavelengths: eq.fx_wavelengths,
+          connector: eq.fx_connector,
+        },
+        {
+          part_number: eq.cp_part_number,
+          form_factor: eq.cp_form_factor,
+          speed_gbps: eq.cp_speed_gbps,
+          standard_name: eq.cp_standard_name,
+          fiber_type: eq.cp_fiber_type,
+          reach_meters: eq.cp_reach_meters,
+          wavelengths: eq.cp_wavelengths,
+          connector: eq.cp_connector,
+        },
+        parseInt(eq.recent_price_count, 10) > 0,
+      );

-      const hasRecentPrice = parseInt(priceCheck.rows[0].cnt, 10) > 0;
-
-      if (!hasRecentPrice) {
-        // Competitor no longer carries this — revert to pending for manual review
+      if (research.decision === "reject") {
        await pool.query(`
          UPDATE transceiver_equivalences
-          SET status = 'pending', re_research_due_at = NULL, re_researched_at = NULL,
-              match_notes = CONCAT(match_notes, E'\n[Re-research ' || NOW()::date || ': no recent price — reverted to pending]')
+          SET status = 'rejected',
+              confidence = $2,
+              match_basis = $3,
+              reject_reason = $4,
+              reviewed_by = 'automated-research',
+              reviewed_at = NOW(),
+              re_research_due_at = NULL,
+              re_researched_at = NOW(),
+              match_notes = CONCAT(
+                COALESCE(match_notes, ''),
+                E'\n[Automated research ' || NOW()::date || ': rejected; ' || $5 || ']'
+              ),
+              updated_at = NOW()
          WHERE id = $1
-        `, [eq.id]);
+        `, [
+          eq.id,
+          research.confidence,
+          research.basis,
+          research.rejectReason || "automated research: rejected",
+          research.reasons.join("; "),
+        ]);

-        // Reset competitor_verified if no other approved equivalence covers this transceiver
        await pool.query(`
          UPDATE transceivers
          SET competitor_verified = false, competitor_verified_at = NULL,
@ -2726,20 +2954,51 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
            )
        `, [eq.flexoptix_id, eq.id]);

-        reverted++;
+        rejected++;
      } else {
-        // Still valid — confirm and schedule next re-research in 30 days
        await pool.query(`
          UPDATE transceiver_equivalences
-          SET re_researched_at   = NOW(),
-              re_research_due_at = NOW() + INTERVAL '30 days'
+          SET status = CASE WHEN status = 'pending' THEN 'auto_approved' ELSE status END,
+              confidence = $2,
+              match_basis = $3,
+              reviewed_by = COALESCE(reviewed_by, 'automated-research'),
+              reviewed_at = COALESCE(reviewed_at, NOW()),
+              reject_reason = NULL,
+              re_researched_at = NOW(),
+              re_research_due_at = NOW() + INTERVAL '30 days',
+              match_notes = CONCAT(
+                COALESCE(match_notes, ''),
+                E'\n[Automated research ' || NOW()::date || ': confirmed; basis: ' || $4 || ']'
+              ),
+              updated_at = NOW()
          WHERE id = $1
-        `, [eq.id]);
+        `, [eq.id, research.confidence, research.basis, research.basis.join(", ")]);
+
+        await pool.query(`
+          UPDATE transceivers
+          SET competitor_verified = true,
+              competitor_verified_at = COALESCE(competitor_verified_at, NOW())
+          WHERE id = $1 AND competitor_verified = false
+        `, [eq.flexoptix_id]);
+
        confirmed++;
      }
    }

-    console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`);
+    if (confirmed > 0) {
+      await pool.query(`
+        UPDATE transceivers
+        SET fully_verified = true,
+            fully_verified_at = COALESCE(fully_verified_at, NOW())
+        WHERE competitor_verified = true
+          AND price_verified = true
+          AND image_verified = true
+          AND details_verified = true
+          AND fully_verified = false
+      `);
+    }
+
+    console.log(`[equivalence-research] confirmed: ${confirmed}, rejected: ${rejected}, batch size: ${batch.rows.length}`);
  });

  // ══════════════════════════════════════════════════════════════════════
--- a/sync/CURRENT.md
+++ b/sync/CURRENT.md
@ -1,9 +1,66 @@
 # Current TIP Sync State

-Updated: 2026-05-09 03:15 UTC
+Updated: 2026-05-09 05:45 UTC

 ## Newest Work

+- TIP automated equivalence research / manual queue cleanup completed on 2026-05-09:
+  - operator intent:
+    - products should be researched well enough that they do not need manual equivalence validation
+    - Erik must not be stressed by crawler-heavy work
+    - TIPLLM-only policy for crawler/robot research remains in force
+  - root cause found:
+    - `approve-all` approved low-confidence equivalences and only marked them for later re-research
+    - the re-research worker mostly checked whether a competitor still had a recent price
+    - it did not re-evaluate hard technical equivalence evidence such as reach, wavelength, fiber type, speed and form factor
+  - code changed:
+    - `packages/api/src/routes/review.ts`
+      - `approve-all` now approves only confidence >= `0.73`
+      - weak pending rows stay pending and are queued for automated research instead of being marked approved
+      - `needs_research` stats/listing now includes pending research rows
+      - added `POST /api/review/run-research`
+    - `packages/scraper/src/scheduler.ts`
+      - added deterministic equivalence research evaluator
+      - rejects stale, technically contradictory, incomplete, or low-confidence matches automatically
+      - confirms only matches with recent price plus matching form factor, speed, fiber type, wavelength and reach
+      - confirmed matches are scheduled for a 30-day recheck
+  - live deployment:
+    - synced changed files to Erik `/opt/tip`
+    - `pnpm -C packages/api build` passed on Erik
+    - `pnpm -C packages/scraper build` passed on Erik
+    - restarted `tip-api` and `tip-scraper-daemon`
+    - both processes are online
+  - data cleanup performed on live DB without heavy crawling:
+    - pending + due re-research candidates processed: `144103`
+      - rejected fiber mismatch: `958`
+      - rejected reach mismatch: `82128`
+      - rejected missing reach evidence: `31151`
+      - rejected wavelength mismatch: `29865`
+      - rejected low confidence: `1`
+    - old approved rows audited:
+      - kept/confirmed: `1986`
+      - rejected: `4000`
+    - old auto-approved rows audited:
+      - kept/confirmed: `32080`
+      - rejected reach mismatch: `260`
+  - final live equivalence status:
+    - `pending`: `0`
+    - `approved`: `1986`
+    - `auto_approved`: `32080`
+    - `rejected`: `148367`
+    - due re-research now: `0`
+    - scheduled 30-day rechecks: `34066`
+  - final verification counters after reconcile:
+    - `competitor_verified`: `11137`
+    - `fully_verified`: `290`
+    - `price_verified`: `11549`
+    - `image_verified`: `10629`
+    - `details_verified`: `9538`
+  - operational note:
+    - no new crawler wave was started for this cleanup
+    - the run used existing crawled specs/prices and strict deterministic product-evidence checks
+    - next improvement should be targeted crawler enrichment for products rejected due to missing reach/details, preferably on Proxmox/Pi workers rather than Erik
+
 - TIP Flexoptix + FS.com price/image revalidation completed on 2026-05-09:
  - live root cause:
    - scraper runs had set `transceivers.price_verified`, but `price_observations.is_verified` stayed false
--- a/sync/history/2026-05-09-tip-equivalence-auto-research.md
+++ b/sync/history/2026-05-09-tip-equivalence-auto-research.md
@ -0,0 +1,98 @@
+# TIP Equivalence Automated Research
+
+Date: 2026-05-09
+
+## Goal
+
+Remove manual equivalence validation as a required workflow for TIP product verification. Low-confidence matches should be researched and either confirmed or rejected automatically.
+
+## Findings
+
+- The dashboard had a large `Approved + Re-Research` backlog.
+- `approve-all` was marking low-confidence rows approved, then setting `re_research_due_at`.
+- The re-research worker only checked whether the competitor still had a recent price; it did not re-check technical equivalence quality.
+- Many low-confidence rows were objectively bad matches:
+  - reach mismatches
+  - wavelength mismatches
+  - missing reach evidence
+  - fiber mismatches
+
+## Code Changes
+
+- `packages/api/src/routes/review.ts`
+  - `approve-all` now approves only confidence >= `0.73`.
+  - Weak rows stay pending and get queued for automated research.
+  - `needs_research` includes pending research rows.
+  - Added `POST /api/review/run-research`.
+
+- `packages/scraper/src/scheduler.ts`
+  - Added deterministic equivalence evaluator.
+  - Confirms matches only when there is:
+    - recent competitor price
+    - matching form factor
+    - matching speed
+    - matching fiber type
+    - matching wavelength
+    - compatible reach
+    - confidence >= `0.73`
+  - Rejects stale, incomplete, contradictory, or low-confidence matches automatically.
+  - Confirmed matches get a 30-day recheck.
+
+## Deployment
+
+- Synced code to Erik `/opt/tip`.
+- Built on Erik:
+  - `pnpm -C packages/api build`
+  - `pnpm -C packages/scraper build`
+- Restarted:
+  - `tip-api`
+  - `tip-scraper-daemon`
+- Both were online after restart.
+
+## Live Data Cleanup
+
+No heavy crawler wave was started. Cleanup used existing crawled specs and price observations.
+
+Processed pending + due re-research:
+
+- total: `144103`
+- rejected fiber mismatch: `958`
+- rejected reach mismatch: `82128`
+- rejected missing reach evidence: `31151`
+- rejected wavelength mismatch: `29865`
+- rejected low confidence: `1`
+
+Processed old approved rows:
+
+- confirmed: `1986`
+- rejected fiber mismatch: `184`
+- rejected reach mismatch: `1704`
+- rejected missing reach evidence: `1117`
+- rejected wavelength mismatch: `993`
+- rejected low confidence: `2`
+
+Processed old auto-approved rows:
+
+- confirmed: `32080`
+- rejected reach mismatch: `260`
+
+## Final State
+
+- pending: `0`
+- approved: `1986`
+- auto_approved: `32080`
+- rejected: `148367`
+- due re-research now: `0`
+- scheduled 30-day rechecks: `34066`
+
+Product verification counters after reconcile:
+
+- competitor_verified: `11137`
+- fully_verified: `290`
+- price_verified: `11549`
+- image_verified: `10629`
+- details_verified: `9538`
+
+## Next Work
+
+Products rejected for missing reach/details should be enriched by targeted vendor crawlers. Keep Erik light; use Proxmox/Pi workers for heavier crawl waves. TIPLLM-only policy remains active for crawler/robot research and learning records.