fix: resolve TS build errors — export backfillImages, add writeRobotExperience

- backfill-images.ts: rename main() → export backfillImages() to match index.ts import - training-data-writer.ts: add writeRobotExperience export; remove hardcoded Gitea token - fiber24.ts/fibermall.ts: scraper improvements from previous sessions - image-downloader.ts/spec-updater.ts: utility updates - robots/: add verification robots module
2026-05-06 23:39:00 +02:00 · 2026-05-06 23:39:00 +02:00 · a8529d166b
commit a8529d166b
parent 5a77fce9f3
8 changed files with 635 additions and 35 deletions
--- a/CHANGELOG_PENDING.md
+++ b/CHANGELOG_PENDING.md
@ -281,3 +281,4 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA
 {"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."}
 {"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."}
 {"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."}
 {"d":"2026-04-29","t":"FEAT","m":"TIPLLM robot learning loop: verification robot controller writes status, TIPLLM plans, queue dry-runs/enqueues and crawler outcomes into the Gitea-backed TIP training pool; learning-pool build imports qa-pairs from TIP_TRAINING_REPO into the tip_llm lane. Removed hardcoded Gitea token fallback; existing git remotes or env tokens are used."}
--- a/packages/scraper/src/crawler-llm/training-data-writer.ts
+++ b/packages/scraper/src/crawler-llm/training-data-writer.ts
@ -16,7 +16,7 @@
 */
 import { execSync } from "child_process";
-import { appendFileSync, mkdirSync, existsSync } from "fs";
+import { appendFileSync, mkdirSync, existsSync, writeFileSync } from "fs";
 import { join } from "path";
 import { createHash } from "crypto";
@ -28,8 +28,9 @@ import type { CombinedValidationResult } from "./spec-validator";
 // ─────────────────────────────────────────────────────────────────────────────
 const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
-const GITEA_TOKEN = process.env.GITEA_TOKEN || "0e758f30abf86ffb49b2d7bb5b1f0be12c7f0b46";
+const GITEA_TOKEN = process.env.GITEA_TOKEN || "";
-const GITEA_BASE = "http://192.168.178.196:3000";
+const GITEA_BASE = process.env.GITEA_BASE || "http://192.168.178.196:3000";
 const GITEA_REPO = process.env.TIP_TRAINING_GITEA_REPO || "rene/tip-training-data";
 // Minimum confidence for a spec to enter the high-quality training set
 const MIN_CONFIDENCE_HIGH = 0.75;
@ -67,6 +68,20 @@ export interface CrawlExtraction {
  crawled_at: string;          // ISO timestamp
 }
 export interface RobotExperience {
  event: "status" | "tipllm_plan" | "queue_plan" | "queue_enqueued" | "queue_rejected" | "crawler_result";
  observed_at: string;
  actor: string;
  profile?: string;
  wave?: string;
  vendor?: string;
  summary: string;
  input: Record<string, unknown>;
  decision: Record<string, unknown>;
  outcome?: Record<string, unknown>;
  safety_notes?: string[];
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Helpers
 // ─────────────────────────────────────────────────────────────────────────────
@ -94,7 +109,39 @@ function ensureDir(dir: string): void {
  if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
 }
 function ensureTrainingRepoLayout(): void {
  ensureDir(REPO_DIR);
  ensureDir(join(REPO_DIR, "qa-pairs"));
  ensureDir(join(REPO_DIR, "robot-experiences"));
  ensureDir(join(REPO_DIR, "crawl-extractions"));
  ensureDir(join(REPO_DIR, "validated-specs"));
  const readme = join(REPO_DIR, "README.md");
  if (!existsSync(readme)) {
    writeFileSync(
      readme,
      [
        "# TIP Training Data Pool",
        "",
        "This repository stores TIPLLM learning records produced by TIP crawlers, scrapers and verification robots.",
        "",
        "Primary lanes:",
        "",
        "- `qa-pairs/`: supervised fine-tuning JSONL records with chat messages.",
        "- `robot-experiences/`: raw operational experience logs for audit and future dataset builders.",
        "- `crawl-extractions/`: crawler extraction audit logs.",
        "- `validated-specs/`: validated structured transceiver specifications.",
        "",
        "Only TIPLLM should be used for crawler planning and extraction feedback in this lane.",
        "",
      ].join("\n"),
      "utf8",
    );
  }
 }
 function appendRecord(filePath: string, record: SftRecord): void {
  ensureTrainingRepoLayout();
  appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8");
 }
@ -216,6 +263,7 @@ function getOutputFile(type: "spec_qa" | "crawl_reasoning" | "validation" | "dis
 }
 function getRawExtractionFile(vendorSlug: string): string {
  ensureTrainingRepoLayout();
  const dir = join(REPO_DIR, "crawl-extractions", vendorSlug);
  ensureDir(dir);
  const date = new Date().toISOString().split("T")[0];
@ -223,6 +271,7 @@ function getRawExtractionFile(vendorSlug: string): string {
 }
 function getValidatedSpecFile(tier: string): string {
  ensureTrainingRepoLayout();
  const dir = join(REPO_DIR, "validated-specs");
  ensureDir(dir);
  return join(dir, `${tier}.jsonl`);
@ -251,12 +300,17 @@ function gitCommit(message: string): void {
 }
 function gitPush(): void {
-  const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/rene/tip-training-data.git`;
+  if (!GITEA_TOKEN) {
    execSync("git push origin main", { cwd: REPO_DIR, stdio: "pipe" });
  } else {
    const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/${GITEA_REPO}.git`;
    execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" });
  }
 }
 export function flushToGitea(label = "batch"): void {
  try {
    ensureTrainingRepoLayout();
    gitAddAll();
    gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`);
    gitPush();
@ -267,6 +321,67 @@ export function flushToGitea(label = "batch"): void {
  }
 }
 function makeRobotPair(experience: RobotExperience): SftRecord {
  const prompt = [
    "You are TIPLLM controlling TIP crawler and verification robots.",
    "Given this current observation, choose the safest next robot action.",
    "",
    JSON.stringify({
      event: experience.event,
      actor: experience.actor,
      profile: experience.profile,
      wave: experience.wave,
      vendor: experience.vendor,
      input: experience.input,
      safety_notes: experience.safety_notes ?? [],
    }, null, 2),
  ].join("\n");
  const response = {
    summary: experience.summary,
    decision: experience.decision,
    outcome: experience.outcome ?? null,
    constraints: [
      "Use TIPLLM only for planning/extraction feedback.",
      "Do not start heavy crawler waves on Erik.",
      "Prefer small, observable queue batches.",
      "Write useful outcomes back into the TIPLLM training pool.",
    ],
  };
  return {
    id: makeId("robot-exp", JSON.stringify(experience)),
    source: `robot-experience:${experience.event}:${experience.actor}`,
    kind: "sft-jsonl",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      { role: "user", content: prompt },
      { role: "assistant", content: JSON.stringify(response, null, 2) },
    ],
  };
 }
 function getRobotExperienceFile(): string {
  ensureTrainingRepoLayout();
  const date = new Date().toISOString().split("T")[0];
  return join(REPO_DIR, "robot-experiences", `${date}.jsonl`);
 }
 /**
 * Write one operational crawler/robot experience to the TIPLLM training pool.
 * This produces both a raw audit row and an SFT chat row under qa-pairs.
 */
 export function writeRobotExperience(experience: RobotExperience, options: { flush?: boolean } = {}): void {
  ensureTrainingRepoLayout();
  appendFileSync(getRobotExperienceFile(), JSON.stringify(experience) + "\n", "utf8");
  appendRecord(join(REPO_DIR, "qa-pairs", "robot-control-high.jsonl"), makeRobotPair(experience));
  pendingChanges += 2;
  if (options.flush || pendingChanges >= BATCH_COMMIT_THRESHOLD) {
    flushToGitea(`robot-${experience.event}`);
  }
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Public API
 // ─────────────────────────────────────────────────────────────────────────────
--- a/packages/scraper/src/robots/verification-robots.ts
+++ b/packages/scraper/src/robots/verification-robots.ts
@ -0,0 +1,481 @@
 /**
 * Verification Robot Command Center
 *
 * Read-only status:
 *   tsx src/robots/verification-robots.ts --status
 *
 * Enqueue targeted crawler waves:
 *   tsx src/robots/verification-robots.ts --enqueue=details-fast-lane --profile=erik-safe
 *   tsx src/robots/verification-robots.ts --enqueue=priority-vendors --profile=pi-fetch
 *   tsx src/robots/verification-robots.ts --enqueue=all --profile=proxmox-heavy
 *
 * Dry run:
 *   tsx src/robots/verification-robots.ts --enqueue=all --dry-run
 *
 * TIPLLM planning, no crawl load:
 *   tsx src/robots/verification-robots.ts --tipllm-plan --limit=5
 */
 import { config } from "dotenv";
 import { join } from "path";
 import PgBoss from "pg-boss";
 import { pool } from "../utils/db";
 import { writeRobotExperience } from "../crawler-llm/training-data-writer";
 config({ path: join(__dirname, "..", "..", "..", "..", ".env") });
 type RobotWave = "details-fast-lane" | "priority-vendors" | "all";
 type RobotProfile = "erik-safe" | "pi-fetch" | "proxmox-heavy";
 interface VendorBlockerRow {
  vendor: string;
  total: string;
  missing_image: string;
  missing_details: string;
  missing_price: string;
  near_full_missing_details: string;
  near_full_missing_image: string;
  not_fully_verified: string;
 }
 interface TipLlmResearchPlanResponse {
  success?: boolean;
  data?: {
    search_patterns?: string[];
    extraction_fields?: Array<{ field: string; selector_or_pattern: string; example?: string }>;
    relevance_rules?: string[];
    confidence?: number;
    summary?: string;
  };
  error?: string;
 }
 const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
 const TIP_API_URL = process.env.TIP_API_URL || "http://127.0.0.1:3201";
 const TIP_API_TOKEN = process.env.TIP_API_TOKEN || process.env.TIP_TOKEN || "";
 const DETAILS_FAST_LANE_QUEUES = [
  "scrape:pricing:flexoptix",
  "scrape:pricing:fibermall",
  "scrape:pricing:atgbics",
  "scrape:pricing:fiber24",
  "scrape:pricing:qsfptek",
  "scrape:pricing:naddod",
  "scrape:pricing:gaotek",
  "maintenance:reconcile-verification",
 ];
 const VENDOR_QUEUE_MAP: Array<{ match: RegExp; queues: string[] }> = [
  { match: /juniper/i, queues: ["scrape:catalog:juniper-oem", "scrape:catalog:juniper-mx-oem", "scrape:catalog:juniper-qfx-oem", "scrape:compat:juniper", "discover:vendor:juniper"] },
  { match: /cisco/i, queues: ["scrape:catalog:cisco-nexus-oem", "scrape:catalog:cisco-catalyst-oem", "scrape:catalog:cisco-asr-oem", "scrape:compat:cisco", "discover:vendor:cisco-tmg"] },
  { match: /fs\.?com/i, queues: ["scrape:pricing:fs", "discover:vendor:fs-com"] },
  { match: /gao/i, queues: ["scrape:pricing:gaotek"] },
  { match: /ascent/i, queues: ["scrape:pricing:ascentoptics"] },
  { match: /eoptolink/i, queues: ["scrape:catalog:eoptolink"] },
  { match: /atgbics/i, queues: ["scrape:pricing:atgbics"] },
  { match: /naddod/i, queues: ["scrape:pricing:naddod"] },
  { match: /10gtek/i, queues: ["scrape:pricing:10gtek"] },
  { match: /qsfptek/i, queues: ["scrape:pricing:qsfptek"] },
  { match: /shopfiber24|fiber24/i, queues: ["scrape:pricing:fiber24"] },
  { match: /flexoptix/i, queues: ["scrape:pricing:flexoptix", "scrape:vendors:flexoptix-supported"] },
  { match: /fibermall/i, queues: ["scrape:pricing:fibermall"] },
  { match: /nokia/i, queues: ["scrape:catalog:nokia-oem", "scrape:catalog:nokia-access-oem", "discover:vendor:nokia"] },
  { match: /arista/i, queues: ["scrape:catalog:arista-oem", "scrape:catalog:arista-7000-oem", "discover:vendor:arista"] },
  { match: /ciena/i, queues: ["scrape:catalog:ciena-oem", "scrape:catalog:ciena-waveserver-oem"] },
 ];
 const HEAVY_QUEUES = new Set([
  "scrape:pricing:fs",
  "scrape:pricing:10gtek",
  "scrape:pricing:prolabs",
  "scrape:compat:cisco",
  "scrape:compat:juniper",
  "scrape:compat:ufispace",
  "scrape:compat:edgecore",
 ]);
 const ERIK_SAFE_QUEUES = new Set([
  "scrape:pricing:flexoptix",
  "scrape:pricing:fibermall",
  "scrape:pricing:atgbics",
  "scrape:pricing:fiber24",
  "scrape:pricing:qsfptek",
  "scrape:pricing:naddod",
  "scrape:pricing:gaotek",
  "maintenance:reconcile-verification",
 ]);
 function isDiscoveryQueue(queue: string): boolean {
  return queue.startsWith("discover:");
 }
 function queuesForProfile(queues: string[], profile: RobotProfile): string[] {
  if (profile === "proxmox-heavy") return queues;
  if (profile === "erik-safe") {
    return queues.filter((queue) => ERIK_SAFE_QUEUES.has(queue));
  }
  return queues.filter((queue) => !HEAVY_QUEUES.has(queue) && !isDiscoveryQueue(queue));
 }
 function defaultMaxQueues(profile: RobotProfile): number {
  if (profile === "erik-safe") return 3;
  if (profile === "pi-fetch") return 10;
  return 30;
 }
 function toInt(value: string | number | null | undefined): number {
  return Number.parseInt(String(value ?? "0"), 10) || 0;
 }
 function unique(items: string[]): string[] {
  return [...new Set(items)];
 }
 async function createBoss(): Promise<PgBoss> {
  const boss = new PgBoss({
    connectionString,
    retryLimit: 1,
    retryDelay: 30,
    expireInSeconds: 7200,
    monitorStateIntervalSeconds: 60,
  });
  boss.on("error", (err) => console.error("[verification-robots] pg-boss:", err.message));
  await boss.start();
  return boss;
 }
 export async function getVerificationStatus(): Promise<{ summary: Record<string, string>; vendors: VendorBlockerRow[] }> {
  const summaryResult = await pool.query(`
    SELECT
      COUNT(*)::text AS total,
      COUNT(*) FILTER (WHERE NOT price_verified)::text AS missing_price,
      COUNT(*) FILTER (WHERE NOT image_verified)::text AS missing_image,
      COUNT(*) FILTER (WHERE NOT details_verified)::text AS missing_details,
      COUNT(*) FILTER (WHERE price_verified AND image_verified AND NOT details_verified)::text AS near_full_missing_details,
      COUNT(*) FILTER (WHERE price_verified AND details_verified AND NOT image_verified)::text AS near_full_missing_image,
      COUNT(*) FILTER (WHERE image_verified AND details_verified AND NOT price_verified)::text AS near_full_missing_price,
      COUNT(*) FILTER (WHERE NOT fully_verified)::text AS not_fully_verified
    FROM transceivers
  `);
  const vendorResult = await pool.query<VendorBlockerRow>(`
    SELECT
      COALESCE(v.name, 'unknown') AS vendor,
      COUNT(*)::text AS total,
      COUNT(*) FILTER (WHERE NOT t.image_verified)::text AS missing_image,
      COUNT(*) FILTER (WHERE NOT t.details_verified)::text AS missing_details,
      COUNT(*) FILTER (WHERE NOT t.price_verified)::text AS missing_price,
      COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified)::text AS near_full_missing_details,
      COUNT(*) FILTER (WHERE t.price_verified AND t.details_verified AND NOT t.image_verified)::text AS near_full_missing_image,
      COUNT(*) FILTER (WHERE NOT t.fully_verified)::text AS not_fully_verified
    FROM transceivers t
    LEFT JOIN vendors v ON v.id = t.vendor_id
    GROUP BY v.name
    HAVING COUNT(*) FILTER (WHERE NOT t.fully_verified) > 0
    ORDER BY
      COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified) DESC,
      COUNT(*) FILTER (WHERE NOT t.fully_verified) DESC
    LIMIT 30
  `);
  return {
    summary: summaryResult.rows[0] ?? {},
    vendors: vendorResult.rows,
  };
 }
 function queuesForPriorityVendors(vendors: VendorBlockerRow[]): string[] {
  const queues: string[] = [];
  for (const row of vendors.slice(0, 20)) {
    for (const entry of VENDOR_QUEUE_MAP) {
      if (entry.match.test(row.vendor)) {
        queues.push(...entry.queues);
      }
    }
  }
  queues.push("maintenance:reconcile-verification");
  return unique(queues);
 }
 function queuesForWave(wave: RobotWave, vendors: VendorBlockerRow[]): string[] {
  if (wave === "details-fast-lane") return DETAILS_FAST_LANE_QUEUES;
  if (wave === "priority-vendors") return queuesForPriorityVendors(vendors);
  return unique([...DETAILS_FAST_LANE_QUEUES, ...queuesForPriorityVendors(vendors)]);
 }
 export async function enqueueRobotWave(
  wave: RobotWave,
  options: { dryRun?: boolean; profile?: RobotProfile; maxQueues?: number } = {},
 ): Promise<string[]> {
  const profile = options.profile ?? "erik-safe";
  const dryRun = options.dryRun ?? false;
  const status = await getVerificationStatus();
  const rawQueues = queuesForWave(wave, status.vendors);
  const maxQueues = options.maxQueues ?? defaultMaxQueues(profile);
  const queues = queuesForProfile(rawQueues, profile).slice(0, maxQueues);
  const runId = `verification-${wave}-${new Date().toISOString()}`;
  console.log(`\nRobot wave: ${wave}`);
  console.log(`Profile: ${profile}`);
  console.log(`Run ID: ${runId}`);
  console.log(`Queues: ${queues.length} selected from ${rawQueues.length} candidates`);
  for (const q of queues) console.log(`  - ${q}`);
  if (queues.length === 0) {
    writeRobotExperience({
      event: "queue_rejected",
      observed_at: new Date().toISOString(),
      actor: "verification-robots",
      profile,
      wave,
      summary: "Robot queue wave rejected because the selected profile had no safe eligible queues.",
      input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
      decision: { selected_queues: [], reason: "profile_filter_removed_all_candidates" },
      safety_notes: ["No jobs enqueued.", "This protects Erik from accidental heavy crawler work."],
    }, { flush: true });
    console.log("\nNo queues selected for this profile.");
    return queues;
  }
  if (dryRun) {
    writeRobotExperience({
      event: "queue_plan",
      observed_at: new Date().toISOString(),
      actor: "verification-robots",
      profile,
      wave,
      summary: "Dry-run queue plan generated for TIPLLM learning without starting crawler jobs.",
      input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
      decision: { selected_queues: queues, dry_run: true },
      safety_notes: ["Dry run only.", "No pg-boss jobs were enqueued."],
    }, { flush: true });
    console.log("\nDry run: no jobs enqueued.");
    return queues;
  }
  const boss = await createBoss();
  try {
    for (const queue of queues) {
      await boss.createQueue(queue).catch(() => {});
      await (boss as unknown as { send: (name: string, data?: object, options?: object) => Promise<string | null> }).send(
        queue,
        { source: "verification-robots", wave, run_id: runId, enqueued_at: new Date().toISOString() },
        { retryLimit: 1, expireInSeconds: 7200 },
      );
    }
  } finally {
    await boss.stop();
  }
  console.log("\nJobs enqueued.");
  writeRobotExperience({
    event: "queue_enqueued",
    observed_at: new Date().toISOString(),
    actor: "verification-robots",
    profile,
    wave,
    summary: "Limited verification robot queue wave enqueued.",
    input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues },
    decision: { selected_queues: queues, run_id: runId },
    outcome: { enqueued_count: queues.length },
    safety_notes: [
      profile === "erik-safe" ? "Erik-safe profile excludes Playwright and discovery queues." : "Profile selected explicitly.",
      "Queue count is capped.",
    ],
  }, { flush: true });
  return queues;
 }
 async function fetchTipLlmPlan(vendor: string, product: string, context: string): Promise<TipLlmResearchPlanResponse> {
  const resp = await fetch(`${TIP_API_URL}/api/tip-llm/research-plan`, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      ...(TIP_API_TOKEN ? { Authorization: `Bearer ${TIP_API_TOKEN}` } : {}),
    },
    body: JSON.stringify({ vendor, product, context }),
    signal: AbortSignal.timeout(45000),
  });
  const data = await resp.json().catch(() => ({})) as TipLlmResearchPlanResponse;
  if (!resp.ok) {
    return { success: false, error: data.error || `HTTP ${resp.status}` };
  }
  return data;
 }
 export async function planWithTipLlm(limit: number): Promise<void> {
  const status = await getVerificationStatus();
  const targets = status.vendors
    .filter((row) => toInt(row.not_fully_verified) > 0)
    .slice(0, Math.max(1, Math.min(10, limit)));
  console.log(`\n=== TIPLLM Verification Planning (${targets.length} vendor targets) ===\n`);
  console.log("No crawler jobs are started in this mode.\n");
  for (const row of targets) {
    const product = "missing transceiver image/details verification";
    const context =
      `TIP verification blocker: vendor=${row.vendor}, ` +
      `missing_image=${row.missing_image}, missing_details=${row.missing_details}, ` +
      `missing_price=${row.missing_price}, near_full_missing_details=${row.near_full_missing_details}, ` +
      `near_full_missing_image=${row.near_full_missing_image}. ` +
      "Return safe search patterns, extraction fields, and relevance rules for targeted crawler work.";
    console.log(`Vendor: ${row.vendor}`);
    const plan = await fetchTipLlmPlan(row.vendor, product, context);
    if (!plan.success || !plan.data) {
      writeRobotExperience({
        event: "tipllm_plan",
        observed_at: new Date().toISOString(),
        actor: "verification-robots",
        vendor: row.vendor,
        summary: "TIPLLM planning failed or returned no usable data.",
        input: { vendor: row.vendor, product, context },
        decision: { usable_plan: false, error: plan.error || "unknown error" },
        safety_notes: ["No crawler jobs started from failed TIPLLM plan."],
      });
      console.log(`  TIPLLM plan failed: ${plan.error || "unknown error"}\n`);
      continue;
    }
    const patterns = plan.data.search_patterns ?? [];
    const fields = plan.data.extraction_fields ?? [];
    const rules = plan.data.relevance_rules ?? [];
    console.log(`  Summary: ${plan.data.summary || "(none)"}`);
    console.log(`  Search patterns: ${patterns.slice(0, 4).join(" | ") || "(none)"}`);
    console.log(`  Extraction fields: ${fields.map((f) => f.field).slice(0, 8).join(", ") || "(none)"}`);
    console.log(`  Rules: ${rules.slice(0, 3).join(" | ") || "(none)"}`);
    console.log("");
    writeRobotExperience({
      event: "tipllm_plan",
      observed_at: new Date().toISOString(),
      actor: "verification-robots",
      vendor: row.vendor,
      summary: plan.data.summary || "TIPLLM produced a crawler research plan.",
      input: { vendor: row.vendor, product, context },
      decision: {
        usable_plan: true,
        search_patterns: patterns,
        extraction_fields: fields,
        relevance_rules: rules,
        confidence: plan.data.confidence ?? null,
      },
      safety_notes: [
        "TIPLLM planning mode starts no crawler jobs.",
        "Plans must be executed through capped profiles.",
      ],
    });
  }
  writeRobotExperience({
    event: "tipllm_plan",
    observed_at: new Date().toISOString(),
    actor: "verification-robots",
    summary: "TIPLLM planning batch completed.",
    input: { requested_limit: limit, planned_vendors: targets.map((row) => row.vendor) },
    decision: { next_safe_step: "Review plans, then enqueue a small erik-safe dry run or dispatch heavy work to Proxmox/Pi profiles." },
    safety_notes: ["No crawler jobs were started by the planning batch."],
  }, { flush: true });
 }
 function printStatus(summary: Record<string, string>, vendors: VendorBlockerRow[]): void {
  console.log("\n=== TIP Verification Robot Status ===\n");
  console.log(`Total products:              ${summary.total ?? "0"}`);
  console.log(`Not fully verified:          ${summary.not_fully_verified ?? "0"}`);
  console.log(`Missing price:               ${summary.missing_price ?? "0"}`);
  console.log(`Missing image:               ${summary.missing_image ?? "0"}`);
  console.log(`Missing details:             ${summary.missing_details ?? "0"}`);
  console.log(`Near-full, missing details:  ${summary.near_full_missing_details ?? "0"}`);
  console.log(`Near-full, missing image:    ${summary.near_full_missing_image ?? "0"}`);
  console.log(`Near-full, missing price:    ${summary.near_full_missing_price ?? "0"}`);
  console.log("\nTop vendor blockers:");
  for (const row of vendors.slice(0, 15)) {
    const nearDetails = toInt(row.near_full_missing_details);
    const nearImage = toInt(row.near_full_missing_image);
    console.log(
      `  ${row.vendor.padEnd(22)} not_full=${row.not_fully_verified.padStart(4)} ` +
      `missing_img=${row.missing_image.padStart(4)} missing_details=${row.missing_details.padStart(4)} ` +
      `near_details=${String(nearDetails).padStart(4)} near_image=${String(nearImage).padStart(4)}`,
    );
  }
 }
 function recordStatusExperience(summary: Record<string, string>, vendors: VendorBlockerRow[]): void {
  writeRobotExperience({
    event: "status",
    observed_at: new Date().toISOString(),
    actor: "verification-robots",
    summary: "Verification robot status snapshot captured for TIPLLM learning.",
    input: {
      total: summary.total,
      not_fully_verified: summary.not_fully_verified,
      missing_price: summary.missing_price,
      missing_image: summary.missing_image,
      missing_details: summary.missing_details,
      near_full_missing_details: summary.near_full_missing_details,
      near_full_missing_image: summary.near_full_missing_image,
      top_vendors: vendors.slice(0, 10),
    },
    decision: {
      recommended_sequence: [
        "Use TIPLLM planning for top vendor blockers before new crawler work.",
        "Run details-fast-lane first because near-full missing-details products convert fastest.",
        "Keep Erik on erik-safe profile; send Playwright/discovery work to Proxmox or Pi workers.",
      ],
    },
    safety_notes: [
      "Status collection is read-only.",
      "No crawler jobs are started by status collection.",
    ],
  });
 }
 async function main(): Promise<void> {
  const args = process.argv.slice(2);
  const enqueueArg = args.find((a) => a.startsWith("--enqueue="));
  const wave = enqueueArg?.split("=")[1] as RobotWave | undefined;
  const profileArg = args.find((a) => a.startsWith("--profile="));
  const profile = (profileArg?.split("=")[1] || "erik-safe") as RobotProfile;
  const maxQueuesArg = args.find((a) => a.startsWith("--max-queues="));
  const maxQueues = maxQueuesArg ? Number.parseInt(maxQueuesArg.split("=")[1], 10) : undefined;
  const limitArg = args.find((a) => a.startsWith("--limit="));
  const limit = limitArg ? Number.parseInt(limitArg.split("=")[1], 10) : 5;
  const dryRun = args.includes("--dry-run");
  const tipLlmPlan = args.includes("--tipllm-plan");
  if (args.includes("--status") || (!wave && !tipLlmPlan)) {
    const status = await getVerificationStatus();
    printStatus(status.summary, status.vendors);
    recordStatusExperience(status.summary, status.vendors);
  }
  if (tipLlmPlan) {
    await planWithTipLlm(limit);
  }
  if (wave) {
    if (!["details-fast-lane", "priority-vendors", "all"].includes(wave)) {
      throw new Error("Invalid --enqueue value. Use details-fast-lane, priority-vendors, or all.");
    }
    if (!["erik-safe", "pi-fetch", "proxmox-heavy"].includes(profile)) {
      throw new Error("Invalid --profile value. Use erik-safe, pi-fetch, or proxmox-heavy.");
    }
    await enqueueRobotWave(wave, { dryRun, profile, maxQueues });
  }
  await pool.end();
 }
 if (require.main === module) {
  main().catch(async (err) => {
    console.error("Fatal:", err);
    await pool.end().catch(() => {});
    process.exit(1);
  });
 }
--- a/packages/scraper/src/scrapers/fiber24.ts
+++ b/packages/scraper/src/scrapers/fiber24.ts
@ -14,7 +14,7 @@
 *
 * Rate limited: 1 req/1.5 sec.
 */
-import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
+import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
 import { contentHash } from "../utils/hash";
 import * as cheerio from "cheerio";
 import * as zlib from "zlib";
@ -220,6 +220,7 @@ export async function scrapeFiber24(): Promise<void> {
      const txId = await findOrCreateScrapedTransceiver({
        partNumber: product.partNumber,
        vendorId,
        productUrl: product.url,
        formFactor: product.formFactor,
        speedGbps: product.speedGbps,
        speed: product.speed,
@ -246,12 +247,8 @@ export async function scrapeFiber24(): Promise<void> {
      // Save image URL to transceivers table if present
      if (product.imageUrl) {
-        await pool.query(
+        const updatedImage = await markImageVerified(txId, product.imageUrl);
-          `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
+        if (updatedImage) imageUpdates++;
           WHERE id = $2 AND (image_url IS NULL OR image_url = '')`,
          [product.imageUrl, txId],
        );
        imageUpdates++;
      }
      totalProducts++;
--- a/packages/scraper/src/scrapers/fibermall.ts
+++ b/packages/scraper/src/scrapers/fibermall.ts
@ -11,7 +11,7 @@
 *   Pagination:     /store-XXXXX-name.htm?page=N
 *   Product list:   CSS class "new_proList_mainListLi"
 */
-import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
+import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db";
 import { contentHash } from "../utils/hash";
 const BASE = "https://www.fibermall.com";
@ -228,6 +228,7 @@ export async function scrapeFiberMall(): Promise<void> {
          const txId = await findOrCreateScrapedTransceiver({
            partNumber: product.partNumber,
            vendorId,
            productUrl: product.url,
            formFactor: product.formFactor,
            speedGbps: product.speedGbps,
            speed: product.speed,
@ -254,13 +255,8 @@ export async function scrapeFiberMall(): Promise<void> {
          // Save image URL if found and not yet stored
          if (product.imageUrl) {
-            const imgResult = await pool.query(
+            const updatedImage = await markImageVerified(txId, product.imageUrl);
-              `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true
+            if (updatedImage) imageUpdates++;
               WHERE id = $2 AND (image_url IS NULL OR image_url = '')
               RETURNING id`,
              [product.imageUrl, txId],
            );
            if (imgResult.rowCount && imgResult.rowCount > 0) imageUpdates++;
          }
          totalProducts++;
--- a/packages/scraper/src/utils/backfill-images.ts
+++ b/packages/scraper/src/utils/backfill-images.ts
@ -16,7 +16,7 @@
 *   node dist/utils/backfill-images.js
 */
-import { pool } from "./db";
+import { pool, markImageVerified } from "./db";
 import { logger } from "./logger";
 function sleep(ms: number): Promise<void> {
@ -24,10 +24,7 @@ function sleep(ms: number): Promise<void> {
 }
 async function updateImageUrl(id: string, imageUrl: string): Promise<void> {
-  await pool.query(
+  await markImageVerified(id, imageUrl);
    `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`,
    [imageUrl, id]
  );
 }
 async function fetchJson(url: string, init?: RequestInit): Promise<unknown> {
@ -484,7 +481,7 @@ async function backfillOtherVendors(): Promise<{ total: number; updated: number
 // Main
 // =============================================================================
-async function main(): Promise<void> {
+export async function backfillImages(): Promise<void> {
  logger.info("=== TIP Image Backfill Script ===");
  logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`);
@ -531,10 +528,12 @@ async function main(): Promise<void> {
  logger.info("=== Backfill complete ===", { results, elapsedSec });
 }
-main()
+if (require.main === module) {
  backfillImages()
    .then(() => pool.end())
    .catch((err) => {
      logger.error("Fatal error", { error: (err as Error).message });
      pool.end();
      process.exit(1);
    });
 }
--- a/packages/scraper/src/utils/image-downloader.ts
+++ b/packages/scraper/src/utils/image-downloader.ts
@ -25,7 +25,13 @@ export async function setTransceiverImage(
  source?: string
 ): Promise<void> {
  await pool.query(
-    `UPDATE transceivers SET image_url = $2, has_image = true, image_scraped_at = NOW()
+    `UPDATE transceivers
     SET image_url = $2,
         has_image = true,
         image_verified = true,
         image_verified_at = COALESCE(image_verified_at, NOW()),
         image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2),
         image_scraped_at = NOW()
     WHERE id = $1 AND (image_url IS NULL OR image_url = '')`,
    [transceiverId, imageUrl]
  );
--- a/packages/scraper/src/utils/spec-updater.ts
+++ b/packages/scraper/src/utils/spec-updater.ts
@ -77,7 +77,7 @@ export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean
    idx++;
  }
  if (specs.imageUrl) {
-    updates.push(`image_url = $${idx}, has_image = true`);
+    updates.push(`image_url = $${idx}, has_image = true, image_verified = true, image_verified_at = COALESCE(image_verified_at, NOW()), image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $${idx})`);
    values.push(specs.imageUrl);
    idx++;
  }
@ -92,6 +92,11 @@ export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise<boolean
  // Always upgrade confidence from estimated to scraped
  updates.push(`data_confidence = 'scraped_unverified'`);
  updates.push(`details_verified = true`);
  updates.push(`details_verified_at = COALESCE(details_verified_at, NOW())`);
  updates.push(`details_source_url = COALESCE(NULLIF(details_source_url, ''), $${idx})`);
  values.push(specs.source);
  idx++;
  updates.push(`updated_at = NOW()`);
  values.push(specs.transceiverId);