diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index fae3c6c..77f052e 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -281,3 +281,4 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA {"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."} {"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."} {"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."} +{"d":"2026-04-29","t":"FEAT","m":"TIPLLM robot learning loop: verification robot controller writes status, TIPLLM plans, queue dry-runs/enqueues and crawler outcomes into the Gitea-backed TIP training pool; learning-pool build imports qa-pairs from TIP_TRAINING_REPO into the tip_llm lane. Removed hardcoded Gitea token fallback; existing git remotes or env tokens are used."} diff --git a/packages/scraper/src/crawler-llm/training-data-writer.ts b/packages/scraper/src/crawler-llm/training-data-writer.ts index 2ebabf6..6d4ddc9 100644 --- a/packages/scraper/src/crawler-llm/training-data-writer.ts +++ b/packages/scraper/src/crawler-llm/training-data-writer.ts @@ -16,7 +16,7 @@ */ import { execSync } from "child_process"; -import { appendFileSync, mkdirSync, existsSync } from "fs"; +import { appendFileSync, mkdirSync, existsSync, writeFileSync } from "fs"; import { join } from "path"; import { createHash } from "crypto"; @@ -28,8 +28,9 @@ import type { CombinedValidationResult } from "./spec-validator"; // ───────────────────────────────────────────────────────────────────────────── const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data"; -const GITEA_TOKEN = process.env.GITEA_TOKEN || "0e758f30abf86ffb49b2d7bb5b1f0be12c7f0b46"; -const GITEA_BASE = "http://192.168.178.196:3000"; +const GITEA_TOKEN = process.env.GITEA_TOKEN || ""; +const GITEA_BASE = process.env.GITEA_BASE || "http://192.168.178.196:3000"; +const GITEA_REPO = process.env.TIP_TRAINING_GITEA_REPO || "rene/tip-training-data"; // Minimum confidence for a spec to enter the high-quality training set const MIN_CONFIDENCE_HIGH = 0.75; @@ -67,6 +68,20 @@ export interface CrawlExtraction { crawled_at: string; // ISO timestamp } +export interface RobotExperience { + event: "status" | "tipllm_plan" | "queue_plan" | "queue_enqueued" | "queue_rejected" | "crawler_result"; + observed_at: string; + actor: string; + profile?: string; + wave?: string; + vendor?: string; + summary: string; + input: Record; + decision: Record; + outcome?: Record; + safety_notes?: string[]; +} + // ───────────────────────────────────────────────────────────────────────────── // Helpers // ───────────────────────────────────────────────────────────────────────────── @@ -94,7 +109,39 @@ function ensureDir(dir: string): void { if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); } +function ensureTrainingRepoLayout(): void { + ensureDir(REPO_DIR); + ensureDir(join(REPO_DIR, "qa-pairs")); + ensureDir(join(REPO_DIR, "robot-experiences")); + ensureDir(join(REPO_DIR, "crawl-extractions")); + ensureDir(join(REPO_DIR, "validated-specs")); + + const readme = join(REPO_DIR, "README.md"); + if (!existsSync(readme)) { + writeFileSync( + readme, + [ + "# TIP Training Data Pool", + "", + "This repository stores TIPLLM learning records produced by TIP crawlers, scrapers and verification robots.", + "", + "Primary lanes:", + "", + "- `qa-pairs/`: supervised fine-tuning JSONL records with chat messages.", + "- `robot-experiences/`: raw operational experience logs for audit and future dataset builders.", + "- `crawl-extractions/`: crawler extraction audit logs.", + "- `validated-specs/`: validated structured transceiver specifications.", + "", + "Only TIPLLM should be used for crawler planning and extraction feedback in this lane.", + "", + ].join("\n"), + "utf8", + ); + } +} + function appendRecord(filePath: string, record: SftRecord): void { + ensureTrainingRepoLayout(); appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8"); } @@ -216,6 +263,7 @@ function getOutputFile(type: "spec_qa" | "crawl_reasoning" | "validation" | "dis } function getRawExtractionFile(vendorSlug: string): string { + ensureTrainingRepoLayout(); const dir = join(REPO_DIR, "crawl-extractions", vendorSlug); ensureDir(dir); const date = new Date().toISOString().split("T")[0]; @@ -223,6 +271,7 @@ function getRawExtractionFile(vendorSlug: string): string { } function getValidatedSpecFile(tier: string): string { + ensureTrainingRepoLayout(); const dir = join(REPO_DIR, "validated-specs"); ensureDir(dir); return join(dir, `${tier}.jsonl`); @@ -251,12 +300,17 @@ function gitCommit(message: string): void { } function gitPush(): void { - const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/rene/tip-training-data.git`; - execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" }); + if (!GITEA_TOKEN) { + execSync("git push origin main", { cwd: REPO_DIR, stdio: "pipe" }); + } else { + const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/${GITEA_REPO}.git`; + execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" }); + } } export function flushToGitea(label = "batch"): void { try { + ensureTrainingRepoLayout(); gitAddAll(); gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`); gitPush(); @@ -267,6 +321,67 @@ export function flushToGitea(label = "batch"): void { } } +function makeRobotPair(experience: RobotExperience): SftRecord { + const prompt = [ + "You are TIPLLM controlling TIP crawler and verification robots.", + "Given this current observation, choose the safest next robot action.", + "", + JSON.stringify({ + event: experience.event, + actor: experience.actor, + profile: experience.profile, + wave: experience.wave, + vendor: experience.vendor, + input: experience.input, + safety_notes: experience.safety_notes ?? [], + }, null, 2), + ].join("\n"); + + const response = { + summary: experience.summary, + decision: experience.decision, + outcome: experience.outcome ?? null, + constraints: [ + "Use TIPLLM only for planning/extraction feedback.", + "Do not start heavy crawler waves on Erik.", + "Prefer small, observable queue batches.", + "Write useful outcomes back into the TIPLLM training pool.", + ], + }; + + return { + id: makeId("robot-exp", JSON.stringify(experience)), + source: `robot-experience:${experience.event}:${experience.actor}`, + kind: "sft-jsonl", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { role: "user", content: prompt }, + { role: "assistant", content: JSON.stringify(response, null, 2) }, + ], + }; +} + +function getRobotExperienceFile(): string { + ensureTrainingRepoLayout(); + const date = new Date().toISOString().split("T")[0]; + return join(REPO_DIR, "robot-experiences", `${date}.jsonl`); +} + +/** + * Write one operational crawler/robot experience to the TIPLLM training pool. + * This produces both a raw audit row and an SFT chat row under qa-pairs. + */ +export function writeRobotExperience(experience: RobotExperience, options: { flush?: boolean } = {}): void { + ensureTrainingRepoLayout(); + appendFileSync(getRobotExperienceFile(), JSON.stringify(experience) + "\n", "utf8"); + appendRecord(join(REPO_DIR, "qa-pairs", "robot-control-high.jsonl"), makeRobotPair(experience)); + pendingChanges += 2; + + if (options.flush || pendingChanges >= BATCH_COMMIT_THRESHOLD) { + flushToGitea(`robot-${experience.event}`); + } +} + // ───────────────────────────────────────────────────────────────────────────── // Public API // ───────────────────────────────────────────────────────────────────────────── diff --git a/packages/scraper/src/robots/verification-robots.ts b/packages/scraper/src/robots/verification-robots.ts new file mode 100644 index 0000000..0eda3d4 --- /dev/null +++ b/packages/scraper/src/robots/verification-robots.ts @@ -0,0 +1,481 @@ +/** + * Verification Robot Command Center + * + * Read-only status: + * tsx src/robots/verification-robots.ts --status + * + * Enqueue targeted crawler waves: + * tsx src/robots/verification-robots.ts --enqueue=details-fast-lane --profile=erik-safe + * tsx src/robots/verification-robots.ts --enqueue=priority-vendors --profile=pi-fetch + * tsx src/robots/verification-robots.ts --enqueue=all --profile=proxmox-heavy + * + * Dry run: + * tsx src/robots/verification-robots.ts --enqueue=all --dry-run + * + * TIPLLM planning, no crawl load: + * tsx src/robots/verification-robots.ts --tipllm-plan --limit=5 + */ +import { config } from "dotenv"; +import { join } from "path"; +import PgBoss from "pg-boss"; +import { pool } from "../utils/db"; +import { writeRobotExperience } from "../crawler-llm/training-data-writer"; + +config({ path: join(__dirname, "..", "..", "..", "..", ".env") }); + +type RobotWave = "details-fast-lane" | "priority-vendors" | "all"; +type RobotProfile = "erik-safe" | "pi-fetch" | "proxmox-heavy"; + +interface VendorBlockerRow { + vendor: string; + total: string; + missing_image: string; + missing_details: string; + missing_price: string; + near_full_missing_details: string; + near_full_missing_image: string; + not_fully_verified: string; +} + +interface TipLlmResearchPlanResponse { + success?: boolean; + data?: { + search_patterns?: string[]; + extraction_fields?: Array<{ field: string; selector_or_pattern: string; example?: string }>; + relevance_rules?: string[]; + confidence?: number; + summary?: string; + }; + error?: string; +} + +const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`; +const TIP_API_URL = process.env.TIP_API_URL || "http://127.0.0.1:3201"; +const TIP_API_TOKEN = process.env.TIP_API_TOKEN || process.env.TIP_TOKEN || ""; + +const DETAILS_FAST_LANE_QUEUES = [ + "scrape:pricing:flexoptix", + "scrape:pricing:fibermall", + "scrape:pricing:atgbics", + "scrape:pricing:fiber24", + "scrape:pricing:qsfptek", + "scrape:pricing:naddod", + "scrape:pricing:gaotek", + "maintenance:reconcile-verification", +]; + +const VENDOR_QUEUE_MAP: Array<{ match: RegExp; queues: string[] }> = [ + { match: /juniper/i, queues: ["scrape:catalog:juniper-oem", "scrape:catalog:juniper-mx-oem", "scrape:catalog:juniper-qfx-oem", "scrape:compat:juniper", "discover:vendor:juniper"] }, + { match: /cisco/i, queues: ["scrape:catalog:cisco-nexus-oem", "scrape:catalog:cisco-catalyst-oem", "scrape:catalog:cisco-asr-oem", "scrape:compat:cisco", "discover:vendor:cisco-tmg"] }, + { match: /fs\.?com/i, queues: ["scrape:pricing:fs", "discover:vendor:fs-com"] }, + { match: /gao/i, queues: ["scrape:pricing:gaotek"] }, + { match: /ascent/i, queues: ["scrape:pricing:ascentoptics"] }, + { match: /eoptolink/i, queues: ["scrape:catalog:eoptolink"] }, + { match: /atgbics/i, queues: ["scrape:pricing:atgbics"] }, + { match: /naddod/i, queues: ["scrape:pricing:naddod"] }, + { match: /10gtek/i, queues: ["scrape:pricing:10gtek"] }, + { match: /qsfptek/i, queues: ["scrape:pricing:qsfptek"] }, + { match: /shopfiber24|fiber24/i, queues: ["scrape:pricing:fiber24"] }, + { match: /flexoptix/i, queues: ["scrape:pricing:flexoptix", "scrape:vendors:flexoptix-supported"] }, + { match: /fibermall/i, queues: ["scrape:pricing:fibermall"] }, + { match: /nokia/i, queues: ["scrape:catalog:nokia-oem", "scrape:catalog:nokia-access-oem", "discover:vendor:nokia"] }, + { match: /arista/i, queues: ["scrape:catalog:arista-oem", "scrape:catalog:arista-7000-oem", "discover:vendor:arista"] }, + { match: /ciena/i, queues: ["scrape:catalog:ciena-oem", "scrape:catalog:ciena-waveserver-oem"] }, +]; + +const HEAVY_QUEUES = new Set([ + "scrape:pricing:fs", + "scrape:pricing:10gtek", + "scrape:pricing:prolabs", + "scrape:compat:cisco", + "scrape:compat:juniper", + "scrape:compat:ufispace", + "scrape:compat:edgecore", +]); + +const ERIK_SAFE_QUEUES = new Set([ + "scrape:pricing:flexoptix", + "scrape:pricing:fibermall", + "scrape:pricing:atgbics", + "scrape:pricing:fiber24", + "scrape:pricing:qsfptek", + "scrape:pricing:naddod", + "scrape:pricing:gaotek", + "maintenance:reconcile-verification", +]); + +function isDiscoveryQueue(queue: string): boolean { + return queue.startsWith("discover:"); +} + +function queuesForProfile(queues: string[], profile: RobotProfile): string[] { + if (profile === "proxmox-heavy") return queues; + + if (profile === "erik-safe") { + return queues.filter((queue) => ERIK_SAFE_QUEUES.has(queue)); + } + + return queues.filter((queue) => !HEAVY_QUEUES.has(queue) && !isDiscoveryQueue(queue)); +} + +function defaultMaxQueues(profile: RobotProfile): number { + if (profile === "erik-safe") return 3; + if (profile === "pi-fetch") return 10; + return 30; +} + +function toInt(value: string | number | null | undefined): number { + return Number.parseInt(String(value ?? "0"), 10) || 0; +} + +function unique(items: string[]): string[] { + return [...new Set(items)]; +} + +async function createBoss(): Promise { + const boss = new PgBoss({ + connectionString, + retryLimit: 1, + retryDelay: 30, + expireInSeconds: 7200, + monitorStateIntervalSeconds: 60, + }); + boss.on("error", (err) => console.error("[verification-robots] pg-boss:", err.message)); + await boss.start(); + return boss; +} + +export async function getVerificationStatus(): Promise<{ summary: Record; vendors: VendorBlockerRow[] }> { + const summaryResult = await pool.query(` + SELECT + COUNT(*)::text AS total, + COUNT(*) FILTER (WHERE NOT price_verified)::text AS missing_price, + COUNT(*) FILTER (WHERE NOT image_verified)::text AS missing_image, + COUNT(*) FILTER (WHERE NOT details_verified)::text AS missing_details, + COUNT(*) FILTER (WHERE price_verified AND image_verified AND NOT details_verified)::text AS near_full_missing_details, + COUNT(*) FILTER (WHERE price_verified AND details_verified AND NOT image_verified)::text AS near_full_missing_image, + COUNT(*) FILTER (WHERE image_verified AND details_verified AND NOT price_verified)::text AS near_full_missing_price, + COUNT(*) FILTER (WHERE NOT fully_verified)::text AS not_fully_verified + FROM transceivers + `); + + const vendorResult = await pool.query(` + SELECT + COALESCE(v.name, 'unknown') AS vendor, + COUNT(*)::text AS total, + COUNT(*) FILTER (WHERE NOT t.image_verified)::text AS missing_image, + COUNT(*) FILTER (WHERE NOT t.details_verified)::text AS missing_details, + COUNT(*) FILTER (WHERE NOT t.price_verified)::text AS missing_price, + COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified)::text AS near_full_missing_details, + COUNT(*) FILTER (WHERE t.price_verified AND t.details_verified AND NOT t.image_verified)::text AS near_full_missing_image, + COUNT(*) FILTER (WHERE NOT t.fully_verified)::text AS not_fully_verified + FROM transceivers t + LEFT JOIN vendors v ON v.id = t.vendor_id + GROUP BY v.name + HAVING COUNT(*) FILTER (WHERE NOT t.fully_verified) > 0 + ORDER BY + COUNT(*) FILTER (WHERE t.price_verified AND t.image_verified AND NOT t.details_verified) DESC, + COUNT(*) FILTER (WHERE NOT t.fully_verified) DESC + LIMIT 30 + `); + + return { + summary: summaryResult.rows[0] ?? {}, + vendors: vendorResult.rows, + }; +} + +function queuesForPriorityVendors(vendors: VendorBlockerRow[]): string[] { + const queues: string[] = []; + + for (const row of vendors.slice(0, 20)) { + for (const entry of VENDOR_QUEUE_MAP) { + if (entry.match.test(row.vendor)) { + queues.push(...entry.queues); + } + } + } + + queues.push("maintenance:reconcile-verification"); + return unique(queues); +} + +function queuesForWave(wave: RobotWave, vendors: VendorBlockerRow[]): string[] { + if (wave === "details-fast-lane") return DETAILS_FAST_LANE_QUEUES; + if (wave === "priority-vendors") return queuesForPriorityVendors(vendors); + return unique([...DETAILS_FAST_LANE_QUEUES, ...queuesForPriorityVendors(vendors)]); +} + +export async function enqueueRobotWave( + wave: RobotWave, + options: { dryRun?: boolean; profile?: RobotProfile; maxQueues?: number } = {}, +): Promise { + const profile = options.profile ?? "erik-safe"; + const dryRun = options.dryRun ?? false; + const status = await getVerificationStatus(); + const rawQueues = queuesForWave(wave, status.vendors); + const maxQueues = options.maxQueues ?? defaultMaxQueues(profile); + const queues = queuesForProfile(rawQueues, profile).slice(0, maxQueues); + const runId = `verification-${wave}-${new Date().toISOString()}`; + + console.log(`\nRobot wave: ${wave}`); + console.log(`Profile: ${profile}`); + console.log(`Run ID: ${runId}`); + console.log(`Queues: ${queues.length} selected from ${rawQueues.length} candidates`); + for (const q of queues) console.log(` - ${q}`); + + if (queues.length === 0) { + writeRobotExperience({ + event: "queue_rejected", + observed_at: new Date().toISOString(), + actor: "verification-robots", + profile, + wave, + summary: "Robot queue wave rejected because the selected profile had no safe eligible queues.", + input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues }, + decision: { selected_queues: [], reason: "profile_filter_removed_all_candidates" }, + safety_notes: ["No jobs enqueued.", "This protects Erik from accidental heavy crawler work."], + }, { flush: true }); + console.log("\nNo queues selected for this profile."); + return queues; + } + + if (dryRun) { + writeRobotExperience({ + event: "queue_plan", + observed_at: new Date().toISOString(), + actor: "verification-robots", + profile, + wave, + summary: "Dry-run queue plan generated for TIPLLM learning without starting crawler jobs.", + input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues }, + decision: { selected_queues: queues, dry_run: true }, + safety_notes: ["Dry run only.", "No pg-boss jobs were enqueued."], + }, { flush: true }); + console.log("\nDry run: no jobs enqueued."); + return queues; + } + + const boss = await createBoss(); + try { + for (const queue of queues) { + await boss.createQueue(queue).catch(() => {}); + await (boss as unknown as { send: (name: string, data?: object, options?: object) => Promise }).send( + queue, + { source: "verification-robots", wave, run_id: runId, enqueued_at: new Date().toISOString() }, + { retryLimit: 1, expireInSeconds: 7200 }, + ); + } + } finally { + await boss.stop(); + } + + console.log("\nJobs enqueued."); + writeRobotExperience({ + event: "queue_enqueued", + observed_at: new Date().toISOString(), + actor: "verification-robots", + profile, + wave, + summary: "Limited verification robot queue wave enqueued.", + input: { wave, profile, raw_queues: rawQueues, max_queues: maxQueues }, + decision: { selected_queues: queues, run_id: runId }, + outcome: { enqueued_count: queues.length }, + safety_notes: [ + profile === "erik-safe" ? "Erik-safe profile excludes Playwright and discovery queues." : "Profile selected explicitly.", + "Queue count is capped.", + ], + }, { flush: true }); + return queues; +} + +async function fetchTipLlmPlan(vendor: string, product: string, context: string): Promise { + const resp = await fetch(`${TIP_API_URL}/api/tip-llm/research-plan`, { + method: "POST", + headers: { + "Content-Type": "application/json", + ...(TIP_API_TOKEN ? { Authorization: `Bearer ${TIP_API_TOKEN}` } : {}), + }, + body: JSON.stringify({ vendor, product, context }), + signal: AbortSignal.timeout(45000), + }); + + const data = await resp.json().catch(() => ({})) as TipLlmResearchPlanResponse; + if (!resp.ok) { + return { success: false, error: data.error || `HTTP ${resp.status}` }; + } + return data; +} + +export async function planWithTipLlm(limit: number): Promise { + const status = await getVerificationStatus(); + const targets = status.vendors + .filter((row) => toInt(row.not_fully_verified) > 0) + .slice(0, Math.max(1, Math.min(10, limit))); + + console.log(`\n=== TIPLLM Verification Planning (${targets.length} vendor targets) ===\n`); + console.log("No crawler jobs are started in this mode.\n"); + + for (const row of targets) { + const product = "missing transceiver image/details verification"; + const context = + `TIP verification blocker: vendor=${row.vendor}, ` + + `missing_image=${row.missing_image}, missing_details=${row.missing_details}, ` + + `missing_price=${row.missing_price}, near_full_missing_details=${row.near_full_missing_details}, ` + + `near_full_missing_image=${row.near_full_missing_image}. ` + + "Return safe search patterns, extraction fields, and relevance rules for targeted crawler work."; + + console.log(`Vendor: ${row.vendor}`); + const plan = await fetchTipLlmPlan(row.vendor, product, context); + if (!plan.success || !plan.data) { + writeRobotExperience({ + event: "tipllm_plan", + observed_at: new Date().toISOString(), + actor: "verification-robots", + vendor: row.vendor, + summary: "TIPLLM planning failed or returned no usable data.", + input: { vendor: row.vendor, product, context }, + decision: { usable_plan: false, error: plan.error || "unknown error" }, + safety_notes: ["No crawler jobs started from failed TIPLLM plan."], + }); + console.log(` TIPLLM plan failed: ${plan.error || "unknown error"}\n`); + continue; + } + + const patterns = plan.data.search_patterns ?? []; + const fields = plan.data.extraction_fields ?? []; + const rules = plan.data.relevance_rules ?? []; + console.log(` Summary: ${plan.data.summary || "(none)"}`); + console.log(` Search patterns: ${patterns.slice(0, 4).join(" | ") || "(none)"}`); + console.log(` Extraction fields: ${fields.map((f) => f.field).slice(0, 8).join(", ") || "(none)"}`); + console.log(` Rules: ${rules.slice(0, 3).join(" | ") || "(none)"}`); + console.log(""); + + writeRobotExperience({ + event: "tipllm_plan", + observed_at: new Date().toISOString(), + actor: "verification-robots", + vendor: row.vendor, + summary: plan.data.summary || "TIPLLM produced a crawler research plan.", + input: { vendor: row.vendor, product, context }, + decision: { + usable_plan: true, + search_patterns: patterns, + extraction_fields: fields, + relevance_rules: rules, + confidence: plan.data.confidence ?? null, + }, + safety_notes: [ + "TIPLLM planning mode starts no crawler jobs.", + "Plans must be executed through capped profiles.", + ], + }); + } + + writeRobotExperience({ + event: "tipllm_plan", + observed_at: new Date().toISOString(), + actor: "verification-robots", + summary: "TIPLLM planning batch completed.", + input: { requested_limit: limit, planned_vendors: targets.map((row) => row.vendor) }, + decision: { next_safe_step: "Review plans, then enqueue a small erik-safe dry run or dispatch heavy work to Proxmox/Pi profiles." }, + safety_notes: ["No crawler jobs were started by the planning batch."], + }, { flush: true }); +} + +function printStatus(summary: Record, vendors: VendorBlockerRow[]): void { + console.log("\n=== TIP Verification Robot Status ===\n"); + console.log(`Total products: ${summary.total ?? "0"}`); + console.log(`Not fully verified: ${summary.not_fully_verified ?? "0"}`); + console.log(`Missing price: ${summary.missing_price ?? "0"}`); + console.log(`Missing image: ${summary.missing_image ?? "0"}`); + console.log(`Missing details: ${summary.missing_details ?? "0"}`); + console.log(`Near-full, missing details: ${summary.near_full_missing_details ?? "0"}`); + console.log(`Near-full, missing image: ${summary.near_full_missing_image ?? "0"}`); + console.log(`Near-full, missing price: ${summary.near_full_missing_price ?? "0"}`); + + console.log("\nTop vendor blockers:"); + for (const row of vendors.slice(0, 15)) { + const nearDetails = toInt(row.near_full_missing_details); + const nearImage = toInt(row.near_full_missing_image); + console.log( + ` ${row.vendor.padEnd(22)} not_full=${row.not_fully_verified.padStart(4)} ` + + `missing_img=${row.missing_image.padStart(4)} missing_details=${row.missing_details.padStart(4)} ` + + `near_details=${String(nearDetails).padStart(4)} near_image=${String(nearImage).padStart(4)}`, + ); + } +} + +function recordStatusExperience(summary: Record, vendors: VendorBlockerRow[]): void { + writeRobotExperience({ + event: "status", + observed_at: new Date().toISOString(), + actor: "verification-robots", + summary: "Verification robot status snapshot captured for TIPLLM learning.", + input: { + total: summary.total, + not_fully_verified: summary.not_fully_verified, + missing_price: summary.missing_price, + missing_image: summary.missing_image, + missing_details: summary.missing_details, + near_full_missing_details: summary.near_full_missing_details, + near_full_missing_image: summary.near_full_missing_image, + top_vendors: vendors.slice(0, 10), + }, + decision: { + recommended_sequence: [ + "Use TIPLLM planning for top vendor blockers before new crawler work.", + "Run details-fast-lane first because near-full missing-details products convert fastest.", + "Keep Erik on erik-safe profile; send Playwright/discovery work to Proxmox or Pi workers.", + ], + }, + safety_notes: [ + "Status collection is read-only.", + "No crawler jobs are started by status collection.", + ], + }); +} + +async function main(): Promise { + const args = process.argv.slice(2); + const enqueueArg = args.find((a) => a.startsWith("--enqueue=")); + const wave = enqueueArg?.split("=")[1] as RobotWave | undefined; + const profileArg = args.find((a) => a.startsWith("--profile=")); + const profile = (profileArg?.split("=")[1] || "erik-safe") as RobotProfile; + const maxQueuesArg = args.find((a) => a.startsWith("--max-queues=")); + const maxQueues = maxQueuesArg ? Number.parseInt(maxQueuesArg.split("=")[1], 10) : undefined; + const limitArg = args.find((a) => a.startsWith("--limit=")); + const limit = limitArg ? Number.parseInt(limitArg.split("=")[1], 10) : 5; + const dryRun = args.includes("--dry-run"); + const tipLlmPlan = args.includes("--tipllm-plan"); + + if (args.includes("--status") || (!wave && !tipLlmPlan)) { + const status = await getVerificationStatus(); + printStatus(status.summary, status.vendors); + recordStatusExperience(status.summary, status.vendors); + } + + if (tipLlmPlan) { + await planWithTipLlm(limit); + } + + if (wave) { + if (!["details-fast-lane", "priority-vendors", "all"].includes(wave)) { + throw new Error("Invalid --enqueue value. Use details-fast-lane, priority-vendors, or all."); + } + if (!["erik-safe", "pi-fetch", "proxmox-heavy"].includes(profile)) { + throw new Error("Invalid --profile value. Use erik-safe, pi-fetch, or proxmox-heavy."); + } + await enqueueRobotWave(wave, { dryRun, profile, maxQueues }); + } + + await pool.end(); +} + +if (require.main === module) { + main().catch(async (err) => { + console.error("Fatal:", err); + await pool.end().catch(() => {}); + process.exit(1); + }); +} diff --git a/packages/scraper/src/scrapers/fiber24.ts b/packages/scraper/src/scrapers/fiber24.ts index 27da8e3..35b9d9d 100644 --- a/packages/scraper/src/scrapers/fiber24.ts +++ b/packages/scraper/src/scrapers/fiber24.ts @@ -14,7 +14,7 @@ * * Rate limited: 1 req/1.5 sec. */ -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db"; import { contentHash } from "../utils/hash"; import * as cheerio from "cheerio"; import * as zlib from "zlib"; @@ -220,6 +220,7 @@ export async function scrapeFiber24(): Promise { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -246,12 +247,8 @@ export async function scrapeFiber24(): Promise { // Save image URL to transceivers table if present if (product.imageUrl) { - await pool.query( - `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true - WHERE id = $2 AND (image_url IS NULL OR image_url = '')`, - [product.imageUrl, txId], - ); - imageUpdates++; + const updatedImage = await markImageVerified(txId, product.imageUrl); + if (updatedImage) imageUpdates++; } totalProducts++; diff --git a/packages/scraper/src/scrapers/fibermall.ts b/packages/scraper/src/scrapers/fibermall.ts index 79e2d62..d768b42 100644 --- a/packages/scraper/src/scrapers/fibermall.ts +++ b/packages/scraper/src/scrapers/fibermall.ts @@ -11,7 +11,7 @@ * Pagination: /store-XXXXX-name.htm?page=N * Product list: CSS class "new_proList_mainListLi" */ -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, markImageVerified } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.fibermall.com"; @@ -228,6 +228,7 @@ export async function scrapeFiberMall(): Promise { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -254,13 +255,8 @@ export async function scrapeFiberMall(): Promise { // Save image URL if found and not yet stored if (product.imageUrl) { - const imgResult = await pool.query( - `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = true - WHERE id = $2 AND (image_url IS NULL OR image_url = '') - RETURNING id`, - [product.imageUrl, txId], - ); - if (imgResult.rowCount && imgResult.rowCount > 0) imageUpdates++; + const updatedImage = await markImageVerified(txId, product.imageUrl); + if (updatedImage) imageUpdates++; } totalProducts++; diff --git a/packages/scraper/src/utils/backfill-images.ts b/packages/scraper/src/utils/backfill-images.ts index 44cbb46..9694d81 100644 --- a/packages/scraper/src/utils/backfill-images.ts +++ b/packages/scraper/src/utils/backfill-images.ts @@ -16,7 +16,7 @@ * node dist/utils/backfill-images.js */ -import { pool } from "./db"; +import { pool, markImageVerified } from "./db"; import { logger } from "./logger"; function sleep(ms: number): Promise { @@ -24,10 +24,7 @@ function sleep(ms: number): Promise { } async function updateImageUrl(id: string, imageUrl: string): Promise { - await pool.query( - `UPDATE transceivers SET image_url = $1, image_scraped_at = NOW(), has_image = TRUE WHERE id = $2`, - [imageUrl, id] - ); + await markImageVerified(id, imageUrl); } async function fetchJson(url: string, init?: RequestInit): Promise { @@ -484,7 +481,7 @@ async function backfillOtherVendors(): Promise<{ total: number; updated: number // Main // ============================================================================= -async function main(): Promise { +export async function backfillImages(): Promise { logger.info("=== TIP Image Backfill Script ==="); logger.info(`DB: ${process.env.POSTGRES_HOST ?? "localhost"}:${process.env.POSTGRES_PORT ?? "5433"}`); @@ -531,10 +528,12 @@ async function main(): Promise { logger.info("=== Backfill complete ===", { results, elapsedSec }); } -main() - .then(() => pool.end()) - .catch((err) => { - logger.error("Fatal error", { error: (err as Error).message }); - pool.end(); - process.exit(1); - }); +if (require.main === module) { + backfillImages() + .then(() => pool.end()) + .catch((err) => { + logger.error("Fatal error", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/utils/image-downloader.ts b/packages/scraper/src/utils/image-downloader.ts index 2db3f49..8cbba67 100644 --- a/packages/scraper/src/utils/image-downloader.ts +++ b/packages/scraper/src/utils/image-downloader.ts @@ -25,7 +25,13 @@ export async function setTransceiverImage( source?: string ): Promise { await pool.query( - `UPDATE transceivers SET image_url = $2, has_image = true, image_scraped_at = NOW() + `UPDATE transceivers + SET image_url = $2, + has_image = true, + image_verified = true, + image_verified_at = COALESCE(image_verified_at, NOW()), + image_verified_url = COALESCE(NULLIF(image_verified_url, ''), $2), + image_scraped_at = NOW() WHERE id = $1 AND (image_url IS NULL OR image_url = '')`, [transceiverId, imageUrl] ); diff --git a/packages/scraper/src/utils/spec-updater.ts b/packages/scraper/src/utils/spec-updater.ts index c8aa714..c4a226d 100644 --- a/packages/scraper/src/utils/spec-updater.ts +++ b/packages/scraper/src/utils/spec-updater.ts @@ -77,7 +77,7 @@ export async function updateVerifiedSpecs(specs: VerifiedSpecs): Promise