/** * Crawler LLM — Core extraction engine. * * Uses Ollama (local LLM) to extract structured product data from HTML. * Two-stage pipeline: * 1. Page type detection (product vs. category) — cheap, fast * 2. Structured data extraction with schema enforcement * * Vendor-specific profiles guide the LLM without hard-coding selectors. */ import { pool } from "../utils/db"; import type { StockExtractionResult, MarketIntelExtractionResult } from "./stock-schema"; import { VENDOR_PROFILES } from "./stock-schema"; import { validateStockExtraction } from "./validator"; const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434"; const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b"; const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable // ───────────────────────────────────────────────────────────────────────────── // Ollama API call // ───────────────────────────────────────────────────────────────────────────── async function ollamaGenerate(prompt: string): Promise { const res = await fetch(`${OLLAMA_HOST}/api/generate`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: OLLAMA_MODEL, prompt, stream: false, format: "json", options: { temperature: 0.1, num_predict: 1024 }, }), }); if (!res.ok) throw new Error(`Ollama error: ${res.status} ${await res.text()}`); const data = await res.json() as { response: string }; return data.response; } // ───────────────────────────────────────────────────────────────────────────── // Stage 1: Page type detection (fast, binary) // ───────────────────────────────────────────────────────────────────────────── async function detectPageType(html: string, url: string, vendorSlug?: string): Promise<{ is_product_page: boolean; confidence: number; evidence: string; }> { const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null; const hints = profile ? `\nVendor hints — Product page signals: ${profile.product_page_signals.join(", ")}. Category page signals: ${profile.category_page_signals.join(", ")}.` : ""; const prompt = `You are a web scraper assistant. Determine if this HTML is a single product page or a category/listing page. URL: ${url}${hints} HTML (truncated): ${html.substring(0, 3000)} Respond with JSON only: { "is_product_page": true or false, "confidence": 0.0 to 1.0, "evidence": "brief quote from the HTML that supports your decision" }`; const raw = await ollamaGenerate(prompt); try { const parsed = JSON.parse(raw); return { is_product_page: Boolean(parsed.is_product_page), confidence: Number(parsed.confidence) || 0, evidence: String(parsed.evidence || ""), }; } catch { return { is_product_page: false, confidence: 0, evidence: "JSON parse failed" }; } } // ───────────────────────────────────────────────────────────────────────────── // Stage 2: Full product extraction // ───────────────────────────────────────────────────────────────────────────── async function extractProductData( html: string, url: string, vendorSlug?: string ): Promise { const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null; const hints = profile ? ` Vendor: ${profile.name} (${profile.currency}) Price hint: ${profile.price_hint || "find the main selling price"} Stock hint: ${profile.stock_hint || "find availability status"}` : ""; const prompt = `You are a product data extractor for optical transceiver products. Extract structured data from this product page HTML. URL: ${url}${hints} HTML (truncated to ${MAX_HTML_CHARS} chars): ${html.substring(0, MAX_HTML_CHARS)} Extract and respond with JSON only — use null for any field you cannot find with confidence: { "is_product_page": true, "confidence": 0.0 to 1.0, "source_evidence": "brief quote from HTML supporting your extraction", "price": number or null, "currency": "USD" or "EUR" or "GBP" or "CNY" or null, "price_breaks": [{"qty": number, "price": number}] or [], "stock_level": "in_stock" or "out_of_stock" or "limited" or "unknown", "stock_quantity": number or null, "incoming_quantity": number or null, "incoming_eta": "YYYY-MM-DD" or null, "lead_time_days": number or null, "moq": number or null, "part_number": "exact part number string" or null, "standard_name": "manufacturer's exact product name as written on the page" or null, "form_factor": "SFP+" or "QSFP28" or "QSFP-DD" etc or null, "speed_gbps": number or null } Rules: - standard_name MUST be the manufacturer's exact product designation, not a generic description - If you see "All Optical Transceivers" or similar category text as the name, set standard_name to null - price_breaks only if there is a visible quantity/price table - incoming_quantity: look for text like "X units incoming", "X im Zulauf", "Expected: X" - Set confidence < 0.5 if you are guessing`; const raw = await ollamaGenerate(prompt); try { const parsed = JSON.parse(raw); return { is_product_page: Boolean(parsed.is_product_page ?? true), confidence: Number(parsed.confidence) || 0, source_evidence: String(parsed.source_evidence || ""), price: parsed.price != null ? Number(parsed.price) : null, currency: parsed.currency || null, price_breaks: Array.isArray(parsed.price_breaks) ? parsed.price_breaks : [], stock_level: (["in_stock", "out_of_stock", "limited"].includes(parsed.stock_level)) ? parsed.stock_level : "unknown", stock_quantity: parsed.stock_quantity != null ? Number(parsed.stock_quantity) : null, incoming_quantity: parsed.incoming_quantity != null ? Number(parsed.incoming_quantity) : null, incoming_eta: parsed.incoming_eta || null, lead_time_days: parsed.lead_time_days != null ? Number(parsed.lead_time_days) : null, moq: parsed.moq != null ? Number(parsed.moq) : null, part_number: parsed.part_number || null, standard_name: parsed.standard_name || null, form_factor: parsed.form_factor || null, speed_gbps: parsed.speed_gbps != null ? Number(parsed.speed_gbps) : null, }; } catch { return { is_product_page: false, confidence: 0, source_evidence: "JSON parse failed", price: null, currency: null, price_breaks: [], stock_level: "unknown", stock_quantity: null, incoming_quantity: null, incoming_eta: null, lead_time_days: null, moq: null, part_number: null, standard_name: null, form_factor: null, speed_gbps: null, }; } } // ───────────────────────────────────────────────────────────────────────────── // Market intelligence extraction // ───────────────────────────────────────────────────────────────────────────── export async function extractMarketIntel( text: string, url: string, sourceName: string ): Promise { const prompt = `You are an optical transceiver market analyst. Analyze this text for market intelligence relevant to transceiver procurement. Source: ${sourceName} URL: ${url} Text: ${text.substring(0, 8000)} Respond with JSON only: { "is_relevant": true or false (false if nothing relevant to transceiver markets), "confidence": 0.0 to 1.0, "source_evidence": "brief quote supporting your analysis", "intel_type": one of: "capex_cycle", "trade_show", "standard_ratified", "standard_draft", "distributor_lead_time", "supply_chain", "tender", "title": "concise title (max 100 chars)", "summary": "2-3 sentence summary of the key insight", "technologies": ["400G", "QSFP-DD", etc — transceiver technologies mentioned], "buy_signal_implication": one of: "buy_now", "wait", "hold", "monitor", "none", "impact_horizon_months": estimated months until this affects the market (number), "published_at": "YYYY-MM-DD" or null } Guidelines: - buy_now: shortage, EOL, CapEx surge → order before prices rise - wait: new standard coming → current products will drop in price - hold: stable market, no urgency - monitor: interesting but unclear impact - impact_horizon_months: 0-3 for immediate, 3-12 for medium, 12+ for long-term`; const raw = await ollamaGenerate(prompt); try { const p = JSON.parse(raw); return { is_relevant: Boolean(p.is_relevant), confidence: Number(p.confidence) || 0, source_evidence: String(p.source_evidence || ""), intel_type: p.intel_type || "supply_chain", title: String(p.title || "").substring(0, 200), summary: String(p.summary || ""), technologies: Array.isArray(p.technologies) ? p.technologies : [], buy_signal_implication: p.buy_signal_implication || "none", impact_horizon_months: Number(p.impact_horizon_months) || 6, published_at: p.published_at || null, }; } catch { return { is_relevant: false, confidence: 0, source_evidence: "parse error", intel_type: "supply_chain", title: "", summary: "", technologies: [], buy_signal_implication: "none", impact_horizon_months: 0, published_at: null, }; } } // ───────────────────────────────────────────────────────────────────────────── // Public API — Main scrape function // ───────────────────────────────────────────────────────────────────────────── export interface CrawlerLLMResult { extraction: StockExtractionResult; validation_passed: boolean; validation_errors: string[]; validation_warnings: string[]; } export async function scrapeWithLLM( html: string, url: string, options: { vendorSlug?: string; vendorId?: string; transceiverIds?: string[]; // candidate matches (pre-filtered by form_factor/speed) speedGbps?: number; skipPageDetection?: boolean; // set true if URL is known product page } = {} ): Promise { const { vendorSlug, speedGbps, skipPageDetection } = options; // Stage 1: Page type detection (skip if caller already knows it's a product page) if (!skipPageDetection) { const pageType = await detectPageType(html, url, vendorSlug); if (!pageType.is_product_page) { return { extraction: { is_product_page: false, confidence: pageType.confidence, source_evidence: pageType.evidence, price: null, currency: null, price_breaks: [], stock_level: "unknown", stock_quantity: null, incoming_quantity: null, incoming_eta: null, lead_time_days: null, moq: null, part_number: null, standard_name: null, form_factor: null, speed_gbps: null, }, validation_passed: false, validation_errors: ["Not a product page"], validation_warnings: [], }; } } // Stage 2: Full extraction const extraction = await extractProductData(html, url, vendorSlug); // Stage 3: Rule-based validation const validation = validateStockExtraction(extraction, speedGbps); return { extraction, validation_passed: validation.passed, validation_errors: validation.errors, validation_warnings: validation.warnings, }; } // ───────────────────────────────────────────────────────────────────────────── // Persist to DB — saves stock snapshot and logs the scrape // ───────────────────────────────────────────────────────────────────────────── export async function persistStockSnapshot( result: CrawlerLLMResult, url: string, vendorId: string, transceiverIds: string[] ): Promise { const { extraction, validation_passed } = result; // Always log (for audit/debug) await pool.query( `INSERT INTO crawler_llm_log (url, vendor_id, is_product_page, extracted_data, confidence, validation_passed, failure_reason, model_used) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, [ url, vendorId, extraction.is_product_page, JSON.stringify(extraction), extraction.confidence, validation_passed, validation_passed ? null : result.validation_errors.join("; "), OLLAMA_MODEL, ] ); if (!validation_passed || !extraction.is_product_page) return; // Save stock snapshot for each matched transceiver for (const transceiverIdStr of transceiverIds) { await pool.query( `INSERT INTO stock_snapshots (transceiver_id, vendor_id, stock_level, stock_quantity, incoming_quantity, incoming_eta, lead_time_days, moq, price_breaks, source_url, crawler_confidence) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`, [ transceiverIdStr, vendorId, extraction.stock_level, extraction.stock_quantity, extraction.incoming_quantity, extraction.incoming_eta, extraction.lead_time_days, extraction.moq, extraction.price_breaks.length > 0 ? JSON.stringify(extraction.price_breaks) : null, url, extraction.confidence, ] ); } }