- Migration 019: stock_snapshots, abc_classification, reorder_signals, product_lifecycle_events, market_intelligence, crawler_llm_log tables - Seeded 7 market intel events (OFC 2026, AWS/Azure CapEx, Coherent lead times, EU TED tenders, ECOC 2026, IEEE 802.3df) - Seeded 4 lifecycle events (Cisco SFP-10G-LR EOL, Juniper EOL, 400ZR ratified, 800G MSA draft) - Crawler LLM: core.ts (Ollama-based extractor), stock-schema.ts (typed schemas + vendor profiles for Flexoptix/FS.com/10Gtek/ATGBICS/ProLabs/Farnell/Mouser), validator.ts (rule-based sanity checks + cross-validation) - market-intelligence.ts scraper: OFC/ECOC, LightReading, IEEE 802.3, EU TED, Farnell/Mouser lead times, FierceTelecom — weekly via pg-boss - computeAbcClassification(): dynamic A/B/C classification from price obs + compat count + vendor breadth - computeReorderSignals(): buy_now/wait/hold/monitor with reasons + signal strength - API: GET /api/procurement/overview|signals|signals/:id|abc|market-intel| stock-trends/:id|lifecycle - Dashboard: Procurement Intel tab with Reorder Signals, ABC table, Market Intel cards, Lifecycle Events
350 lines
14 KiB
TypeScript
350 lines
14 KiB
TypeScript
/**
|
|
* Crawler LLM — Core extraction engine.
|
|
*
|
|
* Uses Ollama (local LLM) to extract structured product data from HTML.
|
|
* Two-stage pipeline:
|
|
* 1. Page type detection (product vs. category) — cheap, fast
|
|
* 2. Structured data extraction with schema enforcement
|
|
*
|
|
* Vendor-specific profiles guide the LLM without hard-coding selectors.
|
|
*/
|
|
|
|
import { pool } from "../utils/db";
|
|
import type { StockExtractionResult, MarketIntelExtractionResult } from "./stock-schema";
|
|
import { VENDOR_PROFILES } from "./stock-schema";
|
|
import { validateStockExtraction } from "./validator";
|
|
|
|
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434";
|
|
const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b";
|
|
const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Ollama API call
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
async function ollamaGenerate(prompt: string): Promise<string> {
|
|
const res = await fetch(`${OLLAMA_HOST}/api/generate`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
model: OLLAMA_MODEL,
|
|
prompt,
|
|
stream: false,
|
|
format: "json",
|
|
options: { temperature: 0.1, num_predict: 1024 },
|
|
}),
|
|
});
|
|
if (!res.ok) throw new Error(`Ollama error: ${res.status} ${await res.text()}`);
|
|
const data = await res.json() as { response: string };
|
|
return data.response;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Stage 1: Page type detection (fast, binary)
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
async function detectPageType(html: string, url: string, vendorSlug?: string): Promise<{
|
|
is_product_page: boolean;
|
|
confidence: number;
|
|
evidence: string;
|
|
}> {
|
|
const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null;
|
|
const hints = profile
|
|
? `\nVendor hints — Product page signals: ${profile.product_page_signals.join(", ")}. Category page signals: ${profile.category_page_signals.join(", ")}.`
|
|
: "";
|
|
|
|
const prompt = `You are a web scraper assistant. Determine if this HTML is a single product page or a category/listing page.
|
|
|
|
URL: ${url}${hints}
|
|
|
|
HTML (truncated):
|
|
${html.substring(0, 3000)}
|
|
|
|
Respond with JSON only:
|
|
{
|
|
"is_product_page": true or false,
|
|
"confidence": 0.0 to 1.0,
|
|
"evidence": "brief quote from the HTML that supports your decision"
|
|
}`;
|
|
|
|
const raw = await ollamaGenerate(prompt);
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
return {
|
|
is_product_page: Boolean(parsed.is_product_page),
|
|
confidence: Number(parsed.confidence) || 0,
|
|
evidence: String(parsed.evidence || ""),
|
|
};
|
|
} catch {
|
|
return { is_product_page: false, confidence: 0, evidence: "JSON parse failed" };
|
|
}
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Stage 2: Full product extraction
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
async function extractProductData(
|
|
html: string,
|
|
url: string,
|
|
vendorSlug?: string
|
|
): Promise<StockExtractionResult> {
|
|
const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null;
|
|
const hints = profile ? `
|
|
Vendor: ${profile.name} (${profile.currency})
|
|
Price hint: ${profile.price_hint || "find the main selling price"}
|
|
Stock hint: ${profile.stock_hint || "find availability status"}` : "";
|
|
|
|
const prompt = `You are a product data extractor for optical transceiver products. Extract structured data from this product page HTML.
|
|
|
|
URL: ${url}${hints}
|
|
|
|
HTML (truncated to ${MAX_HTML_CHARS} chars):
|
|
${html.substring(0, MAX_HTML_CHARS)}
|
|
|
|
Extract and respond with JSON only — use null for any field you cannot find with confidence:
|
|
{
|
|
"is_product_page": true,
|
|
"confidence": 0.0 to 1.0,
|
|
"source_evidence": "brief quote from HTML supporting your extraction",
|
|
|
|
"price": number or null,
|
|
"currency": "USD" or "EUR" or "GBP" or "CNY" or null,
|
|
"price_breaks": [{"qty": number, "price": number}] or [],
|
|
|
|
"stock_level": "in_stock" or "out_of_stock" or "limited" or "unknown",
|
|
"stock_quantity": number or null,
|
|
"incoming_quantity": number or null,
|
|
"incoming_eta": "YYYY-MM-DD" or null,
|
|
"lead_time_days": number or null,
|
|
"moq": number or null,
|
|
|
|
"part_number": "exact part number string" or null,
|
|
"standard_name": "manufacturer's exact product name as written on the page" or null,
|
|
"form_factor": "SFP+" or "QSFP28" or "QSFP-DD" etc or null,
|
|
"speed_gbps": number or null
|
|
}
|
|
|
|
Rules:
|
|
- standard_name MUST be the manufacturer's exact product designation, not a generic description
|
|
- If you see "All Optical Transceivers" or similar category text as the name, set standard_name to null
|
|
- price_breaks only if there is a visible quantity/price table
|
|
- incoming_quantity: look for text like "X units incoming", "X im Zulauf", "Expected: X"
|
|
- Set confidence < 0.5 if you are guessing`;
|
|
|
|
const raw = await ollamaGenerate(prompt);
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
return {
|
|
is_product_page: Boolean(parsed.is_product_page ?? true),
|
|
confidence: Number(parsed.confidence) || 0,
|
|
source_evidence: String(parsed.source_evidence || ""),
|
|
price: parsed.price != null ? Number(parsed.price) : null,
|
|
currency: parsed.currency || null,
|
|
price_breaks: Array.isArray(parsed.price_breaks) ? parsed.price_breaks : [],
|
|
stock_level: (["in_stock", "out_of_stock", "limited"].includes(parsed.stock_level))
|
|
? parsed.stock_level
|
|
: "unknown",
|
|
stock_quantity: parsed.stock_quantity != null ? Number(parsed.stock_quantity) : null,
|
|
incoming_quantity: parsed.incoming_quantity != null ? Number(parsed.incoming_quantity) : null,
|
|
incoming_eta: parsed.incoming_eta || null,
|
|
lead_time_days: parsed.lead_time_days != null ? Number(parsed.lead_time_days) : null,
|
|
moq: parsed.moq != null ? Number(parsed.moq) : null,
|
|
part_number: parsed.part_number || null,
|
|
standard_name: parsed.standard_name || null,
|
|
form_factor: parsed.form_factor || null,
|
|
speed_gbps: parsed.speed_gbps != null ? Number(parsed.speed_gbps) : null,
|
|
};
|
|
} catch {
|
|
return {
|
|
is_product_page: false,
|
|
confidence: 0,
|
|
source_evidence: "JSON parse failed",
|
|
price: null, currency: null, price_breaks: [],
|
|
stock_level: "unknown",
|
|
stock_quantity: null, incoming_quantity: null, incoming_eta: null,
|
|
lead_time_days: null, moq: null,
|
|
part_number: null, standard_name: null, form_factor: null, speed_gbps: null,
|
|
};
|
|
}
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Market intelligence extraction
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export async function extractMarketIntel(
|
|
text: string,
|
|
url: string,
|
|
sourceName: string
|
|
): Promise<MarketIntelExtractionResult> {
|
|
const prompt = `You are an optical transceiver market analyst. Analyze this text for market intelligence relevant to transceiver procurement.
|
|
|
|
Source: ${sourceName}
|
|
URL: ${url}
|
|
|
|
Text:
|
|
${text.substring(0, 8000)}
|
|
|
|
Respond with JSON only:
|
|
{
|
|
"is_relevant": true or false (false if nothing relevant to transceiver markets),
|
|
"confidence": 0.0 to 1.0,
|
|
"source_evidence": "brief quote supporting your analysis",
|
|
|
|
"intel_type": one of: "capex_cycle", "trade_show", "standard_ratified", "standard_draft", "distributor_lead_time", "supply_chain", "tender",
|
|
"title": "concise title (max 100 chars)",
|
|
"summary": "2-3 sentence summary of the key insight",
|
|
"technologies": ["400G", "QSFP-DD", etc — transceiver technologies mentioned],
|
|
"buy_signal_implication": one of: "buy_now", "wait", "hold", "monitor", "none",
|
|
"impact_horizon_months": estimated months until this affects the market (number),
|
|
"published_at": "YYYY-MM-DD" or null
|
|
}
|
|
|
|
Guidelines:
|
|
- buy_now: shortage, EOL, CapEx surge → order before prices rise
|
|
- wait: new standard coming → current products will drop in price
|
|
- hold: stable market, no urgency
|
|
- monitor: interesting but unclear impact
|
|
- impact_horizon_months: 0-3 for immediate, 3-12 for medium, 12+ for long-term`;
|
|
|
|
const raw = await ollamaGenerate(prompt);
|
|
try {
|
|
const p = JSON.parse(raw);
|
|
return {
|
|
is_relevant: Boolean(p.is_relevant),
|
|
confidence: Number(p.confidence) || 0,
|
|
source_evidence: String(p.source_evidence || ""),
|
|
intel_type: p.intel_type || "supply_chain",
|
|
title: String(p.title || "").substring(0, 200),
|
|
summary: String(p.summary || ""),
|
|
technologies: Array.isArray(p.technologies) ? p.technologies : [],
|
|
buy_signal_implication: p.buy_signal_implication || "none",
|
|
impact_horizon_months: Number(p.impact_horizon_months) || 6,
|
|
published_at: p.published_at || null,
|
|
};
|
|
} catch {
|
|
return {
|
|
is_relevant: false, confidence: 0, source_evidence: "parse error",
|
|
intel_type: "supply_chain", title: "", summary: "", technologies: [],
|
|
buy_signal_implication: "none", impact_horizon_months: 0, published_at: null,
|
|
};
|
|
}
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Public API — Main scrape function
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export interface CrawlerLLMResult {
|
|
extraction: StockExtractionResult;
|
|
validation_passed: boolean;
|
|
validation_errors: string[];
|
|
validation_warnings: string[];
|
|
}
|
|
|
|
export async function scrapeWithLLM(
|
|
html: string,
|
|
url: string,
|
|
options: {
|
|
vendorSlug?: string;
|
|
vendorId?: string;
|
|
transceiverIds?: string[]; // candidate matches (pre-filtered by form_factor/speed)
|
|
speedGbps?: number;
|
|
skipPageDetection?: boolean; // set true if URL is known product page
|
|
} = {}
|
|
): Promise<CrawlerLLMResult> {
|
|
const { vendorSlug, speedGbps, skipPageDetection } = options;
|
|
|
|
// Stage 1: Page type detection (skip if caller already knows it's a product page)
|
|
if (!skipPageDetection) {
|
|
const pageType = await detectPageType(html, url, vendorSlug);
|
|
if (!pageType.is_product_page) {
|
|
return {
|
|
extraction: {
|
|
is_product_page: false,
|
|
confidence: pageType.confidence,
|
|
source_evidence: pageType.evidence,
|
|
price: null, currency: null, price_breaks: [],
|
|
stock_level: "unknown",
|
|
stock_quantity: null, incoming_quantity: null, incoming_eta: null,
|
|
lead_time_days: null, moq: null,
|
|
part_number: null, standard_name: null, form_factor: null, speed_gbps: null,
|
|
},
|
|
validation_passed: false,
|
|
validation_errors: ["Not a product page"],
|
|
validation_warnings: [],
|
|
};
|
|
}
|
|
}
|
|
|
|
// Stage 2: Full extraction
|
|
const extraction = await extractProductData(html, url, vendorSlug);
|
|
|
|
// Stage 3: Rule-based validation
|
|
const validation = validateStockExtraction(extraction, speedGbps);
|
|
|
|
return {
|
|
extraction,
|
|
validation_passed: validation.passed,
|
|
validation_errors: validation.errors,
|
|
validation_warnings: validation.warnings,
|
|
};
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Persist to DB — saves stock snapshot and logs the scrape
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export async function persistStockSnapshot(
|
|
result: CrawlerLLMResult,
|
|
url: string,
|
|
vendorId: string,
|
|
transceiverIds: string[]
|
|
): Promise<void> {
|
|
const { extraction, validation_passed } = result;
|
|
|
|
// Always log (for audit/debug)
|
|
await pool.query(
|
|
`INSERT INTO crawler_llm_log
|
|
(url, vendor_id, is_product_page, extracted_data, confidence, validation_passed,
|
|
failure_reason, model_used)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
|
|
[
|
|
url,
|
|
vendorId,
|
|
extraction.is_product_page,
|
|
JSON.stringify(extraction),
|
|
extraction.confidence,
|
|
validation_passed,
|
|
validation_passed ? null : result.validation_errors.join("; "),
|
|
OLLAMA_MODEL,
|
|
]
|
|
);
|
|
|
|
if (!validation_passed || !extraction.is_product_page) return;
|
|
|
|
// Save stock snapshot for each matched transceiver
|
|
for (const transceiverIdStr of transceiverIds) {
|
|
await pool.query(
|
|
`INSERT INTO stock_snapshots
|
|
(transceiver_id, vendor_id, stock_level, stock_quantity, incoming_quantity,
|
|
incoming_eta, lead_time_days, moq, price_breaks, source_url, crawler_confidence)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`,
|
|
[
|
|
transceiverIdStr,
|
|
vendorId,
|
|
extraction.stock_level,
|
|
extraction.stock_quantity,
|
|
extraction.incoming_quantity,
|
|
extraction.incoming_eta,
|
|
extraction.lead_time_days,
|
|
extraction.moq,
|
|
extraction.price_breaks.length > 0 ? JSON.stringify(extraction.price_breaks) : null,
|
|
url,
|
|
extraction.confidence,
|
|
]
|
|
);
|
|
}
|
|
}
|