Rene Fichtmueller 49ccf9a5d2 feat: Procurement Intelligence Engine (WS0c)
- Migration 019: stock_snapshots, abc_classification, reorder_signals,
  product_lifecycle_events, market_intelligence, crawler_llm_log tables
- Seeded 7 market intel events (OFC 2026, AWS/Azure CapEx, Coherent lead times,
  EU TED tenders, ECOC 2026, IEEE 802.3df)
- Seeded 4 lifecycle events (Cisco SFP-10G-LR EOL, Juniper EOL,
  400ZR ratified, 800G MSA draft)
- Crawler LLM: core.ts (Ollama-based extractor), stock-schema.ts (typed schemas
  + vendor profiles for Flexoptix/FS.com/10Gtek/ATGBICS/ProLabs/Farnell/Mouser),
  validator.ts (rule-based sanity checks + cross-validation)
- market-intelligence.ts scraper: OFC/ECOC, LightReading, IEEE 802.3, EU TED,
  Farnell/Mouser lead times, FierceTelecom — weekly via pg-boss
- computeAbcClassification(): dynamic A/B/C classification from price obs +
  compat count + vendor breadth
- computeReorderSignals(): buy_now/wait/hold/monitor with reasons + signal strength
- API: GET /api/procurement/overview|signals|signals/:id|abc|market-intel|
  stock-trends/:id|lifecycle
- Dashboard: Procurement Intel tab with Reorder Signals, ABC table,
  Market Intel cards, Lifecycle Events
2026-04-01 22:04:33 +02:00

350 lines
14 KiB
TypeScript

/**
* Crawler LLM — Core extraction engine.
*
* Uses Ollama (local LLM) to extract structured product data from HTML.
* Two-stage pipeline:
* 1. Page type detection (product vs. category) — cheap, fast
* 2. Structured data extraction with schema enforcement
*
* Vendor-specific profiles guide the LLM without hard-coding selectors.
*/
import { pool } from "../utils/db";
import type { StockExtractionResult, MarketIntelExtractionResult } from "./stock-schema";
import { VENDOR_PROFILES } from "./stock-schema";
import { validateStockExtraction } from "./validator";
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://192.168.178.169:11434";
const OLLAMA_MODEL = process.env.CRAWLER_LLM_MODEL || "qwen2.5:14b";
const MAX_HTML_CHARS = 12_000; // truncate to keep prompt manageable
// ─────────────────────────────────────────────────────────────────────────────
// Ollama API call
// ─────────────────────────────────────────────────────────────────────────────
async function ollamaGenerate(prompt: string): Promise<string> {
const res = await fetch(`${OLLAMA_HOST}/api/generate`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: OLLAMA_MODEL,
prompt,
stream: false,
format: "json",
options: { temperature: 0.1, num_predict: 1024 },
}),
});
if (!res.ok) throw new Error(`Ollama error: ${res.status} ${await res.text()}`);
const data = await res.json() as { response: string };
return data.response;
}
// ─────────────────────────────────────────────────────────────────────────────
// Stage 1: Page type detection (fast, binary)
// ─────────────────────────────────────────────────────────────────────────────
async function detectPageType(html: string, url: string, vendorSlug?: string): Promise<{
is_product_page: boolean;
confidence: number;
evidence: string;
}> {
const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null;
const hints = profile
? `\nVendor hints — Product page signals: ${profile.product_page_signals.join(", ")}. Category page signals: ${profile.category_page_signals.join(", ")}.`
: "";
const prompt = `You are a web scraper assistant. Determine if this HTML is a single product page or a category/listing page.
URL: ${url}${hints}
HTML (truncated):
${html.substring(0, 3000)}
Respond with JSON only:
{
"is_product_page": true or false,
"confidence": 0.0 to 1.0,
"evidence": "brief quote from the HTML that supports your decision"
}`;
const raw = await ollamaGenerate(prompt);
try {
const parsed = JSON.parse(raw);
return {
is_product_page: Boolean(parsed.is_product_page),
confidence: Number(parsed.confidence) || 0,
evidence: String(parsed.evidence || ""),
};
} catch {
return { is_product_page: false, confidence: 0, evidence: "JSON parse failed" };
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Stage 2: Full product extraction
// ─────────────────────────────────────────────────────────────────────────────
async function extractProductData(
html: string,
url: string,
vendorSlug?: string
): Promise<StockExtractionResult> {
const profile = vendorSlug ? VENDOR_PROFILES[vendorSlug] : null;
const hints = profile ? `
Vendor: ${profile.name} (${profile.currency})
Price hint: ${profile.price_hint || "find the main selling price"}
Stock hint: ${profile.stock_hint || "find availability status"}` : "";
const prompt = `You are a product data extractor for optical transceiver products. Extract structured data from this product page HTML.
URL: ${url}${hints}
HTML (truncated to ${MAX_HTML_CHARS} chars):
${html.substring(0, MAX_HTML_CHARS)}
Extract and respond with JSON only — use null for any field you cannot find with confidence:
{
"is_product_page": true,
"confidence": 0.0 to 1.0,
"source_evidence": "brief quote from HTML supporting your extraction",
"price": number or null,
"currency": "USD" or "EUR" or "GBP" or "CNY" or null,
"price_breaks": [{"qty": number, "price": number}] or [],
"stock_level": "in_stock" or "out_of_stock" or "limited" or "unknown",
"stock_quantity": number or null,
"incoming_quantity": number or null,
"incoming_eta": "YYYY-MM-DD" or null,
"lead_time_days": number or null,
"moq": number or null,
"part_number": "exact part number string" or null,
"standard_name": "manufacturer's exact product name as written on the page" or null,
"form_factor": "SFP+" or "QSFP28" or "QSFP-DD" etc or null,
"speed_gbps": number or null
}
Rules:
- standard_name MUST be the manufacturer's exact product designation, not a generic description
- If you see "All Optical Transceivers" or similar category text as the name, set standard_name to null
- price_breaks only if there is a visible quantity/price table
- incoming_quantity: look for text like "X units incoming", "X im Zulauf", "Expected: X"
- Set confidence < 0.5 if you are guessing`;
const raw = await ollamaGenerate(prompt);
try {
const parsed = JSON.parse(raw);
return {
is_product_page: Boolean(parsed.is_product_page ?? true),
confidence: Number(parsed.confidence) || 0,
source_evidence: String(parsed.source_evidence || ""),
price: parsed.price != null ? Number(parsed.price) : null,
currency: parsed.currency || null,
price_breaks: Array.isArray(parsed.price_breaks) ? parsed.price_breaks : [],
stock_level: (["in_stock", "out_of_stock", "limited"].includes(parsed.stock_level))
? parsed.stock_level
: "unknown",
stock_quantity: parsed.stock_quantity != null ? Number(parsed.stock_quantity) : null,
incoming_quantity: parsed.incoming_quantity != null ? Number(parsed.incoming_quantity) : null,
incoming_eta: parsed.incoming_eta || null,
lead_time_days: parsed.lead_time_days != null ? Number(parsed.lead_time_days) : null,
moq: parsed.moq != null ? Number(parsed.moq) : null,
part_number: parsed.part_number || null,
standard_name: parsed.standard_name || null,
form_factor: parsed.form_factor || null,
speed_gbps: parsed.speed_gbps != null ? Number(parsed.speed_gbps) : null,
};
} catch {
return {
is_product_page: false,
confidence: 0,
source_evidence: "JSON parse failed",
price: null, currency: null, price_breaks: [],
stock_level: "unknown",
stock_quantity: null, incoming_quantity: null, incoming_eta: null,
lead_time_days: null, moq: null,
part_number: null, standard_name: null, form_factor: null, speed_gbps: null,
};
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Market intelligence extraction
// ─────────────────────────────────────────────────────────────────────────────
export async function extractMarketIntel(
text: string,
url: string,
sourceName: string
): Promise<MarketIntelExtractionResult> {
const prompt = `You are an optical transceiver market analyst. Analyze this text for market intelligence relevant to transceiver procurement.
Source: ${sourceName}
URL: ${url}
Text:
${text.substring(0, 8000)}
Respond with JSON only:
{
"is_relevant": true or false (false if nothing relevant to transceiver markets),
"confidence": 0.0 to 1.0,
"source_evidence": "brief quote supporting your analysis",
"intel_type": one of: "capex_cycle", "trade_show", "standard_ratified", "standard_draft", "distributor_lead_time", "supply_chain", "tender",
"title": "concise title (max 100 chars)",
"summary": "2-3 sentence summary of the key insight",
"technologies": ["400G", "QSFP-DD", etc — transceiver technologies mentioned],
"buy_signal_implication": one of: "buy_now", "wait", "hold", "monitor", "none",
"impact_horizon_months": estimated months until this affects the market (number),
"published_at": "YYYY-MM-DD" or null
}
Guidelines:
- buy_now: shortage, EOL, CapEx surge → order before prices rise
- wait: new standard coming → current products will drop in price
- hold: stable market, no urgency
- monitor: interesting but unclear impact
- impact_horizon_months: 0-3 for immediate, 3-12 for medium, 12+ for long-term`;
const raw = await ollamaGenerate(prompt);
try {
const p = JSON.parse(raw);
return {
is_relevant: Boolean(p.is_relevant),
confidence: Number(p.confidence) || 0,
source_evidence: String(p.source_evidence || ""),
intel_type: p.intel_type || "supply_chain",
title: String(p.title || "").substring(0, 200),
summary: String(p.summary || ""),
technologies: Array.isArray(p.technologies) ? p.technologies : [],
buy_signal_implication: p.buy_signal_implication || "none",
impact_horizon_months: Number(p.impact_horizon_months) || 6,
published_at: p.published_at || null,
};
} catch {
return {
is_relevant: false, confidence: 0, source_evidence: "parse error",
intel_type: "supply_chain", title: "", summary: "", technologies: [],
buy_signal_implication: "none", impact_horizon_months: 0, published_at: null,
};
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Public API — Main scrape function
// ─────────────────────────────────────────────────────────────────────────────
export interface CrawlerLLMResult {
extraction: StockExtractionResult;
validation_passed: boolean;
validation_errors: string[];
validation_warnings: string[];
}
export async function scrapeWithLLM(
html: string,
url: string,
options: {
vendorSlug?: string;
vendorId?: string;
transceiverIds?: string[]; // candidate matches (pre-filtered by form_factor/speed)
speedGbps?: number;
skipPageDetection?: boolean; // set true if URL is known product page
} = {}
): Promise<CrawlerLLMResult> {
const { vendorSlug, speedGbps, skipPageDetection } = options;
// Stage 1: Page type detection (skip if caller already knows it's a product page)
if (!skipPageDetection) {
const pageType = await detectPageType(html, url, vendorSlug);
if (!pageType.is_product_page) {
return {
extraction: {
is_product_page: false,
confidence: pageType.confidence,
source_evidence: pageType.evidence,
price: null, currency: null, price_breaks: [],
stock_level: "unknown",
stock_quantity: null, incoming_quantity: null, incoming_eta: null,
lead_time_days: null, moq: null,
part_number: null, standard_name: null, form_factor: null, speed_gbps: null,
},
validation_passed: false,
validation_errors: ["Not a product page"],
validation_warnings: [],
};
}
}
// Stage 2: Full extraction
const extraction = await extractProductData(html, url, vendorSlug);
// Stage 3: Rule-based validation
const validation = validateStockExtraction(extraction, speedGbps);
return {
extraction,
validation_passed: validation.passed,
validation_errors: validation.errors,
validation_warnings: validation.warnings,
};
}
// ─────────────────────────────────────────────────────────────────────────────
// Persist to DB — saves stock snapshot and logs the scrape
// ─────────────────────────────────────────────────────────────────────────────
export async function persistStockSnapshot(
result: CrawlerLLMResult,
url: string,
vendorId: string,
transceiverIds: string[]
): Promise<void> {
const { extraction, validation_passed } = result;
// Always log (for audit/debug)
await pool.query(
`INSERT INTO crawler_llm_log
(url, vendor_id, is_product_page, extracted_data, confidence, validation_passed,
failure_reason, model_used)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
[
url,
vendorId,
extraction.is_product_page,
JSON.stringify(extraction),
extraction.confidence,
validation_passed,
validation_passed ? null : result.validation_errors.join("; "),
OLLAMA_MODEL,
]
);
if (!validation_passed || !extraction.is_product_page) return;
// Save stock snapshot for each matched transceiver
for (const transceiverIdStr of transceiverIds) {
await pool.query(
`INSERT INTO stock_snapshots
(transceiver_id, vendor_id, stock_level, stock_quantity, incoming_quantity,
incoming_eta, lead_time_days, moq, price_breaks, source_url, crawler_confidence)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`,
[
transceiverIdStr,
vendorId,
extraction.stock_level,
extraction.stock_quantity,
extraction.incoming_quantity,
extraction.incoming_eta,
extraction.lead_time_days,
extraction.moq,
extraction.price_breaks.length > 0 ? JSON.stringify(extraction.price_breaks) : null,
url,
extraction.confidence,
]
);
}
}