From 174078efdba1ac13b44968ea5b16f43bba5ff3ad Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 1 Apr 2026 17:27:55 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20100%=20verified=20data=20=E2=80=94=20no?= =?UTF-8?q?=20invented=20prices,=20part=20numbers,=20or=20designations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gatherBlogData(): - Fetches real prices from price_observations (last 30 days) per product - Filters transceivers by speed extracted from topic keywords - Enriches every product with verified_prices array + has_verified_price flag - Joins DB products with vector search results (DB first — they have real prices) contextData injection (blog.ts): - [PRODUCT] lines: exact standard_name, form_factor, speed, reach, connector, dBm specs, Watts - [VERIFIED PRICE] lines: real EUR/USD price, vendor, observed date, source URL - [NO VERIFIED PRICE IN DB]: explicit tag — LLM must not invent a number - [NO PRODUCT DATA AVAILABLE]: fallback when DB returns nothing fo-blog-pipeline.ts system prompt: - DATA INTEGRITY RULES block: prices/part numbers/vendors ONLY from context - Never approximate with ~€350 or 'typically $200-600' for specific products - Power specs only from [PRODUCT] data or REFERENCE VALUES STEP4 context instructions: - Explicit rules on how to use [VERIFIED PRICE] vs [NO VERIFIED PRICE] - Invented data = HARD FAIL in QA STEP9 QA — 3 new hard fail checks (30, 31, 32): - Check 30: invented prices → remove or replace with flexoptix.net reference - Check 31: invented part numbers → remove, use class name instead - Check 32: invented vendor names → remove if not in known list --- packages/api/src/llm/fo-blog-pipeline.ts | 31 +++- packages/api/src/routes/blog.ts | 176 ++++++++++++++++++----- 2 files changed, 170 insertions(+), 37 deletions(-) diff --git a/packages/api/src/llm/fo-blog-pipeline.ts b/packages/api/src/llm/fo-blog-pipeline.ts index bcf919a..14e73f3 100644 --- a/packages/api/src/llm/fo-blog-pipeline.ts +++ b/packages/api/src/llm/fo-blog-pipeline.ts @@ -91,6 +91,15 @@ BANNED STRUCTURAL PATTERNS: - OEM pricing for compatible optics — "400G DR4 at $2,000-5,000" is OEM pricing. Compatible vendor range (Flexoptix, FS, ProLabs) is typically $200-600. Always specify OEM vs compatible. - Markdown headers (##, ###, ####, **bold headers**) anywhere in the article body — write in plain text. No hash symbols, no asterisk headers. Section titles as plain sentence or not at all. +DATA INTEGRITY RULES (ABSOLUTE — harder than anything else on this list): +- EVERY price, part number, and product designation in the article MUST come from the CONTEXT DATA block below, tagged [VERIFIED PRICE] or [PRODUCT]. +- If a product has [NO VERIFIED PRICE IN DB], you MUST NOT write any price for it. Write "current pricing at flexoptix.net" instead. +- NEVER invent, estimate, or approximate a price. Not "~€350", not "around $400", not "typically $200-600 for compatible". Only real [VERIFIED PRICE] values from the context. +- NEVER invent a part number. If you don't see it in [PRODUCT] lines, don't use it. +- NEVER invent a vendor. Only use vendor names from the [PRODUCT] or [VERIFIED PRICE] lines. +- If the context has no products at all ([NO PRODUCT DATA AVAILABLE]), write the article without any specific product names or prices — use technology class names only (e.g., "400G DR4 optics" not "the Flexoptix FX-400DR4-001"). +- Power specs (dBm, Watts) may ONLY be cited if they appear in the [PRODUCT] data or in the REFERENCE VALUES section below. Never derive or estimate them. + HARD RULES (non-negotiable — article FAILS QA without these): 1. Start with a BRUTAL hook — not "If you're still..." but "You're about to sign a PO. Stop." 2. Include a "WHAT BREAKS IN PRODUCTION" section with at least 2 SPECIFIC failure scenarios: @@ -270,10 +279,18 @@ FORMATTING RULES (both styles — non-negotiable): MINIMUM 2000 words. No placeholders. No TODO markers. Complete article. +CONTEXT DATA RULES (read before writing a single word): +The context below contains REAL data from the Flexoptix product database. +- Lines starting with [PRODUCT] = real products. Use their exact names, form factors, speeds. +- Lines starting with [VERIFIED PRICE] = real prices observed in our DB. You MAY cite these. +- Lines with [NO VERIFIED PRICE IN DB] = no price data exists. Write "see flexoptix.net for current pricing" — NEVER invent a number. +- Lines with [NO PRODUCT DATA AVAILABLE] = DB returned nothing. Use technology class names only. +- ANY price, part number, or product name you write that is NOT in the context below is an invention. Inventions are HARD FAILS in QA. + Outline: {{OUTLINE}} -Context data: +Context data from Flexoptix database (verified — use exactly as provided): {{CONTEXT_DATA}}`; // ═══════════════════════════════════════════════════════ @@ -568,6 +585,18 @@ CALIBRATION FAILS (auto-reject — fix before returning): → STRONG: "You're about to spend $400,000 on optics. Here's how to accidentally turn it into a $2M problem." → If the hook lacks a concrete number or consequence, strengthen it. +30. INVENTED PRICES (HARD FAIL): Check EVERY price mentioned in the article. + → Any price that was NOT in the [VERIFIED PRICE] lines of the context data is invented. + → REMOVE invented prices. Replace with "see flexoptix.net for current pricing" or a technology-class range from the REFERENCE VALUES section. + → Exception: general ranges like "$200-600 for compatible 400G DR4" from the system prompt reference values are acceptable ONLY if no verified price exists for a specific product. + → If an exact price like "€312.50" or "$449.99" appears and it was NOT in the context — REMOVE IT. + +31. INVENTED PART NUMBERS (HARD FAIL): Check every part number, SKU, or model number. + → If it was NOT in the [PRODUCT] lines of the context data, it is invented. + → REMOVE invented part numbers. Replace with the product class name (e.g., "400G DR4 optic" not "FX-QSFPDD-400G-DR4-001"). + +32. INVENTED VENDOR NAMES: Any vendor cited that was NOT in the context data or in the system prompt reference list (Cisco, Juniper, Arista, Flexoptix, FS.com, ProLabs, InnoLight, Coherent, Lumentum) — REMOVE. + CRITICAL OUTPUT RULE: Return ONLY the fixed article text. NO review commentary. NO numbered issue lists. NO "Critical Review" section. NO "HARD FAIL CHECKS" header. NO markdown review structure. diff --git a/packages/api/src/routes/blog.ts b/packages/api/src/routes/blog.ts index 4ed66d0..6ca43b3 100644 --- a/packages/api/src/routes/blog.ts +++ b/packages/api/src/routes/blog.ts @@ -183,7 +183,10 @@ const BLOG_TEMPLATES: Record = { }; /** Gather data from vector collections for blog content — with PostgreSQL fallback. - * Topic-aware: strategy articles (hype_cycle, comparison) skip troubleshooting data. */ + * Topic-aware: strategy articles (hype_cycle, comparison) skip troubleshooting data. + * + * IMPORTANT: Always enriches products with REAL verified prices from price_observations. + * The LLM may ONLY use prices returned here — never invent pricing. */ async function gatherBlogData(keywords: string[], topic?: string): Promise<{ products: Array>; news: Array>; @@ -195,40 +198,33 @@ async function gatherBlogData(keywords: string[], topic?: string): Promise<{ // Strategy articles should NOT pull troubleshooting data (topic separation) const skipTroubleshooting = topic === "hype_cycle" || topic === "comparison" || topic === "new_product"; - // Try vector search first (requires Qdrant + embeddings) - try { - const [products, news, faq, troubleshooting] = await Promise.all([ - semanticSearch("product_embeddings", query, 10).catch(() => []), - semanticSearch("news_embeddings", query, 5).catch(() => []), - skipTroubleshooting ? Promise.resolve([]) : semanticSearch("faq_embeddings", query, 5).catch(() => []), - skipTroubleshooting ? Promise.resolve([]) : semanticSearch("troubleshooting_embeddings", query, 3).catch(() => []), - ]); + // Extract speed/form_factor hints from keywords for relevance filtering + const speedHints = keywords.join(" ").match(/\b(10|25|40|100|200|400|800|1600)G\b/gi) || []; + const speedGbps = speedHints.map(s => parseInt(s)).filter(Boolean); - const result = { - products: products.map((r) => ({ score: r.score, ...r.payload })), - news: news.map((r) => ({ score: r.score, ...r.payload })), - faq: faq.map((r) => ({ score: r.score, ...r.payload })), - troubleshooting: troubleshooting.map((r) => ({ score: r.score, ...r.payload })), - }; - - // If we got data from vector search, return it - if (result.products.length > 0 || result.news.length > 0) { - return result; - } - } catch { - console.log("Vector search unavailable, falling back to PostgreSQL"); - } - - // Fallback: query PostgreSQL directly for product and news data - const [productsDb, newsDb] = await Promise.all([ - pool.query( - `SELECT t.slug, t.form_factor, t.speed, t.reach_label, t.fiber_type, t.standard_name, - v.name as vendor + // ── Fetch real products with verified prices from DB ────────────────────── + // Primary: filter by keyword-extracted speed; fallback to top products by speed + const productQuery = speedGbps.length > 0 + ? `SELECT t.id, t.slug, t.form_factor, t.speed, t.speed_gbps, t.reach_label, + t.fiber_type, t.standard_name, t.connector, t.power_consumption_w, + t.tx_power_min_dbm, t.tx_power_max_dbm, t.rx_sensitivity_dbm, + v.name as vendor, v.type as vendor_type + FROM transceivers t + LEFT JOIN vendors v ON t.vendor_id = v.id + WHERE t.speed_gbps = ANY($1::int[]) + ORDER BY v.type = 'Compatible' DESC, t.speed_gbps DESC + LIMIT 20` + : `SELECT t.id, t.slug, t.form_factor, t.speed, t.speed_gbps, t.reach_label, + t.fiber_type, t.standard_name, t.connector, t.power_consumption_w, + t.tx_power_min_dbm, t.tx_power_max_dbm, t.rx_sensitivity_dbm, + v.name as vendor, v.type as vendor_type FROM transceivers t LEFT JOIN vendors v ON t.vendor_id = v.id ORDER BY t.speed_gbps DESC - LIMIT 15` - ).catch(() => ({ rows: [] })), + LIMIT 20`; + + const [productsDb, newsDb] = await Promise.all([ + pool.query(productQuery, speedGbps.length > 0 ? [speedGbps] : []).catch(() => ({ rows: [] })), pool.query( `SELECT title, source, category, published_at::text as date FROM news_articles @@ -237,8 +233,78 @@ async function gatherBlogData(keywords: string[], topic?: string): Promise<{ ).catch(() => ({ rows: [] })), ]); + // ── Enrich each product with real verified prices ───────────────────────── + const productIds = productsDb.rows.map((r: Record) => r.id).filter(Boolean); + let priceMap: Record> = {}; + + if (productIds.length > 0) { + const priceResult = await pool.query( + `SELECT po.transceiver_id, + v.name as vendor, + v.type as vendor_type, + po.price::float as price, + po.currency, + po.url, + po.time::text as observed_at + FROM price_observations po + JOIN vendors v ON po.source_vendor_id = v.id + WHERE po.transceiver_id = ANY($1::int[]) + AND po.time > NOW() - INTERVAL '30 days' + AND po.price IS NOT NULL + AND po.price > 0 + ORDER BY po.transceiver_id, po.time DESC`, + [productIds] + ).catch(() => ({ rows: [] })); + + // Group by transceiver_id — keep best price per vendor + for (const row of priceResult.rows) { + const tid = String(row.transceiver_id); + if (!priceMap[tid]) priceMap[tid] = []; + // Deduplicate by vendor — keep most recent + if (!priceMap[tid].find((p) => p.vendor === row.vendor)) { + priceMap[tid].push({ + vendor: row.vendor, + price: row.price, + currency: row.currency || "EUR", + url: row.url || "", + observed_at: row.observed_at, + }); + } + } + } + + // Attach prices to products + const enrichedProducts = productsDb.rows.map((p: Record) => ({ + ...p, + verified_prices: priceMap[String(p.id)] || [], + has_verified_price: (priceMap[String(p.id)] || []).length > 0, + })); + + // Try vector search to supplement (but always use DB products as base — they have real prices) + try { + const [vectorProducts, news, faq, troubleshooting] = await Promise.all([ + semanticSearch("product_embeddings", query, 10).catch(() => []), + semanticSearch("news_embeddings", query, 5).catch(() => []), + skipTroubleshooting ? Promise.resolve([]) : semanticSearch("faq_embeddings", query, 5).catch(() => []), + skipTroubleshooting ? Promise.resolve([]) : semanticSearch("troubleshooting_embeddings", query, 3).catch(() => []), + ]); + + return { + // DB products first (they have real prices) — vector results supplemental only + products: [ + ...enrichedProducts, + ...vectorProducts.map((r) => ({ score: r.score, ...r.payload })), + ].slice(0, 20), + news: news.length > 0 ? news.map((r) => ({ score: r.score, ...r.payload })) : newsDb.rows, + faq: faq.map((r) => ({ score: r.score, ...r.payload })), + troubleshooting: troubleshooting.map((r) => ({ score: r.score, ...r.payload })), + }; + } catch { + // Vector search unavailable — use PostgreSQL only + } + return { - products: productsDb.rows, + products: enrichedProducts, news: newsDb.rows, faq: [], troubleshooting: [], @@ -968,10 +1034,48 @@ async function runLlmPipeline( // Warmup await generate("Test", "OK", { temperature: 0.1, maxTokens: 8, timeoutMs: 60000 }).catch(() => {}); - // Build context data string for injection - const contextData = data.products.slice(0, 15).map(p => - `${p.standard_name || p.slug}: ${p.form_factor} ${p.speed}, reach ${p.reach_label || "N/A"}, fiber ${p.fiber_type || "N/A"}, vendor ${p.vendor || "N/A"}${p.price ? `, ~€${p.price}` : ""}` - ).join("\n"); + // Build context data string for injection — REAL DB data only, never fabricated + type PriceEntry = { vendor: string; price: number; currency: string; url: string; observed_at: string }; + const contextLines: string[] = []; + + for (const p of data.products.slice(0, 20)) { + const prices = (p.verified_prices as PriceEntry[] | undefined) || []; + const hasPrice = prices.length > 0; + + // Build product line with real specs + let line = `[PRODUCT] ${p.standard_name || p.slug || "unknown"}`; + if (p.form_factor) line += ` | Form factor: ${p.form_factor}`; + if (p.speed) line += ` | Speed: ${p.speed}`; + if (p.reach_label) line += ` | Reach: ${p.reach_label}`; + if (p.fiber_type) line += ` | Fiber: ${p.fiber_type}`; + if (p.connector) line += ` | Connector: ${p.connector}`; + if (p.vendor) line += ` | Vendor: ${p.vendor}`; + if (p.vendor_type) line += ` (${p.vendor_type})`; + + // Optical specs if available + if (p.tx_power_min_dbm != null) line += ` | TX min: ${p.tx_power_min_dbm} dBm`; + if (p.tx_power_max_dbm != null) line += ` TX max: ${p.tx_power_max_dbm} dBm`; + if (p.rx_sensitivity_dbm != null) line += ` | RX sensitivity: ${p.rx_sensitivity_dbm} dBm`; + if (p.power_consumption_w != null) line += ` | Power: ${p.power_consumption_w}W`; + + contextLines.push(line); + + // Append verified prices — clearly tagged as real DB observations + if (hasPrice) { + for (const pr of prices.slice(0, 3)) { + const date = pr.observed_at ? pr.observed_at.split("T")[0] : "recent"; + contextLines.push( + ` [VERIFIED PRICE] ${pr.currency} ${pr.price.toFixed(2)} — ${pr.vendor} (observed ${date}) ${pr.url ? `| ${pr.url}` : ""}` + ); + } + } else { + contextLines.push(` [NO VERIFIED PRICE IN DB — do NOT invent a price for this product]`); + } + } + + const contextData = contextLines.length > 0 + ? contextLines.join("\n") + : "[NO PRODUCT DATA AVAILABLE — do NOT invent product names, part numbers, or prices]"; // Get blog type config const blogType = BLOG_TYPES[selectedTopic as keyof typeof BLOG_TYPES] || BLOG_TYPES.tutorial;