From f616e0ebbefcf6ecd72d5a045bc002a467792dc2 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 4 Apr 2026 07:50:01 +0200 Subject: [PATCH] feat: blog engine v4 (reduction+style-lock passes) + flexoptix scraper fixes Blog engine (fo-blog-pipeline.ts): - Add STEP8b_REDUCTION: cuts article 25-35%, removes repeated concepts - Add STEP8c_STYLE_LOCK: enforces tone consistency, fixes scope/OPM confusion, removes inline SKUs from article flow - Add Gold Standard 3 to calibration (Style B troubleshooting example 2026-04-04) - Pipeline now 12 steps (was 10), version bumped to v4-reduction-stylelock blog.ts: - Wire STEP8b and STEP8c into pipeline between Kill-AI-Tone and QA Check - Update progress tracking to 12 total steps - Update pipeline_version to 'v4-reduction-stylelock' flexoptix-catalog.ts: - Fix contentHash call: pass object directly, not JSON.stringify(object) db.ts: - price_verified=true set in content_hash early-return path (no new observation) - image_verified=true auto-set in findOrCreateScrapedTransceiver on INSERT/UPDATE --- packages/api/src/llm/fo-blog-pipeline.ts | 134 ++++++++++++++++++ packages/api/src/routes/blog.ts | 46 ++++-- .../scraper/src/scrapers/flexoptix-catalog.ts | 16 ++- packages/scraper/src/utils/db.ts | 5 + 4 files changed, 186 insertions(+), 15 deletions(-) diff --git a/packages/api/src/llm/fo-blog-pipeline.ts b/packages/api/src/llm/fo-blog-pipeline.ts index 640ef1d..1437c1f 100644 --- a/packages/api/src/llm/fo-blog-pipeline.ts +++ b/packages/api/src/llm/fo-blog-pipeline.ts @@ -955,6 +955,62 @@ KEY ELEMENTS OF THIS SECOND STYLE B EXAMPLE: - Ending reframes the whole topic without telling reader what to do - No bullet lists, no section headers, no numbered points +━━━ STYLE B GOLD EXAMPLE 3 (2026-04-04 validated — Troubleshooting 400G/800G) ━━━ +Topic: Troubleshooting high-density optics. NO sections, pure flow, "why things look fine until they don't". +Note: This example was rated 10/10 for STYLE. Use as reference for troubleshooting tutorial articles. + + "You're about to roll out a new batch of 400G optics. + + Quote is approved, hardware is in, lab tests looked clean. Everything points to a smooth deployment. + + That's usually the moment where things start getting interesting. + + Because 400G doesn't fail the way people expect. It doesn't just go down. It sort of works — and that's what makes it painful. + + Most teams come from 10G, 40G, maybe 100G. At those speeds, you can get away with a lot. Cabling doesn't have to be perfect. Connectors don't have to be spotless. Margins are forgiving. + + At 400G, that changes. + + Not dramatically. Just enough to expose everything that wasn't quite right before. + + So the first time you see it is usually not a hard failure. It's something subtle. + + A link comes up, but error counters start creeping. + Another one stays up, but behaves differently under load. + A third one just refuses to come up, even though everything looks correct. + + You start where everyone starts. Check config. Swap optics. Move ports. Nothing obvious fixes it. + + Eventually, someone looks at the physical layer properly. Not "looks clean". Actually checks it. + + And that's where the story usually turns. + + A slightly dirty MPO connector. A marginal patch panel. A link that technically fits within spec, but only just. + + At 100G, that would have passed unnoticed. At 400G, it doesn't. + + Polarity is the next one. It's one of those things people assume is correct because it always has been. Until it isn't. + + At 400G, one wrong assumption in your MPO layout is enough to keep a link completely down while everything else checks out. Optics are detected. Light levels look fine. Config is clean. Still no link. + + So you lose time looking at layers that aren't the problem, until someone traces the fiber path end-to-end and finds the mismatch. + + That's not an edge case. That's a standard failure mode. + + [continues — breakouts, power, cost of wasted time — all in prose, no headers] + + 400G doesn't usually fail loudly. It fails quietly, inconsistently, and just enough to slow you down." + +KEY ELEMENTS OF THIS STYLE B EXAMPLE 3: + - Opens with a situation the reader recognizes: "lab tests looked clean" + - Error described as behavior, not scenario: "sort of works" not "#### Scenario: Link Flapping" + - Physical layer investigation described as a process, not a procedure + - Polarity: one sentence on the problem, one sentence on how you find it — no header, no bullet + - Measurement: "inspect the end-face" — no "verify <0.5 dB with a scope" (scope is visual only) + - Power mentioned as real-world consequence ("adds up quickly") not a section + - Ending: the cost is lost time, stated simply and directly + - ZERO section headers, ZERO bullet lists, ZERO numbered steps + WRONG PATTERNS (both styles — never produce): ❌ "Thoroughly Test Your PoE Budget:" (PoE = wrong context, checklist = wrong format) ❌ "QSFP-DD DR4 (Direct Attach)" (DR4 ≠ Direct Attach — DAC is Direct Attach Copper) @@ -1009,6 +1065,84 @@ POWER / LOSS BUDGET PRECISION (always apply): --- END GOLD STANDARD --- `; +// ═══════════════════════════════════════════════════════ +// STEP 8b: REDUCTION PASS — Remove 25-35% of content +// (2026-04-04: Added based on field feedback — articles were too long, +// repeated concepts, and "assembled" rather than written) +// ═══════════════════════════════════════════════════════ + +export const STEP8b_REDUCTION = `Cut this article by 25–35%. + +This is not optional. After the previous passes, the article has grown too long and repeats itself. +The goal is a tighter, more natural text — not a shorter version of the same article. + +WHAT TO REMOVE: +- Any concept explained more than once (pick its best version, cut the rest) +- Sentences that restate what the previous sentence already said +- Paragraphs that add length without adding new information or new angle +- "Setting up" sentences that don't earn their space ("This is something engineers often overlook...") +- Transition sentences that bridge to the same point you already made +- The weakest scenario or example if there are more than three +- Any section that reads like a template ("Hidden Costs:", "When Not To Use:", etc.) — either integrate into narrative or cut + +WHAT TO KEEP: +- The single strongest version of each key insight +- Real-world moments that feel like something that actually happened +- Specific numbers, values, and examples — these carry weight +- Any line that a senior engineer would share or quote + +TONE RULE: After cutting, the article should feel tighter and MORE confident — not less. Shorter = stronger. + +DO NOT change the writing style or tone. Do not add new content. Do not add section headers. +Return only the reduced article — no commentary, no explanation of what you cut. + +Article: +{{ARTICLE}}`; + +// ═══════════════════════════════════════════════════════ +// STEP 8c: STYLE LOCK — Ensure tone consistency throughout +// (2026-04-04: Added based on field feedback — tone switched between +// engineer voice and consulting/formal language mid-article) +// ═══════════════════════════════════════════════════════ + +export const STEP8c_STYLE_LOCK = `Check this article for tone inconsistency and fix it. + +THE PROBLEM: The article starts with an engineer voice, then drifts into formal or consulting language mid-way. +This breaks the reader's trust. Once they notice the shift, the whole article feels fake. + +SCAN FOR THESE TONE KILLERS: +- Paragraphs starting with "It is" or "This is" in a formal way after conversational sections +- Sentences using "typically", "often", "generally" where earlier sections used direct assertions +- Academic framing: "The challenge is often framed as...", "In practice, this tends to..." +- Corporate softening: "it is worth considering", "may be beneficial", "could potentially" +- Neutral advice after opinionated sections: "evaluate based on your requirements" +- Sudden textbook explanations in the middle of field narrative +- Passive voice appearing in an otherwise active-voice article + +HOW TO FIX: +- Match the tone of the FIRST paragraph throughout — if the opening is direct and specific, the rest must be too +- Convert passive voice to active: "links were found to be unstable" → "links went unstable" +- Convert hedging to assertion: "this may cause issues" → "this causes issues" +- Convert formal to conversational: "the operator is responsible for validation" → "you own the validation" +- If a section genuinely can't match the opening tone because the content is different — that section doesn't belong in this article. Cut it to one sentence or remove it. + +SCOPE vs OPM (measurement accuracy check — one of the most common tone violations): +- Any sentence where a scope is said to MEASURE loss or dB values: fix it. + WRONG: "verify <0.5 dB insertion loss with a scope" (scope is visual, not a loss meter) + CORRECT: "inspect with a scope for contamination; use an OPM or OTDR to measure actual insertion loss" +- This is a TECHNICAL accuracy fix, not just a tone fix. Getting this wrong destroys credibility with optical engineers. + +NO SKU RULE (fix if present): +- Remove any product SKU or model number that appears inline in the narrative text + (SKUs like "FX-400DR4-001", "QSFP-DD-400-DR4-001", etc. belong in product tables, not article flow) +- Replace with the technology class name: "400G DR4 optic" or "QSFP-DD DR4" +- Exception: if a specific product is cited from [VERIFIED PRICE] context data and is contextually necessary + +Return only the fixed article. No commentary. + +Article: +{{ARTICLE}}`; + /** * Injects the calibration gold standard into the system prompt. * Use sparingly — only when available Ollama context allows. diff --git a/packages/api/src/routes/blog.ts b/packages/api/src/routes/blog.ts index 6b0b68a..f099a3e 100644 --- a/packages/api/src/routes/blog.ts +++ b/packages/api/src/routes/blog.ts @@ -16,8 +16,8 @@ import { pool } from "../db/client"; const pipelineProgress = new Map(); function setProgress(draftId: string, step: number, label: string): void { - const pct = Math.round((step / 10) * 92) + 2; // 2%..94% during run, 100% on complete - pipelineProgress.set(draftId, { step, total: 10, label, pct }); + const pct = Math.round((step / 12) * 92) + 2; // 2%..94% during run, 100% on complete + pipelineProgress.set(draftId, { step, total: 12, label, pct }); } function clearProgress(draftId: string): void { @@ -1001,6 +1001,8 @@ async function runLlmPipeline( STEP6_TECHNICAL_DEEPENING, STEP7_OPINION_LAYER, STEP8_KILL_AI_TONE, + STEP8b_REDUCTION, + STEP8c_STYLE_LOCK, STEP9_QA_CHECK, STEP10_QUALITY_SCORE, BLOG_TYPES, @@ -1010,6 +1012,7 @@ async function runLlmPipeline( const LLM_OPTS = { temperature: 0.7, maxTokens: 6144, timeoutMs: 480000 }; const LLM_REFINE = { temperature: 0.4, maxTokens: 6144, timeoutMs: 480000 }; + const TOTAL_STEPS = 12; // 10 original + 8b Reduction + 8c Style Lock let stepsCompleted = 0; try { @@ -1158,18 +1161,37 @@ async function runLlmPipeline( ); stepsCompleted = 8; - // ═══ STEP 9: QA Check ═══ - console.log(" Step 9/10: QA Check..."); - setProgress(draftId, 9, "Step 9/10: QA Check"); - const step9 = await generate(systemPrompt, - STEP9_QA_CHECK.replace("{{ARTICLE}}", step8.text), + // ═══ STEP 8b: Reduction Pass ═══ + console.log(" Step 9/12: Reduction Pass (remove 25-35%)..."); + setProgress(draftId, 9, "Step 9/12: Reduction Pass"); + const step8b = await generate(systemPrompt, + STEP8b_REDUCTION.replace("{{ARTICLE}}", step8.text), LLM_REFINE ); stepsCompleted = 9; + console.log(` After reduction: ${step8b.text.split(/\s+/).length} words (was ${step8.text.split(/\s+/).length})`); + + // ═══ STEP 8c: Style Lock ═══ + console.log(" Step 10/12: Style Lock (tone consistency + scope/SKU fixes)..."); + setProgress(draftId, 10, "Step 10/12: Style Lock"); + const step8c = await generate(systemPrompt, + STEP8c_STYLE_LOCK.replace("{{ARTICLE}}", step8b.text), + LLM_REFINE + ); + stepsCompleted = 10; + + // ═══ STEP 9: QA Check ═══ + console.log(" Step 11/12: QA Check..."); + setProgress(draftId, 11, "Step 11/12: QA Check"); + const step9 = await generate(systemPrompt, + STEP9_QA_CHECK.replace("{{ARTICLE}}", step8c.text), + LLM_REFINE + ); + stepsCompleted = 11; // ═══ STEP 10: Quality Score ═══ - console.log(" Step 10/10: Quality Score..."); - setProgress(draftId, 10, "Step 10/10: Quality Score"); + console.log(" Step 12/12: Quality Score..."); + setProgress(draftId, 12, "Step 12/12: Quality Score"); let autoQaScore: Record | null = null; try { const step10 = await generate(systemPrompt, @@ -1185,7 +1207,7 @@ async function runLlmPipeline( } catch { console.log(" Quality scoring skipped (parse error)"); } - stepsCompleted = 10; + stepsCompleted = 12; // Extract only the article from STEP9 output (QA returns review + fixed article) // Look for "COMPLETE FIXED ARTICLE" marker and take everything after it @@ -1222,8 +1244,8 @@ async function runLlmPipeline( await pool.query( `UPDATE blog_drafts SET draft_content = $1, word_count = $2, - generated_by = 'fo-blog-engine-v3', - pipeline_version = 'v3-flexoptix-style', + generated_by = 'fo-blog-engine-v4', + pipeline_version = 'v4-reduction-stylelock', pipeline_steps_completed = $3, auto_qa_score = $4, outline = $5, diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts index 68336d5..20a7339 100644 --- a/packages/scraper/src/scrapers/flexoptix-catalog.ts +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -414,6 +414,7 @@ export async function scrapeFlexoptixCatalog(): Promise { { search: "SFP56", defaultFF: "SFP56", defaultGbps: 50 }, { search: "DAC", defaultFF: "SFP+", defaultGbps: 10 }, { search: "AOC", defaultFF: "SFP+", defaultGbps: 10 }, + { search: "AEC", defaultFF: "OSFP", defaultGbps: 800 }, { search: "breakout", defaultFF: "QSFP28", defaultGbps: 100 }, { search: "BiDi", defaultFF: "SFP", defaultGbps: 1 }, { search: "CWDM", defaultFF: "SFP", defaultGbps: 1 }, @@ -488,12 +489,12 @@ export async function scrapeFlexoptixCatalog(): Promise { || lower.includes("adapter") || lower.includes("attenuator") || lower.includes("coupler")) continue; const url = `${BASE}/en/${item.url_key}.html`; - if (allProducts.has(url)) continue; const formFactor = inferFormFactor(item.name, gq.defaultFF); const gbps = inferSpeed(item.name, gq.defaultGbps); const reach = detectReach(item.name); const price = item.price_range?.minimum_price?.final_price?.value; + const validPrice = price && price > 0 && price < 100000 ? price : undefined; const rawImg = item.small_image?.url; const imageUrl = rawImg && !rawImg.includes("placeholder") ? rawImg : undefined; @@ -502,11 +503,20 @@ export async function scrapeFlexoptixCatalog(): Promise { // The base SKU (before ":") is the canonical FLEXOPTIX part number const baseSku = item.sku.includes(":") ? item.sku.split(":")[0] : item.sku; + // If URL already in map (added by Phase 1 HTML scraper), enrich with GraphQL price/image + if (allProducts.has(url)) { + const existing = allProducts.get(url)!; + if (!existing.price && validPrice) existing.price = validPrice; + if (!existing.imageUrl && imageUrl) existing.imageUrl = imageUrl; + if (!existing.partNumber || existing.partNumber.length < baseSku.length) existing.partNumber = baseSku; + continue; + } + allProducts.set(url, { name: item.name, partNumber: baseSku, url, - price: price && price > 0 && price < 100000 ? price : undefined, + price: validPrice, currency: item.price_range?.minimum_price?.final_price?.currency || "EUR", formFactor, speed: speedLabel(gbps), @@ -557,7 +567,7 @@ export async function scrapeFlexoptixCatalog(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const hash = contentHash({ price: product.price, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index 8db97e7..4c10323 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -38,6 +38,11 @@ export async function upsertPriceObservation(params: { ); if (existing.rows.length > 0 && existing.rows[0].content_hash === params.contentHash) { + // Price unchanged — but still ensure price_verified is set (in case it wasn't before) + await pool.query( + `UPDATE transceivers SET price_verified = true WHERE id = $1 AND (price_verified IS NULL OR price_verified = false)`, + [params.transceiverId] + ); return false; // No change }