From 67310c8fe774f82771d5ba10a6e50d664b80dcd6 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Thu, 14 May 2026 12:29:17 +0200 Subject: [PATCH] fix(blog): SPA-aware URL blog generation + dynamic generated_by - fetchUrlContent() now extracts OG/meta tags (og:title, og:description, name="description", og:site_name) as fallback content for JS-rendered SPAs - Returns spaDetected=true when body text < 300 chars after stripping scripts - from-url endpoint skips gatherBlogData() product injection when SPA detected, preventing fo-blog-v10 from defaulting to optical networking domain - additionalContext now includes SPA warning instructing LLM not to default to optical transceiver topics unless the page is actually about that - generated_by in pipeline UPDATE query now uses active model name instead of hardcoded 'fo-blog-engine-v7' (reads getLlmProvider().ollamaModel) - Dashboard shows SPA warning toast when spa_detected=true in response - Response now includes spa_detected field for client awareness --- packages/api/src/routes/blog.ts | 110 ++++++++++++++++++++++++++------ packages/dashboard/index.html | 11 +++- 2 files changed, 98 insertions(+), 23 deletions(-) diff --git a/packages/api/src/routes/blog.ts b/packages/api/src/routes/blog.ts index db5843e..8f50625 100644 --- a/packages/api/src/routes/blog.ts +++ b/packages/api/src/routes/blog.ts @@ -1347,10 +1347,12 @@ async function runLlmPipeline( const finalIssues = validateArticle(draftContent); // Update the draft in DB (title updated to generated headline if available) + const pipelineModel = getLlmProvider(); + const pipelineGeneratedBy = `fo-blog-engine-${pipelineModel.ollamaModel || pipelineModel.provider || "llm"}`; await pool.query( `UPDATE blog_drafts SET title = $9, draft_content = $1, word_count = $2, - generated_by = 'fo-blog-engine-v7', + generated_by = $10, pipeline_version = 'v7', pipeline_steps_completed = $3, auto_qa_score = $4, @@ -1377,6 +1379,7 @@ async function runLlmPipeline( linkedinCharCount, draftId, finalTitle, + pipelineGeneratedBy, ], ); @@ -1528,8 +1531,17 @@ blogRouter.post("/generate", async (req: Request, res: Response) => { } }); -/** Fetch a URL and extract readable text content for use as LLM context. */ -async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> { +/** Fetch a URL and extract readable text content for use as LLM context. + * Returns spaDetected=true when extracted body text is thin (< 300 chars), + * indicating a JavaScript Single Page Application where content is rendered client-side. + * In that case, metaDesc contains OG/meta description fallback text. + */ +async function fetchUrlContent(rawUrl: string): Promise<{ + pageTitle: string; + text: string; + spaDetected: boolean; + metaDesc: string; +}> { const response = await fetch(rawUrl, { headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" }, signal: AbortSignal.timeout(20000), @@ -1545,12 +1557,33 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex const html = await response.text(); - // Extract page title + // --- Extract OG / meta tags for SPA fallback --- + const decodeEntities = (s: string) => + s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">") + .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim(); + + const ogTitle = + html.match(/]+property=["']og:title["'][^>]+content=["']([^"']{1,200})["']/i)?.[1] || + html.match(/]+content=["']([^"']{1,200})["'][^>]+property=["']og:title["']/i)?.[1] || ""; + + const ogDesc = + html.match(/]+property=["']og:description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] || + html.match(/]+content=["']([^"']{1,500})["'][^>]+property=["']og:description["']/i)?.[1] || + html.match(/]+name=["']description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] || + html.match(/]+content=["']([^"']{1,500})["'][^>]+name=["']description["']/i)?.[1] || ""; + + const ogSiteName = + html.match(/]+property=["']og:site_name["'][^>]+content=["']([^"']{1,100})["']/i)?.[1] || + html.match(/]+content=["']([^"']{1,100})["'][^>]+property=["']og:site_name["']/i)?.[1] || ""; + + // Extract page title: prefer OG title, then , then <h1> const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i); const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i); - const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "") - .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">") - .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim(); + const pageTitle = decodeEntities( + ogTitle || titleMatch?.[1] || h1Match?.[1] || "" + ); + + const metaDesc = decodeEntities(ogDesc); // Strip scripts, styles, SVG, navigation boilerplate let text = html @@ -1580,7 +1613,19 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]"; } - return { pageTitle, text }; + // Detect SPA: very little body text means JS renders the real content + const spaDetected = text.length < 300; + + // When SPA detected, enrich text with what we could extract from meta tags + if (spaDetected && (metaDesc || ogSiteName)) { + const parts: string[] = []; + if (ogSiteName) parts.push(`Site: ${ogSiteName}`); + if (pageTitle) parts.push(`Title: ${pageTitle}`); + if (metaDesc) parts.push(`Description: ${metaDesc}`); + text = parts.join("\n"); + } + + return { pageTitle, text, spaDetected, metaDesc }; } // POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it @@ -1611,33 +1656,55 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => { try { // Fetch page content server-side (no CORS issues) - const { pageTitle, text: extractedText } = await fetchUrlContent(url); + const { pageTitle, text: extractedText, spaDetected, metaDesc } = await fetchUrlContent(url); - console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`); + console.log( + `Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} ` + + `(${extractedText.length} chars${spaDetected ? ", SPA detected" : ""})` + ); + + // Build a rich additional_context from the URL content. + // When a SPA is detected (JS-rendered), body text is a shell — we rely on meta tags instead. + const spaWarning = spaDetected + ? `\nNOTE: This URL is a JavaScript Single Page Application. Only meta/OG data was available ` + + `server-side — the LLM should infer topic from the site name, title, and description above. ` + + `Do NOT default to optical networking topics unless the page is actually about that.` + : ""; - // Build a rich additional_context from the URL content const additionalContext = `SOURCE URL: ${url}\n` + `PAGE TITLE: ${pageTitle}\n` + `HOSTNAME: ${parsedUrl.hostname}\n` + + (metaDesc ? `META DESCRIPTION: ${metaDesc}\n` : "") + `\n--- EXTRACTED PAGE CONTENT ---\n` + - `${extractedText}\n` + + `${extractedText || "(No body text extractable — JavaScript-rendered SPA)"}\n` + `--- END PAGE CONTENT ---\n` + - `\nIMPORTANT: Use this content as factual background and editorial direction. ` + + spaWarning + + `\n\nIMPORTANT: Use this content as factual background and editorial direction. ` + + `The blog MUST be about the topic described above, NOT about optical transceivers or fiber unless explicitly relevant. ` + `Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`; const title = pageTitle || parsedUrl.hostname; const template = templates[Math.floor(Math.random() * templates.length)]; - const keywords = [ - ...template.seo_keywords, - "optical transceiver", "networking", - ].filter(Boolean); - const data = await gatherBlogData(keywords, selectedTopic); + // When SPA detected, skip optical transceiver product injection — it pollutes the LLM context + // with irrelevant product data and causes the model to default to its fine-tuning domain. + // Use empty data so the pipeline focuses purely on the URL context provided above. + const keywords = spaDetected + ? [parsedUrl.hostname.replace(/^www\./, ""), pageTitle].filter(Boolean) + : [...template.seo_keywords, "optical transceiver", "networking"].filter(Boolean); + + const data = spaDetected + ? { products: [] as any[], news: [] as any[], faq: [] as any[], troubleshooting: [] as any[] } + : await gatherBlogData(keywords, selectedTopic); + const draftContent = generateTemplateDraft(title, selectedTopic, data); const wordCount = draftContent.split(/\s+/).length; const initialIssues = validateArticle(draftContent); + const activeModel = getLlmProvider(); + const generatedBy = `tip-blog-from-url-${activeModel.ollamaModel || activeModel.provider || "llm"}`; + const result = await pool.query( `INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords) VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9) @@ -1646,10 +1713,10 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => { title, selectedTopic, template.target_audience, - JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }), + JSON.stringify({ generation_method: "from-url", source_url: url, spa_detected: spaDetected, quality_issues: initialIssues }), draftContent, - JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }), - "tip-blog-engine-url", + JSON.stringify({ source_url: url, extracted_chars: extractedText.length, spa_detected: spaDetected, products: data.products.length, news: data.news.length }), + generatedBy, wordCount, template.seo_keywords, ], @@ -1672,6 +1739,7 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => { source_url: url, page_title: pageTitle, extracted_chars: extractedText.length, + spa_detected: spaDetected, draft: { id: draftId, title, diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html index 2a4763b..d532fb8 100644 --- a/packages/dashboard/index.html +++ b/packages/dashboard/index.html @@ -5561,8 +5561,15 @@ function generateBlogFromUrl() { btn.disabled = false; btn.textContent = '🔗 Aus URL generieren'; if (data.success) { - status.textContent = '✓ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)'; - showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig'); + var spaNote = data.spa_detected + ? ' ⚠️ SPA erkannt — Inhalt nur via Meta-Tags (JS-gerendert)' + : ' ✓ ' + data.extracted_chars + ' Zeichen extrahiert'; + status.textContent = spaNote + ' — Pipeline läuft (~10 min)'; + if (data.spa_detected) { + showToast('⚠️ SPA erkannt', (data.page_title || url) + ' — JavaScript-Seite, Inhalt via Meta-Tags. Pipeline läuft.'); + } else { + showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig'); + } document.getElementById('blog-from-url-input').value = ''; loadBlogDrafts(); pollBlogLlm(data.draft.id, 0);