From 67310c8fe774f82771d5ba10a6e50d664b80dcd6 Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller <renefichtmueller@MacStudio-von-Rene-8.local>
Date: Thu, 14 May 2026 12:29:17 +0200
Subject: [PATCH] fix(blog): SPA-aware URL blog generation + dynamic
 generated_by

- fetchUrlContent() now extracts OG/meta tags (og:title, og:description,
  name="description", og:site_name) as fallback content for JS-rendered SPAs
- Returns spaDetected=true when body text < 300 chars after stripping scripts
- from-url endpoint skips gatherBlogData() product injection when SPA detected,
  preventing fo-blog-v10 from defaulting to optical networking domain
- additionalContext now includes SPA warning instructing LLM not to default
  to optical transceiver topics unless the page is actually about that
- generated_by in pipeline UPDATE query now uses active model name instead of
  hardcoded 'fo-blog-engine-v7' (reads getLlmProvider().ollamaModel)
- Dashboard shows SPA warning toast when spa_detected=true in response
- Response now includes spa_detected field for client awareness
---
 packages/api/src/routes/blog.ts | 110 ++++++++++++++++++++++++++------
 packages/dashboard/index.html   |  11 +++-
 2 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/packages/api/src/routes/blog.ts b/packages/api/src/routes/blog.ts
index db5843e..8f50625 100644
--- a/packages/api/src/routes/blog.ts
+++ b/packages/api/src/routes/blog.ts
@@ -1347,10 +1347,12 @@ async function runLlmPipeline(
     const finalIssues = validateArticle(draftContent);
 
     // Update the draft in DB (title updated to generated headline if available)
+    const pipelineModel = getLlmProvider();
+    const pipelineGeneratedBy = `fo-blog-engine-${pipelineModel.ollamaModel || pipelineModel.provider || "llm"}`;
     await pool.query(
       `UPDATE blog_drafts
        SET title = $9, draft_content = $1, word_count = $2,
-           generated_by = 'fo-blog-engine-v7',
+           generated_by = $10,
            pipeline_version = 'v7',
            pipeline_steps_completed = $3,
            auto_qa_score = $4,
@@ -1377,6 +1379,7 @@ async function runLlmPipeline(
         linkedinCharCount,
         draftId,
         finalTitle,
+        pipelineGeneratedBy,
       ],
     );
 
@@ -1528,8 +1531,17 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
   }
 });
 
-/** Fetch a URL and extract readable text content for use as LLM context. */
-async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> {
+/** Fetch a URL and extract readable text content for use as LLM context.
+ *  Returns spaDetected=true when extracted body text is thin (< 300 chars),
+ *  indicating a JavaScript Single Page Application where content is rendered client-side.
+ *  In that case, metaDesc contains OG/meta description fallback text.
+ */
+async function fetchUrlContent(rawUrl: string): Promise<{
+  pageTitle: string;
+  text: string;
+  spaDetected: boolean;
+  metaDesc: string;
+}> {
   const response = await fetch(rawUrl, {
     headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" },
     signal: AbortSignal.timeout(20000),
@@ -1545,12 +1557,33 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex
 
   const html = await response.text();
 
-  // Extract page title
+  // --- Extract OG / meta tags for SPA fallback ---
+  const decodeEntities = (s: string) =>
+    s.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
+     .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
+
+  const ogTitle =
+    html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']{1,200})["']/i)?.[1] ||
+    html.match(/<meta[^>]+content=["']([^"']{1,200})["'][^>]+property=["']og:title["']/i)?.[1] || "";
+
+  const ogDesc =
+    html.match(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] ||
+    html.match(/<meta[^>]+content=["']([^"']{1,500})["'][^>]+property=["']og:description["']/i)?.[1] ||
+    html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] ||
+    html.match(/<meta[^>]+content=["']([^"']{1,500})["'][^>]+name=["']description["']/i)?.[1] || "";
+
+  const ogSiteName =
+    html.match(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']{1,100})["']/i)?.[1] ||
+    html.match(/<meta[^>]+content=["']([^"']{1,100})["'][^>]+property=["']og:site_name["']/i)?.[1] || "";
+
+  // Extract page title: prefer OG title, then <title>, then <h1>
   const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i);
   const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i);
-  const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "")
-    .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
-    .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
+  const pageTitle = decodeEntities(
+    ogTitle || titleMatch?.[1] || h1Match?.[1] || ""
+  );
+
+  const metaDesc = decodeEntities(ogDesc);
 
   // Strip scripts, styles, SVG, navigation boilerplate
   let text = html
@@ -1580,7 +1613,19 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex
     text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]";
   }
 
-  return { pageTitle, text };
+  // Detect SPA: very little body text means JS renders the real content
+  const spaDetected = text.length < 300;
+
+  // When SPA detected, enrich text with what we could extract from meta tags
+  if (spaDetected && (metaDesc || ogSiteName)) {
+    const parts: string[] = [];
+    if (ogSiteName) parts.push(`Site: ${ogSiteName}`);
+    if (pageTitle) parts.push(`Title: ${pageTitle}`);
+    if (metaDesc) parts.push(`Description: ${metaDesc}`);
+    text = parts.join("\n");
+  }
+
+  return { pageTitle, text, spaDetected, metaDesc };
 }
 
 // POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it
@@ -1611,33 +1656,55 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
 
   try {
     // Fetch page content server-side (no CORS issues)
-    const { pageTitle, text: extractedText } = await fetchUrlContent(url);
+    const { pageTitle, text: extractedText, spaDetected, metaDesc } = await fetchUrlContent(url);
 
-    console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`);
+    console.log(
+      `Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} ` +
+      `(${extractedText.length} chars${spaDetected ? ", SPA detected" : ""})`
+    );
+
+    // Build a rich additional_context from the URL content.
+    // When a SPA is detected (JS-rendered), body text is a shell — we rely on meta tags instead.
+    const spaWarning = spaDetected
+      ? `\nNOTE: This URL is a JavaScript Single Page Application. Only meta/OG data was available ` +
+        `server-side — the LLM should infer topic from the site name, title, and description above. ` +
+        `Do NOT default to optical networking topics unless the page is actually about that.`
+      : "";
 
-    // Build a rich additional_context from the URL content
     const additionalContext =
       `SOURCE URL: ${url}\n` +
       `PAGE TITLE: ${pageTitle}\n` +
       `HOSTNAME: ${parsedUrl.hostname}\n` +
+      (metaDesc ? `META DESCRIPTION: ${metaDesc}\n` : "") +
       `\n--- EXTRACTED PAGE CONTENT ---\n` +
-      `${extractedText}\n` +
+      `${extractedText || "(No body text extractable — JavaScript-rendered SPA)"}\n` +
       `--- END PAGE CONTENT ---\n` +
-      `\nIMPORTANT: Use this content as factual background and editorial direction. ` +
+      spaWarning +
+      `\n\nIMPORTANT: Use this content as factual background and editorial direction. ` +
+      `The blog MUST be about the topic described above, NOT about optical transceivers or fiber unless explicitly relevant. ` +
       `Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`;
 
     const title = pageTitle || parsedUrl.hostname;
     const template = templates[Math.floor(Math.random() * templates.length)];
-    const keywords = [
-      ...template.seo_keywords,
-      "optical transceiver", "networking",
-    ].filter(Boolean);
 
-    const data = await gatherBlogData(keywords, selectedTopic);
+    // When SPA detected, skip optical transceiver product injection — it pollutes the LLM context
+    // with irrelevant product data and causes the model to default to its fine-tuning domain.
+    // Use empty data so the pipeline focuses purely on the URL context provided above.
+    const keywords = spaDetected
+      ? [parsedUrl.hostname.replace(/^www\./, ""), pageTitle].filter(Boolean)
+      : [...template.seo_keywords, "optical transceiver", "networking"].filter(Boolean);
+
+    const data = spaDetected
+      ? { products: [] as any[], news: [] as any[], faq: [] as any[], troubleshooting: [] as any[] }
+      : await gatherBlogData(keywords, selectedTopic);
+
     const draftContent = generateTemplateDraft(title, selectedTopic, data);
     const wordCount = draftContent.split(/\s+/).length;
     const initialIssues = validateArticle(draftContent);
 
+    const activeModel = getLlmProvider();
+    const generatedBy = `tip-blog-from-url-${activeModel.ollamaModel || activeModel.provider || "llm"}`;
+
     const result = await pool.query(
       `INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords)
        VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9)
@@ -1646,10 +1713,10 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
         title,
         selectedTopic,
         template.target_audience,
-        JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }),
+        JSON.stringify({ generation_method: "from-url", source_url: url, spa_detected: spaDetected, quality_issues: initialIssues }),
         draftContent,
-        JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }),
-        "tip-blog-engine-url",
+        JSON.stringify({ source_url: url, extracted_chars: extractedText.length, spa_detected: spaDetected, products: data.products.length, news: data.news.length }),
+        generatedBy,
         wordCount,
         template.seo_keywords,
       ],
@@ -1672,6 +1739,7 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
       source_url: url,
       page_title: pageTitle,
       extracted_chars: extractedText.length,
+      spa_detected: spaDetected,
       draft: {
         id: draftId,
         title,
diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html
index 2a4763b..d532fb8 100644
--- a/packages/dashboard/index.html
+++ b/packages/dashboard/index.html
@@ -5561,8 +5561,15 @@ function generateBlogFromUrl() {
     btn.disabled = false;
     btn.textContent = '🔗 Aus URL generieren';
     if (data.success) {
-      status.textContent = '✓ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)';
-      showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig');
+      var spaNote = data.spa_detected
+        ? ' ⚠️ SPA erkannt — Inhalt nur via Meta-Tags (JS-gerendert)'
+        : ' ✓ ' + data.extracted_chars + ' Zeichen extrahiert';
+      status.textContent = spaNote + ' — Pipeline läuft (~10 min)';
+      if (data.spa_detected) {
+        showToast('⚠️ SPA erkannt', (data.page_title || url) + ' — JavaScript-Seite, Inhalt via Meta-Tags. Pipeline läuft.');
+      } else {
+        showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig');
+      }
       document.getElementById('blog-from-url-input').value = '';
       loadBlogDrafts();
       pollBlogLlm(data.draft.id, 0);