feat: Blog Engine — generate from URL (link → BlogLLM → article)

New POST /api/blog/from-url endpoint: - Accepts url + topic in request body - Fetches page server-side (no CORS, 20s timeout, redirect-follow) - Strips script/style/nav/footer/svg; extracts readable text (~5000 chars) - Extracts page title from <title> or <h1> - Passes extracted content as structured additional_context to the existing 16-step FO blog pipeline (same as manual generation) - Returns immediately; LLM pipeline runs async - Validated: smoke test fetched flexoptix.net/en/blog/, 5040 chars, pipeline launched with llm_enhancing=true New "🔗 Blog aus URL generieren" panel in dashboard: - URL input (Enter key triggers generation) - Blog-Typ dropdown (same 8 types as manual panel) - Button shows loading state "⏳ Fetching…" during API call - Status line shows extracted char count after success - Reuses pollBlogLlm() for step-by-step progress polling - Inline status field for error display without toast spam
2026-05-14 00:55:35 +02:00 · 2026-05-14 00:55:35 +02:00 · e0f9656684
commit e0f9656684
parent 9b8b03e783
2 changed files with 253 additions and 1 deletions
--- a/packages/api/src/routes/blog.ts
+++ b/packages/api/src/routes/blog.ts
@ -11,7 +11,7 @@
 */
 import { Router, Request, Response } from "express";
 import { pool } from "../db/client";
-import { setLlmProvider, getLlmProvider } from "../llm/client";
+import { setLlmProvider, getLlmProvider, refreshLlmAutoDiscovery } from "../llm/client";

 /** In-memory pipeline progress tracker — step updates pushed here, polled via GET /api/blog/:id/progress */
 const pipelineProgress = new Map<string, { step: number; total: number; label: string; pct: number }>();
@ -1528,6 +1528,171 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
  }
 });

+/** Fetch a URL and extract readable text content for use as LLM context. */
+async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> {
+  const response = await fetch(rawUrl, {
+    headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" },
+    signal: AbortSignal.timeout(20000),
+    redirect: "follow",
+  });
+
+  if (!response.ok) throw new Error(`HTTP ${response.status} ${response.statusText}`);
+
+  const contentType = response.headers.get("content-type") || "";
+  if (!contentType.includes("text/html") && !contentType.includes("text/plain") && !contentType.includes("application/xhtml")) {
+    throw new Error(`Unsupported content type: ${contentType.split(";")[0]}`);
+  }
+
+  const html = await response.text();
+
+  // Extract page title
+  const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i);
+  const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i);
+  const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "")
+    .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
+    .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
+
+  // Strip scripts, styles, SVG, navigation boilerplate
+  let text = html
+    .replace(/<script[\s\S]*?<\/script>/gi, " ")
+    .replace(/<style[\s\S]*?<\/style>/gi, " ")
+    .replace(/<svg[\s\S]*?<\/svg>/gi, " ")
+    .replace(/<nav[\s\S]*?<\/nav>/gi, " ")
+    .replace(/<footer[\s\S]*?<\/footer>/gi, " ")
+    .replace(/<header[\s\S]*?<\/header>/gi, " ")
+    .replace(/<aside[\s\S]*?<\/aside>/gi, " ")
+    .replace(/<form[\s\S]*?<\/form>/gi, " ")
+    // Block elements → newlines
+    .replace(/<\/?(p|div|section|article|h[1-6]|li|br|hr|tr|td|th|blockquote|pre)[^>]*>/gi, "\n")
+    // Strip all remaining tags
+    .replace(/<[^>]{0,500}>/g, " ")
+    // Decode common entities
+    .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
+    .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, " ")
+    .replace(/&[a-z]+;/gi, " ")
+    // Collapse whitespace
+    .split("\n").map(l => l.trim()).filter(l => l.length > 30).join("\n")
+    .replace(/\n{3,}/g, "\n\n")
+    .trim();
+
+  // Limit to ~5000 chars — enough for LLM context, not so much it blows the prompt
+  if (text.length > 5000) {
+    text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]";
+  }
+
+  return { pageTitle, text };
+}
+
+// POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it
+blogRouter.post("/from-url", async (req: Request, res: Response) => {
+  const { url, topic } = req.body as { url?: string; topic?: string };
+
+  if (!url) {
+    res.status(400).json({ success: false, error: "url ist erforderlich" });
+    return;
+  }
+
+  // Validate URL — must be http/https
+  let parsedUrl: URL;
+  try {
+    parsedUrl = new URL(url);
+    if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new Error("bad protocol");
+  } catch {
+    res.status(400).json({ success: false, error: "Ungültige URL — muss http:// oder https:// beginnen" });
+    return;
+  }
+
+  const selectedTopic = topic || "technology_deep_dive";
+  const templates = BLOG_TEMPLATES[selectedTopic];
+  if (!templates) {
+    res.status(400).json({ success: false, error: `Ungültiger Blog-Typ. Gültig: ${Object.keys(BLOG_TEMPLATES).join(", ")}` });
+    return;
+  }
+
+  try {
+    // Fetch page content server-side (no CORS issues)
+    const { pageTitle, text: extractedText } = await fetchUrlContent(url);
+
+    console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`);
+
+    // Build a rich additional_context from the URL content
+    const additionalContext =
+      `SOURCE URL: ${url}\n` +
+      `PAGE TITLE: ${pageTitle}\n` +
+      `HOSTNAME: ${parsedUrl.hostname}\n` +
+      `\n--- EXTRACTED PAGE CONTENT ---\n` +
+      `${extractedText}\n` +
+      `--- END PAGE CONTENT ---\n` +
+      `\nIMPORTANT: Use this content as factual background and editorial direction. ` +
+      `Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`;
+
+    const title = pageTitle || parsedUrl.hostname;
+    const template = templates[Math.floor(Math.random() * templates.length)];
+    const keywords = [
+      ...template.seo_keywords,
+      "optical transceiver", "networking",
+    ].filter(Boolean);
+
+    const data = await gatherBlogData(keywords, selectedTopic);
+    const draftContent = generateTemplateDraft(title, selectedTopic, data);
+    const wordCount = draftContent.split(/\s+/).length;
+    const initialIssues = validateArticle(draftContent);
+
+    const result = await pool.query(
+      `INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords)
+       VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9)
+       RETURNING id, created_at`,
+      [
+        title,
+        selectedTopic,
+        template.target_audience,
+        JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }),
+        draftContent,
+        JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }),
+        "tip-blog-engine-url",
+        wordCount,
+        template.seo_keywords,
+      ],
+    );
+
+    const draftId = result.rows[0].id;
+
+    // Launch LLM pipeline with URL content as context
+    const health = await checkHealth().catch(() => ({ ok: false, model: "", error: "unreachable" }));
+    let llmStarted = false;
+    if (health.ok) {
+      llmStarted = true;
+      enqueueLlmPipeline(draftId, title, selectedTopic, template.target_audience, data, additionalContext).catch((err) => {
+        console.error(`Blog from-url LLM pipeline error: ${(err as Error).message}`);
+      });
+    }
+
+    res.json({
+      success: true,
+      source_url: url,
+      page_title: pageTitle,
+      extracted_chars: extractedText.length,
+      draft: {
+        id: draftId,
+        title,
+        topic: selectedTopic,
+        target_audience: template.target_audience,
+        word_count: wordCount,
+        generation_method: "from-url",
+        llm_enhancing: llmStarted,
+        created_at: result.rows[0].created_at,
+      },
+    });
+  } catch (err) {
+    const msg = (err as Error).message;
+    console.error(`Blog from-url error for ${url}: ${msg}`);
+    res.status(500).json({
+      success: false,
+      error: `URL konnte nicht verarbeitet werden: ${msg}`,
+    });
+  }
+});
+
 // GET /api/blog — List all drafts
 blogRouter.get("/", async (_req: Request, res: Response) => {
  try {
@ -1557,6 +1722,17 @@ blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => {
  res.json({ success: true, message: "Ollama queue reset — stuck requests cleared" });
 });

+// POST /api/blog/llm/refresh-discovery — Force auto-discovery to pick up newly-trained fo-blog-v* versions
+// Useful right after Magatama adopts a new fo-blog-vN model. Otherwise runs every 10 min by itself.
+blogRouter.post("/llm/refresh-discovery", async (_req: Request, res: Response) => {
+  try {
+    const active = await refreshLlmAutoDiscovery();
+    res.json({ success: true, active, message: `Auto-discovery refreshed. Active: ${active.provider}${active.ollamaModel ? ` (${active.ollamaModel})` : ""}` });
+  } catch (err) {
+    res.status(500).json({ success: false, error: (err as Error).message });
+  }
+});
+
 // POST /api/blog/llm/switch — Switch active LLM provider at runtime (no restart needed)
 // Body: { provider: "claude-code" | "anthropic" | "ollama", model?: "fo-blog-v10" | "qwen2.5:14b" | ... }
 blogRouter.post("/llm/switch", (req: Request, res: Response) => {
--- a/packages/dashboard/index.html
+++ b/packages/dashboard/index.html
@ -1439,6 +1439,40 @@
      <button onclick="generateBlogManual()" style="background:rgba(99,102,241,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600">⚙️ Artikel generieren</button>
    </div><!-- end manual generation -->

+    <!-- URL → BLOG PANEL -->
+    <div class="card" style="margin-bottom:1.25rem;border:1px solid rgba(16,185,129,0.35);background:var(--surface2)">
+      <div style="font-size:0.85rem;font-weight:700;color:var(--text-bright);margin-bottom:0.1rem">🔗 Blog aus URL generieren</div>
+      <div style="font-size:0.7rem;color:var(--text-dim);margin-bottom:0.75rem">Link eingeben → Inhalt wird automatisch extrahiert → BlogLLM schreibt einen Artikel daraus</div>
+      <div style="display:grid;grid-template-columns:1fr auto;gap:0.6rem;margin-bottom:0.65rem;align-items:end">
+        <div>
+          <label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">URL</label>
+          <input type="url" id="blog-from-url-input" placeholder="https://example.com/article-about-400g-transceivers"
+            style="width:100%;background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;box-sizing:border-box"
+            onkeydown="if(event.key==='Enter')generateBlogFromUrl()">
+        </div>
+        <div>
+          <label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">Blog-Typ</label>
+          <select id="blog-from-url-topic" style="background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;height:32px">
+            <option value="technology_deep_dive">Technology Deep Dive</option>
+            <option value="tutorial">Troubleshooting Tutorial</option>
+            <option value="migration_guide">Migration Guide</option>
+            <option value="market_alert">Market Alert</option>
+            <option value="buying_guide">Buying Guide</option>
+            <option value="comparison">Product Comparison</option>
+            <option value="competitor_analysis">Competitor Analysis</option>
+            <option value="hype_cycle">Hype Cycle / Strategy</option>
+          </select>
+        </div>
+      </div>
+      <div style="display:flex;align-items:center;gap:0.75rem">
+        <button onclick="generateBlogFromUrl()" id="blog-from-url-btn"
+          style="background:rgba(16,185,129,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600">
+          🔗 Aus URL generieren
+        </button>
+        <span id="blog-from-url-status" style="font-size:0.75rem;color:var(--text-dim)"></span>
+      </div>
+    </div><!-- end url→blog panel -->
+
    <!-- SLL INSIGHTS WIDGET -->
    <div class="card" style="margin-bottom:1rem;border:1px solid rgba(212,163,115,0.3);background:var(--surface2)">
      <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.75rem">
@ -5503,6 +5537,48 @@ function generateBlogManual() {
  }).catch(function(err) { if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true); });
 }

+function generateBlogFromUrl() {
+  var url = (document.getElementById('blog-from-url-input').value || '').trim();
+  var topic = document.getElementById('blog-from-url-topic').value || 'technology_deep_dive';
+  var btn = document.getElementById('blog-from-url-btn');
+  var status = document.getElementById('blog-from-url-status');
+
+  if (!url) { showToast('Fehler', 'Bitte eine URL eingeben', true); return; }
+  try { new URL(url); } catch (e) { showToast('Fehler', 'Ungültige URL', true); return; }
+
+  btn.disabled = true;
+  btn.textContent = '⏳ Fetching…';
+  status.textContent = 'Seite wird abgerufen…';
+
+  var token = window.loadToken ? window.loadToken() : '';
+  fetch(API + '/api/blog/from-url', {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + token },
+    body: JSON.stringify({ url: url, topic: topic })
+  })
+  .then(function(r) { if (r.status === 401) { handleAuthError(401); throw new Error('Unauthorized'); } return r.json(); })
+  .then(function(data) {
+    btn.disabled = false;
+    btn.textContent = '🔗 Aus URL generieren';
+    if (data.success) {
+      status.textContent = '✓ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)';
+      showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig');
+      document.getElementById('blog-from-url-input').value = '';
+      loadBlogDrafts();
+      pollBlogLlm(data.draft.id, 0);
+    } else {
+      status.textContent = '✗ Fehler: ' + (data.error || 'Unbekannt');
+      showToast('Fehler', data.error || 'Unbekannter Fehler', true);
+    }
+  })
+  .catch(function(err) {
+    btn.disabled = false;
+    btn.textContent = '🔗 Aus URL generieren';
+    status.textContent = '';
+    if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true);
+  });
+}
+
 function pollBlogLlm(id, attempt) {
  if (attempt > 60) return; // max 10 min (60 × 10s)
  setTimeout(function() {