From e0f9656684506e4efbfa42d6db542d2bbe07d409 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Thu, 14 May 2026 00:55:35 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20Blog=20Engine=20=E2=80=94=20generate=20?= =?UTF-8?q?from=20URL=20(link=20=E2=86=92=20BlogLLM=20=E2=86=92=20article)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New POST /api/blog/from-url endpoint: - Accepts url + topic in request body - Fetches page server-side (no CORS, 20s timeout, redirect-follow) - Strips script/style/nav/footer/svg; extracts readable text (~5000 chars) - Extracts page title from or <h1> - Passes extracted content as structured additional_context to the existing 16-step FO blog pipeline (same as manual generation) - Returns immediately; LLM pipeline runs async - Validated: smoke test fetched flexoptix.net/en/blog/, 5040 chars, pipeline launched with llm_enhancing=true New "đź”— Blog aus URL generieren" panel in dashboard: - URL input (Enter key triggers generation) - Blog-Typ dropdown (same 8 types as manual panel) - Button shows loading state "⏳ Fetching…" during API call - Status line shows extracted char count after success - Reuses pollBlogLlm() for step-by-step progress polling - Inline status field for error display without toast spam --- packages/api/src/routes/blog.ts | 178 +++++++++++++++++++++++++++++++- packages/dashboard/index.html | 76 ++++++++++++++ 2 files changed, 253 insertions(+), 1 deletion(-) diff --git a/packages/api/src/routes/blog.ts b/packages/api/src/routes/blog.ts index 973921f..db5843e 100644 --- a/packages/api/src/routes/blog.ts +++ b/packages/api/src/routes/blog.ts @@ -11,7 +11,7 @@ */ import { Router, Request, Response } from "express"; import { pool } from "../db/client"; -import { setLlmProvider, getLlmProvider } from "../llm/client"; +import { setLlmProvider, getLlmProvider, refreshLlmAutoDiscovery } from "../llm/client"; /** In-memory pipeline progress tracker — step updates pushed here, polled via GET /api/blog/:id/progress */ const pipelineProgress = new Map<string, { step: number; total: number; label: string; pct: number }>(); @@ -1528,6 +1528,171 @@ blogRouter.post("/generate", async (req: Request, res: Response) => { } }); +/** Fetch a URL and extract readable text content for use as LLM context. */ +async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> { + const response = await fetch(rawUrl, { + headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" }, + signal: AbortSignal.timeout(20000), + redirect: "follow", + }); + + if (!response.ok) throw new Error(`HTTP ${response.status} ${response.statusText}`); + + const contentType = response.headers.get("content-type") || ""; + if (!contentType.includes("text/html") && !contentType.includes("text/plain") && !contentType.includes("application/xhtml")) { + throw new Error(`Unsupported content type: ${contentType.split(";")[0]}`); + } + + const html = await response.text(); + + // Extract page title + const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i); + const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i); + const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "") + .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">") + .replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim(); + + // Strip scripts, styles, SVG, navigation boilerplate + let text = html + .replace(/<script[\s\S]*?<\/script>/gi, " ") + .replace(/<style[\s\S]*?<\/style>/gi, " ") + .replace(/<svg[\s\S]*?<\/svg>/gi, " ") + .replace(/<nav[\s\S]*?<\/nav>/gi, " ") + .replace(/<footer[\s\S]*?<\/footer>/gi, " ") + .replace(/<header[\s\S]*?<\/header>/gi, " ") + .replace(/<aside[\s\S]*?<\/aside>/gi, " ") + .replace(/<form[\s\S]*?<\/form>/gi, " ") + // Block elements → newlines + .replace(/<\/?(p|div|section|article|h[1-6]|li|br|hr|tr|td|th|blockquote|pre)[^>]*>/gi, "\n") + // Strip all remaining tags + .replace(/<[^>]{0,500}>/g, " ") + // Decode common entities + .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">") + .replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, " ") + .replace(/&[a-z]+;/gi, " ") + // Collapse whitespace + .split("\n").map(l => l.trim()).filter(l => l.length > 30).join("\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); + + // Limit to ~5000 chars — enough for LLM context, not so much it blows the prompt + if (text.length > 5000) { + text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]"; + } + + return { pageTitle, text }; +} + +// POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it +blogRouter.post("/from-url", async (req: Request, res: Response) => { + const { url, topic } = req.body as { url?: string; topic?: string }; + + if (!url) { + res.status(400).json({ success: false, error: "url ist erforderlich" }); + return; + } + + // Validate URL — must be http/https + let parsedUrl: URL; + try { + parsedUrl = new URL(url); + if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new Error("bad protocol"); + } catch { + res.status(400).json({ success: false, error: "UngĂĽltige URL — muss http:// oder https:// beginnen" }); + return; + } + + const selectedTopic = topic || "technology_deep_dive"; + const templates = BLOG_TEMPLATES[selectedTopic]; + if (!templates) { + res.status(400).json({ success: false, error: `UngĂĽltiger Blog-Typ. GĂĽltig: ${Object.keys(BLOG_TEMPLATES).join(", ")}` }); + return; + } + + try { + // Fetch page content server-side (no CORS issues) + const { pageTitle, text: extractedText } = await fetchUrlContent(url); + + console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`); + + // Build a rich additional_context from the URL content + const additionalContext = + `SOURCE URL: ${url}\n` + + `PAGE TITLE: ${pageTitle}\n` + + `HOSTNAME: ${parsedUrl.hostname}\n` + + `\n--- EXTRACTED PAGE CONTENT ---\n` + + `${extractedText}\n` + + `--- END PAGE CONTENT ---\n` + + `\nIMPORTANT: Use this content as factual background and editorial direction. ` + + `Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`; + + const title = pageTitle || parsedUrl.hostname; + const template = templates[Math.floor(Math.random() * templates.length)]; + const keywords = [ + ...template.seo_keywords, + "optical transceiver", "networking", + ].filter(Boolean); + + const data = await gatherBlogData(keywords, selectedTopic); + const draftContent = generateTemplateDraft(title, selectedTopic, data); + const wordCount = draftContent.split(/\s+/).length; + const initialIssues = validateArticle(draftContent); + + const result = await pool.query( + `INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords) + VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9) + RETURNING id, created_at`, + [ + title, + selectedTopic, + template.target_audience, + JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }), + draftContent, + JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }), + "tip-blog-engine-url", + wordCount, + template.seo_keywords, + ], + ); + + const draftId = result.rows[0].id; + + // Launch LLM pipeline with URL content as context + const health = await checkHealth().catch(() => ({ ok: false, model: "", error: "unreachable" })); + let llmStarted = false; + if (health.ok) { + llmStarted = true; + enqueueLlmPipeline(draftId, title, selectedTopic, template.target_audience, data, additionalContext).catch((err) => { + console.error(`Blog from-url LLM pipeline error: ${(err as Error).message}`); + }); + } + + res.json({ + success: true, + source_url: url, + page_title: pageTitle, + extracted_chars: extractedText.length, + draft: { + id: draftId, + title, + topic: selectedTopic, + target_audience: template.target_audience, + word_count: wordCount, + generation_method: "from-url", + llm_enhancing: llmStarted, + created_at: result.rows[0].created_at, + }, + }); + } catch (err) { + const msg = (err as Error).message; + console.error(`Blog from-url error for ${url}: ${msg}`); + res.status(500).json({ + success: false, + error: `URL konnte nicht verarbeitet werden: ${msg}`, + }); + } +}); + // GET /api/blog — List all drafts blogRouter.get("/", async (_req: Request, res: Response) => { try { @@ -1557,6 +1722,17 @@ blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => { res.json({ success: true, message: "Ollama queue reset — stuck requests cleared" }); }); +// POST /api/blog/llm/refresh-discovery — Force auto-discovery to pick up newly-trained fo-blog-v* versions +// Useful right after Magatama adopts a new fo-blog-vN model. Otherwise runs every 10 min by itself. +blogRouter.post("/llm/refresh-discovery", async (_req: Request, res: Response) => { + try { + const active = await refreshLlmAutoDiscovery(); + res.json({ success: true, active, message: `Auto-discovery refreshed. Active: ${active.provider}${active.ollamaModel ? ` (${active.ollamaModel})` : ""}` }); + } catch (err) { + res.status(500).json({ success: false, error: (err as Error).message }); + } +}); + // POST /api/blog/llm/switch — Switch active LLM provider at runtime (no restart needed) // Body: { provider: "claude-code" | "anthropic" | "ollama", model?: "fo-blog-v10" | "qwen2.5:14b" | ... } blogRouter.post("/llm/switch", (req: Request, res: Response) => { diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html index 3bb04b1..2a4763b 100644 --- a/packages/dashboard/index.html +++ b/packages/dashboard/index.html @@ -1439,6 +1439,40 @@ <button onclick="generateBlogManual()" style="background:rgba(99,102,241,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600">⚙️ Artikel generieren</button> </div><!-- end manual generation --> + <!-- URL → BLOG PANEL --> + <div class="card" style="margin-bottom:1.25rem;border:1px solid rgba(16,185,129,0.35);background:var(--surface2)"> + <div style="font-size:0.85rem;font-weight:700;color:var(--text-bright);margin-bottom:0.1rem">đź”— Blog aus URL generieren</div> + <div style="font-size:0.7rem;color:var(--text-dim);margin-bottom:0.75rem">Link eingeben → Inhalt wird automatisch extrahiert → BlogLLM schreibt einen Artikel daraus</div> + <div style="display:grid;grid-template-columns:1fr auto;gap:0.6rem;margin-bottom:0.65rem;align-items:end"> + <div> + <label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">URL</label> + <input type="url" id="blog-from-url-input" placeholder="https://example.com/article-about-400g-transceivers" + style="width:100%;background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;box-sizing:border-box" + onkeydown="if(event.key==='Enter')generateBlogFromUrl()"> + </div> + <div> + <label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">Blog-Typ</label> + <select id="blog-from-url-topic" style="background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;height:32px"> + <option value="technology_deep_dive">Technology Deep Dive</option> + <option value="tutorial">Troubleshooting Tutorial</option> + <option value="migration_guide">Migration Guide</option> + <option value="market_alert">Market Alert</option> + <option value="buying_guide">Buying Guide</option> + <option value="comparison">Product Comparison</option> + <option value="competitor_analysis">Competitor Analysis</option> + <option value="hype_cycle">Hype Cycle / Strategy</option> + </select> + </div> + </div> + <div style="display:flex;align-items:center;gap:0.75rem"> + <button onclick="generateBlogFromUrl()" id="blog-from-url-btn" + style="background:rgba(16,185,129,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600"> + đź”— Aus URL generieren + </button> + <span id="blog-from-url-status" style="font-size:0.75rem;color:var(--text-dim)"></span> + </div> + </div><!-- end url→blog panel --> + <!-- SLL INSIGHTS WIDGET --> <div class="card" style="margin-bottom:1rem;border:1px solid rgba(212,163,115,0.3);background:var(--surface2)"> <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.75rem"> @@ -5503,6 +5537,48 @@ function generateBlogManual() { }).catch(function(err) { if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true); }); } +function generateBlogFromUrl() { + var url = (document.getElementById('blog-from-url-input').value || '').trim(); + var topic = document.getElementById('blog-from-url-topic').value || 'technology_deep_dive'; + var btn = document.getElementById('blog-from-url-btn'); + var status = document.getElementById('blog-from-url-status'); + + if (!url) { showToast('Fehler', 'Bitte eine URL eingeben', true); return; } + try { new URL(url); } catch (e) { showToast('Fehler', 'UngĂĽltige URL', true); return; } + + btn.disabled = true; + btn.textContent = '⏳ Fetching…'; + status.textContent = 'Seite wird abgerufen…'; + + var token = window.loadToken ? window.loadToken() : ''; + fetch(API + '/api/blog/from-url', { + method: 'POST', + headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + token }, + body: JSON.stringify({ url: url, topic: topic }) + }) + .then(function(r) { if (r.status === 401) { handleAuthError(401); throw new Error('Unauthorized'); } return r.json(); }) + .then(function(data) { + btn.disabled = false; + btn.textContent = 'đź”— Aus URL generieren'; + if (data.success) { + status.textContent = 'âś“ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)'; + showToast('âś“ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig'); + document.getElementById('blog-from-url-input').value = ''; + loadBlogDrafts(); + pollBlogLlm(data.draft.id, 0); + } else { + status.textContent = 'âś— Fehler: ' + (data.error || 'Unbekannt'); + showToast('Fehler', data.error || 'Unbekannter Fehler', true); + } + }) + .catch(function(err) { + btn.disabled = false; + btn.textContent = 'đź”— Aus URL generieren'; + status.textContent = ''; + if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true); + }); +} + function pollBlogLlm(id, attempt) { if (attempt > 60) return; // max 10 min (60 Ă— 10s) setTimeout(function() {