feat: Blog Engine — generate from URL (link → BlogLLM → article)

New POST /api/blog/from-url endpoint:
- Accepts url + topic in request body
- Fetches page server-side (no CORS, 20s timeout, redirect-follow)
- Strips script/style/nav/footer/svg; extracts readable text (~5000 chars)
- Extracts page title from <title> or <h1>
- Passes extracted content as structured additional_context to the
  existing 16-step FO blog pipeline (same as manual generation)
- Returns immediately; LLM pipeline runs async
- Validated: smoke test fetched flexoptix.net/en/blog/, 5040 chars,
  pipeline launched with llm_enhancing=true

New "🔗 Blog aus URL generieren" panel in dashboard:
- URL input (Enter key triggers generation)
- Blog-Typ dropdown (same 8 types as manual panel)
- Button shows loading state " Fetching…" during API call
- Status line shows extracted char count after success
- Reuses pollBlogLlm() for step-by-step progress polling
- Inline status field for error display without toast spam
This commit is contained in:
Rene Fichtmueller 2026-05-14 00:55:35 +02:00
parent 9b8b03e783
commit e0f9656684
2 changed files with 253 additions and 1 deletions

View File

@ -11,7 +11,7 @@
*/
import { Router, Request, Response } from "express";
import { pool } from "../db/client";
import { setLlmProvider, getLlmProvider } from "../llm/client";
import { setLlmProvider, getLlmProvider, refreshLlmAutoDiscovery } from "../llm/client";
/** In-memory pipeline progress tracker — step updates pushed here, polled via GET /api/blog/:id/progress */
const pipelineProgress = new Map<string, { step: number; total: number; label: string; pct: number }>();
@ -1528,6 +1528,171 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
}
});
/** Fetch a URL and extract readable text content for use as LLM context. */
async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> {
const response = await fetch(rawUrl, {
headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" },
signal: AbortSignal.timeout(20000),
redirect: "follow",
});
if (!response.ok) throw new Error(`HTTP ${response.status} ${response.statusText}`);
const contentType = response.headers.get("content-type") || "";
if (!contentType.includes("text/html") && !contentType.includes("text/plain") && !contentType.includes("application/xhtml")) {
throw new Error(`Unsupported content type: ${contentType.split(";")[0]}`);
}
const html = await response.text();
// Extract page title
const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i);
const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i);
const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "")
.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
.replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
// Strip scripts, styles, SVG, navigation boilerplate
let text = html
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<svg[\s\S]*?<\/svg>/gi, " ")
.replace(/<nav[\s\S]*?<\/nav>/gi, " ")
.replace(/<footer[\s\S]*?<\/footer>/gi, " ")
.replace(/<header[\s\S]*?<\/header>/gi, " ")
.replace(/<aside[\s\S]*?<\/aside>/gi, " ")
.replace(/<form[\s\S]*?<\/form>/gi, " ")
// Block elements → newlines
.replace(/<\/?(p|div|section|article|h[1-6]|li|br|hr|tr|td|th|blockquote|pre)[^>]*>/gi, "\n")
// Strip all remaining tags
.replace(/<[^>]{0,500}>/g, " ")
// Decode common entities
.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
.replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, " ")
.replace(/&[a-z]+;/gi, " ")
// Collapse whitespace
.split("\n").map(l => l.trim()).filter(l => l.length > 30).join("\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
// Limit to ~5000 chars — enough for LLM context, not so much it blows the prompt
if (text.length > 5000) {
text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]";
}
return { pageTitle, text };
}
// POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it
blogRouter.post("/from-url", async (req: Request, res: Response) => {
const { url, topic } = req.body as { url?: string; topic?: string };
if (!url) {
res.status(400).json({ success: false, error: "url ist erforderlich" });
return;
}
// Validate URL — must be http/https
let parsedUrl: URL;
try {
parsedUrl = new URL(url);
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new Error("bad protocol");
} catch {
res.status(400).json({ success: false, error: "Ungültige URL — muss http:// oder https:// beginnen" });
return;
}
const selectedTopic = topic || "technology_deep_dive";
const templates = BLOG_TEMPLATES[selectedTopic];
if (!templates) {
res.status(400).json({ success: false, error: `Ungültiger Blog-Typ. Gültig: ${Object.keys(BLOG_TEMPLATES).join(", ")}` });
return;
}
try {
// Fetch page content server-side (no CORS issues)
const { pageTitle, text: extractedText } = await fetchUrlContent(url);
console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`);
// Build a rich additional_context from the URL content
const additionalContext =
`SOURCE URL: ${url}\n` +
`PAGE TITLE: ${pageTitle}\n` +
`HOSTNAME: ${parsedUrl.hostname}\n` +
`\n--- EXTRACTED PAGE CONTENT ---\n` +
`${extractedText}\n` +
`--- END PAGE CONTENT ---\n` +
`\nIMPORTANT: Use this content as factual background and editorial direction. ` +
`Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`;
const title = pageTitle || parsedUrl.hostname;
const template = templates[Math.floor(Math.random() * templates.length)];
const keywords = [
...template.seo_keywords,
"optical transceiver", "networking",
].filter(Boolean);
const data = await gatherBlogData(keywords, selectedTopic);
const draftContent = generateTemplateDraft(title, selectedTopic, data);
const wordCount = draftContent.split(/\s+/).length;
const initialIssues = validateArticle(draftContent);
const result = await pool.query(
`INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords)
VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9)
RETURNING id, created_at`,
[
title,
selectedTopic,
template.target_audience,
JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }),
draftContent,
JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }),
"tip-blog-engine-url",
wordCount,
template.seo_keywords,
],
);
const draftId = result.rows[0].id;
// Launch LLM pipeline with URL content as context
const health = await checkHealth().catch(() => ({ ok: false, model: "", error: "unreachable" }));
let llmStarted = false;
if (health.ok) {
llmStarted = true;
enqueueLlmPipeline(draftId, title, selectedTopic, template.target_audience, data, additionalContext).catch((err) => {
console.error(`Blog from-url LLM pipeline error: ${(err as Error).message}`);
});
}
res.json({
success: true,
source_url: url,
page_title: pageTitle,
extracted_chars: extractedText.length,
draft: {
id: draftId,
title,
topic: selectedTopic,
target_audience: template.target_audience,
word_count: wordCount,
generation_method: "from-url",
llm_enhancing: llmStarted,
created_at: result.rows[0].created_at,
},
});
} catch (err) {
const msg = (err as Error).message;
console.error(`Blog from-url error for ${url}: ${msg}`);
res.status(500).json({
success: false,
error: `URL konnte nicht verarbeitet werden: ${msg}`,
});
}
});
// GET /api/blog — List all drafts
blogRouter.get("/", async (_req: Request, res: Response) => {
try {
@ -1557,6 +1722,17 @@ blogRouter.post("/llm/reset-queue", (_req: Request, res: Response) => {
res.json({ success: true, message: "Ollama queue reset — stuck requests cleared" });
});
// POST /api/blog/llm/refresh-discovery — Force auto-discovery to pick up newly-trained fo-blog-v* versions
// Useful right after Magatama adopts a new fo-blog-vN model. Otherwise runs every 10 min by itself.
blogRouter.post("/llm/refresh-discovery", async (_req: Request, res: Response) => {
try {
const active = await refreshLlmAutoDiscovery();
res.json({ success: true, active, message: `Auto-discovery refreshed. Active: ${active.provider}${active.ollamaModel ? ` (${active.ollamaModel})` : ""}` });
} catch (err) {
res.status(500).json({ success: false, error: (err as Error).message });
}
});
// POST /api/blog/llm/switch — Switch active LLM provider at runtime (no restart needed)
// Body: { provider: "claude-code" | "anthropic" | "ollama", model?: "fo-blog-v10" | "qwen2.5:14b" | ... }
blogRouter.post("/llm/switch", (req: Request, res: Response) => {

View File

@ -1439,6 +1439,40 @@
<button onclick="generateBlogManual()" style="background:rgba(99,102,241,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600">⚙️ Artikel generieren</button>
</div><!-- end manual generation -->
<!-- URL → BLOG PANEL -->
<div class="card" style="margin-bottom:1.25rem;border:1px solid rgba(16,185,129,0.35);background:var(--surface2)">
<div style="font-size:0.85rem;font-weight:700;color:var(--text-bright);margin-bottom:0.1rem">🔗 Blog aus URL generieren</div>
<div style="font-size:0.7rem;color:var(--text-dim);margin-bottom:0.75rem">Link eingeben → Inhalt wird automatisch extrahiert → BlogLLM schreibt einen Artikel daraus</div>
<div style="display:grid;grid-template-columns:1fr auto;gap:0.6rem;margin-bottom:0.65rem;align-items:end">
<div>
<label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">URL</label>
<input type="url" id="blog-from-url-input" placeholder="https://example.com/article-about-400g-transceivers"
style="width:100%;background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;box-sizing:border-box"
onkeydown="if(event.key==='Enter')generateBlogFromUrl()">
</div>
<div>
<label style="font-size:0.7rem;color:var(--text-dim);display:block;margin-bottom:3px">Blog-Typ</label>
<select id="blog-from-url-topic" style="background:var(--surface);border:1px solid rgba(16,185,129,0.4);color:var(--text);padding:6px 10px;border-radius:6px;font-size:0.82rem;height:32px">
<option value="technology_deep_dive">Technology Deep Dive</option>
<option value="tutorial">Troubleshooting Tutorial</option>
<option value="migration_guide">Migration Guide</option>
<option value="market_alert">Market Alert</option>
<option value="buying_guide">Buying Guide</option>
<option value="comparison">Product Comparison</option>
<option value="competitor_analysis">Competitor Analysis</option>
<option value="hype_cycle">Hype Cycle / Strategy</option>
</select>
</div>
</div>
<div style="display:flex;align-items:center;gap:0.75rem">
<button onclick="generateBlogFromUrl()" id="blog-from-url-btn"
style="background:rgba(16,185,129,0.85);color:#fff;border:none;padding:8px 20px;border-radius:6px;cursor:pointer;font-size:0.82rem;font-weight:600">
🔗 Aus URL generieren
</button>
<span id="blog-from-url-status" style="font-size:0.75rem;color:var(--text-dim)"></span>
</div>
</div><!-- end url→blog panel -->
<!-- SLL INSIGHTS WIDGET -->
<div class="card" style="margin-bottom:1rem;border:1px solid rgba(212,163,115,0.3);background:var(--surface2)">
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.75rem">
@ -5503,6 +5537,48 @@ function generateBlogManual() {
}).catch(function(err) { if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true); });
}
function generateBlogFromUrl() {
var url = (document.getElementById('blog-from-url-input').value || '').trim();
var topic = document.getElementById('blog-from-url-topic').value || 'technology_deep_dive';
var btn = document.getElementById('blog-from-url-btn');
var status = document.getElementById('blog-from-url-status');
if (!url) { showToast('Fehler', 'Bitte eine URL eingeben', true); return; }
try { new URL(url); } catch (e) { showToast('Fehler', 'Ungültige URL', true); return; }
btn.disabled = true;
btn.textContent = '⏳ Fetching…';
status.textContent = 'Seite wird abgerufen…';
var token = window.loadToken ? window.loadToken() : '';
fetch(API + '/api/blog/from-url', {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + token },
body: JSON.stringify({ url: url, topic: topic })
})
.then(function(r) { if (r.status === 401) { handleAuthError(401); throw new Error('Unauthorized'); } return r.json(); })
.then(function(data) {
btn.disabled = false;
btn.textContent = '🔗 Aus URL generieren';
if (data.success) {
status.textContent = '✓ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)';
showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig');
document.getElementById('blog-from-url-input').value = '';
loadBlogDrafts();
pollBlogLlm(data.draft.id, 0);
} else {
status.textContent = '✗ Fehler: ' + (data.error || 'Unbekannt');
showToast('Fehler', data.error || 'Unbekannter Fehler', true);
}
})
.catch(function(err) {
btn.disabled = false;
btn.textContent = '🔗 Aus URL generieren';
status.textContent = '';
if (err.message !== 'Unauthorized') showToast('Netzwerkfehler', err.message, true);
});
}
function pollBlogLlm(id, attempt) {
if (attempt > 60) return; // max 10 min (60 × 10s)
setTimeout(function() {