fix(blog): SPA-aware URL blog generation + dynamic generated_by
- fetchUrlContent() now extracts OG/meta tags (og:title, og:description, name="description", og:site_name) as fallback content for JS-rendered SPAs - Returns spaDetected=true when body text < 300 chars after stripping scripts - from-url endpoint skips gatherBlogData() product injection when SPA detected, preventing fo-blog-v10 from defaulting to optical networking domain - additionalContext now includes SPA warning instructing LLM not to default to optical transceiver topics unless the page is actually about that - generated_by in pipeline UPDATE query now uses active model name instead of hardcoded 'fo-blog-engine-v7' (reads getLlmProvider().ollamaModel) - Dashboard shows SPA warning toast when spa_detected=true in response - Response now includes spa_detected field for client awareness
This commit is contained in:
parent
e0f9656684
commit
67310c8fe7
@ -1347,10 +1347,12 @@ async function runLlmPipeline(
|
||||
const finalIssues = validateArticle(draftContent);
|
||||
|
||||
// Update the draft in DB (title updated to generated headline if available)
|
||||
const pipelineModel = getLlmProvider();
|
||||
const pipelineGeneratedBy = `fo-blog-engine-${pipelineModel.ollamaModel || pipelineModel.provider || "llm"}`;
|
||||
await pool.query(
|
||||
`UPDATE blog_drafts
|
||||
SET title = $9, draft_content = $1, word_count = $2,
|
||||
generated_by = 'fo-blog-engine-v7',
|
||||
generated_by = $10,
|
||||
pipeline_version = 'v7',
|
||||
pipeline_steps_completed = $3,
|
||||
auto_qa_score = $4,
|
||||
@ -1377,6 +1379,7 @@ async function runLlmPipeline(
|
||||
linkedinCharCount,
|
||||
draftId,
|
||||
finalTitle,
|
||||
pipelineGeneratedBy,
|
||||
],
|
||||
);
|
||||
|
||||
@ -1528,8 +1531,17 @@ blogRouter.post("/generate", async (req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
/** Fetch a URL and extract readable text content for use as LLM context. */
|
||||
async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; text: string }> {
|
||||
/** Fetch a URL and extract readable text content for use as LLM context.
|
||||
* Returns spaDetected=true when extracted body text is thin (< 300 chars),
|
||||
* indicating a JavaScript Single Page Application where content is rendered client-side.
|
||||
* In that case, metaDesc contains OG/meta description fallback text.
|
||||
*/
|
||||
async function fetchUrlContent(rawUrl: string): Promise<{
|
||||
pageTitle: string;
|
||||
text: string;
|
||||
spaDetected: boolean;
|
||||
metaDesc: string;
|
||||
}> {
|
||||
const response = await fetch(rawUrl, {
|
||||
headers: { "User-Agent": "TIPBot/1.0 blog-from-url (+https://tip.flexoptix.net)" },
|
||||
signal: AbortSignal.timeout(20000),
|
||||
@ -1545,12 +1557,33 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Extract page title
|
||||
// --- Extract OG / meta tags for SPA fallback ---
|
||||
const decodeEntities = (s: string) =>
|
||||
s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">")
|
||||
.replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
|
||||
|
||||
const ogTitle =
|
||||
html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']{1,200})["']/i)?.[1] ||
|
||||
html.match(/<meta[^>]+content=["']([^"']{1,200})["'][^>]+property=["']og:title["']/i)?.[1] || "";
|
||||
|
||||
const ogDesc =
|
||||
html.match(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] ||
|
||||
html.match(/<meta[^>]+content=["']([^"']{1,500})["'][^>]+property=["']og:description["']/i)?.[1] ||
|
||||
html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']{1,500})["']/i)?.[1] ||
|
||||
html.match(/<meta[^>]+content=["']([^"']{1,500})["'][^>]+name=["']description["']/i)?.[1] || "";
|
||||
|
||||
const ogSiteName =
|
||||
html.match(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']{1,100})["']/i)?.[1] ||
|
||||
html.match(/<meta[^>]+content=["']([^"']{1,100})["'][^>]+property=["']og:site_name["']/i)?.[1] || "";
|
||||
|
||||
// Extract page title: prefer OG title, then <title>, then <h1>
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]{1,200})<\/title>/i);
|
||||
const h1Match = html.match(/<h1[^>]*>([^<]{1,150})<\/h1>/i);
|
||||
const pageTitle = (titleMatch?.[1] || h1Match?.[1] || "")
|
||||
.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">")
|
||||
.replace(/&#\d+;/g, "").replace(/&[a-z]+;/gi, " ").trim();
|
||||
const pageTitle = decodeEntities(
|
||||
ogTitle || titleMatch?.[1] || h1Match?.[1] || ""
|
||||
);
|
||||
|
||||
const metaDesc = decodeEntities(ogDesc);
|
||||
|
||||
// Strip scripts, styles, SVG, navigation boilerplate
|
||||
let text = html
|
||||
@ -1580,7 +1613,19 @@ async function fetchUrlContent(rawUrl: string): Promise<{ pageTitle: string; tex
|
||||
text = text.slice(0, 5000) + "\n[… content truncated for LLM context …]";
|
||||
}
|
||||
|
||||
return { pageTitle, text };
|
||||
// Detect SPA: very little body text means JS renders the real content
|
||||
const spaDetected = text.length < 300;
|
||||
|
||||
// When SPA detected, enrich text with what we could extract from meta tags
|
||||
if (spaDetected && (metaDesc || ogSiteName)) {
|
||||
const parts: string[] = [];
|
||||
if (ogSiteName) parts.push(`Site: ${ogSiteName}`);
|
||||
if (pageTitle) parts.push(`Title: ${pageTitle}`);
|
||||
if (metaDesc) parts.push(`Description: ${metaDesc}`);
|
||||
text = parts.join("\n");
|
||||
}
|
||||
|
||||
return { pageTitle, text, spaDetected, metaDesc };
|
||||
}
|
||||
|
||||
// POST /api/blog/from-url — Fetch URL, extract content, generate a blog from it
|
||||
@ -1611,33 +1656,55 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
|
||||
|
||||
try {
|
||||
// Fetch page content server-side (no CORS issues)
|
||||
const { pageTitle, text: extractedText } = await fetchUrlContent(url);
|
||||
const { pageTitle, text: extractedText, spaDetected, metaDesc } = await fetchUrlContent(url);
|
||||
|
||||
console.log(`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} (${extractedText.length} chars)`);
|
||||
console.log(
|
||||
`Blog from-url: fetched "${pageTitle}" from ${parsedUrl.hostname} ` +
|
||||
`(${extractedText.length} chars${spaDetected ? ", SPA detected" : ""})`
|
||||
);
|
||||
|
||||
// Build a rich additional_context from the URL content.
|
||||
// When a SPA is detected (JS-rendered), body text is a shell — we rely on meta tags instead.
|
||||
const spaWarning = spaDetected
|
||||
? `\nNOTE: This URL is a JavaScript Single Page Application. Only meta/OG data was available ` +
|
||||
`server-side — the LLM should infer topic from the site name, title, and description above. ` +
|
||||
`Do NOT default to optical networking topics unless the page is actually about that.`
|
||||
: "";
|
||||
|
||||
// Build a rich additional_context from the URL content
|
||||
const additionalContext =
|
||||
`SOURCE URL: ${url}\n` +
|
||||
`PAGE TITLE: ${pageTitle}\n` +
|
||||
`HOSTNAME: ${parsedUrl.hostname}\n` +
|
||||
(metaDesc ? `META DESCRIPTION: ${metaDesc}\n` : "") +
|
||||
`\n--- EXTRACTED PAGE CONTENT ---\n` +
|
||||
`${extractedText}\n` +
|
||||
`${extractedText || "(No body text extractable — JavaScript-rendered SPA)"}\n` +
|
||||
`--- END PAGE CONTENT ---\n` +
|
||||
`\nIMPORTANT: Use this content as factual background and editorial direction. ` +
|
||||
spaWarning +
|
||||
`\n\nIMPORTANT: Use this content as factual background and editorial direction. ` +
|
||||
`The blog MUST be about the topic described above, NOT about optical transceivers or fiber unless explicitly relevant. ` +
|
||||
`Do NOT copy sentences verbatim. Write a Flexoptix-voice blog article using these facts and insights.`;
|
||||
|
||||
const title = pageTitle || parsedUrl.hostname;
|
||||
const template = templates[Math.floor(Math.random() * templates.length)];
|
||||
const keywords = [
|
||||
...template.seo_keywords,
|
||||
"optical transceiver", "networking",
|
||||
].filter(Boolean);
|
||||
|
||||
const data = await gatherBlogData(keywords, selectedTopic);
|
||||
// When SPA detected, skip optical transceiver product injection — it pollutes the LLM context
|
||||
// with irrelevant product data and causes the model to default to its fine-tuning domain.
|
||||
// Use empty data so the pipeline focuses purely on the URL context provided above.
|
||||
const keywords = spaDetected
|
||||
? [parsedUrl.hostname.replace(/^www\./, ""), pageTitle].filter(Boolean)
|
||||
: [...template.seo_keywords, "optical transceiver", "networking"].filter(Boolean);
|
||||
|
||||
const data = spaDetected
|
||||
? { products: [] as any[], news: [] as any[], faq: [] as any[], troubleshooting: [] as any[] }
|
||||
: await gatherBlogData(keywords, selectedTopic);
|
||||
|
||||
const draftContent = generateTemplateDraft(title, selectedTopic, data);
|
||||
const wordCount = draftContent.split(/\s+/).length;
|
||||
const initialIssues = validateArticle(draftContent);
|
||||
|
||||
const activeModel = getLlmProvider();
|
||||
const generatedBy = `tip-blog-from-url-${activeModel.ollamaModel || activeModel.provider || "llm"}`;
|
||||
|
||||
const result = await pool.query(
|
||||
`INSERT INTO blog_drafts (title, topic, target_audience, outline, draft_content, data_sources, status, generated_by, word_count, seo_keywords)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, 'draft', $7, $8, $9)
|
||||
@ -1646,10 +1713,10 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
|
||||
title,
|
||||
selectedTopic,
|
||||
template.target_audience,
|
||||
JSON.stringify({ generation_method: "from-url", source_url: url, quality_issues: initialIssues }),
|
||||
JSON.stringify({ generation_method: "from-url", source_url: url, spa_detected: spaDetected, quality_issues: initialIssues }),
|
||||
draftContent,
|
||||
JSON.stringify({ source_url: url, extracted_chars: extractedText.length, products: data.products.length, news: data.news.length }),
|
||||
"tip-blog-engine-url",
|
||||
JSON.stringify({ source_url: url, extracted_chars: extractedText.length, spa_detected: spaDetected, products: data.products.length, news: data.news.length }),
|
||||
generatedBy,
|
||||
wordCount,
|
||||
template.seo_keywords,
|
||||
],
|
||||
@ -1672,6 +1739,7 @@ blogRouter.post("/from-url", async (req: Request, res: Response) => {
|
||||
source_url: url,
|
||||
page_title: pageTitle,
|
||||
extracted_chars: extractedText.length,
|
||||
spa_detected: spaDetected,
|
||||
draft: {
|
||||
id: draftId,
|
||||
title,
|
||||
|
||||
@ -5561,8 +5561,15 @@ function generateBlogFromUrl() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '🔗 Aus URL generieren';
|
||||
if (data.success) {
|
||||
status.textContent = '✓ ' + data.extracted_chars + ' Zeichen extrahiert — Pipeline läuft (~10 min)';
|
||||
var spaNote = data.spa_detected
|
||||
? ' ⚠️ SPA erkannt — Inhalt nur via Meta-Tags (JS-gerendert)'
|
||||
: ' ✓ ' + data.extracted_chars + ' Zeichen extrahiert';
|
||||
status.textContent = spaNote + ' — Pipeline läuft (~10 min)';
|
||||
if (data.spa_detected) {
|
||||
showToast('⚠️ SPA erkannt', (data.page_title || url) + ' — JavaScript-Seite, Inhalt via Meta-Tags. Pipeline läuft.');
|
||||
} else {
|
||||
showToast('✓ Pipeline gestartet', (data.page_title || url) + ' — wird in ~10 min fertig');
|
||||
}
|
||||
document.getElementById('blog-from-url-input').value = '';
|
||||
loadBlogDrafts();
|
||||
pollBlogLlm(data.draft.id, 0);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user