Rene Fichtmueller 4e813024f1 fix: serialize Claude API calls via queue to prevent 429 rate-limit spam
Tier-1 Anthropic API has 40K TPM — with ~20K tokens per pipeline step,
concurrent calls immediately hit the limit. enqueueClaude() serializes
all generateClaude() calls so only one runs at a time, eliminating
the flood of 429-retry-429-retry loops.
2026-04-06 00:57:03 +02:00

292 lines
10 KiB
TypeScript

/**
* LLM client for blog generation — supports Ollama (local) and Anthropic Claude (API).
*
* Provider selection:
* BLOG_LLM_PROVIDER=anthropic → Claude Sonnet/Haiku via Anthropic API
* BLOG_LLM_PROVIDER=ollama → qwen2.5 on local Ollama (default)
*
* Claude is strongly recommended for blog generation — qwen2.5:14b cannot
* follow complex multi-constraint prompts (mode collapse).
*/
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
const LLM_MODEL = process.env.OLLAMA_LLM_MODEL || "qwen2.5:14b";
const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY || "";
const ANTHROPIC_MODEL = process.env.ANTHROPIC_MODEL || "claude-sonnet-4-20250514";
const BLOG_LLM_PROVIDER = process.env.BLOG_LLM_PROVIDER || "ollama";
interface LlmResponse {
text: string;
model: string;
totalDuration: number;
evalCount: number;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
// ═══════════════════════════════════════════════════════
// ANTHROPIC CLAUDE PROVIDER
// ═══════════════════════════════════════════════════════
// Serialize Claude API calls to stay within TPM limits
// Tier-1 has 40,000 TPM — with ~20K tokens/step, only 1 concurrent call safe
let claudeQueue: Promise<unknown> = Promise.resolve();
function enqueueClaude<T>(fn: () => Promise<T>): Promise<T> {
const result = claudeQueue.then(() => fn());
claudeQueue = result.catch(() => {});
return result;
}
async function generateClaude(
systemPrompt: string,
userPrompt: string,
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
): Promise<LlmResponse> {
if (!ANTHROPIC_API_KEY) {
throw new Error("ANTHROPIC_API_KEY not set — cannot use Claude provider");
}
return enqueueClaude(async () => {
const startTime = Date.now();
const resp = await fetch("https://api.anthropic.com/v1/messages", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
},
body: JSON.stringify({
model: ANTHROPIC_MODEL,
max_tokens: options?.maxTokens ?? 4096,
temperature: options?.temperature ?? 0.7,
system: systemPrompt,
messages: [{ role: "user", content: userPrompt }],
}),
signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
});
if (!resp.ok) {
const errText = await resp.text();
// Rate limit retry
if (resp.status === 429) {
console.log("[LLM] Claude 429 — retrying in 10s...");
await sleep(10000);
return generateClaude(systemPrompt, userPrompt, options);
}
throw new Error(`Claude API failed: ${resp.status} ${errText.slice(0, 200)}`);
}
const data = await resp.json() as {
content: Array<{ type: string; text: string }>;
model: string;
usage: { input_tokens: number; output_tokens: number };
};
const text = data.content
.filter((c) => c.type === "text")
.map((c) => c.text)
.join("");
const duration = Date.now() - startTime;
console.log(`[LLM] Claude ${data.model}: ${data.usage.input_tokens}+${data.usage.output_tokens} tokens, ${duration}ms`);
return {
text,
model: data.model,
totalDuration: duration * 1_000_000, // ns for compat
evalCount: data.usage.output_tokens,
};
}); // end enqueueClaude
}
// ═══════════════════════════════════════════════════════
// OLLAMA PROVIDER (existing)
// ═══════════════════════════════════════════════════════
let ollamaQueue: Promise<unknown> = Promise.resolve();
let queueDepth = 0;
let lastQueueEnqueueTime = 0;
export function resetOllamaQueue(): void {
ollamaQueue = Promise.resolve();
queueDepth = 0;
console.log("[LLM] Queue reset — previous stuck requests cleared");
}
export function getQueueDepth(): number { return queueDepth; }
function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
queueDepth++;
lastQueueEnqueueTime = Date.now();
const result = ollamaQueue.then(() => {
if (Date.now() - lastQueueEnqueueTime > 900000) {
console.warn("[LLM] Queue auto-reset after 15min stall");
queueDepth = Math.max(0, queueDepth - 1);
return Promise.reject(new Error("Queue auto-reset: previous request timed out"));
}
return fn();
});
ollamaQueue = result.catch(() => {}).then(() => { queueDepth = Math.max(0, queueDepth - 1); });
return result;
}
async function generateOllama(
systemPrompt: string,
userPrompt: string,
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
): Promise<LlmResponse> {
return enqueueOllama(async () => {
const RETRY_DELAYS = [15000, 30000, 60000];
for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
if (attempt > 0) {
const delay = RETRY_DELAYS[attempt - 1];
console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
await sleep(delay);
}
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: LLM_MODEL,
prompt: userPrompt,
system: systemPrompt,
stream: false,
options: {
temperature: options?.temperature ?? 0.7,
num_predict: options?.maxTokens ?? 4096,
},
}),
signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
});
if (resp.status === 429) {
if (attempt < RETRY_DELAYS.length) continue;
throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
}
if (!resp.ok) {
const errText = await resp.text();
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
}
const data = await resp.json() as {
response: string;
model: string;
total_duration: number;
eval_count: number;
};
return {
text: data.response,
model: data.model,
totalDuration: data.total_duration,
evalCount: data.eval_count,
};
}
throw new Error("Ollama generate: unreachable");
});
}
// ═══════════════════════════════════════════════════════
// PUBLIC API — auto-routes to configured provider
// ═══════════════════════════════════════════════════════
export async function generate(
systemPrompt: string,
userPrompt: string,
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
): Promise<LlmResponse> {
if (BLOG_LLM_PROVIDER === "anthropic" && ANTHROPIC_API_KEY) {
return generateClaude(systemPrompt, userPrompt, options);
}
return generateOllama(systemPrompt, userPrompt, options);
}
/** Chat-style generation with message history (Ollama only for now) */
export async function chat(
messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
options?: { temperature?: number; maxTokens?: number },
): Promise<LlmResponse> {
return enqueueOllama(async () => {
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: LLM_MODEL,
messages,
stream: false,
options: {
temperature: options?.temperature ?? 0.7,
num_predict: options?.maxTokens ?? 4096,
},
}),
signal: AbortSignal.timeout(300000),
});
if (!resp.ok) {
const errText = await resp.text();
throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
}
const data = await resp.json() as {
message: { content: string };
model: string;
total_duration: number;
eval_count: number;
};
return {
text: data.message.content,
model: data.model,
totalDuration: data.total_duration,
evalCount: data.eval_count,
};
});
}
/** Check if configured LLM provider is available */
export async function checkHealth(): Promise<{ ok: boolean; model: string; provider: string; error?: string }> {
if (BLOG_LLM_PROVIDER === "anthropic" && ANTHROPIC_API_KEY) {
try {
// Quick validation — just check API key works
const resp = await fetch("https://api.anthropic.com/v1/messages", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
},
body: JSON.stringify({
model: ANTHROPIC_MODEL,
max_tokens: 5,
messages: [{ role: "user", content: "hi" }],
}),
signal: AbortSignal.timeout(10000),
});
return { ok: resp.ok, model: ANTHROPIC_MODEL, provider: "anthropic" };
} catch (err) {
return { ok: false, model: ANTHROPIC_MODEL, provider: "anthropic", error: (err as Error).message };
}
}
try {
const resp = await fetch(`${OLLAMA_URL}/api/tags`, { signal: AbortSignal.timeout(5000) });
if (!resp.ok) return { ok: false, model: LLM_MODEL, provider: "ollama", error: `HTTP ${resp.status}` };
const data = await resp.json() as { models: Array<{ name: string }> };
const hasModel = data.models.some((m) => m.name.includes(LLM_MODEL.split(":")[0]));
return { ok: hasModel, model: LLM_MODEL, provider: "ollama", error: hasModel ? undefined : `Model ${LLM_MODEL} not found` };
} catch (err) {
return { ok: false, model: LLM_MODEL, provider: "ollama", error: (err as Error).message };
}
}