From bf34096d480f94f5fba95954d4eb0e4dc1bb633e Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 31 Mar 2026 21:45:46 +0200 Subject: [PATCH] fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard --- packages/api/src/llm/client.ts | 172 ++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 66 deletions(-) diff --git a/packages/api/src/llm/client.ts b/packages/api/src/llm/client.ts index 7463549..b74b327 100644 --- a/packages/api/src/llm/client.ts +++ b/packages/api/src/llm/client.ts @@ -3,6 +3,7 @@ * * Uses qwen2.5:14b on Mac Studio (.213) for text generation. * Supports streaming and non-streaming modes. + * Includes 429 retry with exponential backoff + server-side concurrency guard. */ const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434"; @@ -15,46 +16,83 @@ interface LlmResponse { evalCount: number; } -/** Generate text from a system prompt + user prompt */ +/** Sleep helper */ +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Server-side concurrency guard — Ollama processes one generation at a time. + * Queue ensures sequential execution even with multiple concurrent API requests. + */ +let ollamaQueue: Promise = Promise.resolve(); + +function enqueueOllama(fn: () => Promise): Promise { + const result = ollamaQueue.then(fn); + // Keep queue alive even if fn throws (attach no-op error handler on chain) + ollamaQueue = result.catch(() => {}); + return result; +} + +/** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */ export async function generate( systemPrompt: string, userPrompt: string, options?: { temperature?: number; maxTokens?: number; timeoutMs?: number }, ): Promise { - const resp = await fetch(`${OLLAMA_URL}/api/generate`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - model: LLM_MODEL, - prompt: userPrompt, - system: systemPrompt, - stream: false, - options: { - temperature: options?.temperature ?? 0.7, - num_predict: options?.maxTokens ?? 4096, - }, - }), - signal: AbortSignal.timeout(options?.timeoutMs ?? 180000), + return enqueueOllama(async () => { + const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s + + for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) { + if (attempt > 0) { + const delay = RETRY_DELAYS[attempt - 1]; + console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`); + await sleep(delay); + } + + const resp = await fetch(`${OLLAMA_URL}/api/generate`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: LLM_MODEL, + prompt: userPrompt, + system: systemPrompt, + stream: false, + options: { + temperature: options?.temperature ?? 0.7, + num_predict: options?.maxTokens ?? 4096, + }, + }), + signal: AbortSignal.timeout(options?.timeoutMs ?? 300000), + }); + + if (resp.status === 429) { + if (attempt < RETRY_DELAYS.length) continue; + throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`); + } + + if (!resp.ok) { + const errText = await resp.text(); + throw new Error(`Ollama generate failed: ${resp.status} ${errText}`); + } + + const data = await resp.json() as { + response: string; + model: string; + total_duration: number; + eval_count: number; + }; + + return { + text: data.response, + model: data.model, + totalDuration: data.total_duration, + evalCount: data.eval_count, + }; + } + + throw new Error("Ollama generate: unreachable"); }); - - if (!resp.ok) { - const errText = await resp.text(); - throw new Error(`Ollama generate failed: ${resp.status} ${errText}`); - } - - const data = await resp.json() as { - response: string; - model: string; - total_duration: number; - eval_count: number; - }; - - return { - text: data.response, - model: data.model, - totalDuration: data.total_duration, - evalCount: data.eval_count, - }; } /** Chat-style generation with message history */ @@ -62,39 +100,41 @@ export async function chat( messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>, options?: { temperature?: number; maxTokens?: number }, ): Promise { - const resp = await fetch(`${OLLAMA_URL}/api/chat`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - model: LLM_MODEL, - messages, - stream: false, - options: { - temperature: options?.temperature ?? 0.7, - num_predict: options?.maxTokens ?? 4096, - }, - }), - signal: AbortSignal.timeout(120000), + return enqueueOllama(async () => { + const resp = await fetch(`${OLLAMA_URL}/api/chat`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: LLM_MODEL, + messages, + stream: false, + options: { + temperature: options?.temperature ?? 0.7, + num_predict: options?.maxTokens ?? 4096, + }, + }), + signal: AbortSignal.timeout(300000), + }); + + if (!resp.ok) { + const errText = await resp.text(); + throw new Error(`Ollama chat failed: ${resp.status} ${errText}`); + } + + const data = await resp.json() as { + message: { content: string }; + model: string; + total_duration: number; + eval_count: number; + }; + + return { + text: data.message.content, + model: data.model, + totalDuration: data.total_duration, + evalCount: data.eval_count, + }; }); - - if (!resp.ok) { - const errText = await resp.text(); - throw new Error(`Ollama chat failed: ${resp.status} ${errText}`); - } - - const data = await resp.json() as { - message: { content: string }; - model: string; - total_duration: number; - eval_count: number; - }; - - return { - text: data.message.content, - model: data.model, - totalDuration: data.total_duration, - evalCount: data.eval_count, - }; } /** Check if Ollama is available and model is loaded */