fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard

2026-03-31 21:45:46 +02:00 · 2026-03-31 21:45:46 +02:00 · ef0b0bb148
commit ef0b0bb148
parent 01ad16464d
1 changed files with 106 additions and 66 deletions
--- a/packages/api/src/llm/client.ts
+++ b/packages/api/src/llm/client.ts
@ -3,6 +3,7 @@
 *
 * Uses qwen2.5:14b on Mac Studio (.213) for text generation.
 * Supports streaming and non-streaming modes.
+ * Includes 429 retry with exponential backoff + server-side concurrency guard.
 */

 const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
@ -15,46 +16,83 @@ interface LlmResponse {
  evalCount: number;
 }

-/** Generate text from a system prompt + user prompt */
+/** Sleep helper */
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Server-side concurrency guard — Ollama processes one generation at a time.
+ * Queue ensures sequential execution even with multiple concurrent API requests.
+ */
+let ollamaQueue: Promise<unknown> = Promise.resolve();
+
+function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
+  const result = ollamaQueue.then(fn);
+  // Keep queue alive even if fn throws (attach no-op error handler on chain)
+  ollamaQueue = result.catch(() => {});
+  return result;
+}
+
+/** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
 export async function generate(
  systemPrompt: string,
  userPrompt: string,
  options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
 ): Promise<LlmResponse> {
-  const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({
-      model: LLM_MODEL,
-      prompt: userPrompt,
-      system: systemPrompt,
-      stream: false,
-      options: {
-        temperature: options?.temperature ?? 0.7,
-        num_predict: options?.maxTokens ?? 4096,
-      },
-    }),
-    signal: AbortSignal.timeout(options?.timeoutMs ?? 180000),
+  return enqueueOllama(async () => {
+    const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
+
+    for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
+      if (attempt > 0) {
+        const delay = RETRY_DELAYS[attempt - 1];
+        console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
+        await sleep(delay);
+      }
+
+      const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          model: LLM_MODEL,
+          prompt: userPrompt,
+          system: systemPrompt,
+          stream: false,
+          options: {
+            temperature: options?.temperature ?? 0.7,
+            num_predict: options?.maxTokens ?? 4096,
+          },
+        }),
+        signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
+      });
+
+      if (resp.status === 429) {
+        if (attempt < RETRY_DELAYS.length) continue;
+        throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
+      }
+
+      if (!resp.ok) {
+        const errText = await resp.text();
+        throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
+      }
+
+      const data = await resp.json() as {
+        response: string;
+        model: string;
+        total_duration: number;
+        eval_count: number;
+      };
+
+      return {
+        text: data.response,
+        model: data.model,
+        totalDuration: data.total_duration,
+        evalCount: data.eval_count,
+      };
+    }
+
+    throw new Error("Ollama generate: unreachable");
  });
-
-  if (!resp.ok) {
-    const errText = await resp.text();
-    throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
-  }
-
-  const data = await resp.json() as {
-    response: string;
-    model: string;
-    total_duration: number;
-    eval_count: number;
-  };
-
-  return {
-    text: data.response,
-    model: data.model,
-    totalDuration: data.total_duration,
-    evalCount: data.eval_count,
-  };
 }

 /** Chat-style generation with message history */
@ -62,39 +100,41 @@ export async function chat(
  messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
  options?: { temperature?: number; maxTokens?: number },
 ): Promise<LlmResponse> {
-  const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({
-      model: LLM_MODEL,
-      messages,
-      stream: false,
-      options: {
-        temperature: options?.temperature ?? 0.7,
-        num_predict: options?.maxTokens ?? 4096,
-      },
-    }),
-    signal: AbortSignal.timeout(120000),
+  return enqueueOllama(async () => {
+    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        model: LLM_MODEL,
+        messages,
+        stream: false,
+        options: {
+          temperature: options?.temperature ?? 0.7,
+          num_predict: options?.maxTokens ?? 4096,
+        },
+      }),
+      signal: AbortSignal.timeout(300000),
+    });
+
+    if (!resp.ok) {
+      const errText = await resp.text();
+      throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
+    }
+
+    const data = await resp.json() as {
+      message: { content: string };
+      model: string;
+      total_duration: number;
+      eval_count: number;
+    };
+
+    return {
+      text: data.message.content,
+      model: data.model,
+      totalDuration: data.total_duration,
+      evalCount: data.eval_count,
+    };
  });
-
-  if (!resp.ok) {
-    const errText = await resp.text();
-    throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
-  }
-
-  const data = await resp.json() as {
-    message: { content: string };
-    model: string;
-    total_duration: number;
-    eval_count: number;
-  };
-
-  return {
-    text: data.message.content,
-    model: data.model,
-    totalDuration: data.total_duration,
-    evalCount: data.eval_count,
-  };
 }

 /** Check if Ollama is available and model is loaded */