fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard

2026-03-31 21:45:46 +02:00 · 2026-03-31 21:45:46 +02:00 · ef0b0bb148
commit ef0b0bb148
parent 01ad16464d
1 changed files with 106 additions and 66 deletions
--- a/packages/api/src/llm/client.ts
+++ b/packages/api/src/llm/client.ts
@ -3,6 +3,7 @@
 *
 * Uses qwen2.5:14b on Mac Studio (.213) for text generation.
 * Supports streaming and non-streaming modes.
 * Includes 429 retry with exponential backoff + server-side concurrency guard.
 */
 const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
@ -15,46 +16,83 @@ interface LlmResponse {
  evalCount: number;
 }
-/** Generate text from a system prompt + user prompt */
+/** Sleep helper */
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }
 /**
 * Server-side concurrency guard — Ollama processes one generation at a time.
 * Queue ensures sequential execution even with multiple concurrent API requests.
 */
 let ollamaQueue: Promise<unknown> = Promise.resolve();
 function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
  const result = ollamaQueue.then(fn);
  // Keep queue alive even if fn throws (attach no-op error handler on chain)
  ollamaQueue = result.catch(() => {});
  return result;
 }
 /** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
 export async function generate(
  systemPrompt: string,
  userPrompt: string,
  options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
 ): Promise<LlmResponse> {
-  const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
+  return enqueueOllama(async () => {
-    method: "POST",
+    const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
-    headers: { "Content-Type": "application/json" },
+
-    body: JSON.stringify({
+    for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
-      model: LLM_MODEL,
+      if (attempt > 0) {
-      prompt: userPrompt,
+        const delay = RETRY_DELAYS[attempt - 1];
-      system: systemPrompt,
+        console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
-      stream: false,
+        await sleep(delay);
-      options: {
+      }
-        temperature: options?.temperature ?? 0.7,
+
-        num_predict: options?.maxTokens ?? 4096,
+      const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
-      },
+        method: "POST",
-    }),
+        headers: { "Content-Type": "application/json" },
-    signal: AbortSignal.timeout(options?.timeoutMs ?? 180000),
+        body: JSON.stringify({
          model: LLM_MODEL,
          prompt: userPrompt,
          system: systemPrompt,
          stream: false,
          options: {
            temperature: options?.temperature ?? 0.7,
            num_predict: options?.maxTokens ?? 4096,
          },
        }),
        signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
      });
      if (resp.status === 429) {
        if (attempt < RETRY_DELAYS.length) continue;
        throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
      }
      if (!resp.ok) {
        const errText = await resp.text();
        throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
      }
      const data = await resp.json() as {
        response: string;
        model: string;
        total_duration: number;
        eval_count: number;
      };
      return {
        text: data.response,
        model: data.model,
        totalDuration: data.total_duration,
        evalCount: data.eval_count,
      };
    }
    throw new Error("Ollama generate: unreachable");
  });
  if (!resp.ok) {
    const errText = await resp.text();
    throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
  }
  const data = await resp.json() as {
    response: string;
    model: string;
    total_duration: number;
    eval_count: number;
  };
  return {
    text: data.response,
    model: data.model,
    totalDuration: data.total_duration,
    evalCount: data.eval_count,
  };
 }
 /** Chat-style generation with message history */
@ -62,39 +100,41 @@ export async function chat(
  messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
  options?: { temperature?: number; maxTokens?: number },
 ): Promise<LlmResponse> {
-  const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
+  return enqueueOllama(async () => {
-    method: "POST",
+    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
-    headers: { "Content-Type": "application/json" },
+      method: "POST",
-    body: JSON.stringify({
+      headers: { "Content-Type": "application/json" },
-      model: LLM_MODEL,
+      body: JSON.stringify({
-      messages,
+        model: LLM_MODEL,
-      stream: false,
+        messages,
-      options: {
+        stream: false,
-        temperature: options?.temperature ?? 0.7,
+        options: {
-        num_predict: options?.maxTokens ?? 4096,
+          temperature: options?.temperature ?? 0.7,
-      },
+          num_predict: options?.maxTokens ?? 4096,
-    }),
+        },
-    signal: AbortSignal.timeout(120000),
+      }),
      signal: AbortSignal.timeout(300000),
    });
    if (!resp.ok) {
      const errText = await resp.text();
      throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
    }
    const data = await resp.json() as {
      message: { content: string };
      model: string;
      total_duration: number;
      eval_count: number;
    };
    return {
      text: data.message.content,
      model: data.model,
      totalDuration: data.total_duration,
      evalCount: data.eval_count,
    };
  });
  if (!resp.ok) {
    const errText = await resp.text();
    throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
  }
  const data = await resp.json() as {
    message: { content: string };
    model: string;
    total_duration: number;
    eval_count: number;
  };
  return {
    text: data.message.content,
    model: data.model,
    totalDuration: data.total_duration,
    evalCount: data.eval_count,
  };
 }
 /** Check if Ollama is available and model is loaded */