fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard

2026-03-31 21:45:46 +02:00 · 2026-03-31 21:45:46 +02:00 · bf34096d48
commit bf34096d48
parent 58751b896d
1 changed files with 106 additions and 66 deletions
--- a/packages/api/src/llm/client.ts
+++ b/packages/api/src/llm/client.ts
@ -3,6 +3,7 @@
 *
 * Uses qwen2.5:14b on Mac Studio (.213) for text generation.
 * Supports streaming and non-streaming modes.
 * Includes 429 retry with exponential backoff + server-side concurrency guard.
 */
 const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
@ -15,12 +16,40 @@ interface LlmResponse {
  evalCount: number;
 }
-/** Generate text from a system prompt + user prompt */
+/** Sleep helper */
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }
 /**
 * Server-side concurrency guard — Ollama processes one generation at a time.
 * Queue ensures sequential execution even with multiple concurrent API requests.
 */
 let ollamaQueue: Promise<unknown> = Promise.resolve();
 function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
  const result = ollamaQueue.then(fn);
  // Keep queue alive even if fn throws (attach no-op error handler on chain)
  ollamaQueue = result.catch(() => {});
  return result;
 }
 /** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
 export async function generate(
  systemPrompt: string,
  userPrompt: string,
  options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
 ): Promise<LlmResponse> {
  return enqueueOllama(async () => {
    const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
    for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
      if (attempt > 0) {
        const delay = RETRY_DELAYS[attempt - 1];
        console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
        await sleep(delay);
      }
      const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
@ -34,9 +63,14 @@ export async function generate(
            num_predict: options?.maxTokens ?? 4096,
          },
        }),
-    signal: AbortSignal.timeout(options?.timeoutMs ?? 180000),
+        signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
      });
      if (resp.status === 429) {
        if (attempt < RETRY_DELAYS.length) continue;
        throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
      }
      if (!resp.ok) {
        const errText = await resp.text();
        throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
@ -55,6 +89,10 @@ export async function generate(
        totalDuration: data.total_duration,
        evalCount: data.eval_count,
      };
    }
    throw new Error("Ollama generate: unreachable");
  });
 }
 /** Chat-style generation with message history */
@ -62,6 +100,7 @@ export async function chat(
  messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
  options?: { temperature?: number; maxTokens?: number },
 ): Promise<LlmResponse> {
  return enqueueOllama(async () => {
    const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
@ -74,7 +113,7 @@ export async function chat(
          num_predict: options?.maxTokens ?? 4096,
        },
      }),
-    signal: AbortSignal.timeout(120000),
+      signal: AbortSignal.timeout(300000),
    });
    if (!resp.ok) {
@ -95,6 +134,7 @@ export async function chat(
      totalDuration: data.total_duration,
      evalCount: data.eval_count,
    };
  });
 }
 /** Check if Ollama is available and model is loaded */