fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard

This commit is contained in:
Rene Fichtmueller 2026-03-31 21:45:46 +02:00
parent 58751b896d
commit bf34096d48

View File

@ -3,6 +3,7 @@
* *
* Uses qwen2.5:14b on Mac Studio (.213) for text generation. * Uses qwen2.5:14b on Mac Studio (.213) for text generation.
* Supports streaming and non-streaming modes. * Supports streaming and non-streaming modes.
* Includes 429 retry with exponential backoff + server-side concurrency guard.
*/ */
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434"; const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
@ -15,12 +16,40 @@ interface LlmResponse {
evalCount: number; evalCount: number;
} }
/** Generate text from a system prompt + user prompt */ /** Sleep helper */
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Server-side concurrency guard Ollama processes one generation at a time.
* Queue ensures sequential execution even with multiple concurrent API requests.
*/
let ollamaQueue: Promise<unknown> = Promise.resolve();
function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
const result = ollamaQueue.then(fn);
// Keep queue alive even if fn throws (attach no-op error handler on chain)
ollamaQueue = result.catch(() => {});
return result;
}
/** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
export async function generate( export async function generate(
systemPrompt: string, systemPrompt: string,
userPrompt: string, userPrompt: string,
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number }, options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
): Promise<LlmResponse> { ): Promise<LlmResponse> {
return enqueueOllama(async () => {
const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
if (attempt > 0) {
const delay = RETRY_DELAYS[attempt - 1];
console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
await sleep(delay);
}
const resp = await fetch(`${OLLAMA_URL}/api/generate`, { const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
@ -34,9 +63,14 @@ export async function generate(
num_predict: options?.maxTokens ?? 4096, num_predict: options?.maxTokens ?? 4096,
}, },
}), }),
signal: AbortSignal.timeout(options?.timeoutMs ?? 180000), signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
}); });
if (resp.status === 429) {
if (attempt < RETRY_DELAYS.length) continue;
throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
}
if (!resp.ok) { if (!resp.ok) {
const errText = await resp.text(); const errText = await resp.text();
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`); throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
@ -55,6 +89,10 @@ export async function generate(
totalDuration: data.total_duration, totalDuration: data.total_duration,
evalCount: data.eval_count, evalCount: data.eval_count,
}; };
}
throw new Error("Ollama generate: unreachable");
});
} }
/** Chat-style generation with message history */ /** Chat-style generation with message history */
@ -62,6 +100,7 @@ export async function chat(
messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>, messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
options?: { temperature?: number; maxTokens?: number }, options?: { temperature?: number; maxTokens?: number },
): Promise<LlmResponse> { ): Promise<LlmResponse> {
return enqueueOllama(async () => {
const resp = await fetch(`${OLLAMA_URL}/api/chat`, { const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
@ -74,7 +113,7 @@ export async function chat(
num_predict: options?.maxTokens ?? 4096, num_predict: options?.maxTokens ?? 4096,
}, },
}), }),
signal: AbortSignal.timeout(120000), signal: AbortSignal.timeout(300000),
}); });
if (!resp.ok) { if (!resp.ok) {
@ -95,6 +134,7 @@ export async function chat(
totalDuration: data.total_duration, totalDuration: data.total_duration,
evalCount: data.eval_count, evalCount: data.eval_count,
}; };
});
} }
/** Check if Ollama is available and model is loaded */ /** Check if Ollama is available and model is loaded */