fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard
This commit is contained in:
parent
01ad16464d
commit
ef0b0bb148
@ -3,6 +3,7 @@
|
||||
*
|
||||
* Uses qwen2.5:14b on Mac Studio (.213) for text generation.
|
||||
* Supports streaming and non-streaming modes.
|
||||
* Includes 429 retry with exponential backoff + server-side concurrency guard.
|
||||
*/
|
||||
|
||||
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
|
||||
@ -15,46 +16,83 @@ interface LlmResponse {
|
||||
evalCount: number;
|
||||
}
|
||||
|
||||
/** Generate text from a system prompt + user prompt */
|
||||
/** Sleep helper */
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Server-side concurrency guard — Ollama processes one generation at a time.
|
||||
* Queue ensures sequential execution even with multiple concurrent API requests.
|
||||
*/
|
||||
let ollamaQueue: Promise<unknown> = Promise.resolve();
|
||||
|
||||
function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
|
||||
const result = ollamaQueue.then(fn);
|
||||
// Keep queue alive even if fn throws (attach no-op error handler on chain)
|
||||
ollamaQueue = result.catch(() => {});
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
|
||||
export async function generate(
|
||||
systemPrompt: string,
|
||||
userPrompt: string,
|
||||
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
|
||||
): Promise<LlmResponse> {
|
||||
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: LLM_MODEL,
|
||||
prompt: userPrompt,
|
||||
system: systemPrompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: options?.temperature ?? 0.7,
|
||||
num_predict: options?.maxTokens ?? 4096,
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(options?.timeoutMs ?? 180000),
|
||||
return enqueueOllama(async () => {
|
||||
const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
|
||||
|
||||
for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
|
||||
if (attempt > 0) {
|
||||
const delay = RETRY_DELAYS[attempt - 1];
|
||||
console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
|
||||
await sleep(delay);
|
||||
}
|
||||
|
||||
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: LLM_MODEL,
|
||||
prompt: userPrompt,
|
||||
system: systemPrompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: options?.temperature ?? 0.7,
|
||||
num_predict: options?.maxTokens ?? 4096,
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
|
||||
});
|
||||
|
||||
if (resp.status === 429) {
|
||||
if (attempt < RETRY_DELAYS.length) continue;
|
||||
throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
const errText = await resp.text();
|
||||
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json() as {
|
||||
response: string;
|
||||
model: string;
|
||||
total_duration: number;
|
||||
eval_count: number;
|
||||
};
|
||||
|
||||
return {
|
||||
text: data.response,
|
||||
model: data.model,
|
||||
totalDuration: data.total_duration,
|
||||
evalCount: data.eval_count,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error("Ollama generate: unreachable");
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errText = await resp.text();
|
||||
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json() as {
|
||||
response: string;
|
||||
model: string;
|
||||
total_duration: number;
|
||||
eval_count: number;
|
||||
};
|
||||
|
||||
return {
|
||||
text: data.response,
|
||||
model: data.model,
|
||||
totalDuration: data.total_duration,
|
||||
evalCount: data.eval_count,
|
||||
};
|
||||
}
|
||||
|
||||
/** Chat-style generation with message history */
|
||||
@ -62,39 +100,41 @@ export async function chat(
|
||||
messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
|
||||
options?: { temperature?: number; maxTokens?: number },
|
||||
): Promise<LlmResponse> {
|
||||
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: LLM_MODEL,
|
||||
messages,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: options?.temperature ?? 0.7,
|
||||
num_predict: options?.maxTokens ?? 4096,
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(120000),
|
||||
return enqueueOllama(async () => {
|
||||
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: LLM_MODEL,
|
||||
messages,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: options?.temperature ?? 0.7,
|
||||
num_predict: options?.maxTokens ?? 4096,
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(300000),
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errText = await resp.text();
|
||||
throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json() as {
|
||||
message: { content: string };
|
||||
model: string;
|
||||
total_duration: number;
|
||||
eval_count: number;
|
||||
};
|
||||
|
||||
return {
|
||||
text: data.message.content,
|
||||
model: data.model,
|
||||
totalDuration: data.total_duration,
|
||||
evalCount: data.eval_count,
|
||||
};
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errText = await resp.text();
|
||||
throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json() as {
|
||||
message: { content: string };
|
||||
model: string;
|
||||
total_duration: number;
|
||||
eval_count: number;
|
||||
};
|
||||
|
||||
return {
|
||||
text: data.message.content,
|
||||
model: data.model,
|
||||
totalDuration: data.total_duration,
|
||||
evalCount: data.eval_count,
|
||||
};
|
||||
}
|
||||
|
||||
/** Check if Ollama is available and model is loaded */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user