fix(llm): add 429 retry with exponential backoff + ollamaQueue concurrency guard
This commit is contained in:
parent
01ad16464d
commit
ef0b0bb148
@ -3,6 +3,7 @@
|
|||||||
*
|
*
|
||||||
* Uses qwen2.5:14b on Mac Studio (.213) for text generation.
|
* Uses qwen2.5:14b on Mac Studio (.213) for text generation.
|
||||||
* Supports streaming and non-streaming modes.
|
* Supports streaming and non-streaming modes.
|
||||||
|
* Includes 429 retry with exponential backoff + server-side concurrency guard.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
|
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
|
||||||
@ -15,46 +16,83 @@ interface LlmResponse {
|
|||||||
evalCount: number;
|
evalCount: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Generate text from a system prompt + user prompt */
|
/** Sleep helper */
|
||||||
|
function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Server-side concurrency guard — Ollama processes one generation at a time.
|
||||||
|
* Queue ensures sequential execution even with multiple concurrent API requests.
|
||||||
|
*/
|
||||||
|
let ollamaQueue: Promise<unknown> = Promise.resolve();
|
||||||
|
|
||||||
|
function enqueueOllama<T>(fn: () => Promise<T>): Promise<T> {
|
||||||
|
const result = ollamaQueue.then(fn);
|
||||||
|
// Keep queue alive even if fn throws (attach no-op error handler on chain)
|
||||||
|
ollamaQueue = result.catch(() => {});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Generate text from a system prompt + user prompt — with 429 retry/backoff + queue */
|
||||||
export async function generate(
|
export async function generate(
|
||||||
systemPrompt: string,
|
systemPrompt: string,
|
||||||
userPrompt: string,
|
userPrompt: string,
|
||||||
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
|
options?: { temperature?: number; maxTokens?: number; timeoutMs?: number },
|
||||||
): Promise<LlmResponse> {
|
): Promise<LlmResponse> {
|
||||||
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
return enqueueOllama(async () => {
|
||||||
method: "POST",
|
const RETRY_DELAYS = [15000, 30000, 60000]; // 15s, 30s, 60s
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({
|
for (let attempt = 0; attempt <= RETRY_DELAYS.length; attempt++) {
|
||||||
model: LLM_MODEL,
|
if (attempt > 0) {
|
||||||
prompt: userPrompt,
|
const delay = RETRY_DELAYS[attempt - 1];
|
||||||
system: systemPrompt,
|
console.log(`Blog LLM: 429 rate-limit — retrying in ${delay / 1000}s (attempt ${attempt}/${RETRY_DELAYS.length})`);
|
||||||
stream: false,
|
await sleep(delay);
|
||||||
options: {
|
}
|
||||||
temperature: options?.temperature ?? 0.7,
|
|
||||||
num_predict: options?.maxTokens ?? 4096,
|
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
||||||
},
|
method: "POST",
|
||||||
}),
|
headers: { "Content-Type": "application/json" },
|
||||||
signal: AbortSignal.timeout(options?.timeoutMs ?? 180000),
|
body: JSON.stringify({
|
||||||
|
model: LLM_MODEL,
|
||||||
|
prompt: userPrompt,
|
||||||
|
system: systemPrompt,
|
||||||
|
stream: false,
|
||||||
|
options: {
|
||||||
|
temperature: options?.temperature ?? 0.7,
|
||||||
|
num_predict: options?.maxTokens ?? 4096,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(options?.timeoutMs ?? 300000),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (resp.status === 429) {
|
||||||
|
if (attempt < RETRY_DELAYS.length) continue;
|
||||||
|
throw new Error(`Ollama generate failed: 429 Too Many Requests (all retries exhausted)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
const errText = await resp.text();
|
||||||
|
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await resp.json() as {
|
||||||
|
response: string;
|
||||||
|
model: string;
|
||||||
|
total_duration: number;
|
||||||
|
eval_count: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: data.response,
|
||||||
|
model: data.model,
|
||||||
|
totalDuration: data.total_duration,
|
||||||
|
evalCount: data.eval_count,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error("Ollama generate: unreachable");
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!resp.ok) {
|
|
||||||
const errText = await resp.text();
|
|
||||||
throw new Error(`Ollama generate failed: ${resp.status} ${errText}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await resp.json() as {
|
|
||||||
response: string;
|
|
||||||
model: string;
|
|
||||||
total_duration: number;
|
|
||||||
eval_count: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
text: data.response,
|
|
||||||
model: data.model,
|
|
||||||
totalDuration: data.total_duration,
|
|
||||||
evalCount: data.eval_count,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Chat-style generation with message history */
|
/** Chat-style generation with message history */
|
||||||
@ -62,39 +100,41 @@ export async function chat(
|
|||||||
messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
|
messages: ReadonlyArray<{ role: "system" | "user" | "assistant"; content: string }>,
|
||||||
options?: { temperature?: number; maxTokens?: number },
|
options?: { temperature?: number; maxTokens?: number },
|
||||||
): Promise<LlmResponse> {
|
): Promise<LlmResponse> {
|
||||||
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
return enqueueOllama(async () => {
|
||||||
method: "POST",
|
const resp = await fetch(`${OLLAMA_URL}/api/chat`, {
|
||||||
headers: { "Content-Type": "application/json" },
|
method: "POST",
|
||||||
body: JSON.stringify({
|
headers: { "Content-Type": "application/json" },
|
||||||
model: LLM_MODEL,
|
body: JSON.stringify({
|
||||||
messages,
|
model: LLM_MODEL,
|
||||||
stream: false,
|
messages,
|
||||||
options: {
|
stream: false,
|
||||||
temperature: options?.temperature ?? 0.7,
|
options: {
|
||||||
num_predict: options?.maxTokens ?? 4096,
|
temperature: options?.temperature ?? 0.7,
|
||||||
},
|
num_predict: options?.maxTokens ?? 4096,
|
||||||
}),
|
},
|
||||||
signal: AbortSignal.timeout(120000),
|
}),
|
||||||
|
signal: AbortSignal.timeout(300000),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
const errText = await resp.text();
|
||||||
|
throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await resp.json() as {
|
||||||
|
message: { content: string };
|
||||||
|
model: string;
|
||||||
|
total_duration: number;
|
||||||
|
eval_count: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: data.message.content,
|
||||||
|
model: data.model,
|
||||||
|
totalDuration: data.total_duration,
|
||||||
|
evalCount: data.eval_count,
|
||||||
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!resp.ok) {
|
|
||||||
const errText = await resp.text();
|
|
||||||
throw new Error(`Ollama chat failed: ${resp.status} ${errText}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await resp.json() as {
|
|
||||||
message: { content: string };
|
|
||||||
model: string;
|
|
||||||
total_duration: number;
|
|
||||||
eval_count: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
text: data.message.content,
|
|
||||||
model: data.model,
|
|
||||||
totalDuration: data.total_duration,
|
|
||||||
evalCount: data.eval_count,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check if Ollama is available and model is loaded */
|
/** Check if Ollama is available and model is loaded */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user