Gateway and learning DB clients now prefer DATABASE_URL connection string over individual DB_* env vars — matches ecosystem.config.cjs convention. Ollama health check timeout increased 5→15s for Cloudflare tunnel latency.
132 lines
4.3 KiB
TypeScript
132 lines
4.3 KiB
TypeScript
import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
|
import { getOllamaBaseUrl } from '../pipeline/router.js';
|
|
import { getAllBreakerStates } from '../circuit-breaker/ollama-breaker.js';
|
|
import { query } from '../db/client.js';
|
|
import { getPgBoss } from '../queue/pg-boss-client.js';
|
|
import { logger } from '../observability/logger.js';
|
|
|
|
interface HealthStatus {
|
|
status: 'ok' | 'degraded' | 'down';
|
|
timestamp: string;
|
|
checks: {
|
|
ollama: { status: 'ok' | 'down'; latency_ms?: number; error?: string };
|
|
database: { status: 'ok' | 'down'; error?: string };
|
|
queue: { status: 'ok' | 'down' | 'unknown'; depth?: number; error?: string };
|
|
review_queue: { unreviewed_count: number };
|
|
circuit_breakers: Record<string, 'closed' | 'open' | 'half-open'>;
|
|
};
|
|
}
|
|
|
|
async function checkOllama(baseUrl: string): Promise<{ status: 'ok' | 'down'; latency_ms?: number; error?: string }> {
|
|
const start = Date.now();
|
|
try {
|
|
const response = await fetch(`${baseUrl}/api/tags`, {
|
|
signal: AbortSignal.timeout(15000),
|
|
});
|
|
const latency_ms = Date.now() - start;
|
|
if (!response.ok) {
|
|
return { status: 'down', error: `HTTP ${response.status}`, latency_ms };
|
|
}
|
|
return { status: 'ok', latency_ms };
|
|
} catch (err) {
|
|
return { status: 'down', error: err instanceof Error ? err.message : 'Unknown error' };
|
|
}
|
|
}
|
|
|
|
async function checkDatabase(): Promise<{ status: 'ok' | 'down'; error?: string }> {
|
|
try {
|
|
await query('SELECT 1');
|
|
return { status: 'ok' };
|
|
} catch (err) {
|
|
return { status: 'down', error: err instanceof Error ? err.message : 'Unknown error' };
|
|
}
|
|
}
|
|
|
|
async function checkQueue(): Promise<{ status: 'ok' | 'down' | 'unknown'; depth?: number; error?: string }> {
|
|
const boss = getPgBoss();
|
|
if (!boss) return { status: 'unknown' };
|
|
|
|
try {
|
|
const [queued, active] = await Promise.all([
|
|
boss.getQueueSize('llm-batch', { before: 'completed' }),
|
|
boss.getQueueSize('llm-batch', { before: 'active' }),
|
|
]);
|
|
return { status: 'ok', depth: (queued ?? 0) + (active ?? 0) };
|
|
} catch (err) {
|
|
return { status: 'down', error: err instanceof Error ? err.message : 'Unknown error' };
|
|
}
|
|
}
|
|
|
|
async function getReviewQueueCount(): Promise<number> {
|
|
try {
|
|
const result = await query<{ count: string }>(
|
|
'SELECT COUNT(*) as count FROM review_queue WHERE decision IS NULL',
|
|
);
|
|
return parseInt(result.rows[0]?.count ?? '0', 10);
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
export async function healthRoute(fastify: FastifyInstance): Promise<void> {
|
|
fastify.get(
|
|
'/health',
|
|
async (_request: FastifyRequest, reply: FastifyReply) => {
|
|
const ollamaBaseUrl = getOllamaBaseUrl();
|
|
|
|
const [ollamaCheck, dbCheck, queueCheck, reviewCount] = await Promise.all([
|
|
checkOllama(ollamaBaseUrl),
|
|
checkDatabase(),
|
|
checkQueue(),
|
|
getReviewQueueCount(),
|
|
]);
|
|
|
|
const breakerStates = getAllBreakerStates();
|
|
|
|
const isDown = ollamaCheck.status === 'down' || dbCheck.status === 'down';
|
|
const isDegraded = queueCheck.status === 'down' || Object.values(breakerStates).some((s) => s === 'open');
|
|
|
|
const status: HealthStatus['status'] = isDown ? 'down' : isDegraded ? 'degraded' : 'ok';
|
|
|
|
const health: HealthStatus = {
|
|
status,
|
|
timestamp: new Date().toISOString(),
|
|
checks: {
|
|
ollama: ollamaCheck,
|
|
database: dbCheck,
|
|
queue: queueCheck,
|
|
review_queue: { unreviewed_count: reviewCount },
|
|
circuit_breakers: breakerStates,
|
|
},
|
|
};
|
|
|
|
const statusCode = isDown ? 503 : 200;
|
|
if (status !== 'ok') {
|
|
logger.warn({ status, checks: health.checks }, 'Health check degraded');
|
|
}
|
|
|
|
return reply.status(statusCode).send(health);
|
|
},
|
|
);
|
|
|
|
// Kubernetes-style liveness probe (minimal check)
|
|
fastify.get(
|
|
'/health/live',
|
|
async (_request: FastifyRequest, reply: FastifyReply) => {
|
|
return reply.send({ status: 'alive', ts: Date.now() });
|
|
},
|
|
);
|
|
|
|
// Kubernetes-style readiness probe
|
|
fastify.get(
|
|
'/health/ready',
|
|
async (_request: FastifyRequest, reply: FastifyReply) => {
|
|
const dbCheck = await checkDatabase();
|
|
if (dbCheck.status === 'down') {
|
|
return reply.status(503).send({ status: 'not ready', reason: 'database unavailable' });
|
|
}
|
|
return reply.send({ status: 'ready' });
|
|
},
|
|
);
|
|
}
|