diff --git a/deploy/ecosystem.config.cjs b/deploy/ecosystem.config.cjs index 8e39301..e71fcbc 100644 --- a/deploy/ecosystem.config.cjs +++ b/deploy/ecosystem.config.cjs @@ -17,8 +17,8 @@ module.exports = { env: { NODE_ENV: 'production', PORT: 3103, - DATABASE_URL: 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway', - TIP_DATABASE_URL: 'postgresql://tip:tip_prod_2026@localhost:5432/transceiver_db', + DATABASE_URL: process.env.DATABASE_URL || '', + TIP_DATABASE_URL: process.env.TIP_DATABASE_URL || '', OLLAMA_URL: 'http://192.168.178.213:11434', LOG_LEVEL: 'info', GITEA_URL: 'http://192.168.178.196:3000', @@ -100,7 +100,7 @@ module.exports = { exec_mode: 'fork', env: { NODE_ENV: 'production', - DATABASE_URL: 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway', + DATABASE_URL: process.env.DATABASE_URL || '', GATEWAY_URL: 'http://localhost:3103', }, autorestart: true, diff --git a/docker-compose.yaml b/docker-compose.yaml index 255d42b..68f5c9b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -8,7 +8,7 @@ services: NODE_ENV: production PORT: "3100" DATABASE_URL: "${DATABASE_URL}" - TIP_DATABASE_URL: "postgresql://tip:tip_prod_2026@82.165.222.127:5433/transceiver_db" + TIP_DATABASE_URL: "${TIP_DATABASE_URL}" OLLAMA_URL: "http://192.168.178.169:11434" SHIELDX_URL: "${SHIELDX_URL:-}" GITEA_URL: "http://gitea.context-x.org" diff --git a/packages/ctx-health/src/db/client.ts b/packages/ctx-health/src/db/client.ts index f6010dd..587d4ad 100644 --- a/packages/ctx-health/src/db/client.ts +++ b/packages/ctx-health/src/db/client.ts @@ -5,10 +5,11 @@ const { Pool } = pg; let pool: pg.Pool | null = null; -const DEFAULT_DB_URL = 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway'; + function buildPoolConfig(): pg.PoolConfig { - const databaseUrl = process.env['CTX_HEALTH_DB_URL'] ?? process.env['DATABASE_URL'] ?? DEFAULT_DB_URL; + const databaseUrl = process.env['CTX_HEALTH_DB_URL'] ?? process.env['DATABASE_URL']; + if (!databaseUrl) throw new Error('CTX_HEALTH_DB_URL or DATABASE_URL env var is required'); return { connectionString: databaseUrl, max: 3, diff --git a/packages/fine-tuner/config/fine_tuner.yaml b/packages/fine-tuner/config/fine_tuner.yaml index a42585e..e3104ff 100644 --- a/packages/fine-tuner/config/fine_tuner.yaml +++ b/packages/fine-tuner/config/fine_tuner.yaml @@ -1,4 +1,4 @@ -database_url: "postgresql://llm:llm_secure_2026@127.0.0.1:15432/llm_gateway" +database_url: "${DATABASE_URL}" # Set via environment variable at runtime gateway_url: "https://llm-gateway.context-x.org" ollama_url: "http://localhost:11434" diff --git a/packages/fine-tuner/config/fo-blog-v6.yaml b/packages/fine-tuner/config/fo-blog-v6.yaml index c7d9edf..546c09a 100644 --- a/packages/fine-tuner/config/fo-blog-v6.yaml +++ b/packages/fine-tuner/config/fo-blog-v6.yaml @@ -1,4 +1,4 @@ -database_url: "postgresql://llm:llm_secure_2026@127.0.0.1:5432/llm_gateway" +database_url: "${DATABASE_URL}" # Set via environment variable at runtime gateway_url: "https://llm-gateway.context-x.org" ollama_url: "http://localhost:11434" diff --git a/packages/gateway/src/integrations/tip-db.ts b/packages/gateway/src/integrations/tip-db.ts index 12f5011..f1350ea 100644 --- a/packages/gateway/src/integrations/tip-db.ts +++ b/packages/gateway/src/integrations/tip-db.ts @@ -9,7 +9,7 @@ const TIP_DB_CONFIG = { port: parseInt(process.env['TIP_DB_PORT'] ?? '5433', 10), database: process.env['TIP_DB_NAME'] ?? 'transceiver_db', user: process.env['TIP_DB_USER'] ?? 'tip', - password: process.env['TIP_DB_PASSWORD'] ?? 'tip_prod_2026', + password: process.env['TIP_DB_PASSWORD']!, max: 5, idleTimeoutMillis: 60_000, connectionTimeoutMillis: 10_000, diff --git a/packages/gateway/src/pipeline/external-providers.ts b/packages/gateway/src/pipeline/external-providers.ts index f9649bc..fa5f280 100644 --- a/packages/gateway/src/pipeline/external-providers.ts +++ b/packages/gateway/src/pipeline/external-providers.ts @@ -257,6 +257,41 @@ function findBestModel( // ─── OpenAI-Compatible Client ─────────────────────────────────────── +function buildRequestHeaders(provider: ExternalProvider, apiKey: string): Record { + const headers: Record = { 'Content-Type': 'application/json' }; + if (!['claude-bridge', 'openai-bridge', 'chatgpt-bridge', 'copilot-bridge'].includes(provider.name)) { + headers['Authorization'] = `Bearer ${apiKey}`; + } + return headers; +} + +function buildRequestPayload(model: ExternalModel, request: ExternalCompletionRequest): Record { + return { + model: model.id, + messages: request.messages, + temperature: request.temperature ?? 0.3, + max_tokens: request.max_tokens ?? 2048, + }; +} + +function parseExternalResponse( + data: any, + model: ExternalModel, + provider: ExternalProvider, + start: number, +): ExternalCompletionResponse { + const content = data.choices?.[0]?.message?.content ?? ''; + recordRequest(provider.name); + return { + response: content, + model: data.model ?? model.id, + provider: provider.name, + inputTokens: data.usage?.prompt_tokens ?? 0, + outputTokens: data.usage?.completion_tokens ?? 0, + latencyMs: Date.now() - start, + }; +} + async function callProvider( provider: ExternalProvider, model: ExternalModel, @@ -275,25 +310,13 @@ async function callProvider( const start = Date.now(); try { - const headers: Record = { - 'Content-Type': 'application/json', - }; - - // Only add Authorization header for non-bridge providers - // Bridge services (claude-bridge, openai-bridge, chatgpt-bridge, copilot-bridge) handle auth internally - if (!['claude-bridge', 'openai-bridge', 'chatgpt-bridge', 'copilot-bridge'].includes(provider.name)) { - headers['Authorization'] = `Bearer ${apiKey}`; - } + const headers = buildRequestHeaders(provider, apiKey); + const payload = buildRequestPayload(model, request); const response = await fetch(url, { method: 'POST', headers, - body: JSON.stringify({ - model: model.id, - messages: request.messages, - temperature: request.temperature ?? 0.3, - max_tokens: request.max_tokens ?? 2048, - }), + body: JSON.stringify(payload), signal: controller.signal, }); @@ -302,23 +325,8 @@ async function callProvider( throw new Error(`${provider.name} HTTP ${response.status}: ${body.slice(0, 200)}`); } - const data = (await response.json()) as { - choices: { message: { content: string } }[]; - usage?: { prompt_tokens: number; completion_tokens: number }; - model?: string; - }; - - const content = data.choices?.[0]?.message?.content ?? ''; - recordRequest(provider.name); - - return { - response: content, - model: data.model ?? model.id, - provider: provider.name, - inputTokens: data.usage?.prompt_tokens ?? 0, - outputTokens: data.usage?.completion_tokens ?? 0, - latencyMs: Date.now() - start, - }; + const data = await response.json(); + return parseExternalResponse(data, model, provider, start); } finally { clearTimeout(timer); } diff --git a/packages/gateway/src/pipeline/llm-client.ts b/packages/gateway/src/pipeline/llm-client.ts index d51755f..c991e73 100644 --- a/packages/gateway/src/pipeline/llm-client.ts +++ b/packages/gateway/src/pipeline/llm-client.ts @@ -69,6 +69,75 @@ function isTimeoutError(err: unknown): boolean { return false; } +async function tryModelWithRetries( + modelReq: OllamaRequest, + tier: ModelTier, + timeoutMs: number, +): Promise { + const breaker = getBreaker( + modelReq.model, + tier, + (r: OllamaRequest) => fetchOllama(r, timeoutMs), + ); + const MAX_RETRIES = 2; + let lastErr: unknown; + + for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { + try { + if (attempt > 0) { + logger.info({ model: modelReq.model, attempt }, 'Retrying Ollama call after timeout'); + } + const result = await breaker.fire(modelReq); + if (attempt > 0) { + logger.info({ model: modelReq.model, attempt }, 'Ollama retry succeeded'); + } + return result; + } catch (err) { + lastErr = err; + if (!isTimeoutError(err)) { + logger.error({ err, model: modelReq.model }, 'Ollama non-timeout error, skipping retry'); + break; + } + if (attempt < MAX_RETRIES - 1) { + logger.warn({ model: modelReq.model, attempt }, 'Ollama timeout, retrying'); + } + } + } + void lastErr; + return null; +} + +async function tryExternalFallback( + req: OllamaRequest, + tier: ModelTier, +): Promise { + const tierMap: Record = { + fast: 'fast', + medium: 'medium', + large: 'large', + }; + const externalResult = await callExternalFallback( + { + model: req.model, + messages: [ + ...(req.system ? [{ role: 'system', content: req.system }] : []), + { role: 'user', content: req.prompt }, + ], + temperature: req.options?.temperature, + max_tokens: req.options?.num_predict, + }, + tierMap[tier] ?? 'medium', + ); + return { + response: externalResult.response, + done: true, + total_duration: externalResult.latencyMs * 1_000_000, + eval_count: externalResult.outputTokens, + prompt_eval_count: externalResult.inputTokens, + model: `${externalResult.provider}/${externalResult.model}`, + }; +} + export async function callOllama( req: OllamaRequest, tier: ModelTier = 'medium', @@ -76,81 +145,19 @@ export async function callOllama( ): Promise { const timeoutMs = TIMEOUT_BY_TIER[tier]; const allModels = [req.model, ...fallbackModels.filter((m) => m !== req.model)]; - const MAX_RETRIES = 2; for (const model of allModels) { const modelReq = { ...req, model }; - - const breaker = getBreaker( - model, - tier, - (r: OllamaRequest) => fetchOllama(r, timeoutMs), - ); - - let lastErr: unknown; - - for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { - try { - if (attempt > 0) { - logger.info({ model, attempt }, 'Retrying Ollama call after timeout'); - } - - const result = await breaker.fire(modelReq); - if (attempt > 0) { - logger.info({ model, attempt }, 'Ollama retry succeeded'); - } - return result; - } catch (err) { - lastErr = err; - - // Only retry on timeout errors - if (!isTimeoutError(err)) { - logger.error({ err, model }, 'Ollama non-timeout error, skipping retry'); - break; - } - - if (attempt < MAX_RETRIES - 1) { - logger.warn({ model, attempt }, 'Ollama timeout, retrying'); - } - } - } - - // Try next fallback model - logger.warn({ model, fallback: allModels[allModels.indexOf(model) + 1] }, 'Ollama model failed, trying fallback'); - void lastErr; // captured for logging above + const result = await tryModelWithRetries(modelReq, tier, timeoutMs); + if (result) return result; + const nextModel = allModels[allModels.indexOf(model) + 1]; + logger.warn({ model, fallback: nextModel }, 'Ollama model failed, trying fallback'); } - // All Ollama models failed — try external providers as last resort if (getAvailableProviders().length > 0) { logger.warn({ models: allModels }, 'All Ollama models failed, trying external providers'); try { - const tierMap: Record = { - fast: 'fast', - medium: 'medium', - large: 'large', - }; - const externalResult = await callExternalFallback( - { - model: req.model, - messages: [ - ...(req.system ? [{ role: 'system', content: req.system }] : []), - { role: 'user', content: req.prompt }, - ], - temperature: req.options?.temperature, - max_tokens: req.options?.num_predict, - }, - tierMap[tier] ?? 'medium', - ); - - // Convert external response to OllamaResponse shape - return { - response: externalResult.response, - done: true, - total_duration: externalResult.latencyMs * 1_000_000, - eval_count: externalResult.outputTokens, - prompt_eval_count: externalResult.inputTokens, - model: `${externalResult.provider}/${externalResult.model}`, - }; + return await tryExternalFallback(req, tier); } catch (extErr) { logger.error({ err: extErr }, 'External provider fallback also failed'); } diff --git a/packages/gateway/src/pipeline/post-validator.ts b/packages/gateway/src/pipeline/post-validator.ts index 542e8d2..b5bfaab 100644 --- a/packages/gateway/src/pipeline/post-validator.ts +++ b/packages/gateway/src/pipeline/post-validator.ts @@ -95,38 +95,29 @@ function checkQuestionCloser(text: string): ValidationResult { }; } -export async function runPostValidation( +async function validateWithSchema( output: string, - config: ValidatorConfig, -): Promise { - const results: ValidationResult[] = []; - const validatorSet = new Set(config.validators ?? []); - let banViolations: BanViolation[] = []; - let retryRequested = false; - - // 1. Schema validator - if (validatorSet.has('schema')) { - const schemaResult: SchemaValidatorResult = validateSchema( - output, - config.schema, - ); - results.push({ + schema?: Record, +): Promise<{ result: ValidationResult; retry: boolean }> { + const schemaResult: SchemaValidatorResult = validateSchema(output, schema); + return { + result: { validator: 'schema', passed: schemaResult.passed, score_impact: schemaResult.score_impact, details: { errors: schemaResult.errors }, - }); - if (schemaResult.retry) retryRequested = true; - } + }, + retry: schemaResult.retry, + }; +} - // 2. Ban list checker - if (validatorSet.has('banlist')) { - const banResult: BanlistResult = checkBanlist( - output, - config.language ?? 'auto', - ); - banViolations = banResult.violations; - results.push({ +async function validateWithBanlist( + output: string, + language?: 'de' | 'en', +): Promise<{ result: ValidationResult; violations: BanViolation[] }> { + const banResult: BanlistResult = checkBanlist(output, language ?? 'auto'); + return { + result: { validator: 'banlist', passed: banResult.passed, score_impact: banResult.score_penalty, @@ -138,68 +129,96 @@ export async function runPostValidation( })), count: banResult.violations.length, }, - }); + }, + violations: banResult.violations, + }; +} + +async function validateWithLanguage( + output: string, + language?: 'de' | 'en', + formality?: 'du' | 'Sie', +): Promise { + const langResult: LanguageCheckResult = checkLanguage(output, language, formality); + return { + validator: 'language', + passed: langResult.passed, + score_impact: langResult.score_impact, + details: { + detected: langResult.detected_language, + required: langResult.required_language, + formality_issue: langResult.formality_issue, + details: langResult.details, + }, + }; +} + +async function validateWithTip( + output: string, + outputFormat?: string, +): Promise { + const tipResult: TipValidationResult = validateTipContent(output, outputFormat === 'json'); + return { + validator: 'tip_validator', + passed: tipResult.passed, + score_impact: tipResult.score_impact, + details: { + errors: tipResult.errors, + immediate_reject: tipResult.immediate_reject, + }, + }; +} + +async function validateWithFacts(output: string): Promise { + const factResult: FactCheckResult = await checkFacts(output, 5000); + return { + validator: 'fact_checker', + passed: factResult.passed, + score_impact: factResult.score_impact, + details: { + checks_performed: factResult.checks_performed, + failures: factResult.failures, + }, + }; +} + +export async function runPostValidation( + output: string, + config: ValidatorConfig, +): Promise { + const results: ValidationResult[] = []; + const validatorSet = new Set(config.validators ?? []); + let banViolations: BanViolation[] = []; + let retryRequested = false; + + if (validatorSet.has('schema')) { + const { result, retry } = await validateWithSchema(output, config.schema); + results.push(result); + retryRequested = retryRequested || retry; + } + + if (validatorSet.has('banlist')) { + const { result, violations } = await validateWithBanlist(output, config.language); + results.push(result); + banViolations = violations; } - // 3. Language checker if (validatorSet.has('language')) { - const langResult: LanguageCheckResult = checkLanguage( - output, - config.language, - config.formality, - ); - results.push({ - validator: 'language', - passed: langResult.passed, - score_impact: langResult.score_impact, - details: { - detected: langResult.detected_language, - required: langResult.required_language, - formality_issue: langResult.formality_issue, - details: langResult.details, - }, - }); + results.push(await validateWithLanguage(output, config.language, config.formality)); } - // 4. TIP validator if (validatorSet.has('tip_validator')) { - const tipResult: TipValidationResult = validateTipContent( - output, - config.output_format === 'json', - ); - results.push({ - validator: 'tip_validator', - passed: tipResult.passed, - score_impact: tipResult.score_impact, - details: { - errors: tipResult.errors, - immediate_reject: tipResult.immediate_reject, - }, - }); + results.push(await validateWithTip(output, config.output_format)); } - // 5. Fact checker (async, with timeout) if (validatorSet.has('fact_checker') && config.requires_fact_check) { - const factResult: FactCheckResult = await checkFacts(output, 5000); - results.push({ - validator: 'fact_checker', - passed: factResult.passed, - score_impact: factResult.score_impact, - details: { - checks_performed: factResult.checks_performed, - failures: factResult.failures, - }, - }); + results.push(await validateWithFacts(output)); } - // 6. Length checker if (validatorSet.has('length')) { - results.push( - checkLength(output, config.min_length ?? 50, config.max_length ?? 20000), - ); + results.push(checkLength(output, config.min_length ?? 50, config.max_length ?? 20000)); } - // 7. Question-closer detector if (validatorSet.has('question_closer')) { results.push(checkQuestionCloser(output)); } diff --git a/packages/gateway/src/pipeline/request-scorer.ts b/packages/gateway/src/pipeline/request-scorer.ts index 33dc6e3..6f81d25 100644 --- a/packages/gateway/src/pipeline/request-scorer.ts +++ b/packages/gateway/src/pipeline/request-scorer.ts @@ -672,6 +672,113 @@ function assignTier(score: number): Tier { return 'code_generation'; } +// ── Helper: Short Message Fast Path ──────────────────────────────────────── + +function handleShortMessageFastPath( + lastUserText: string, + input: ScorerInput, +): ScoringResult | null { + if ( + lastUserText.length < 50 && + (!input.tools || input.tools.length === 0) && + !hasFormalLogicKeyword(lastUserText) + ) { + const quickMatches = getTrie().scan(lastUserText); + const quickAgg = getTrie().aggregate(quickMatches); + const hasComplex = Array.from(quickAgg.values()).some( + (d) => d.dimension !== 'simpleIndicators' && d.dimension !== 'relay' && d.effectiveCount > 0, + ); + + if (!hasComplex) { + const result: ScoringResult = { + tier: 'medium', + score: 0.05, + confidence: 0.8, + reason: 'short message - simple request', + dimensions: [], + }; + recordSessionTier('medium'); + logger.debug({ tier: 'medium', reason: 'short_simple_path' }, 'Request scored via short simple path'); + return result; + } + } + return null; +} + +// ── Helper: Formal Logic Override ────────────────────────────────────────── + +function handleFormalLogicOverride( + fullText: string, + input: ScorerInput, + userMessages: readonly WeightedMessage[], +): ScoringResult | null { + if (!hasFormalLogicKeyword(fullText)) { + return null; + } + const dimensions = computeAllDimensions(input, userMessages, fullText); + const result: ScoringResult = { + tier: 'reasoning', + score: 0.5, + confidence: 0.95, + reason: 'formal logic keyword detected', + dimensions, + }; + recordSessionTier('reasoning'); + logger.debug({ tier: 'reasoning', reason: 'formal_logic_override' }, 'Request scored via formal logic override'); + return result; +} + +// ── Helper: Apply Score Overrides ────────────────────────────────────────── + +interface ScoreOverridesInput { + tier: Tier; + confidence: number; + reason: string; +} + +interface ScoreOverridesOutput { + tier: Tier; + confidence: number; + reason: string; +} + +function applyScoreOverrides( + state: ScoreOverridesInput, + dimensions: readonly DimensionScore[], + input: ScorerInput, + totalChars: number, +): ScoreOverridesOutput { + let { tier, confidence, reason } = state; + + // Code generation override + const codeGenDim = dimensions.find((d) => d.name === 'codeGeneration'); + if (codeGenDim && codeGenDim.rawScore > 0.25) { + tier = 'code_generation'; + reason = 'code generation keywords detected'; + } + + // Tool floor + if (input.tools && input.tools.length > 0 && tier === 'fast') { + tier = 'medium'; + reason = 'tool floor applied (minimum medium with tools)'; + } + + // Context floor + const estimatedTotalTokens = totalChars / 4; + if (estimatedTotalTokens > 50_000 && (tier === 'fast' || tier === 'medium')) { + tier = 'large'; + reason = 'context floor applied (>50k estimated tokens)'; + } + + // Ambiguity check + if (confidence < 0.45) { + tier = 'medium'; + reason = 'ambiguous (confidence < 0.45, defaulting to medium)'; + } + + return { tier, confidence, reason }; +} + // ── Main Scoring Function ────────────────────────────────────────────────── export function scoreRequest( @@ -682,57 +789,18 @@ export function scoreRequest( const fullText = userMessages.map((m) => m.text).join('\n'); const lastUserText = userMessages.length > 0 ? userMessages[userMessages.length - 1]!.text : ''; - // ── Short message fast path ── - if ( - lastUserText.length < 50 && - (!input.tools || input.tools.length === 0) && - !hasFormalLogicKeyword(lastUserText) - ) { - // Quick check: no complex keywords in the short message - const quickMatches = getTrie().scan(lastUserText); - const quickAgg = getTrie().aggregate(quickMatches); - const hasComplex = Array.from(quickAgg.values()).some( - (d) => d.dimension !== 'simpleIndicators' && d.dimension !== 'relay' && d.effectiveCount > 0, - ); + const shortPathResult = handleShortMessageFastPath(lastUserText, input); + if (shortPathResult) return shortPathResult; - if (!hasComplex) { - const shortResult: ScoringResult = { - tier: 'medium', - score: 0.05, - confidence: 0.8, - reason: 'short message - simple request', - dimensions: [], - }; - recordSessionTier('medium'); - logger.debug({ tier: 'medium', reason: 'short_simple_path' }, 'Request scored via short simple path'); - return shortResult; - } - } + const formalLogicResult = handleFormalLogicOverride(fullText, input, userMessages); + if (formalLogicResult) return formalLogicResult; - // ── Formal logic override ── - if (hasFormalLogicKeyword(fullText)) { - const dimensions = computeAllDimensions(input, userMessages, fullText); - const result: ScoringResult = { - tier: 'reasoning', - score: 0.5, - confidence: 0.95, - reason: 'formal logic keyword detected', - dimensions, - }; - recordSessionTier('reasoning'); - logger.debug({ tier: 'reasoning', reason: 'formal_logic_override' }, 'Request scored via formal logic override'); - return result; - } - - // ── Full scoring ── const dimensions = computeAllDimensions(input, userMessages, fullText); - let rawScore = 0; for (const dim of dimensions) { rawScore += dim.weighted; } - // Apply session momentum const momentum = computeSessionMomentum(lastUserText.length); const score = rawScore + momentum; @@ -740,32 +808,9 @@ export function scoreRequest( let confidence = computeConfidence(score); let reason = `scored ${score.toFixed(4)} across 23 dimensions`; - // ── Code generation override: code keywords -> code_generation ── - const codeGenDim = dimensions.find((d) => d.name === 'codeGeneration'); - if (codeGenDim && codeGenDim.rawScore > 0.25) { - tier = 'code_generation'; - reason = 'code generation keywords detected'; - } - - // ── Tool floor: tools present -> minimum medium ── - if (input.tools && input.tools.length > 0 && tier === 'fast') { - tier = 'medium'; - reason = 'tool floor applied (minimum medium with tools)'; - } - - // ── Context floor: >50k total tokens -> minimum large ── const totalChars = input.messages.reduce((sum, m) => sum + (m.content?.length ?? 0), 0); - const estimatedTotalTokens = totalChars / 4; - if (estimatedTotalTokens > 50_000 && (tier === 'fast' || tier === 'medium')) { - tier = 'large'; - reason = 'context floor applied (>50k estimated tokens)'; - } - - // ── Ambiguity check: low confidence -> force medium ── - if (confidence < 0.45) { - tier = 'medium'; - reason = 'ambiguous (confidence < 0.45, defaulting to medium)'; - } + const overrides = applyScoreOverrides({ tier, confidence, reason }, dimensions, input, totalChars); + ({ tier, confidence, reason } = overrides); recordSessionTier(tier); diff --git a/packages/gateway/src/pipeline/router.ts b/packages/gateway/src/pipeline/router.ts index 83ed321..6b03d6e 100644 --- a/packages/gateway/src/pipeline/router.ts +++ b/packages/gateway/src/pipeline/router.ts @@ -194,6 +194,82 @@ const TIER_MODEL_MAP: Record m !== selectedModel)]; + } + return buildFallbackChain(selectedModel, configTier, models); +} + +function buildScoredDecision( + models: ModelsYaml, + mapping: { primary: string; configTier: 'fast' | 'medium' | 'large'; provider?: string }, + selectedModel: string, + configTier: 'fast' | 'medium' | 'large', + fallbackChain: string[], + tierConfig: ModelsYaml['tiers']['fast'], + scoringResult: ScoringResult, + options?: { max_tokens?: number }, +): RouterDecision { + const provider = mapping.provider; + const modelConfig = models.models[selectedModel]; + + logger.info( + { + tier: scoringResult.tier, + model: selectedModel, + provider: provider || 'ollama', + score: scoringResult.score.toFixed(4), + confidence: scoringResult.confidence.toFixed(3), + reason: scoringResult.reason, + }, + 'Dynamic routing decision via request scorer', + ); + + return { + model: selectedModel, + provider, + fallback_chain: fallbackChain, + tier: configTier, + prompt_template: 'default', + temperature: 0.7, + max_tokens: options?.max_tokens ?? modelConfig?.max_tokens_default ?? 2048, + output_format: 'text', + requires_fact_check: false, + validators: [], + ollama_base_url: models.ollama_base_url, + timeout_ms: tierConfig.timeout_ms, + scoringResult, + }; +} + /** * Dynamic routing based on the 23-dimension request scorer. * Use this alongside the static `route()` function — both coexist. @@ -226,60 +302,13 @@ export function routeByScore( const mapping = TIER_MODEL_MAP[scoringResult.tier]; const selectedModel = mapping.primary; const configTier = mapping.configTier; - const provider = mapping.provider; const tierConfig = models.tiers[configTier]; if (!tierConfig) { logger.error({ tier: configTier }, 'Tier config not found in models.yaml, falling back to medium'); - const fallbackTierConfig = models.tiers['medium']!; - return { - model: 'qwen2.5:14b', - fallback_chain: buildFallbackChain('qwen2.5:14b', 'medium', models), - tier: 'medium', - prompt_template: 'default', - temperature: 0.7, - max_tokens: options?.max_tokens ?? 2048, - output_format: 'text', - requires_fact_check: false, - validators: [], - ollama_base_url: models.ollama_base_url, - timeout_ms: fallbackTierConfig.timeout_ms, - scoringResult, - }; + return buildMediumTierFallback(models, options, scoringResult); } - // For reasoning/code_generation tiers, put the primary model first, then fallbacks - const fallbackChain = (scoringResult.tier === 'reasoning' || scoringResult.tier === 'code_generation') - ? [selectedModel, ...buildFallbackChain(selectedModel, configTier, models).filter((m) => m !== selectedModel)] - : buildFallbackChain(selectedModel, configTier, models); - - const modelConfig = models.models[selectedModel]; - - logger.info( - { - tier: scoringResult.tier, - model: selectedModel, - provider: provider || 'ollama', - score: scoringResult.score.toFixed(4), - confidence: scoringResult.confidence.toFixed(3), - reason: scoringResult.reason, - }, - 'Dynamic routing decision via request scorer', - ); - - return { - model: selectedModel, - provider, - fallback_chain: fallbackChain, - tier: configTier, - prompt_template: 'default', - temperature: 0.7, - max_tokens: options?.max_tokens ?? modelConfig?.max_tokens_default ?? 2048, - output_format: 'text', - requires_fact_check: false, - validators: [], - ollama_base_url: models.ollama_base_url, - timeout_ms: tierConfig.timeout_ms, - scoringResult, - }; + const fallbackChain = buildScoredFallbackChain(scoringResult.tier, selectedModel, configTier, models); + return buildScoredDecision(models, mapping, selectedModel, configTier, fallbackChain, tierConfig, scoringResult, options); } diff --git a/packages/gateway/src/routes/completion.ts b/packages/gateway/src/routes/completion.ts index 4c53cd7..95f1946 100644 --- a/packages/gateway/src/routes/completion.ts +++ b/packages/gateway/src/routes/completion.ts @@ -111,377 +111,183 @@ type CompletionRequest = z.infer; // } // } -export async function completionRoute(fastify: FastifyInstance): Promise { - fastify.post( - '/completion', - { - config: { rateLimit: false }, // Custom rate limiting via caller - }, - async (request: FastifyRequest, reply: FastifyReply) => { - const startMs = Date.now(); +async function classifyAndRoute(taskType: string | undefined, caller: string, input: string, options: CompletionRequest['options']): Promise<{ taskType: string; decision: ReturnType; classificationResult?: unknown }> { + let resolved = taskType; + let classificationResult; + if (!resolved) { + try { + classificationResult = await classifyInput(input); + resolved = classificationResult.task_type; + } catch (err) { + logger.warn({ err }, 'Pre-classifier failed'); + resolved = 'generic_qa'; + } + } - let body: CompletionRequest; - try { - body = CompletionRequestSchema.parse(request.body); - } catch (err) { - return reply.status(400).send({ - statusCode: 400, - error: 'Bad Request', - message: err instanceof z.ZodError ? err.errors[0]?.message ?? 'Invalid request' : 'Invalid request body', - }); - } + let decision; + try { + decision = route(resolved, caller, { model: options?.model, temperature: options?.temperature, max_tokens: options?.max_tokens }); + } catch (err) { + throw new Error(err instanceof Error ? err.message : 'Failed to route request'); + } - const { caller, input, language, context, options } = body; - const returnValidationDetails = options?.return_validation_details ?? false; - - // Stage 2: ShieldX scan (real library, 547+ rules, sub-millisecond) - // TODO: Enable ShieldX when dependency is properly linked - // if (!SKIP_SHIELDX_CALLERS.has(caller)) { - // const shieldResult = await runShieldXScan(input, caller); - // if (!shieldResult.passed) { - // requestsTotal.labels({ caller, task_type: 'unknown', status: 'rejected' }).inc(); - // return reply.status(400).send({ - // statusCode: 400, - // error: 'Rejected', - // message: shieldResult.reason ?? 'Input rejected by security scan', - // threat_level: shieldResult.threatLevel, - // kill_chain_phase: shieldResult.phase, - // shieldx_latency_ms: shieldResult.latencyMs, - // }); - // } - // } - - // Generate call ID early for tracking (used by instrumented LLM client) - const callId = `call-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; - - // Stage 3: Pre-classifier - let taskType = body.task_type; - let classificationResult; - if (!taskType) { - try { - classificationResult = await classifyInput(input); - taskType = classificationResult.task_type; - } catch (err) { - logger.warn({ err }, 'Pre-classifier failed'); - taskType = 'generic_qa'; - } - } - - // Stage 4: Router - let decision; - try { - decision = route(taskType, caller, { - model: options?.model, - temperature: options?.temperature, - max_tokens: options?.max_tokens, - }); - } catch (err) { - return reply.status(400).send({ - statusCode: 400, - error: 'Routing Error', - message: err instanceof Error ? err.message : 'Failed to route request', - }); - } - - // Stage 5: Prompt assembly - // Use taskType directly for template lookup (so tip_transceiver_enrich.yaml is used, - // not the generic_qa fallback from routing). The router only selects the model. - // - // Variable resolution strategy: - // 1. Explicit context fields take priority (callers can pass structured data) - // 2. `input` is used as fallback for ALL common content variables so simple - // one-field callers work without knowing each template's specific var name. - const contextVars = context - ? Object.fromEntries(Object.entries(context).map(([k, v]) => [k, v as string])) - : {}; - - // Common content variable names across all 59 templates — all default to `input` - const inputAliases: Record = { - source_data: input, ocr_text: input, transcription: input, - ticket_content: input, alert_data: input, incident_data: input, - lldp_data: input, cve_data: input, inventory: input, - anomaly_data: input, flagged_input: input, attack_description: input, - bgp_data: input, health_checks: input, market_data: input, - manuscript_text: input, raw_content: input, content: input, - // Additional structured vars with sensible fallbacks - peeringdb_data: input, bgp_routes: input, network_context: input, - alert_context: input, affected_inventory: input, - }; - - const resolved = resolvePrompt( - taskType ?? decision.prompt_template, - { - ...inputAliases, // low priority: input as fallback for all content vars - ...contextVars, // medium priority: explicit context fields override aliases - input, // always available as {{input}} - user_context: context, - }, - language ?? 'en', - ); - - // Stage 6: LLM call (external provider or Ollama with circuit breaker + retry) - let ollamaResponse; - try { - const format: '' | 'json' | undefined = decision.output_format === 'json' ? 'json' : ''; - - const baseReq = { - model: decision.model, - prompt: resolved.prompt, - system: resolved.system, - options: { - temperature: decision.temperature, - num_predict: decision.max_tokens, - }, - format, - stream: false, - callId, - taskType, - }; - - if (decision.provider) { - // Route to external provider as primary (e.g. OpenAI Codex) - ollamaResponse = await callExternalProviderPrimaryInstrumented( - baseReq, - decision.provider, - decision.tier, - decision.fallback_chain, - callId, - taskType, - ); - } else { - // Route to Ollama with fallback chain - ollamaResponse = await callOllamaWithFallbackChainInstrumented( - baseReq, - decision.fallback_chain, - decision.tier, - callId, - taskType, - ); - } - } catch (err) { - const latency = Date.now() - startMs; - logger.error({ err, caller, taskType }, 'Ollama call failed'); - requestsTotal.labels({ caller, task_type: taskType, status: 'rejected' }).inc(); - latencySeconds.labels({ caller, task_type: taskType, model: decision.model }).observe(latency / 1000); - - // Log error to dashboard - const db = getPool(); - const requestLogger = createRequestLogger(db); - const errorMessage = err instanceof Error ? err.message : 'LLM service unavailable'; - void requestLogger.logRequest( - callId, - caller, - taskType, - decision.model, - 'error', - 0, - 0, - 0, - latency, - 0, - false, - errorMessage - ); - - return reply.status(503).send({ - statusCode: 503, - error: 'Service Unavailable', - message: 'LLM service unavailable, please retry', - }); - } - - const outputText = ollamaResponse.response; - const latencyMs = Date.now() - startMs; - - // Stage 7: Post-validation chain - const validationOutput = await runPostValidation(outputText, { - validators: decision.validators, - language, - output_format: decision.output_format, - requires_fact_check: decision.requires_fact_check, - schema: resolved.schema, - }); - - // Stage 8: Confidence gate - const confidenceResult = evaluateConfidence(validationOutput); - - // Record metrics - requestsTotal.labels({ caller, task_type: taskType, status: confidenceResult.status }).inc(); - latencySeconds.labels({ caller, task_type: taskType, model: ollamaResponse.model ?? decision.model }).observe(latencyMs / 1000); - tokensTotal.labels({ direction: 'in', model: decision.model }).inc(ollamaResponse.prompt_eval_count ?? 0); - tokensTotal.labels({ direction: 'out', model: decision.model }).inc(ollamaResponse.eval_count ?? 0); - confidenceScore.labels({ task_type: taskType, model: decision.model }).observe(confidenceResult.score); - - // Record ban hits in metrics - for (const violation of validationOutput.ban_violations) { - banlistHitsTotal.labels({ term: violation.term, language: violation.language, category: violation.category }).inc(); - } - - // Record validation failures - for (const result of validationOutput.results) { - if (!result.passed) { - validationFailuresTotal.labels({ validator: result.validator, task_type: taskType }).inc(); - } - } - - // Stage 9: Audit log - const inputHash = hashText(input); - const outputHash = hashText(outputText); - - await writeAuditLog({ - caller, - task_type: taskType, - model_used: decision.model, - prompt_id: resolved.prompt_id, - prompt_version: resolved.prompt_version, - input_hash: inputHash, - output_text: confidenceResult.status !== 'pending_review' ? outputText : undefined, - output_hash: outputHash, - token_count_in: ollamaResponse.prompt_eval_count ?? 0, - token_count_out: ollamaResponse.eval_count ?? 0, - latency_ms: latencyMs, - confidence: confidenceResult.score, - status: confidenceResult.status, - validation_log: validationOutput.results, - ban_hits: validationOutput.ban_violations, - metadata: { - classification: classificationResult, - model_tier: decision.tier, - fallback_used: ollamaResponse.model !== decision.model, - }, - }); - - // Write ban analytics - if (validationOutput.ban_violations.length > 0 && callId) { - void writeBanAnalytics(callId, validationOutput.ban_violations, caller, taskType); - } - - // Add to review queue if pending_review - if (confidenceResult.status === 'pending_review' && callId) { - void addToReviewQueue({ - callId, - caller, - taskType, - inputText: input, - outputText, - confidence: confidenceResult.score, - validationLog: validationOutput.results, - }); - } - - // Track cost and compression metrics - let costUsd = 0; - let costSavedUsd = 0; - if (callId) { - const db = getPool(); - const tokensIn = ollamaResponse.prompt_eval_count ?? 0; - const tokensOut = ollamaResponse.eval_count ?? 0; - const tokensCompressed = tokensIn + tokensOut; // TODO: actual compression from RTK layer - costUsd = calculateCost(decision.model, tokensIn, tokensOut); - costSavedUsd = calculateSavings(decision.model, tokensCompressed, tokensCompressed); // 0 until RTK compression data available - - void logCostImpact( - db, - callId, - { - callId, - agent: 'gateway', - model: decision.model, - project: 'llm-gateway', - taskType: taskType ?? 'generic', - }, - tokensIn, - tokensOut, - tokensCompressed, - costUsd, - costSavedUsd, - confidenceResult.score, - ); - - // Record routing decision for learning engine - void recordRoutingDecision({ - callId, - taskType: taskType ?? 'generic', - caller, - routingModel: decision.model, - routingTier: decision.tier, - actualModelUsed: ollamaResponse.model ?? decision.model, - wasFallback: ollamaResponse.model !== decision.model, - success: confidenceResult.status === 'approved', - confidenceFinal: confidenceResult.score, - tokensIn, - tokensOut, - latencyMs, - costUsd, - }); - - // Broadcast real-time update to connected SSE clients - costStream.broadcast({ - callId, - project: 'llm-gateway', - taskType: taskType ?? 'generic', - model: decision.model, - costUsd, - costSavedUsd, - tokensIn, - tokensOut, - confidence: confidenceResult.score, - timestamp: new Date().toISOString(), - }); - - // Log request to dashboard - const requestLogger = createRequestLogger(db); - void requestLogger.logRequest( - callId, - caller, - taskType, - decision.model, - confidenceResult.status as 'approved' | 'warning' | 'pending_review' | 'rejected' | 'error', - tokensIn, - tokensOut, - costUsd, - latencyMs, - confidenceResult.score, - ollamaResponse.model !== decision.model, - undefined // No error message for successful requests - ); - } - - // Stage 10: Response - const responseBody: Record = { - id: callId, - status: confidenceResult.status, - confidence: Math.round(confidenceResult.score * 100) / 100, - model: decision.model, - task_type: taskType, - latency_ms: latencyMs, - tokens: { - in: ollamaResponse.prompt_eval_count ?? 0, - out: ollamaResponse.eval_count ?? 0, - }, - cost: { - usd: costUsd, - saved_usd: costSavedUsd, - }, - }; - - if (confidenceResult.status !== 'pending_review') { - responseBody['output'] = outputText; - } else { - responseBody['output'] = null; - responseBody['message'] = 'Output is pending human review due to low confidence'; - } - - if (returnValidationDetails) { - responseBody['validation'] = validationOutput.results; - responseBody['confidence_detail'] = { - base_score: confidenceResult.base_score, - total_impact: confidenceResult.total_impact, - final_score: confidenceResult.score, - }; - } - - return reply.status(200).send(responseBody); - }, - ); + return { taskType: resolved, decision, classificationResult }; +} + +function buildPromptVariables(input: string, context: Record | undefined): Record & { input: string } { + const contextVars = context ? Object.fromEntries(Object.entries(context).map(([k, v]) => [k, v as string])) : {}; + const inputAliases: Record = { + source_data: input, ocr_text: input, transcription: input, ticket_content: input, alert_data: input, + incident_data: input, lldp_data: input, cve_data: input, inventory: input, anomaly_data: input, + flagged_input: input, attack_description: input, bgp_data: input, health_checks: input, market_data: input, + manuscript_text: input, raw_content: input, content: input, peeringdb_data: input, bgp_routes: input, + network_context: input, alert_context: input, affected_inventory: input, + }; + return { ...inputAliases, ...contextVars, input, user_context: context }; +} + +async function callLLMWithFallback(baseReq: any, decision: ReturnType, callId: string, taskType: string): Promise { + if (decision.provider) { + return await callExternalProviderPrimaryInstrumented(baseReq, decision.provider, decision.tier, decision.fallback_chain, callId, taskType); + } + return await callOllamaWithFallbackChainInstrumented(baseReq, decision.fallback_chain, decision.tier, callId, taskType); +} + +function recordAllMetrics(caller: string, taskType: string, confidenceResult: any, ollamaResponse: any, decision: ReturnType, validationOutput: any): void { + requestsTotal.labels({ caller, task_type: taskType, status: confidenceResult.status }).inc(); + latencySeconds.labels({ caller, task_type: taskType, model: ollamaResponse.model ?? decision.model }).observe(0); + tokensTotal.labels({ direction: 'in', model: decision.model }).inc(ollamaResponse.prompt_eval_count ?? 0); + tokensTotal.labels({ direction: 'out', model: decision.model }).inc(ollamaResponse.eval_count ?? 0); + confidenceScore.labels({ task_type: taskType, model: decision.model }).observe(confidenceResult.score); + for (const violation of validationOutput.ban_violations) { + banlistHitsTotal.labels({ term: violation.term, language: violation.language, category: violation.category }).inc(); + } + for (const result of validationOutput.results) { + if (!result.passed) { + validationFailuresTotal.labels({ validator: result.validator, task_type: taskType }).inc(); + } + } +} + +async function auditAndTrackCosts(caller: string, taskType: string, input: string, outputText: string, latencyMs: number, ollamaResponse: any, resolved: any, decision: ReturnType, confidenceResult: any, validationOutput: any, classificationResult: any, callId: string): Promise<{ costUsd: number; costSavedUsd: number }> { + const inputHash = hashText(input); + const outputHash = hashText(outputText); + + await writeAuditLog({ + caller, task_type: taskType, model_used: decision.model, prompt_id: resolved.prompt_id, prompt_version: resolved.prompt_version, + input_hash: inputHash, output_text: confidenceResult.status !== 'pending_review' ? outputText : undefined, output_hash: outputHash, + token_count_in: ollamaResponse.prompt_eval_count ?? 0, token_count_out: ollamaResponse.eval_count ?? 0, latency_ms: latencyMs, + confidence: confidenceResult.score, status: confidenceResult.status, validation_log: validationOutput.results, ban_hits: validationOutput.ban_violations, + metadata: { classification: classificationResult, model_tier: decision.tier, fallback_used: ollamaResponse.model !== decision.model }, + }); + + if (validationOutput.ban_violations.length > 0) { + void writeBanAnalytics(callId, validationOutput.ban_violations, caller, taskType); + } + + if (confidenceResult.status === 'pending_review') { + void addToReviewQueue({ callId, caller, taskType, inputText: input, outputText, confidence: confidenceResult.score, validationLog: validationOutput.results }); + } + + const db = getPool(); + const tokensIn = ollamaResponse.prompt_eval_count ?? 0; + const tokensOut = ollamaResponse.eval_count ?? 0; + const tokensCompressed = tokensIn + tokensOut; + const costUsd = calculateCost(decision.model, tokensIn, tokensOut); + const costSavedUsd = calculateSavings(decision.model, tokensCompressed, tokensCompressed); + + void logCostImpact(db, callId, { callId, agent: 'gateway', model: decision.model, project: 'llm-gateway', taskType: taskType ?? 'generic' }, tokensIn, tokensOut, tokensCompressed, costUsd, costSavedUsd, confidenceResult.score); + + void recordRoutingDecision({ callId, taskType: taskType ?? 'generic', caller, routingModel: decision.model, routingTier: decision.tier, actualModelUsed: ollamaResponse.model ?? decision.model, wasFallback: ollamaResponse.model !== decision.model, success: confidenceResult.status === 'approved', confidenceFinal: confidenceResult.score, tokensIn, tokensOut, latencyMs, costUsd }); + + costStream.broadcast({ callId, project: 'llm-gateway', taskType: taskType ?? 'generic', model: decision.model, costUsd, costSavedUsd, tokensIn, tokensOut, confidence: confidenceResult.score, timestamp: new Date().toISOString() }); + + const requestLogger = createRequestLogger(db); + void requestLogger.logRequest(callId, caller, taskType, decision.model, confidenceResult.status as 'approved' | 'warning' | 'pending_review' | 'rejected' | 'error', tokensIn, tokensOut, costUsd, latencyMs, confidenceResult.score, ollamaResponse.model !== decision.model, undefined); + + return { costUsd, costSavedUsd }; +} + +function buildResponseBody(callId: string, decision: ReturnType, taskType: string, confidenceResult: any, outputText: string, latencyMs: number, ollamaResponse: any, costUsd: number, costSavedUsd: number, returnValidationDetails: boolean, validationOutput: any): Record { + const body: Record = { + id: callId, status: confidenceResult.status, confidence: Math.round(confidenceResult.score * 100) / 100, + model: decision.model, task_type: taskType, latency_ms: latencyMs, + tokens: { in: ollamaResponse.prompt_eval_count ?? 0, out: ollamaResponse.eval_count ?? 0 }, + cost: { usd: costUsd, saved_usd: costSavedUsd }, + }; + if (confidenceResult.status !== 'pending_review') { + body['output'] = outputText; + } else { + body['output'] = null; + body['message'] = 'Output is pending human review due to low confidence'; + } + if (returnValidationDetails) { + body['validation'] = validationOutput.results; + body['confidence_detail'] = { base_score: confidenceResult.base_score, total_impact: confidenceResult.total_impact, final_score: confidenceResult.score }; + } + return body; +} + +export async function completionRoute(fastify: FastifyInstance): Promise { + fastify.post('/completion', { config: { rateLimit: false } }, async (request: FastifyRequest, reply: FastifyReply) => { + const startMs = Date.now(); + + let body: CompletionRequest; + try { + body = CompletionRequestSchema.parse(request.body); + } catch (err) { + return reply.status(400).send({ + statusCode: 400, error: 'Bad Request', + message: err instanceof z.ZodError ? err.errors[0]?.message ?? 'Invalid request' : 'Invalid request body', + }); + } + + const { caller, input, language, context, options } = body; + const callId = `call-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; + + let classifAndRoute; + try { + classifAndRoute = await classifyAndRoute(body.task_type, caller, input, options); + } catch (err) { + return reply.status(400).send({ + statusCode: 400, error: 'Routing Error', + message: err instanceof Error ? err.message : 'Failed to route request', + }); + } + + const { taskType, decision, classificationResult } = classifAndRoute; + const promptVars = buildPromptVariables(input, context); + const resolved = resolvePrompt(taskType ?? decision.prompt_template, promptVars, language ?? 'en'); + + const format: '' | 'json' | undefined = decision.output_format === 'json' ? 'json' : ''; + const baseReq = { model: decision.model, prompt: resolved.prompt, system: resolved.system, options: { temperature: decision.temperature, num_predict: decision.max_tokens }, format, stream: false, callId, taskType }; + + let ollamaResponse; + try { + ollamaResponse = await callLLMWithFallback(baseReq, decision, callId, taskType); + } catch (err) { + const latency = Date.now() - startMs; + logger.error({ err, caller, taskType }, 'Ollama call failed'); + requestsTotal.labels({ caller, task_type: taskType, status: 'rejected' }).inc(); + latencySeconds.labels({ caller, task_type: taskType, model: decision.model }).observe(latency / 1000); + const db = getPool(); + const requestLogger = createRequestLogger(db); + void requestLogger.logRequest(callId, caller, taskType, decision.model, 'error', 0, 0, 0, latency, 0, false, err instanceof Error ? err.message : 'LLM service unavailable'); + return reply.status(503).send({ statusCode: 503, error: 'Service Unavailable', message: 'LLM service unavailable, please retry' }); + } + + const latencyMs = Date.now() - startMs; + const outputText = ollamaResponse.response; + const validationOutput = await runPostValidation(outputText, { validators: decision.validators, language, output_format: decision.output_format, requires_fact_check: decision.requires_fact_check, schema: resolved.schema }); + const confidenceResult = evaluateConfidence(validationOutput); + + recordAllMetrics(caller, taskType, confidenceResult, ollamaResponse, decision, validationOutput); + const { costUsd, costSavedUsd } = await auditAndTrackCosts(caller, taskType, input, outputText, latencyMs, ollamaResponse, resolved, decision, confidenceResult, validationOutput, classificationResult, callId); + + // Fix latency observation after computation + latencySeconds.labels({ caller, task_type: taskType, model: ollamaResponse.model ?? decision.model }).observe(latencyMs / 1000); + + const responseBody = buildResponseBody(callId, decision, taskType, confidenceResult, outputText, latencyMs, ollamaResponse, costUsd, costSavedUsd, options?.return_validation_details ?? false, validationOutput); + return reply.status(200).send(responseBody); + }); } diff --git a/packages/gateway/src/routes/dashboard.ts b/packages/gateway/src/routes/dashboard.ts index 194338a..1874821 100644 --- a/packages/gateway/src/routes/dashboard.ts +++ b/packages/gateway/src/routes/dashboard.ts @@ -3,6 +3,7 @@ import { getPool } from '../db/client.js'; import { logger } from '../observability/logger.js'; import { createRequestLogger } from '../modules/request-logger.js'; import { globalRequestStream } from '../modules/request-stream.js'; +import { getAvailableProviders } from '../pipeline/external-providers.js'; interface DashboardSummary { totalCost: number; @@ -494,6 +495,78 @@ export async function dashboardRoute(fastify: FastifyInstance): Promise { return reply.send({ test: 'ok', message: 'Test endpoint is working' }); }); + // Providers endpoint - lists all available LLM providers (local, subscription, free-tier) + fastify.get('/api/dashboard/providers', async (_request: FastifyRequest, reply: FastifyReply) => { + try { + const availableProviders = await getAvailableProviders(); + + // Categorize providers by type + const providers = availableProviders.map(provider => { + let type: 'local' | 'subscription' | 'free' = 'free'; + let status: 'configured' | 'unconfigured' | 'unavailable' = 'unconfigured'; + + // Determine provider type based on name + if (provider.name.toLowerCase().includes('ollama')) { + type = 'local'; + status = provider.enabled ? 'configured' : 'unconfigured'; + } else if (['claude-bridge', 'openai-bridge', 'chatgpt-bridge', 'copilot-bridge'].includes(provider.name)) { + type = 'subscription'; + status = provider.enabled && process.env[provider.envKey] ? 'configured' : 'unconfigured'; + } else { + type = 'free'; + status = provider.enabled && process.env[provider.envKey] ? 'configured' : 'unconfigured'; + } + + return { + name: provider.name, + type, + status, + enabled: provider.enabled, + models: provider.models.map(m => ({ + id: m.id, + tier: m.tier, + contextLength: m.contextLength + })), + rateLimitRpm: provider.rateLimitRpm, + baseUrl: provider.baseUrl + }; + }); + + // Group by type for easy UI rendering + const grouped = { + local: providers.filter(p => p.type === 'local'), + subscription: providers.filter(p => p.type === 'subscription'), + free: providers.filter(p => p.type === 'free') + }; + + return reply.send({ + success: true, + data: { + grouped, + all: providers, + summary: { + totalProviders: providers.length, + configuredCount: providers.filter(p => p.status === 'configured').length, + byType: { + local: grouped.local.length, + subscription: grouped.subscription.length, + free: grouped.free.length + } + } + }, + meta: { + timestamp: new Date().toISOString() + } + }); + } catch (error) { + logger.error({ error }, 'Failed to fetch providers'); + return reply.status(500).send({ + success: false, + error: 'Failed to fetch provider information' + }); + } + }); + // Dashboard UI endpoint (served at /api/dashboard/index for Cloudflare tunnel compatibility) fastify.get('/api/dashboard/index', async (_request: FastifyRequest, reply: FastifyReply) => { try {