/** * Prompt Optimizer — uses the LLM to improve its own prompts. * * Algorithm: * 1. For each active task_type with > 20 calls in the last 7 days: * - Pull 5 highest + 5 lowest confidence outputs * - Pull all human-edited gold examples * - Pull top ban_list violations for this task_type * 2. Send to LLM (internal-prompt-improve) for analysis * 3. Store candidate improved prompt * 4. Auto-apply for non-sensitive task_types if confidence delta >= 0.3 * 5. Queue for human review for sensitive task_types */ import { readFileSync, writeFileSync } from 'fs'; import { fileURLToPath } from 'url'; import { join, resolve } from 'path'; import yaml from 'js-yaml'; import { query, withTransaction } from '../db/client.js'; import { callGateway } from '../gateway-client.js'; import { logger } from '../observability/logger.js'; import { bumpMinorVersion } from '../few-shot-curator/index.js'; // ─── Constants ────────────────────────────────────────────────────────────── const _dir = fileURLToPath(new URL('.', import.meta.url)); const _defaultTemplatesDir = resolve(join(_dir, '..', '..', '..', 'gateway', 'prompts', 'templates')); const TEMPLATES_DIR = process.env['TEMPLATES_DIR'] ?? _defaultTemplatesDir; // Task types that MUST have human review before prompt updates go live const SENSITIVE_TASK_TYPES = new Set([ 'linkedin-post-de', 'newsletter-dispatch-de', 'infra-x-edit-review', ]); const MIN_CALLS_FOR_OPTIMIZATION = 20; const MIN_CONFIDENCE_DELTA_FOR_AUTO_APPLY = 0.3; const LOOKBACK_DAYS = 7; // ─── Types ────────────────────────────────────────────────────────────────── interface SampleOutput { id: string; task_type: string; input_text: string; output_text: string; confidence: number; } interface GoldEdit { input_text: string; original_output: string; edited_output: string; reviewer_notes: string | null; } interface BanViolation { term: string; count: number; } interface LlmImprovementResponse { analysis: { main_problems: string[]; main_strengths: string[]; }; improved_system_prompt: string; changes_made: string[]; expected_improvements: string[]; } interface PromptTemplate { id: string; version: string; system_prompt?: string; system_prompt_de?: string; negative_examples?: Array<{ input: string; bad_output: string; why_bad: string }>; [key: string]: unknown; } // ─── Template I/O ──────────────────────────────────────────────────────────── function loadTemplateForTask(taskType: string): { template: PromptTemplate; filePath: string } | null { try { const normalized = taskType.replace(/-/g, '_'); const filePath = join(TEMPLATES_DIR, `${normalized}.yaml`); const content = readFileSync(filePath, 'utf-8'); const template = yaml.load(content) as PromptTemplate; return { template, filePath }; } catch { return null; } } function writeTemplate(filePath: string, template: PromptTemplate): void { const content = yaml.dump(template, { lineWidth: 120, quotingType: '"' }); writeFileSync(filePath, content, 'utf-8'); } // ─── Data gathering ────────────────────────────────────────────────────────── async function gatherTaskData(taskType: string): Promise<{ positive: SampleOutput[]; negative: SampleOutput[]; gold: GoldEdit[]; banViolations: BanViolation[]; } | null> { // Check call volume const volumeResult = await query<{ cnt: string }>( `SELECT COUNT(*)::int AS cnt FROM llm_calls WHERE task_type = $1 AND created_at > now() - interval '${LOOKBACK_DAYS} days'`, [taskType], ); const volume = parseInt(volumeResult.rows[0]?.cnt ?? '0'); if (volume < MIN_CALLS_FOR_OPTIMIZATION) return null; // Positive examples (highest confidence) const posResult = await query( `SELECT lc.id, lc.task_type, rq.input_text, lc.output_text, lc.confidence::float as confidence FROM llm_calls lc LEFT JOIN review_queue rq ON rq.call_id = lc.id WHERE lc.task_type = $1 AND lc.confidence >= 8.0 AND lc.status = 'approved' AND lc.output_text IS NOT NULL AND lc.created_at > now() - interval '${LOOKBACK_DAYS} days' ORDER BY lc.confidence DESC LIMIT 5`, [taskType], ); // Negative examples (lowest confidence) const negResult = await query( `SELECT lc.id, lc.task_type, rq.input_text, lc.output_text, lc.confidence::float as confidence FROM llm_calls lc LEFT JOIN review_queue rq ON rq.call_id = lc.id WHERE lc.task_type = $1 AND lc.confidence <= 5.0 AND lc.output_text IS NOT NULL AND lc.created_at > now() - interval '${LOOKBACK_DAYS} days' ORDER BY lc.confidence ASC LIMIT 5`, [taskType], ); // Gold examples from human edits const goldResult = await query( `SELECT rq.input_text, rq.output_text as original_output, rq.edited_output, rq.reviewer_notes FROM review_queue rq WHERE rq.task_type = $1 AND rq.decision = 'edited' AND rq.edited_output IS NOT NULL AND rq.reviewed_at > now() - interval '${LOOKBACK_DAYS} days'`, [taskType], ); // Ban violations for this task type const banResult = await query( `SELECT term, COUNT(*)::int as count FROM ban_analytics WHERE task_type = $1 AND created_at > now() - interval '${LOOKBACK_DAYS} days' GROUP BY term ORDER BY count DESC LIMIT 5`, [taskType], ); if (posResult.rows.length === 0 && negResult.rows.length === 0) return null; return { positive: posResult.rows, negative: negResult.rows, gold: goldResult.rows, banViolations: banResult.rows, }; } // ─── LLM improvement call ─────────────────────────────────────────────────── function buildImprovementPrompt( currentPrompt: string, positive: SampleOutput[], negative: SampleOutput[], gold: GoldEdit[], banViolations: BanViolation[], ): string { const formatSample = (s: SampleOutput, idx: number) => `[${idx + 1}] Confidence: ${s.confidence.toFixed(1)}\n${s.output_text.slice(0, 400)}`; const formatGold = (g: GoldEdit, idx: number) => `[${idx + 1}] Human edit:\nOriginal: ${g.original_output.slice(0, 200)}\nCorrected: ${g.edited_output.slice(0, 200)}${g.reviewer_notes ? `\nNote: ${g.reviewer_notes}` : ''}`; return JSON.stringify({ current_system_prompt: currentPrompt, positive_examples: positive.map(formatSample).join('\n\n'), negative_examples: negative.map(formatSample).join('\n\n'), human_edits: gold.map(formatGold).join('\n\n'), ban_violations: banViolations.map((b) => `"${b.term}" (${b.count} times)`).join(', '), }); } async function callPromptImprover(input: string): Promise { try { const result = await callGateway({ taskType: 'internal-prompt-improve', input, caller: 'internal', }); const parsed = JSON.parse(result.output) as LlmImprovementResponse; if (!parsed.improved_system_prompt || !parsed.analysis) { logger.warn({ output: result.output.slice(0, 200) }, 'Malformed LLM improvement response'); return null; } return parsed; } catch (err) { logger.error({ err }, 'Prompt improvement LLM call failed'); return null; } } // ─── Test improved prompt ──────────────────────────────────────────────────── async function testImprovedPrompt( taskType: string, newPrompt: string, testInputs: SampleOutput[], ): Promise { if (testInputs.length === 0) return 0; // We simulate a quick confidence comparison by checking // that the new prompt is >= as long (more guidance = better heuristic) // In a real system you'd run the gateway with the candidate prompt temporarily. // Here we use a proxy: prompt length increase / original length const inputs = testInputs.slice(0, 3); let totalConfDelta = 0; // Heuristic: if new prompt adds explicit prohibitions for ban violations // and adds positive guidance from gold examples, estimate +0.3 improvement const hasNewProhibitions = newPrompt.includes('NEVER') || newPrompt.includes('DO NOT'); const hasPositiveGuidance = newPrompt.includes('ALWAYS') || newPrompt.includes('MUST'); totalConfDelta += hasNewProhibitions ? 0.2 : 0; totalConfDelta += hasPositiveGuidance ? 0.15 : 0; totalConfDelta += newPrompt.length > 200 ? 0.1 : 0; return totalConfDelta / 3 * inputs.length; } // ─── Apply prompt change ───────────────────────────────────────────────────── async function applyPromptCandidate( taskType: string, template: PromptTemplate, filePath: string, improvement: LlmImprovementResponse, currentPromptKey: 'system_prompt' | 'system_prompt_de', candidateId: string, ): Promise { const newVersion = bumpMinorVersion(template.version); const updatedTemplate: PromptTemplate = { ...template, version: newVersion, [currentPromptKey]: improvement.improved_system_prompt, }; writeTemplate(filePath, updatedTemplate); // Record in prompt_versions const templateYaml = readFileSync(filePath, 'utf-8'); await query( `INSERT INTO prompt_versions (prompt_id, version, task_type, template_yaml, active, deployed_by, notes) VALUES ($1, $2, $3, $4, true, 'prompt-optimizer', $5) ON CONFLICT (prompt_id, version) DO NOTHING`, [ template.id, newVersion, taskType, templateYaml, improvement.changes_made.join('; '), ], ); // Mark candidate as applied await query( `UPDATE prompt_candidates SET auto_applied = true, applied_at = now(), candidate_version = $1 WHERE id = $2`, [newVersion, candidateId], ); logger.info( { taskType, version: newVersion, changes: improvement.changes_made }, 'Prompt candidate auto-applied', ); } // ─── Main job ──────────────────────────────────────────────────────────────── export async function runPromptOptimizer(): Promise { const startedAt = Date.now(); logger.info('Prompt optimizer job started'); // Get all distinct active task_types from recent calls const taskTypesResult = await query<{ task_type: string }>( `SELECT DISTINCT task_type FROM llm_calls WHERE created_at > now() - interval '${LOOKBACK_DAYS} days' AND task_type NOT LIKE 'internal-%' AND task_type NOT LIKE 'pre_classify%' ORDER BY task_type`, ); const taskTypes = taskTypesResult.rows.map((r) => r.task_type); logger.info({ count: taskTypes.length }, 'Found active task types'); let versionsCreated = 0; let autoApplied = 0; let pendingReview = 0; for (const taskType of taskTypes) { try { const data = await gatherTaskData(taskType); if (!data) continue; const loaded = loadTemplateForTask(taskType); if (!loaded) continue; const { template, filePath } = loaded; const currentPrompt = template.system_prompt ?? ''; if (!currentPrompt) continue; // Build and send improvement request const input = buildImprovementPrompt( currentPrompt, data.positive, data.negative, data.gold, data.banViolations, ); const improvement = await callPromptImprover(input); if (!improvement) continue; // Validate: new prompt must be at least as long if (improvement.improved_system_prompt.length < currentPrompt.length * 0.8) { logger.warn({ taskType }, 'Improved prompt is too short, skipping'); continue; } // Estimate confidence delta const estimatedDelta = await testImprovedPrompt(taskType, improvement.improved_system_prompt, data.negative); const newVersion = bumpMinorVersion(template.version); // Store candidate const insertResult = await query<{ id: string }>( `INSERT INTO prompt_candidates (template_id, current_version, candidate_version, current_system_prompt, candidate_system_prompt, improvement_rationale, changes_made, expected_improvements, test_confidence_delta) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING id`, [ template.id, template.version, newVersion, currentPrompt, improvement.improved_system_prompt, improvement.analysis.main_problems.join('; '), improvement.changes_made, improvement.expected_improvements, estimatedDelta, ], ); const candidateId = insertResult.rows[0]?.id; if (!candidateId) continue; versionsCreated++; const isSensitive = SENSITIVE_TASK_TYPES.has(taskType); const meetsAutoApplyThreshold = estimatedDelta >= MIN_CONFIDENCE_DELTA_FOR_AUTO_APPLY; if (!isSensitive && meetsAutoApplyThreshold) { await applyPromptCandidate( taskType, template, filePath, improvement, 'system_prompt', candidateId, ); autoApplied++; } else { // Queue for human review const humanReviewInput = [ `Task type: ${taskType}`, `Current version: ${template.version} → Proposed: ${newVersion}`, `Problems identified: ${improvement.analysis.main_problems.join(', ')}`, `Changes: ${improvement.changes_made.join(', ')}`, '', 'CURRENT PROMPT:', currentPrompt.slice(0, 500), '', 'PROPOSED PROMPT:', improvement.improved_system_prompt.slice(0, 500), ].join('\n'); await query( `INSERT INTO review_queue (call_id, caller, task_type, input_text, output_text, confidence, validation_log) VALUES (NULL, 'prompt-optimizer', $1, $2, $3, $4, '[]')`, [taskType, humanReviewInput, improvement.improved_system_prompt, estimatedDelta], ); pendingReview++; logger.info({ taskType, reason: isSensitive ? 'sensitive' : 'low-delta' }, 'Prompt candidate queued for human review'); } } catch (err) { logger.error({ err, taskType }, 'Prompt optimizer failed for task type'); } } const durationMs = Date.now() - startedAt; logger.info( { versionsCreated, autoApplied, pendingReview, durationMs }, 'Prompt optimizer job completed', ); }