#!/usr/bin/env python3 """Load blog-training-alpaca.jsonl into PostgreSQL learning_corpus table via psql.""" import json import subprocess import sys from pathlib import Path JSONL_FILE = Path(__file__).parent.parent / "data" / "blog-training-alpaca.jsonl" DB_URL = "postgresql://llm:llm_secure_2026@127.0.0.1:15432/llm_gateway" TASK_TYPE = "tip_blog" # Parse connection details parts = DB_URL.replace("postgresql://", "").split("@") creds = parts[0].split(":") db_user, db_pass = creds[0], creds[1] host_port = parts[1].split("/") host, port = host_port[0].split(":") db_name = host_port[1] print("šŸ”„ Loading blog training data into PostgreSQL...") print(f" File: {JSONL_FILE}") print(f" Host: {host}:{port}") print(f" Database: {db_name}") print() # Generate SQL statements sql_statements = [] # Read JSONL and convert to SQL try: with open(JSONL_FILE) as f: samples = [json.loads(line.strip()) for line in f if line.strip()] except FileNotFoundError: print(f"āŒ File not found: {JSONL_FILE}") sys.exit(1) # Start transaction sql_statements.append("BEGIN;") sql_statements.append(f"SELECT COUNT(*) as before_count FROM learning_corpus WHERE task_type = '{TASK_TYPE}';") # Insert samples for sample in samples: prompt_text = sample.get("instruction", "") if sample.get("input"): prompt_text += f"\n{sample['input']}" # Escape single quotes for SQL prompt_text = prompt_text.replace("'", "''") completion_text = sample.get("output", "").replace("'", "''") quality_score = sample.get("quality_score", 8.0) source = sample.get("source", "unknown").replace("'", "''") tags = f"ARRAY['{source}', '{TASK_TYPE}']" sql = f"INSERT INTO learning_corpus (task_type, prompt_text, completion_text, quality_score, tags) VALUES ('{TASK_TYPE}', '{prompt_text}', '{completion_text}', {quality_score}, {tags});" sql_statements.append(sql) sql_statements.append(f"SELECT COUNT(*) as after_count FROM learning_corpus WHERE task_type = '{TASK_TYPE}';") sql_statements.append("COMMIT;") # Write to temporary SQL file tmpfile = Path("/tmp/load_blog_data.sql") with open(tmpfile, "w") as f: f.write("\n".join(sql_statements)) # Execute via psql try: env = {"PGPASSWORD": db_pass} result = subprocess.run( [ "/opt/homebrew/bin/psql", "-h", host, "-p", port, "-U", db_user, "-d", db_name, "-f", str(tmpfile), ], capture_output=True, text=True, env=env, ) if result.returncode == 0: print(result.stdout) print(f"\nāœ… Loaded {len(samples)} samples into learning_corpus (task_type={TASK_TYPE})") print("šŸŽÆ Next: python3 scripts/manual_trigger.py --general --force") else: print("āŒ Error loading data:") print(result.stderr) sys.exit(1) finally: tmpfile.unlink(missing_ok=True)