#!/usr/bin/env bash
#
# create_fixture.sh - Generate a 100-chunk database fixture for MPEMBED testing
#
# This script extracts a representative sample of chunks from the production database
# (maproom-postgres) using stratified sampling to ensure diverse coverage of:
# - File types: TypeScript, Rust, Markdown
# - Chunk kinds: functions, classes, interfaces, modules, markdown sections
#
# The fixture preserves OpenAI embeddings (when they exist) and all FK relationships.
#
# Usage:
#   ./scripts/create_fixture.sh
#
# Output:
#   tests/fixtures/mpembed_baseline_100.sql
#
# Requirements:
#   - PostgreSQL client tools (psql, pg_dump)
#   - Access to maproom-postgres database
#   - Write permission to tests/fixtures/

set -euo pipefail

# Configuration
DB_URL="postgresql://maproom:maproom@maproom-postgres:5432/maproom"
OUTPUT_FILE="tests/fixtures/mpembed_baseline_100.sql"
TEMP_IDS_FILE=$(mktemp)
TEMP_FILE_IDS=$(mktemp)

# Target distribution (100 chunks total)
# Based on production data availability:
# - TypeScript: 221 available → sample 50 (22.6%)
# - Rust: 4,994 available → sample 30 (0.6%)
# - Markdown: 17,822 available → sample 20 (0.1%)
TS_COUNT=50
RUST_COUNT=30
MD_COUNT=20
TOTAL_COUNT=100

echo "============================================"
echo "Creating MPEMBED baseline fixture"
echo "============================================"
echo ""
echo "Target: $TOTAL_COUNT chunks"
echo "  - TypeScript: $TS_COUNT"
echo "  - Rust: $RUST_COUNT"
echo "  - Markdown: $MD_COUNT"
echo ""

# Step 1: Select representative chunks using stratified sampling
echo "Step 1: Selecting chunks with stratified sampling..."

psql "$DB_URL" -t -A -c "
WITH ts_sample AS (
  -- Sample TypeScript chunks with diverse kinds
  SELECT c.id, c.file_id
  FROM maproom.chunks c
  JOIN maproom.files f ON c.file_id = f.id
  WHERE f.language = 'ts'
    AND c.kind IN ('func', 'class', 'module', 'type', 'component', 'hook')
  ORDER BY random()
  LIMIT $TS_COUNT
),
rust_sample AS (
  -- Sample Rust chunks with diverse kinds
  SELECT c.id, c.file_id
  FROM maproom.chunks c
  JOIN maproom.files f ON c.file_id = f.id
  WHERE f.language = 'rs'
    AND c.kind IN ('func', 'module', 'struct', 'impl', 'use', 'trait', 'enum')
  ORDER BY random()
  LIMIT $RUST_COUNT
),
md_sample AS (
  -- Sample Markdown chunks with diverse kinds
  SELECT c.id, c.file_id
  FROM maproom.chunks c
  JOIN maproom.files f ON c.file_id = f.id
  WHERE f.language = 'md'
    AND c.kind IN ('markdown_section', 'heading_2', 'heading_3', 'code_block', 'heading_4')
  ORDER BY random()
  LIMIT $MD_COUNT
),
all_samples AS (
  SELECT id, file_id FROM ts_sample
  UNION ALL
  SELECT id, file_id FROM rust_sample
  UNION ALL
  SELECT id, file_id FROM md_sample
)
SELECT id FROM all_samples;
" > "$TEMP_IDS_FILE"

SELECTED_COUNT=$(wc -l < "$TEMP_IDS_FILE")
echo "  Selected $SELECTED_COUNT chunks"

if [ "$SELECTED_COUNT" -lt "$TOTAL_COUNT" ]; then
  echo "  WARNING: Only found $SELECTED_COUNT chunks (target was $TOTAL_COUNT)"
  echo "  This may indicate insufficient data in the database"
fi

# Step 2: Get unique file IDs for selected chunks
echo ""
echo "Step 2: Identifying required files..."

psql "$DB_URL" -t -A -c "
SELECT DISTINCT f.id
FROM maproom.files f
JOIN maproom.chunks c ON c.file_id = f.id
WHERE c.id IN ($(paste -sd, "$TEMP_IDS_FILE"))
ORDER BY f.id;
" > "$TEMP_FILE_IDS"

FILE_COUNT=$(wc -l < "$TEMP_FILE_IDS")
echo "  Found $FILE_COUNT unique files"

# Step 3: Export fixture
echo ""
echo "Step 3: Exporting fixture to $OUTPUT_FILE..."

# Create SQL header with metadata
cat > "$OUTPUT_FILE" << 'EOF'
-- MPEMBED Baseline Fixture (100 chunks)
-- Generated by: crates/maproom/scripts/create_fixture.sh
-- Purpose: Fast-loading test fixture for multi-provider embedding migration
--
-- Contains:
--   - 50 TypeScript chunks (functions, classes, modules)
--   - 30 Rust chunks (functions, modules, structs, impls, uses)
--   - 20 Markdown chunks (sections, headings, code blocks)
--
-- Preserves:
--   - OpenAI embeddings (code_embedding, text_embedding) when present
--   - All FK relationships (chunks → files → commits → repos/worktrees)
--
-- Usage:
--   psql $MAPROOM_DATABASE_URL < tests/fixtures/mpembed_baseline_100.sql

BEGIN;

-- Temporarily disable triggers for faster loading
SET session_replication_role = replica;

EOF

# Export related tables in dependency order using COPY format
# This ensures proper handling of special characters, newlines, etc.

echo "  Exporting repos..."
{
  echo "\COPY maproom.repos FROM stdin;"
  psql "$DB_URL" -c "
    COPY (
      SELECT r.*
      FROM maproom.repos r
      JOIN maproom.files f ON f.repo_id = r.id
      WHERE f.id IN ($(cat "$TEMP_FILE_IDS" | paste -sd,))
      GROUP BY r.id
      ORDER BY r.id
    ) TO STDOUT;
  "
  echo "\."
  echo ""
} >> "$OUTPUT_FILE"

echo "  Exporting worktrees..."
{
  echo "\COPY maproom.worktrees FROM stdin;"
  psql "$DB_URL" -c "
    COPY (
      SELECT w.*
      FROM maproom.worktrees w
      JOIN maproom.files f ON f.worktree_id = w.id
      WHERE f.id IN ($(cat "$TEMP_FILE_IDS" | paste -sd,))
      GROUP BY w.id
      ORDER BY w.id
    ) TO STDOUT;
  "
  echo "\."
  echo ""
} >> "$OUTPUT_FILE"

echo "  Exporting commits..."
{
  echo "\COPY maproom.commits FROM stdin;"
  psql "$DB_URL" -c "
    COPY (
      SELECT cm.*
      FROM maproom.commits cm
      JOIN maproom.files f ON f.commit_id = cm.id
      WHERE f.id IN ($(cat "$TEMP_FILE_IDS" | paste -sd,))
      GROUP BY cm.id
      ORDER BY cm.id
    ) TO STDOUT;
  "
  echo "\."
  echo ""
} >> "$OUTPUT_FILE"

echo "  Exporting files..."
{
  echo "\COPY maproom.files FROM stdin;"
  psql "$DB_URL" -c "
    COPY (
      SELECT f.*
      FROM maproom.files f
      WHERE f.id IN ($(cat "$TEMP_FILE_IDS" | paste -sd,))
      ORDER BY f.id
    ) TO STDOUT;
  "
  echo "\."
  echo ""
} >> "$OUTPUT_FILE"

echo "  Exporting chunks..."
{
  echo "\COPY maproom.chunks FROM stdin;"
  psql "$DB_URL" -c "
    COPY (
      SELECT c.*
      FROM maproom.chunks c
      WHERE c.id IN ($(cat "$TEMP_IDS_FILE" | paste -sd,))
      ORDER BY c.id
    ) TO STDOUT;
  "
  echo "\."
  echo ""
} >> "$OUTPUT_FILE"

# Re-enable triggers and commit
cat >> "$OUTPUT_FILE" << 'EOF'
-- Re-enable triggers
SET session_replication_role = DEFAULT;

-- Update sequences to avoid conflicts
SELECT setval('maproom.repos_id_seq', (SELECT MAX(id) FROM maproom.repos));
SELECT setval('maproom.worktrees_id_seq', (SELECT MAX(id) FROM maproom.worktrees));
SELECT setval('maproom.commits_id_seq', (SELECT MAX(id) FROM maproom.commits));
SELECT setval('maproom.files_id_seq', (SELECT MAX(id) FROM maproom.files));
SELECT setval('maproom.chunks_id_seq', (SELECT MAX(id) FROM maproom.chunks));

COMMIT;

-- Verification queries
\echo ''
\echo '=== Fixture Statistics ==='
\echo ''

SELECT
  f.language,
  COUNT(*) as chunk_count
FROM maproom.chunks c
JOIN maproom.files f ON c.file_id = f.id
GROUP BY f.language
ORDER BY chunk_count DESC;

\echo ''

SELECT
  c.kind::text,
  COUNT(*) as count
FROM maproom.chunks c
GROUP BY c.kind
ORDER BY count DESC
LIMIT 10;

\echo ''

SELECT
  COUNT(*) as total_chunks,
  COUNT(CASE WHEN code_embedding IS NOT NULL THEN 1 END) as with_code_emb,
  COUNT(CASE WHEN text_embedding IS NOT NULL THEN 1 END) as with_text_emb
FROM maproom.chunks;

\echo ''
\echo 'Fixture loaded successfully!'
EOF

# Cleanup
rm -f "$TEMP_IDS_FILE" "$TEMP_FILE_IDS"

# Get file size
FILE_SIZE=$(du -h "$OUTPUT_FILE" | cut -f1)

echo ""
echo "============================================"
echo "Fixture created successfully!"
echo "============================================"
echo ""
echo "Location: $OUTPUT_FILE"
echo "Size: $FILE_SIZE"
echo ""
echo "Verification:"
echo "  - Chunks exported: $SELECTED_COUNT"
echo "  - Files required: $FILE_COUNT"
echo ""
echo "To load fixture:"
echo "  psql \$MAPROOM_DATABASE_URL < $OUTPUT_FILE"
echo ""