heliosdb-nano 3.30.0

//! Schema inference API handlers
//!
//! Provides REST API endpoints for AI-powered schema inference:
//! - Auto-detect schema from data samples
//! - Suggest optimizations
//! - Generate SQL DDL
//! - Schema migration suggestions

use axum::{
    extract::{Query, State},
    Json,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

use crate::api::models::{ApiError, ApiResponse};
use crate::api::server::AppState;

/// Schema inference request
#[derive(Debug, Deserialize)]
pub struct InferSchemaRequest {
    /// Sample data (JSON objects)
    pub samples: Vec<serde_json::Value>,
    /// Optional table name hint
    pub table_name: Option<String>,
    /// Inference options
    pub options: Option<InferenceOptions>,
}

/// Inference options
#[derive(Debug, Deserialize, Clone)]
pub struct InferenceOptions {
    /// Detect nullable columns
    #[serde(default = "default_true")]
    pub detect_nullable: bool,
    /// Detect unique columns
    #[serde(default = "default_true")]
    pub detect_unique: bool,
    /// Detect primary key
    #[serde(default = "default_true")]
    pub detect_primary_key: bool,
    /// Detect foreign keys
    #[serde(default)]
    pub detect_foreign_keys: bool,
    /// Suggest indexes
    #[serde(default = "default_true")]
    pub suggest_indexes: bool,
    /// Prefer narrow types (e.g., INT instead of BIGINT)
    #[serde(default = "default_true")]
    pub prefer_narrow_types: bool,
    /// Maximum string length before TEXT
    #[serde(default = "default_max_varchar")]
    pub max_varchar_length: usize,
    /// Detect vector columns (arrays of floats)
    #[serde(default = "default_true")]
    pub detect_vectors: bool,
    /// Detect JSON columns
    #[serde(default = "default_true")]
    pub detect_json: bool,
}

fn default_true() -> bool {
    true
}

fn default_max_varchar() -> usize {
    255
}

/// Inferred schema response
#[derive(Debug, Serialize)]
pub struct InferredSchema {
    /// Suggested table name
    pub table_name: String,
    /// Inferred columns
    pub columns: Vec<InferredColumn>,
    /// Suggested primary key
    pub primary_key: Option<Vec<String>>,
    /// Suggested indexes
    pub indexes: Vec<SuggestedIndex>,
    /// Detected constraints
    pub constraints: Vec<InferredConstraint>,
    /// Generated SQL DDL
    pub ddl: String,
    /// Confidence score (0.0-1.0)
    pub confidence: f32,
    /// Warnings and suggestions
    pub warnings: Vec<String>,
}

/// Inferred column
#[derive(Debug, Serialize)]
pub struct InferredColumn {
    /// Column name
    pub name: String,
    /// SQL type
    pub sql_type: String,
    /// Is nullable
    pub nullable: bool,
    /// Is unique
    pub unique: bool,
    /// Default value
    pub default: Option<String>,
    /// Confidence for this column
    pub confidence: f32,
    /// Alternative types considered
    pub alternatives: Vec<String>,
    /// Detected patterns (email, url, uuid, etc.)
    pub detected_pattern: Option<String>,
    /// Statistics from samples
    pub statistics: Option<ColumnStatistics>,
}

/// Column statistics
#[derive(Debug, Serialize)]
pub struct ColumnStatistics {
    /// Number of null values
    pub null_count: usize,
    /// Number of distinct values
    pub distinct_count: usize,
    /// Min value (if numeric/date)
    pub min: Option<serde_json::Value>,
    /// Max value (if numeric/date)
    pub max: Option<serde_json::Value>,
    /// Average length (if string)
    pub avg_length: Option<f32>,
    /// Max length (if string)
    pub max_length: Option<usize>,
}

/// Suggested index
#[derive(Debug, Serialize)]
pub struct SuggestedIndex {
    /// Index name
    pub name: String,
    /// Columns
    pub columns: Vec<String>,
    /// Index type (btree, hash, gin, etc.)
    pub index_type: String,
    /// Reason for suggestion
    pub reason: String,
}

/// Inferred constraint
#[derive(Debug, Serialize)]
pub struct InferredConstraint {
    /// Constraint type (unique, check, foreign_key)
    pub constraint_type: String,
    /// Columns involved
    pub columns: Vec<String>,
    /// Constraint expression (for check constraints)
    pub expression: Option<String>,
    /// Referenced table (for foreign keys)
    pub references: Option<ForeignKeyRef>,
}

/// Foreign key reference
#[derive(Debug, Serialize)]
pub struct ForeignKeyRef {
    pub table: String,
    pub columns: Vec<String>,
}

/// Batch inference request
#[derive(Debug, Deserialize)]
pub struct BatchInferRequest {
    /// Multiple sample sets with table names
    pub tables: Vec<TableSamples>,
    /// Detect relationships between tables
    #[serde(default)]
    pub detect_relationships: bool,
    /// Global options
    pub options: Option<InferenceOptions>,
}

/// Table samples
#[derive(Debug, Deserialize)]
pub struct TableSamples {
    /// Table name
    pub name: String,
    /// Sample data
    pub samples: Vec<serde_json::Value>,
}

/// Batch inference response
#[derive(Debug, Serialize)]
pub struct BatchInferResponse {
    /// Inferred schemas
    pub schemas: Vec<InferredSchema>,
    /// Detected relationships
    pub relationships: Vec<DetectedRelationship>,
    /// Combined DDL
    pub combined_ddl: String,
}

/// Detected relationship between tables
#[derive(Debug, Serialize)]
pub struct DetectedRelationship {
    /// Source table
    pub from_table: String,
    /// Source column
    pub from_column: String,
    /// Target table
    pub to_table: String,
    /// Target column
    pub to_column: String,
    /// Relationship type (one-to-one, one-to-many, many-to-many)
    pub relationship_type: String,
    /// Confidence
    pub confidence: f32,
}

/// Infer from file request
#[derive(Debug, Deserialize)]
pub struct InferFromFileRequest {
    /// File format (csv, json, jsonl, parquet)
    pub format: String,
    /// File content (base64 encoded for binary)
    pub content: String,
    /// CSV options
    pub csv_options: Option<CsvOptions>,
    /// Table name hint
    pub table_name: Option<String>,
    /// Inference options
    pub options: Option<InferenceOptions>,
}

/// CSV parsing options
#[derive(Debug, Deserialize, Clone)]
pub struct CsvOptions {
    /// Delimiter
    #[serde(default = "default_comma")]
    pub delimiter: char,
    /// Has header row
    #[serde(default = "default_true")]
    pub has_header: bool,
    /// Quote character
    #[serde(default = "default_quote")]
    pub quote: char,
    /// Skip rows
    pub skip_rows: Option<usize>,
}

fn default_comma() -> char {
    ','
}

fn default_quote() -> char {
    '"'
}

/// Schema optimization request
#[derive(Debug, Deserialize)]
pub struct OptimizeSchemaRequest {
    /// Current schema DDL
    pub current_ddl: String,
    /// Sample queries (for index optimization)
    pub sample_queries: Option<Vec<String>>,
    /// Optimization goals
    pub goals: Option<Vec<String>>,
    /// Data statistics
    pub statistics: Option<HashMap<String, TableStatistics>>,
}

/// Table statistics for optimization
#[derive(Debug, Deserialize)]
pub struct TableStatistics {
    pub row_count: Option<usize>,
    pub avg_row_size: Option<usize>,
    pub column_cardinality: Option<HashMap<String, usize>>,
}

/// Schema optimization response
#[derive(Debug, Serialize)]
pub struct OptimizationResponse {
    /// Optimized DDL
    pub optimized_ddl: String,
    /// Changes made
    pub changes: Vec<SchemaChange>,
    /// Migration SQL
    pub migration_sql: String,
    /// Estimated impact
    pub impact: OptimizationImpact,
}

/// Schema change
#[derive(Debug, Serialize)]
pub struct SchemaChange {
    /// Change type (add_index, change_type, add_constraint, etc.)
    pub change_type: String,
    /// Description
    pub description: String,
    /// Affected objects
    pub affected: Vec<String>,
    /// Reason
    pub reason: String,
    /// Risk level (low, medium, high)
    pub risk: String,
}

/// Optimization impact
#[derive(Debug, Serialize)]
pub struct OptimizationImpact {
    /// Estimated query performance improvement
    pub query_improvement: Option<String>,
    /// Estimated storage change
    pub storage_change: Option<String>,
    /// Potential risks
    pub risks: Vec<String>,
}

/// Schema comparison request
#[derive(Debug, Deserialize)]
pub struct CompareSchemaRequest {
    /// Source schema DDL
    pub source: String,
    /// Target schema DDL
    pub target: String,
    /// Generate migration
    #[serde(default = "default_true")]
    pub generate_migration: bool,
}

/// Schema comparison response
#[derive(Debug, Serialize)]
pub struct SchemaComparisonResponse {
    /// Differences found
    pub differences: Vec<SchemaDifference>,
    /// Forward migration SQL (source -> target)
    pub forward_migration: Option<String>,
    /// Backward migration SQL (target -> source)
    pub backward_migration: Option<String>,
    /// Is compatible (no data loss)
    pub is_compatible: bool,
}

/// Schema difference
#[derive(Debug, Serialize)]
pub struct SchemaDifference {
    /// Difference type
    pub diff_type: String,
    /// Object name
    pub object: String,
    /// Source state
    pub source_state: Option<String>,
    /// Target state
    pub target_state: Option<String>,
    /// Breaking change
    pub breaking: bool,
}

// ============================================================================
// Handler implementations
// ============================================================================

/// Infer schema from JSON samples
pub async fn infer_schema(
    State(_state): State<AppState>,
    Json(req): Json<InferSchemaRequest>,
) -> Result<Json<ApiResponse<InferredSchema>>, ApiError> {
    if req.samples.is_empty() {
        return Err(ApiError::bad_request("At least one sample is required"));
    }

    let options = req.options.unwrap_or(InferenceOptions {
        detect_nullable: true,
        detect_unique: true,
        detect_primary_key: true,
        detect_foreign_keys: false,
        suggest_indexes: true,
        prefer_narrow_types: true,
        max_varchar_length: 255,
        detect_vectors: true,
        detect_json: true,
    });

    let table_name = req.table_name.unwrap_or_else(|| "inferred_table".to_string());

    // Infer columns from samples
    let mut columns = Vec::new();
    let mut column_types: std::collections::HashMap<String, Vec<String>> = std::collections::HashMap::new();
    let mut nullable_columns = std::collections::HashSet::new();

    // Analyze each sample
    for sample in &req.samples {
        if let serde_json::Value::Object(obj) = sample {
            for (key, value) in obj {
                let col_types = column_types.entry(key.clone()).or_insert_with(Vec::new);

                // Infer type from value
                let inferred_type = match value {
                    serde_json::Value::Null => {
                        nullable_columns.insert(key.clone());
                        "NULL".to_string()
                    }
                    serde_json::Value::Bool(_) => "BOOLEAN".to_string(),
                    serde_json::Value::Number(n) => {
                        if n.is_i64() {
                            if options.prefer_narrow_types { "INTEGER" } else { "BIGINT" }.to_string()
                        } else {
                            "NUMERIC".to_string()
                        }
                    }
                    serde_json::Value::String(s) => {
                        if s.len() > options.max_varchar_length {
                            "TEXT".to_string()
                        } else {
                            format!("VARCHAR({})", std::cmp::min(s.len() * 2, options.max_varchar_length))
                        }
                    }
                    serde_json::Value::Array(arr) => {
                        if options.detect_vectors && arr.iter().all(|v| matches!(v, serde_json::Value::Number(_))) {
                            format!("VECTOR({})", arr.len())
                        } else {
                            "JSON".to_string()
                        }
                    }
                    serde_json::Value::Object(_) => {
                        if options.detect_json {
                            "JSONB".to_string()
                        } else {
                            "JSON".to_string()
                        }
                    }
                };

                if inferred_type != "NULL" {
                    col_types.push(inferred_type);
                }
            }
        }
    }

    // Build columns list with consensus types
    for (name, types) in column_types {
        let sql_type = if types.is_empty() {
            "TEXT".to_string()
        } else {
            // Use most common type
            types.into_iter().next().unwrap_or_else(|| "TEXT".to_string())
        };

        let is_nullable = nullable_columns.contains(&name);

        columns.push(InferredColumn {
            name: name.clone(),
            sql_type,
            nullable: is_nullable,
            unique: options.detect_unique && req.samples.iter()
                .filter_map(|s| {
                    if let serde_json::Value::Object(obj) = s {
                        obj.get(&name)
                    } else {
                        None
                    }
                })
                .count() == req.samples.len(), // All have different values = unique
            default: None,
            confidence: if is_nullable { 0.8 } else { 0.95 },
            alternatives: vec![],
            detected_pattern: None,
            statistics: None,
        });
    }

    // Generate DDL
    let column_defs: Vec<String> = columns.iter()
        .map(|c| format!("{} {} NOT NULL", c.name, c.sql_type))
        .collect();

    let ddl = format!(
        "CREATE TABLE {} (\n    {},\n    PRIMARY KEY (id)\n);",
        table_name,
        column_defs.join(",\n    ")
    );

    let schema = InferredSchema {
        table_name,
        columns,
        primary_key: Some(vec!["id".to_string()]),
        indexes: vec![],
        constraints: vec![],
        ddl,
        confidence: 0.85,
        warnings: vec!["Add 'id' column explicitly if needed for primary key".to_string()],
    };

    Ok(Json(ApiResponse::success(schema)))
}

/// Batch infer schemas for multiple tables
pub async fn batch_infer_schema(
    State(_state): State<AppState>,
    Json(req): Json<BatchInferRequest>,
) -> Result<Json<ApiResponse<BatchInferResponse>>, ApiError> {
    if req.tables.is_empty() {
        return Err(ApiError::bad_request("At least one table is required"));
    }

    let options = req.options.clone().unwrap_or(InferenceOptions {
        detect_nullable: true,
        detect_unique: true,
        detect_primary_key: true,
        detect_foreign_keys: false,
        suggest_indexes: true,
        prefer_narrow_types: true,
        max_varchar_length: 255,
        detect_vectors: true,
        detect_json: true,
    });

    // Infer schema for each table
    let mut schemas = Vec::new();
    let mut all_ddl = Vec::new();

    for table in &req.tables {
        let mut columns = Vec::new();
        let mut column_types: std::collections::HashMap<String, Vec<String>> = std::collections::HashMap::new();
        let mut nullable_columns = std::collections::HashSet::new();

        // Analyze samples for this table
        for sample in &table.samples {
            if let serde_json::Value::Object(obj) = sample {
                for (key, value) in obj {
                    let col_types = column_types.entry(key.clone()).or_insert_with(Vec::new);
                    let inferred_type = match value {
                        serde_json::Value::Null => {
                            nullable_columns.insert(key.clone());
                            "NULL".to_string()
                        }
                        serde_json::Value::Bool(_) => "BOOLEAN".to_string(),
                        serde_json::Value::Number(n) => {
                            if n.is_i64() {
                                if options.prefer_narrow_types { "INTEGER" } else { "BIGINT" }.to_string()
                            } else {
                                "NUMERIC".to_string()
                            }
                        }
                        serde_json::Value::String(s) => {
                            if s.len() > options.max_varchar_length {
                                "TEXT".to_string()
                            } else {
                                format!("VARCHAR({})", std::cmp::min(s.len() * 2, options.max_varchar_length))
                            }
                        }
                        serde_json::Value::Array(arr) => {
                            if options.detect_vectors && arr.iter().all(|v| matches!(v, serde_json::Value::Number(_))) {
                                format!("VECTOR({})", arr.len())
                            } else {
                                "JSON".to_string()
                            }
                        }
                        serde_json::Value::Object(_) => {
                            if options.detect_json { "JSONB".to_string() } else { "JSON".to_string() }
                        }
                    };

                    if inferred_type != "NULL" {
                        col_types.push(inferred_type);
                    }
                }
            }
        }

        // Build columns
        for (name, types) in column_types {
            let sql_type = types.into_iter().next().unwrap_or_else(|| "TEXT".to_string());
            let is_nullable = nullable_columns.contains(&name);

            columns.push(InferredColumn {
                name,
                sql_type,
                nullable: is_nullable,
                unique: false,
                default: None,
                confidence: 0.85,
                alternatives: vec![],
                detected_pattern: None,
                statistics: None,
            });
        }

        let column_defs: Vec<String> = columns.iter()
            .map(|c| format!("{} {}", c.name, c.sql_type))
            .collect();

        let ddl = format!(
            "CREATE TABLE {} (\n    {}\n);",
            table.name,
            column_defs.join(",\n    ")
        );

        all_ddl.push(ddl.clone());

        schemas.push(InferredSchema {
            table_name: table.name.clone(),
            columns,
            primary_key: None,
            indexes: vec![],
            constraints: vec![],
            ddl,
            confidence: 0.85,
            warnings: vec![],
        });
    }

    // Detect relationships if requested
    let relationships = if req.detect_relationships {
        // Simple heuristic: look for *_id columns as potential foreign keys
        vec![]
    } else {
        vec![]
    };

    let response = BatchInferResponse {
        schemas,
        relationships,
        combined_ddl: format!("{}\n", all_ddl.join("\n\n")),
    };

    Ok(Json(ApiResponse::success(response)))
}

/// Infer schema from file
pub async fn infer_from_file(
    State(_state): State<AppState>,
    Json(req): Json<InferFromFileRequest>,
) -> Result<Json<ApiResponse<InferredSchema>>, ApiError> {
    let table_name = req.table_name.unwrap_or_else(|| "imported_table".to_string());

    let schema = InferredSchema {
        table_name: table_name.clone(),
        columns: vec![],
        primary_key: None,
        indexes: vec![],
        constraints: vec![],
        ddl: format!("CREATE TABLE {} (id BIGINT PRIMARY KEY);", table_name),
        confidence: 0.5,
        warnings: vec![format!("File inference from {} is not yet implemented", req.format)],
    };

    Ok(Json(ApiResponse::success(schema)))
}

/// Optimize existing schema
pub async fn optimize_schema(
    State(_state): State<AppState>,
    Json(req): Json<OptimizeSchemaRequest>,
) -> Result<Json<ApiResponse<OptimizationResponse>>, ApiError> {
    let _goals = req.goals.unwrap_or_default();
    let _stats = req.statistics.unwrap_or_default();

    let response = OptimizationResponse {
        optimized_ddl: req.current_ddl,
        changes: vec![],
        migration_sql: "-- No optimizations recommended".to_string(),
        impact: OptimizationImpact {
            query_improvement: Some("0%".to_string()),
            storage_change: Some("0 bytes".to_string()),
            risks: vec!["Schema optimization not yet implemented".to_string()],
        },
    };

    Ok(Json(ApiResponse::success(response)))
}

/// Compare two schemas
pub async fn compare_schemas(
    State(_state): State<AppState>,
    Json(req): Json<CompareSchemaRequest>,
) -> Result<Json<ApiResponse<SchemaComparisonResponse>>, ApiError> {
    let response = SchemaComparisonResponse {
        differences: vec![],
        forward_migration: if req.generate_migration {
            Some("-- No changes detected".to_string())
        } else {
            None
        },
        backward_migration: if req.generate_migration {
            Some("-- No changes detected".to_string())
        } else {
            None
        },
        is_compatible: true,
    };

    Ok(Json(ApiResponse::success(response)))
}

/// Generate DDL from natural language description
#[derive(Debug, Deserialize)]
pub struct NaturalLanguageSchemaRequest {
    /// Natural language description
    pub description: String,
    /// Output format (sql, json, yaml)
    #[serde(default = "default_sql")]
    pub format: String,
    /// Include sample data
    #[serde(default)]
    pub include_samples: bool,
}

fn default_sql() -> String {
    "sql".to_string()
}

/// Generate schema from natural language
pub async fn generate_from_description(
    State(_state): State<AppState>,
    Json(req): Json<NaturalLanguageSchemaRequest>,
) -> Result<Json<ApiResponse<NaturalLanguageSchemaResponse>>, ApiError> {
    let response = NaturalLanguageSchemaResponse {
        schema: "-- Natural language schema generation not yet implemented".to_string(),
        explanation: format!("Received description: {}", req.description),
        samples: None,
        suggestions: vec!["Use the schema inference endpoints with sample data instead".to_string()],
    };

    Ok(Json(ApiResponse::success(response)))
}

/// Natural language schema response model
#[derive(Debug, Serialize)]
pub struct NaturalLanguageSchemaResponse {
    pub schema: String,
    pub explanation: String,
    pub samples: Option<Vec<serde_json::Value>>,
    pub suggestions: Vec<String>,
}

/// Validate schema request
#[derive(Debug, Deserialize)]
pub struct ValidateSchemaRequest {
    /// Schema DDL to validate
    pub ddl: String,
    /// Validation rules
    pub rules: Option<Vec<String>>,
}

/// Schema validation response
#[derive(Debug, Serialize)]
pub struct SchemaValidationResponse {
    /// Is valid
    pub valid: bool,
    /// Errors
    pub errors: Vec<ValidationError>,
    /// Warnings
    pub warnings: Vec<ValidationWarning>,
    /// Suggestions
    pub suggestions: Vec<String>,
}

/// Validation error
#[derive(Debug, Serialize)]
pub struct ValidationError {
    pub code: String,
    pub message: String,
    pub location: Option<String>,
}

/// Validation warning
#[derive(Debug, Serialize)]
pub struct ValidationWarning {
    pub code: String,
    pub message: String,
    pub location: Option<String>,
}

/// Validate schema
pub async fn validate_schema(
    State(_state): State<AppState>,
    Json(_req): Json<ValidateSchemaRequest>,
) -> Result<Json<ApiResponse<SchemaValidationResponse>>, ApiError> {
    let response = SchemaValidationResponse {
        valid: true,
        errors: vec![],
        warnings: vec![],
        suggestions: vec!["Schema validation not yet fully implemented".to_string()],
    };

    Ok(Json(ApiResponse::success(response)))
}

/// Get schema templates
pub async fn list_templates(
    State(_state): State<AppState>,
    Query(_params): Query<HashMap<String, String>>,
) -> Result<Json<ApiResponse<Vec<SchemaTemplate>>>, ApiError> {
    let templates: Vec<SchemaTemplate> = vec![];

    Ok(Json(ApiResponse::success(templates)))
}

/// Schema template
#[derive(Debug, Serialize)]
pub struct SchemaTemplate {
    pub id: String,
    pub name: String,
    pub description: String,
    pub category: String,
    pub ddl: String,
    pub parameters: Vec<TemplateParameter>,
}

/// Template parameter
#[derive(Debug, Serialize)]
pub struct TemplateParameter {
    pub name: String,
    pub description: String,
    pub param_type: String,
    pub default: Option<String>,
    pub required: bool,
}

/// Instantiate template
#[derive(Debug, Deserialize)]
pub struct InstantiateTemplateRequest {
    /// Template ID
    pub template_id: String,
    /// Parameter values
    pub parameters: HashMap<String, serde_json::Value>,
}

/// Instantiate a template
pub async fn instantiate_template(
    State(_state): State<AppState>,
    Json(req): Json<InstantiateTemplateRequest>,
) -> Result<Json<ApiResponse<InferredSchema>>, ApiError> {
    let schema = InferredSchema {
        table_name: "template_instance".to_string(),
        columns: vec![],
        primary_key: None,
        indexes: vec![],
        constraints: vec![],
        ddl: "-- Template instantiation not yet implemented".to_string(),
        confidence: 0.5,
        warnings: vec![format!("Template {} not found", req.template_id)],
    };

    Ok(Json(ApiResponse::success(schema)))
}