rhei-core 1.5.0

//! Table-schema registry and identifier-validation utilities.
//!
//! Every table that participates in CDC replication must be registered with a
//! [`SchemaRegistry`] before the engine starts. The registry stores the Arrow
//! schema and primary-key columns for each table, validates all identifiers for
//! SQL-injection safety, and can be persisted to disk so registrations survive
//! process restarts.
//!
//! # Typical usage
//!
//! 1. Create a [`SchemaRegistry`] (or load one with [`SchemaRegistry::load_from_disk`]).
//! 2. For each replicated table, call [`SchemaRegistry::register`] with a
//!    validated [`TableSchema`].
//! 3. Pass the registry (cheaply cloned via its inner `Arc`) to the
//!    `CdcSyncEngine` and `OlapBackend`.
//! 4. When schema evolution occurs, call [`SchemaRegistry::add_column`] or
//!    [`SchemaRegistry::drop_column`] — these keep the registry and the OLAP
//!    engine in sync.

use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::{Arc, RwLock};

use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
use serde::{Deserialize, Serialize};

use crate::error::CoreError;

/// Validate that a SQL identifier contains only safe characters: `[A-Za-z0-9_]`.
///
/// Called at schema-registration time and re-validated in the CDC converter as
/// a defense-in-depth measure against SQL injection via identifier interpolation
/// in generated DDL/DML.
///
/// # Errors
///
/// Returns [`CoreError::SchemaValidation`] if `name` is empty or contains a
/// character outside `[A-Za-z0-9_]`.
pub fn validate_identifier(name: &str) -> Result<(), CoreError> {
    if name.is_empty() {
        return Err(CoreError::SchemaValidation(
            "identifier must not be empty".to_string(),
        ));
    }
    if !name.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') {
        return Err(CoreError::SchemaValidation(format!(
            "identifier '{}' contains invalid characters (only [A-Za-z0-9_] allowed)",
            name
        )));
    }
    Ok(())
}

/// Schema definition for a table tracked by the HTAP engine.
#[derive(Debug, Clone)]
pub struct TableSchema {
    /// Table name (must match the OLTP table name).
    pub name: String,
    /// Arrow schema describing the column types.
    pub arrow_schema: SchemaRef,
    /// Column names that form the primary key (required for UPDATE/DELETE propagation).
    pub primary_key: Vec<String>,
}

impl TableSchema {
    /// Create a new `TableSchema`.
    ///
    /// `name` must be a valid SQL identifier (see [`validate_identifier`]).
    /// `primary_key` is a list of column names that uniquely identify a row;
    /// it must be non-empty and every column must exist in `schema`.
    ///
    /// Call [`TableSchema::validate`] before passing this to
    /// [`SchemaRegistry::register`], or use `register` directly (it calls
    /// `validate` internally).
    pub fn new(name: impl Into<String>, schema: SchemaRef, primary_key: Vec<String>) -> Self {
        Self {
            name: name.into(),
            arrow_schema: schema,
            primary_key,
        }
    }

    /// Validate the schema.
    ///
    /// Checks:
    /// 1. Table name and all column names are safe SQL identifiers (`[A-Za-z0-9_]`).
    /// 2. At least one primary key column is declared.
    /// 3. Every declared primary key column exists in the Arrow schema.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::SchemaValidation`] with a descriptive message if
    /// any check fails.
    pub fn validate(&self) -> Result<(), CoreError> {
        // Validate table name
        validate_identifier(&self.name)?;

        // Validate all Arrow field names
        for field in self.arrow_schema.fields() {
            validate_identifier(field.name()).map_err(|_| {
                CoreError::SchemaValidation(format!(
                    "column name '{}' in table '{}' contains invalid characters",
                    field.name(),
                    self.name
                ))
            })?;
        }

        // Validate primary key
        if self.primary_key.is_empty() {
            return Err(CoreError::SchemaValidation(format!(
                "table '{}' must have at least one primary key column",
                self.name
            )));
        }
        for pk_col in &self.primary_key {
            validate_identifier(pk_col)?;
            if self.arrow_schema.field_with_name(pk_col).is_err() {
                return Err(CoreError::SchemaValidation(format!(
                    "primary key column '{}' not found in schema for table '{}'",
                    pk_col, self.name
                )));
            }
        }
        Ok(())
    }
}

/// Thread-safe registry of table schemas.
///
/// All tables that should be replicated from OLTP to OLAP must be registered here
/// with their Arrow schema and primary key definition.
#[derive(Debug, Clone)]
pub struct SchemaRegistry {
    tables: Arc<RwLock<HashMap<String, Arc<TableSchema>>>>,
}

impl SchemaRegistry {
    /// Create an empty [`SchemaRegistry`].
    pub fn new() -> Self {
        Self {
            tables: Arc::new(RwLock::new(HashMap::new())),
        }
    }

    /// Register a [`TableSchema`].
    ///
    /// Calls [`TableSchema::validate`] before inserting. The registry holds an
    /// `Arc` clone of the schema so subsequent `get` calls are cheap.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableAlreadyRegistered`] if a table with the same
    /// name is already present. Returns [`CoreError::SchemaValidation`] if
    /// validation fails.
    pub fn register(&self, schema: TableSchema) -> Result<(), CoreError> {
        schema.validate()?;
        let mut tables = self.tables.write().unwrap();
        match tables.entry(schema.name.clone()) {
            Entry::Occupied(_) => Err(CoreError::TableAlreadyRegistered(schema.name)),
            Entry::Vacant(entry) => {
                entry.insert(Arc::new(schema));
                Ok(())
            }
        }
    }

    /// Look up a [`TableSchema`] by table name.
    ///
    /// Returns an `Arc` clone — the call is cheap and does not copy the schema.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableNotFound`] if `table_name` has not been
    /// registered.
    pub fn get(&self, table_name: &str) -> Result<Arc<TableSchema>, CoreError> {
        let tables = self.tables.read().unwrap();
        tables
            .get(table_name)
            .cloned()
            .ok_or_else(|| CoreError::TableNotFound(table_name.to_string()))
    }

    /// List all registered table names.
    pub fn table_names(&self) -> Vec<String> {
        let tables = self.tables.read().unwrap();
        tables.keys().cloned().collect()
    }

    /// Remove a table from the registry and return its schema.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableNotFound`] if `table_name` is not registered.
    pub fn unregister(&self, table_name: &str) -> Result<Arc<TableSchema>, CoreError> {
        let mut tables = self.tables.write().unwrap();
        tables
            .remove(table_name)
            .ok_or_else(|| CoreError::TableNotFound(table_name.to_string()))
    }

    /// Replace an existing table's schema in-place (used during schema evolution).
    ///
    /// Validates `schema` before replacing. Does **not** apply DDL to any
    /// engine; callers must also call the corresponding `OlapEngine::add_column`
    /// / `OlapEngine::drop_column` methods.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableNotFound`] if the table is not registered.
    /// Returns [`CoreError::SchemaValidation`] if the new schema is invalid.
    pub fn update(&self, schema: TableSchema) -> Result<(), CoreError> {
        schema.validate()?;
        let mut tables = self.tables.write().unwrap();
        match tables.entry(schema.name.clone()) {
            Entry::Occupied(mut entry) => {
                entry.insert(Arc::new(schema));
                Ok(())
            }
            Entry::Vacant(_) => Err(CoreError::TableNotFound(schema.name)),
        }
    }

    /// Add a nullable column to a registered table's schema.
    ///
    /// The column is appended at the end of the field list. New columns are
    /// always marked nullable so that existing rows (which lack the column)
    /// remain valid.
    ///
    /// Returns the updated schema arc.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableNotFound`] if the table is not registered.
    /// Returns [`CoreError::SchemaValidation`] if `column_name` is invalid or
    /// already exists.
    pub fn add_column(
        &self,
        table_name: &str,
        column_name: &str,
        data_type: DataType,
    ) -> Result<Arc<TableSchema>, CoreError> {
        validate_identifier(column_name)?;
        let mut tables = self.tables.write().unwrap();
        let existing = tables
            .get(table_name)
            .ok_or_else(|| CoreError::TableNotFound(table_name.to_string()))?;

        // Check column doesn't already exist
        if existing.arrow_schema.field_with_name(column_name).is_ok() {
            return Err(CoreError::SchemaValidation(format!(
                "column '{}' already exists in table '{}'",
                column_name, table_name
            )));
        }

        let primary_key = existing.primary_key.clone();
        let mut fields: Vec<Field> = existing
            .arrow_schema
            .fields()
            .iter()
            .map(|f| f.as_ref().clone())
            .collect();
        fields.push(Field::new(column_name, data_type, true)); // new columns are nullable

        Ok(commit_schema(&mut tables, table_name, fields, primary_key))
    }

    /// Remove a column from a registered table's schema.
    ///
    /// Returns the updated schema arc. Note that the SQLite OLTP engine
    /// requires CDC triggers to be torn down **before** issuing `DROP COLUMN`
    /// (SQLite rejects `DROP COLUMN` while triggers reference the column).
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::TableNotFound`] if the table is not registered.
    /// Returns [`CoreError::SchemaValidation`] if `column_name` is a primary
    /// key column or does not exist in the schema.
    pub fn drop_column(
        &self,
        table_name: &str,
        column_name: &str,
    ) -> Result<Arc<TableSchema>, CoreError> {
        let mut tables = self.tables.write().unwrap();
        let existing = tables
            .get(table_name)
            .ok_or_else(|| CoreError::TableNotFound(table_name.to_string()))?;

        // Cannot drop PK columns
        if existing.primary_key.contains(&column_name.to_string()) {
            return Err(CoreError::SchemaValidation(format!(
                "cannot drop primary key column '{}' from table '{}'",
                column_name, table_name
            )));
        }

        // Check column exists
        if existing.arrow_schema.field_with_name(column_name).is_err() {
            return Err(CoreError::SchemaValidation(format!(
                "column '{}' not found in table '{}'",
                column_name, table_name
            )));
        }

        let primary_key = existing.primary_key.clone();
        let fields: Vec<Field> = existing
            .arrow_schema
            .fields()
            .iter()
            .filter(|f| f.name() != column_name)
            .map(|f| f.as_ref().clone())
            .collect();

        Ok(commit_schema(&mut tables, table_name, fields, primary_key))
    }
}

/// Build a new `TableSchema` from `fields` + `primary_key`, insert it into `tables`,
/// and return the `Arc`. Extracted to deduplicate the identical tail of `add_column`
/// and `drop_column`.
fn commit_schema(
    tables: &mut HashMap<String, Arc<TableSchema>>,
    table_name: &str,
    fields: Vec<Field>,
    primary_key: Vec<String>,
) -> Arc<TableSchema> {
    let schema = Arc::new(TableSchema {
        name: table_name.to_string(),
        arrow_schema: Arc::new(Schema::new(fields)),
        primary_key,
    });
    tables.insert(table_name.to_string(), schema.clone());
    schema
}

impl Default for SchemaRegistry {
    fn default() -> Self {
        Self::new()
    }
}

// ---------------------------------------------------------------------------
// Schema persistence helpers
// ---------------------------------------------------------------------------

/// Serializable representation of a `TableSchema` for on-disk storage.
///
/// Arrow types are encoded as human-readable strings. A `(name, type_string,
/// nullable)` tuple is sufficient to reconstruct the Arrow `Field`.
#[derive(Debug, Serialize, Deserialize)]
struct PersistedSchema {
    name: String,
    primary_key: Vec<String>,
    /// Each entry is `(column_name, arrow_type_string, nullable)`.
    fields: Vec<(String, String, bool)>,
}

/// Encode an Arrow `DataType` as a portable string.
///
/// This covers the types that appear in typical HTAP workloads. Complex nested
/// types (List, Struct, …) are not expected in registered table schemas and
/// will fall through to a fallback string that `arrow_type_from_str` will
/// reject with a clear error.
fn arrow_type_to_str(dt: &DataType) -> String {
    match dt {
        DataType::Int8 => "int8".to_string(),
        DataType::Int16 => "int16".to_string(),
        DataType::Int32 => "int32".to_string(),
        DataType::Int64 => "int64".to_string(),
        DataType::UInt8 => "uint8".to_string(),
        DataType::UInt16 => "uint16".to_string(),
        DataType::UInt32 => "uint32".to_string(),
        DataType::UInt64 => "uint64".to_string(),
        DataType::Float16 => "float16".to_string(),
        DataType::Float32 => "float32".to_string(),
        DataType::Float64 => "float64".to_string(),
        DataType::Boolean => "boolean".to_string(),
        DataType::Utf8 => "utf8".to_string(),
        DataType::LargeUtf8 => "large_utf8".to_string(),
        DataType::Binary => "binary".to_string(),
        DataType::LargeBinary => "large_binary".to_string(),
        DataType::Date32 => "date32".to_string(),
        DataType::Date64 => "date64".to_string(),
        DataType::Timestamp(TimeUnit::Second, tz) => {
            format!("timestamp_s[{}]", tz.as_deref().unwrap_or(""))
        }
        DataType::Timestamp(TimeUnit::Millisecond, tz) => {
            format!("timestamp_ms[{}]", tz.as_deref().unwrap_or(""))
        }
        DataType::Timestamp(TimeUnit::Microsecond, tz) => {
            format!("timestamp_us[{}]", tz.as_deref().unwrap_or(""))
        }
        DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
            format!("timestamp_ns[{}]", tz.as_deref().unwrap_or(""))
        }
        DataType::Null => "null".to_string(),
        other => format!("unknown:{other:?}"),
    }
}

/// Decode a string produced by [`arrow_type_to_str`] back into an Arrow `DataType`.
fn arrow_type_from_str(s: &str) -> Result<DataType, CoreError> {
    // Handle timestamp variants with optional timezone: timestamp_us[UTC]
    if let Some(rest) = s.strip_prefix("timestamp_") {
        let (unit_str, tz_part) = if let Some(idx) = rest.find('[') {
            // Require a closing `]` as the very last character.
            if !rest.ends_with(']') {
                return Err(CoreError::SchemaValidation(format!(
                    "malformed timestamp type string '{s}': missing closing ']'"
                )));
            }
            let unit = &rest[..idx];
            // Strip the surrounding `[` and `]` to get the timezone string.
            let tz_raw = &rest[idx + 1..rest.len() - 1];
            let tz: Option<Arc<str>> = if tz_raw.is_empty() {
                None
            } else {
                Some(Arc::from(tz_raw))
            };
            (unit, tz)
        } else {
            (rest, None)
        };
        let unit = match unit_str {
            "s" => TimeUnit::Second,
            "ms" => TimeUnit::Millisecond,
            "us" => TimeUnit::Microsecond,
            "ns" => TimeUnit::Nanosecond,
            other => {
                return Err(CoreError::SchemaValidation(format!(
                    "unknown timestamp unit '{other}'"
                )))
            }
        };
        return Ok(DataType::Timestamp(unit, tz_part));
    }

    match s {
        "int8" => Ok(DataType::Int8),
        "int16" => Ok(DataType::Int16),
        "int32" => Ok(DataType::Int32),
        "int64" => Ok(DataType::Int64),
        "uint8" => Ok(DataType::UInt8),
        "uint16" => Ok(DataType::UInt16),
        "uint32" => Ok(DataType::UInt32),
        "uint64" => Ok(DataType::UInt64),
        "float16" => Ok(DataType::Float16),
        "float32" => Ok(DataType::Float32),
        "float64" => Ok(DataType::Float64),
        "boolean" => Ok(DataType::Boolean),
        "utf8" => Ok(DataType::Utf8),
        "large_utf8" => Ok(DataType::LargeUtf8),
        "binary" => Ok(DataType::Binary),
        "large_binary" => Ok(DataType::LargeBinary),
        "date32" => Ok(DataType::Date32),
        "date64" => Ok(DataType::Date64),
        "null" => Ok(DataType::Null),
        other => Err(CoreError::SchemaValidation(format!(
            "cannot deserialize unknown Arrow type string '{other}'"
        ))),
    }
}

impl SchemaRegistry {
    /// Persist the current registry contents to a JSON file at `path`.
    ///
    /// The file is written atomically via a temp file + rename so a crash
    /// mid-write cannot produce a corrupt file.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::SchemaValidation`] if serialization fails or the
    /// file cannot be written or renamed (e.g., permission denied, disk full).
    pub fn save_to_disk(&self, path: &str) -> Result<(), CoreError> {
        let tables = self.tables.read().unwrap();
        let persisted: Vec<PersistedSchema> = tables
            .values()
            .map(|ts| PersistedSchema {
                name: ts.name.clone(),
                primary_key: ts.primary_key.clone(),
                fields: ts
                    .arrow_schema
                    .fields()
                    .iter()
                    .map(|f| {
                        (
                            f.name().clone(),
                            arrow_type_to_str(f.data_type()),
                            f.is_nullable(),
                        )
                    })
                    .collect(),
            })
            .collect();

        let json = serde_json::to_string_pretty(&persisted).map_err(|e| {
            CoreError::SchemaValidation(format!("failed to serialize schema registry: {e}"))
        })?;

        // Atomic write: write to a temp file then rename.
        let tmp_path = format!("{path}.tmp");
        std::fs::write(&tmp_path, &json).map_err(|e| {
            CoreError::SchemaValidation(format!(
                "failed to write schema registry to '{tmp_path}': {e}"
            ))
        })?;
        std::fs::rename(&tmp_path, path).map_err(|e| {
            CoreError::SchemaValidation(format!(
                "failed to rename schema registry file '{tmp_path}' -> '{path}': {e}"
            ))
        })?;

        Ok(())
    }

    /// Load a previously persisted registry from a JSON file at `path`.
    ///
    /// Returns an empty [`SchemaRegistry`] if the file does not exist (fresh
    /// start). This design means that the first run behaves identically to
    /// subsequent runs — no special-casing is needed at call sites.
    ///
    /// # Errors
    ///
    /// Returns [`CoreError::SchemaValidation`] if the file exists but cannot
    /// be read, is not valid JSON, contains an unknown Arrow type string, or
    /// contains schemas that fail [`TableSchema::validate`].
    pub fn load_from_disk(path: &str) -> Result<SchemaRegistry, CoreError> {
        if !std::path::Path::new(path).exists() {
            return Ok(SchemaRegistry::new());
        }

        let json = std::fs::read_to_string(path).map_err(|e| {
            CoreError::SchemaValidation(format!(
                "failed to read schema registry from '{path}': {e}"
            ))
        })?;

        let persisted: Vec<PersistedSchema> = serde_json::from_str(&json).map_err(|e| {
            CoreError::SchemaValidation(format!("failed to parse schema registry at '{path}': {e}"))
        })?;

        let registry = SchemaRegistry::new();
        for ps in persisted {
            let fields: Vec<Field> = ps
                .fields
                .iter()
                .map(|(name, type_str, nullable)| {
                    arrow_type_from_str(type_str).map(|dt| Field::new(name.as_str(), dt, *nullable))
                })
                .collect::<Result<_, _>>()?;

            let schema = Arc::new(Schema::new(fields));
            let table_schema = TableSchema::new(ps.name, schema, ps.primary_key);
            // `register` validates identifiers & PK — if the file was written by us it
            // will always pass, but we want a clear error if the file was hand-edited.
            registry.register(table_schema)?;
        }

        Ok(registry)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use arrow::datatypes::{DataType, Field, Schema};

    fn simple_schema(col_type: DataType) -> TableSchema {
        TableSchema::new(
            "t",
            Arc::new(Schema::new(vec![
                Field::new("id", DataType::Int64, false),
                Field::new("val", col_type, true),
            ])),
            vec!["id".to_string()],
        )
    }

    // -----------------------------------------------------------------------
    // SchemaRegistry::register idempotency
    // -----------------------------------------------------------------------

    #[test]
    fn register_idempotent_matching_schema() {
        // Re-registering an identical schema should succeed (no error).
        let registry = SchemaRegistry::new();
        let schema = simple_schema(DataType::Utf8);
        registry.register(schema.clone()).unwrap();

        // Second call with the same schema — must succeed and not return an error.
        let result = registry.register(schema);
        // SchemaRegistry::register still returns TableAlreadyRegistered; idempotency
        // is implemented at the HtapEngine layer.  This test documents the current
        // low-level behaviour so we know what HtapEngine must handle.
        assert!(
            matches!(result, Err(CoreError::TableAlreadyRegistered(_))),
            "expected TableAlreadyRegistered, got {result:?}"
        );
    }

    #[test]
    fn register_detects_conflict() {
        // A second registration with a *different* schema for the same table name
        // must return TableAlreadyRegistered so callers can distinguish it.
        let registry = SchemaRegistry::new();
        registry.register(simple_schema(DataType::Utf8)).unwrap();

        let conflicting = simple_schema(DataType::Int32); // same table, different column type
        let result = registry.register(conflicting);
        assert!(
            matches!(result, Err(CoreError::TableAlreadyRegistered(_))),
            "expected TableAlreadyRegistered for conflicting schema, got {result:?}"
        );
    }

    // -----------------------------------------------------------------------
    // arrow_type_from_str — malformed timestamp strings
    // -----------------------------------------------------------------------

    #[test]
    fn arrow_type_from_str_valid_timestamp() {
        // Smoke test: well-formed strings must round-trip correctly.
        let cases = [
            ("timestamp_s[]", DataType::Timestamp(TimeUnit::Second, None)),
            (
                "timestamp_ms[UTC]",
                DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from("UTC"))),
            ),
            (
                "timestamp_us[America/New_York]",
                DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("America/New_York"))),
            ),
            (
                "timestamp_ns[]",
                DataType::Timestamp(TimeUnit::Nanosecond, None),
            ),
        ];
        for (s, expected) in cases {
            let got = arrow_type_from_str(s).unwrap_or_else(|e| panic!("parse '{s}' failed: {e}"));
            assert_eq!(got, expected, "round-trip mismatch for '{s}'");
        }
    }

    #[test]
    fn arrow_type_from_str_missing_close_bracket() {
        // `timestamp_us[UTC` — has `[` but no closing `]` → must error, not panic.
        let result = arrow_type_from_str("timestamp_us[UTC");
        assert!(
            matches!(result, Err(CoreError::SchemaValidation(ref msg)) if msg.contains("missing closing ']'")),
            "expected SchemaValidation error for missing ']', got {result:?}"
        );
    }

    #[test]
    fn arrow_type_from_str_empty_bracket_no_close() {
        // `timestamp_us[` — opening bracket with nothing after it → must error, not panic.
        let result = arrow_type_from_str("timestamp_us[");
        assert!(
            matches!(result, Err(CoreError::SchemaValidation(ref msg)) if msg.contains("missing closing ']'")),
            "expected SchemaValidation error for 'timestamp_us[', got {result:?}"
        );
    }
}