uni-common 2.0.5

// SPDX-License-Identifier: Apache-2.0
// Copyright 2024-2026 Dragonscale Team

use crate::core::edge_type::{
    MAX_SCHEMA_TYPE_ID, VIRTUAL_EDGE_TYPE_ID_SENTINEL, VIRTUAL_EDGE_TYPE_ID_START,
    is_schemaless_edge_type, make_schemaless_id,
};
use crate::sync::{acquire_read, acquire_write};
use anyhow::{Result, anyhow};
use chrono::{DateTime, Utc};
use object_store::ObjectStore;
use object_store::ObjectStoreExt;
use object_store::local::LocalFileSystem;
use object_store::path::Path as ObjectStorePath;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use std::sync::{Arc, RwLock};

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum SchemaElementState {
    Active,
    Hidden {
        since: DateTime<Utc>,
        last_active_snapshot: String, // SnapshotId
    },
    Tombstone {
        since: DateTime<Utc>,
    },
}

use arrow_schema::{DataType as ArrowDataType, Field, Fields, TimeUnit};

/// Returns the canonical struct field definitions for DateTime encoding in Arrow.
///
/// DateTime is encoded as a 3-field struct to preserve timezone information:
/// - `nanos_since_epoch`: i64 nanoseconds since Unix epoch (UTC)
/// - `offset_seconds`: i32 seconds offset from UTC (e.g., +3600 for +01:00)
/// - `timezone_name`: Optional IANA timezone name (e.g., "America/New_York")
pub fn datetime_struct_fields() -> Fields {
    Fields::from(vec![
        Field::new(
            "nanos_since_epoch",
            ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
            true,
        ),
        Field::new("offset_seconds", ArrowDataType::Int32, true),
        Field::new("timezone_name", ArrowDataType::Utf8, true),
    ])
}

/// Returns the canonical struct field definitions for Time encoding in Arrow.
///
/// Time is encoded as a 2-field struct to preserve timezone offset:
/// - `nanos_since_midnight`: i64 nanoseconds since midnight (0-86,399,999,999,999)
/// - `offset_seconds`: i32 seconds offset from UTC (e.g., +3600 for +01:00)
pub fn time_struct_fields() -> Fields {
    Fields::from(vec![
        Field::new(
            "nanos_since_midnight",
            ArrowDataType::Time64(TimeUnit::Nanosecond),
            true,
        ),
        Field::new("offset_seconds", ArrowDataType::Int32, true),
    ])
}

/// Detects if an Arrow DataType is the canonical DateTime struct.
pub fn is_datetime_struct(arrow_dt: &ArrowDataType) -> bool {
    matches!(arrow_dt, ArrowDataType::Struct(fields) if *fields == datetime_struct_fields())
}

/// Detects if an Arrow DataType is the canonical Time struct.
pub fn is_time_struct(arrow_dt: &ArrowDataType) -> bool {
    matches!(arrow_dt, ArrowDataType::Struct(fields) if *fields == time_struct_fields())
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum CrdtType {
    GCounter,
    GSet,
    ORSet,
    LWWRegister,
    LWWMap,
    Rga,
    VectorClock,
    VCRegister,
}

impl CrdtType {
    /// Returns the canonical variant name for this CRDT type.
    ///
    /// The returned strings must stay in sync with `uni_crdt::Crdt::type_name`,
    /// so a written CRDT value can be validated against its schema-declared
    /// variant (see uni-store's write-time CRDT enforcement).
    ///
    /// # Examples
    /// ```
    /// use uni_common::core::schema::CrdtType;
    /// assert_eq!(CrdtType::GCounter.type_name(), "GCounter");
    /// ```
    #[must_use]
    pub fn type_name(&self) -> &'static str {
        match self {
            CrdtType::GCounter => "GCounter",
            CrdtType::GSet => "GSet",
            CrdtType::ORSet => "ORSet",
            CrdtType::LWWRegister => "LWWRegister",
            CrdtType::LWWMap => "LWWMap",
            CrdtType::Rga => "Rga",
            CrdtType::VectorClock => "VectorClock",
            CrdtType::VCRegister => "VCRegister",
        }
    }
}

#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
pub enum PointType {
    Geographic,  // WGS84
    Cartesian2D, // x, y
    Cartesian3D, // x, y, z
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum DataType {
    String,
    Int32,
    Int64,
    Float32,
    Float64,
    Bool,
    Timestamp,
    Date,
    Time,
    DateTime,
    Duration,
    CypherValue,
    Bytes,
    Point(PointType),
    Vector { dimensions: usize },
    Btic,
    Crdt(CrdtType),
    List(Box<DataType>),
    Map(Box<DataType>, Box<DataType>),
}

impl DataType {
    // Alias for compatibility/convenience if needed, but preferable to use exact types.
    #[allow(non_upper_case_globals)]
    pub const Float: DataType = DataType::Float64;
    #[allow(non_upper_case_globals)]
    pub const Int: DataType = DataType::Int64;

    pub fn to_arrow(&self) -> ArrowDataType {
        match self {
            DataType::String => ArrowDataType::Utf8,
            DataType::Int32 => ArrowDataType::Int32,
            DataType::Int64 => ArrowDataType::Int64,
            DataType::Float32 => ArrowDataType::Float32,
            DataType::Float64 => ArrowDataType::Float64,
            DataType::Bool => ArrowDataType::Boolean,
            DataType::Timestamp => {
                ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()))
            }
            DataType::Date => ArrowDataType::Date32,
            DataType::Time => ArrowDataType::Struct(time_struct_fields()),
            DataType::DateTime => ArrowDataType::Struct(datetime_struct_fields()),
            DataType::Duration => ArrowDataType::LargeBinary, // Lance doesn't support Interval(MonthDayNano); use CypherValue codec
            DataType::CypherValue => ArrowDataType::LargeBinary, // MessagePack-tagged binary encoding
            DataType::Bytes => ArrowDataType::LargeBinary, // raw byte buffer (no codec wrapping)
            DataType::Point(pt) => match pt {
                PointType::Geographic => ArrowDataType::Struct(Fields::from(vec![
                    Field::new("latitude", ArrowDataType::Float64, false),
                    Field::new("longitude", ArrowDataType::Float64, false),
                    Field::new("crs", ArrowDataType::Utf8, false),
                ])),
                PointType::Cartesian2D => ArrowDataType::Struct(Fields::from(vec![
                    Field::new("x", ArrowDataType::Float64, false),
                    Field::new("y", ArrowDataType::Float64, false),
                    Field::new("crs", ArrowDataType::Utf8, false),
                ])),
                PointType::Cartesian3D => ArrowDataType::Struct(Fields::from(vec![
                    Field::new("x", ArrowDataType::Float64, false),
                    Field::new("y", ArrowDataType::Float64, false),
                    Field::new("z", ArrowDataType::Float64, false),
                    Field::new("crs", ArrowDataType::Utf8, false),
                ])),
            },
            DataType::Vector { dimensions } => ArrowDataType::FixedSizeList(
                Arc::new(Field::new("item", ArrowDataType::Float32, true)),
                *dimensions as i32,
            ),
            DataType::Btic => ArrowDataType::FixedSizeBinary(24),
            DataType::Crdt(_) => ArrowDataType::Binary, // Store CRDT as binary MessagePack
            DataType::List(inner) => {
                ArrowDataType::List(Arc::new(Field::new("item", inner.to_arrow(), true)))
            }
            DataType::Map(key, value) => ArrowDataType::List(Arc::new(Field::new(
                "item",
                ArrowDataType::Struct(Fields::from(vec![
                    Field::new("key", key.to_arrow(), false),
                    Field::new("value", value.to_arrow(), true),
                ])),
                true,
            ))),
        }
    }

    /// Returns `true` if `value` is directly storable in this column type without loss.
    ///
    /// This is the schema-level type guard used by the write path. `Value::Null` is
    /// always accepted — column nullability is enforced separately by the `nullable`
    /// flag, not here. `CypherValue`, `Crdt`, and `Point` columns accept any value.
    /// For every other declared type, only the `Value` variants that the storage layer
    /// persists *without silently nulling* are accepted (see the per-type converters in
    /// `uni-store`'s `arrow_convert`), plus the intentional lossless widenings
    /// `Int`→`Float`, `Int`→`Int32`, and `Temporal`→`Timestamp`.
    ///
    /// A `Value::String` destined for a `Date`/`Time`/`DateTime`/`Duration` column is
    /// intentionally *not* accepted here: the write path first coerces such strings into
    /// the proper `Temporal` value (matching the Cypher temporal constructors), then the
    /// coerced value passes this check. This keeps `accepts` a pure, allocation-free
    /// predicate.
    ///
    /// # Examples
    /// ```
    /// use uni_common::core::schema::DataType;
    /// use uni_common::Value;
    ///
    /// assert!(DataType::Float64.accepts(&Value::Int(3))); // Int widens to Float
    /// assert!(DataType::Bool.accepts(&Value::Null)); // Null always accepted
    /// assert!(!DataType::DateTime.accepts(&Value::String("2026-01-01T00:00:00Z".into())));
    /// ```
    pub fn accepts(&self, value: &crate::value::Value) -> bool {
        use crate::value::{TemporalValue, Value};

        // Null is universally accepted; nullability is a separate concern.
        if matches!(value, Value::Null) {
            return true;
        }

        match self {
            // Opaque / dynamically-typed columns accept any value.
            DataType::CypherValue | DataType::Crdt(_) | DataType::Point(_) => true,

            DataType::String => matches!(value, Value::String(_)),
            DataType::Int32 | DataType::Int64 => matches!(value, Value::Int(_)),
            // Int widens to Float losslessly for the ranges we care about.
            DataType::Float32 | DataType::Float64 => {
                matches!(value, Value::Int(_) | Value::Float(_))
            }
            DataType::Bool => matches!(value, Value::Bool(_)),

            // Non-struct timestamp column: storage parses strings and accepts ints,
            // so both are lossless here (unlike the DateTime struct column below).
            DataType::Timestamp => matches!(
                value,
                Value::String(_)
                    | Value::Int(_)
                    | Value::Temporal(
                        TemporalValue::DateTime { .. } | TemporalValue::LocalDateTime { .. }
                    )
            ),
            DataType::DateTime => matches!(
                value,
                Value::Temporal(
                    TemporalValue::DateTime { .. } | TemporalValue::LocalDateTime { .. }
                )
            ),
            DataType::Date => {
                matches!(
                    value,
                    Value::Int(_) | Value::Temporal(TemporalValue::Date { .. })
                )
            }
            DataType::Time => matches!(
                value,
                Value::Int(_)
                    | Value::Temporal(TemporalValue::Time { .. } | TemporalValue::LocalTime { .. })
            ),
            DataType::Duration => {
                matches!(value, Value::Temporal(TemporalValue::Duration { .. }))
            }
            DataType::Bytes => matches!(value, Value::Bytes(_)),
            // FixedSizeBinary(24) converter accepts the Btic temporal, raw strings, and lists.
            DataType::Btic => matches!(
                value,
                Value::String(_) | Value::List(_) | Value::Temporal(TemporalValue::Btic { .. })
            ),
            DataType::Vector { .. } => matches!(value, Value::Vector(_) | Value::List(_)),
            DataType::List(_) => matches!(value, Value::List(_)),
            DataType::Map(_, _) => matches!(value, Value::Map(_)),
        }
    }
}

fn default_created_at() -> DateTime<Utc> {
    Utc::now()
}

fn default_state() -> SchemaElementState {
    SchemaElementState::Active
}

fn default_version_1() -> u32 {
    1
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PropertyMeta {
    pub r#type: DataType,
    pub nullable: bool,
    #[serde(default = "default_version_1")]
    pub added_in: u32, // SchemaVersion
    #[serde(default = "default_state")]
    pub state: SchemaElementState,
    #[serde(default)]
    pub generation_expression: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub description: Option<String>,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct LabelMeta {
    pub id: u16, // LabelId
    #[serde(default = "default_created_at")]
    pub created_at: DateTime<Utc>,
    #[serde(default = "default_state")]
    pub state: SchemaElementState,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub description: Option<String>,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EdgeTypeMeta {
    /// See [`crate::core::edge_type::EdgeTypeId`] for bit-layout details.
    pub id: u32,
    pub src_labels: Vec<String>,
    pub dst_labels: Vec<String>,
    #[serde(default = "default_state")]
    pub state: SchemaElementState,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub description: Option<String>,
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum ConstraintType {
    Unique { properties: Vec<String> },
    Exists { property: String },
    Check { expression: String },
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum ConstraintTarget {
    Label(String),
    EdgeType(String),
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Constraint {
    pub name: String,
    pub constraint_type: ConstraintType,
    pub target: ConstraintTarget,
    pub enabled: bool,
}

/// Bidirectional registry for dynamically-assigned schemaless edge type IDs.
///
/// Edge types not defined in the schema are assigned IDs at runtime with
/// bit 31 set (see [`crate::core::edge_type`]). This registry maintains
/// the name-to-ID and ID-to-name mappings for those types.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SchemalessEdgeTypeRegistry {
    name_to_id: HashMap<String, u32>,
    id_to_name: HashMap<u32, String>,
    /// Next local ID to assign (0 is reserved for invalid).
    next_local_id: u32,
}

impl SchemalessEdgeTypeRegistry {
    pub fn new() -> Self {
        Self {
            name_to_id: HashMap::new(),
            id_to_name: HashMap::new(),
            next_local_id: 1,
        }
    }

    /// Returns the schemaless ID for `type_name`, assigning a new one if needed.
    pub fn get_or_assign_id(&mut self, type_name: &str) -> u32 {
        if let Some(&id) = self.name_to_id.get(type_name) {
            return id;
        }

        let id = make_schemaless_id(self.next_local_id);
        self.next_local_id += 1;

        self.name_to_id.insert(type_name.to_string(), id);
        self.id_to_name.insert(id, type_name.to_string());

        id
    }

    /// Looks up the edge type name for a schemaless ID.
    pub fn type_name_by_id(&self, type_id: u32) -> Option<&str> {
        self.id_to_name.get(&type_id).map(String::as_str)
    }

    /// Returns `true` if `type_name` has already been assigned a schemaless ID.
    pub fn contains(&self, type_name: &str) -> bool {
        self.name_to_id.contains_key(type_name)
    }

    /// Looks up the edge type ID for `type_name` with case-insensitive matching.
    pub fn id_by_name_case_insensitive(&self, type_name: &str) -> Option<u32> {
        self.name_to_id
            .iter()
            .find(|(k, _)| k.eq_ignore_ascii_case(type_name))
            .map(|(_, &id)| id)
    }

    /// Returns all registered schemaless type IDs.
    pub fn all_type_ids(&self) -> Vec<u32> {
        self.id_to_name.keys().copied().collect()
    }

    /// Returns true if the registry has any schemaless types.
    pub fn is_empty(&self) -> bool {
        self.name_to_id.is_empty()
    }
}

impl Default for SchemalessEdgeTypeRegistry {
    fn default() -> Self {
        Self::new()
    }
}

/// First virtual (catalog-resolved) label ID. Label IDs in
/// `VIRTUAL_LABEL_ID_START..VIRTUAL_LABEL_ID_SENTINEL` are owned by
/// plugin-registered `CatalogProvider`s and allocated lazily by the
/// planner via `PluginRegistry::register_virtual_label`. Native label
/// allocation (`SchemaManager::add_label`) refuses IDs in this range.
pub const VIRTUAL_LABEL_ID_START: u16 = 0xFF00;
/// Sentinel "no label" marker, kept distinct from any allocatable ID.
pub const VIRTUAL_LABEL_ID_SENTINEL: u16 = 0xFFFF;

/// Returns `true` if `id` is in the virtual (catalog-resolved) range.
#[inline]
pub fn is_virtual_label_id(id: u16) -> bool {
    (VIRTUAL_LABEL_ID_START..VIRTUAL_LABEL_ID_SENTINEL).contains(&id)
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Schema {
    pub schema_version: u32,
    pub labels: HashMap<String, LabelMeta>,
    pub edge_types: HashMap<String, EdgeTypeMeta>,
    pub properties: HashMap<String, HashMap<String, PropertyMeta>>,
    #[serde(default)]
    pub indexes: Vec<IndexDefinition>,
    #[serde(default)]
    pub constraints: Vec<Constraint>,
    /// Registry for schemaless edge types (dynamically assigned IDs)
    #[serde(default)]
    pub schemaless_registry: SchemalessEdgeTypeRegistry,
}

impl Default for Schema {
    fn default() -> Self {
        Self {
            schema_version: 1,
            labels: HashMap::new(),
            edge_types: HashMap::new(),
            properties: HashMap::new(),
            indexes: Vec::new(),
            constraints: Vec::new(),
            schemaless_registry: SchemalessEdgeTypeRegistry::new(),
        }
    }
}

impl Schema {
    /// Returns the label name for a given label ID.
    ///
    /// Performs a linear scan over all labels. This is efficient because
    /// the number of labels in a schema is typically small.
    pub fn label_name_by_id(&self, label_id: u16) -> Option<&str> {
        self.labels
            .iter()
            .find(|(_, meta)| meta.id == label_id)
            .map(|(name, _)| name.as_str())
    }

    /// Returns the label ID for a given label name.
    pub fn label_id_by_name(&self, label_name: &str) -> Option<u16> {
        self.labels.get(label_name).map(|meta| meta.id)
    }

    /// Returns the edge type name for a given type ID.
    ///
    /// Performs a linear scan over all edge types. This is efficient because
    /// the number of edge types in a schema is typically small.
    pub fn edge_type_name_by_id(&self, type_id: u32) -> Option<&str> {
        self.edge_types
            .iter()
            .find(|(_, meta)| meta.id == type_id)
            .map(|(name, _)| name.as_str())
    }

    /// Returns the edge type ID for a given type name.
    pub fn edge_type_id_by_name(&self, type_name: &str) -> Option<u32> {
        self.edge_types.get(type_name).map(|meta| meta.id)
    }

    /// Returns the vector index configuration for a given label and property.
    ///
    /// Performs a linear scan over all indexes. This is efficient because
    /// the number of indexes in a schema is typically small.
    pub fn vector_index_for_property(
        &self,
        label: &str,
        property: &str,
    ) -> Option<&VectorIndexConfig> {
        self.indexes.iter().find_map(|idx| {
            if let IndexDefinition::Vector(config) = idx
                && config.label == label
                && config.property == property
                && config.metadata.status == IndexStatus::Online
            {
                return Some(config);
            }
            None
        })
    }

    /// Returns the full-text index configuration for a given label and property.
    ///
    /// A full-text index covers one or more properties. This returns the config
    /// if the specified property is among the indexed properties.
    pub fn fulltext_index_for_property(
        &self,
        label: &str,
        property: &str,
    ) -> Option<&FullTextIndexConfig> {
        self.indexes.iter().find_map(|idx| {
            if let IndexDefinition::FullText(config) = idx
                && config.label == label
                && config.properties.iter().any(|p| p == property)
                && config.metadata.status == IndexStatus::Online
            {
                return Some(config);
            }
            None
        })
    }

    /// Get label metadata with case-insensitive lookup.
    ///
    /// This allows queries to match labels regardless of case, providing
    /// better user experience when label names vary in casing.
    pub fn get_label_case_insensitive(&self, name: &str) -> Option<&LabelMeta> {
        self.labels
            .iter()
            .find(|(k, _)| k.eq_ignore_ascii_case(name))
            .map(|(_, v)| v)
    }

    /// Get label ID with case-insensitive lookup.
    pub fn label_id_by_name_case_insensitive(&self, label_name: &str) -> Option<u16> {
        self.get_label_case_insensitive(label_name)
            .map(|meta| meta.id)
    }

    /// Get edge type metadata with case-insensitive lookup.
    ///
    /// This allows queries to match edge types regardless of case, providing
    /// better user experience when type names vary in casing.
    pub fn get_edge_type_case_insensitive(&self, name: &str) -> Option<&EdgeTypeMeta> {
        self.edge_types
            .iter()
            .find(|(k, _)| k.eq_ignore_ascii_case(name))
            .map(|(_, v)| v)
    }

    /// Get edge type ID with case-insensitive lookup (schema-defined types only).
    pub fn edge_type_id_by_name_case_insensitive(&self, type_name: &str) -> Option<u32> {
        self.get_edge_type_case_insensitive(type_name)
            .map(|meta| meta.id)
    }

    /// Get edge type ID with case-insensitive lookup, checking both
    /// schema-defined and schemaless registries.
    pub fn edge_type_id_unified_case_insensitive(&self, type_name: &str) -> Option<u32> {
        self.edge_type_id_by_name_case_insensitive(type_name)
            .or_else(|| {
                self.schemaless_registry
                    .id_by_name_case_insensitive(type_name)
            })
    }

    /// Returns the edge type ID for `type_name`, checking the schema first
    /// and falling back to the schemaless registry (assigning a new ID if needed).
    ///
    /// Requires `&mut self` because it may assign a new schemaless ID.
    /// Use [`edge_type_id_by_name`](Self::edge_type_id_by_name) for read-only schema lookups.
    pub fn get_or_assign_edge_type_id(&mut self, type_name: &str) -> u32 {
        if let Some(id) = self.edge_type_id_by_name(type_name) {
            return id;
        }
        self.schemaless_registry.get_or_assign_id(type_name)
    }

    /// Returns the edge type name for `type_id`, checking both the schema
    /// and schemaless registries. Returns an owned `String` because the
    /// name may come from either registry.
    pub fn edge_type_name_by_id_unified(&self, type_id: u32) -> Option<String> {
        if is_schemaless_edge_type(type_id) {
            self.schemaless_registry
                .type_name_by_id(type_id)
                .map(str::to_owned)
        } else {
            self.edge_type_name_by_id(type_id).map(str::to_owned)
        }
    }

    /// Returns all edge type IDs, including both schema-defined and schemaless types.
    /// Used when MATCH queries don't specify an edge type and need to scan all edges.
    pub fn all_edge_type_ids(&self) -> Vec<u32> {
        let mut ids: Vec<u32> = self.edge_types.values().map(|m| m.id).collect();
        ids.extend(self.schemaless_registry.all_type_ids());
        ids.sort_unstable();
        ids
    }
}

/// Lifecycle status of an index.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub enum IndexStatus {
    /// Index is up-to-date and available for queries.
    #[default]
    Online,
    /// Index is currently being rebuilt.
    Building,
    /// Index is outdated and scheduled for rebuild.
    Stale,
    /// Index rebuild failed after exhausting retries.
    Failed,
}

/// Metadata tracking the lifecycle state of an index.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
pub struct IndexMetadata {
    /// Current lifecycle status.
    #[serde(default)]
    pub status: IndexStatus,
    /// When the index was last successfully built.
    #[serde(default)]
    pub last_built_at: Option<DateTime<Utc>>,
    /// Row count of the dataset when the index was last built.
    #[serde(default)]
    pub row_count_at_build: Option<u64>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "type")]
#[non_exhaustive]
pub enum IndexDefinition {
    Vector(VectorIndexConfig),
    FullText(FullTextIndexConfig),
    Scalar(ScalarIndexConfig),
    Inverted(InvertedIndexConfig),
    JsonFullText(JsonFtsIndexConfig),
}

impl IndexDefinition {
    /// Returns the index name for any variant.
    pub fn name(&self) -> &str {
        match self {
            IndexDefinition::Vector(c) => &c.name,
            IndexDefinition::FullText(c) => &c.name,
            IndexDefinition::Scalar(c) => &c.name,
            IndexDefinition::Inverted(c) => &c.name,
            IndexDefinition::JsonFullText(c) => &c.name,
        }
    }

    /// Returns the label this index is defined on.
    pub fn label(&self) -> &str {
        match self {
            IndexDefinition::Vector(c) => &c.label,
            IndexDefinition::FullText(c) => &c.label,
            IndexDefinition::Scalar(c) => &c.label,
            IndexDefinition::Inverted(c) => &c.label,
            IndexDefinition::JsonFullText(c) => &c.label,
        }
    }

    /// Returns a reference to the index lifecycle metadata.
    pub fn metadata(&self) -> &IndexMetadata {
        match self {
            IndexDefinition::Vector(c) => &c.metadata,
            IndexDefinition::FullText(c) => &c.metadata,
            IndexDefinition::Scalar(c) => &c.metadata,
            IndexDefinition::Inverted(c) => &c.metadata,
            IndexDefinition::JsonFullText(c) => &c.metadata,
        }
    }

    /// Returns a mutable reference to the index lifecycle metadata.
    pub fn metadata_mut(&mut self) -> &mut IndexMetadata {
        match self {
            IndexDefinition::Vector(c) => &mut c.metadata,
            IndexDefinition::FullText(c) => &mut c.metadata,
            IndexDefinition::Scalar(c) => &mut c.metadata,
            IndexDefinition::Inverted(c) => &mut c.metadata,
            IndexDefinition::JsonFullText(c) => &mut c.metadata,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct InvertedIndexConfig {
    pub name: String,
    pub label: String,
    pub property: String,
    #[serde(default = "default_normalize")]
    pub normalize: bool,
    #[serde(default = "default_max_terms_per_doc")]
    pub max_terms_per_doc: usize,
    #[serde(default)]
    pub metadata: IndexMetadata,
}

fn default_normalize() -> bool {
    true
}

fn default_max_terms_per_doc() -> usize {
    10_000
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct VectorIndexConfig {
    pub name: String,
    pub label: String,
    pub property: String,
    pub index_type: VectorIndexType,
    pub metric: DistanceMetric,
    pub embedding_config: Option<EmbeddingConfig>,
    #[serde(default)]
    pub metadata: IndexMetadata,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EmbeddingConfig {
    /// Model alias in the Uni-Xervo catalog (for example: "embed/default").
    pub alias: String,
    pub source_properties: Vec<String>,
    pub batch_size: usize,
    /// Prefix prepended to text before embedding during auto-embed (document side).
    /// Example: `"search_document: "` for Nomic models. Include any trailing space.
    #[serde(default)]
    pub document_prefix: Option<String>,
    /// Prefix prepended to text before embedding during query-time embed calls.
    /// Example: `"search_query: "` for Nomic models. Include any trailing space.
    #[serde(default)]
    pub query_prefix: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum VectorIndexType {
    Flat,
    IvfFlat {
        num_partitions: u32,
    },
    IvfPq {
        num_partitions: u32,
        num_sub_vectors: u32,
        bits_per_subvector: u8,
    },
    IvfSq {
        num_partitions: u32,
    },
    IvfRq {
        num_partitions: u32,
        #[serde(default)]
        num_bits: Option<u8>,
    },
    HnswFlat {
        m: u32,
        ef_construction: u32,
        #[serde(default)]
        num_partitions: Option<u32>,
    },
    HnswSq {
        m: u32,
        ef_construction: u32,
        #[serde(default)]
        num_partitions: Option<u32>,
    },
    HnswPq {
        m: u32,
        ef_construction: u32,
        num_sub_vectors: u32,
        #[serde(default)]
        num_partitions: Option<u32>,
    },
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum DistanceMetric {
    Cosine,
    L2,
    Dot,
}

impl DistanceMetric {
    /// Computes the distance between two vectors using this metric.
    ///
    /// All metrics follow LanceDB conventions so that lower values indicate
    /// greater similarity:
    /// - **L2**: squared Euclidean distance.
    /// - **Cosine**: `1.0 - cosine_similarity` (range \[0, 2\]).
    /// - **Dot**: negative dot product.
    ///
    /// # Panics
    ///
    /// Panics if `a` and `b` have different lengths.
    pub fn compute_distance(&self, a: &[f32], b: &[f32]) -> f32 {
        assert_eq!(a.len(), b.len(), "vector dimension mismatch");
        match self {
            DistanceMetric::L2 => a.iter().zip(b).map(|(x, y)| (x - y).powi(2)).sum(),
            DistanceMetric::Cosine => {
                let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
                let norm_a: f32 = a.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
                let norm_b: f32 = b.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
                let denom = norm_a * norm_b;
                if denom == 0.0 { 1.0 } else { 1.0 - dot / denom }
            }
            DistanceMetric::Dot => {
                let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
                -dot
            }
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FullTextIndexConfig {
    pub name: String,
    pub label: String,
    pub properties: Vec<String>,
    pub tokenizer: TokenizerConfig,
    pub with_positions: bool,
    #[serde(default)]
    pub metadata: IndexMetadata,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum TokenizerConfig {
    Standard,
    Whitespace,
    Ngram { min: u8, max: u8 },
    Custom { name: String },
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct JsonFtsIndexConfig {
    pub name: String,
    pub label: String,
    pub column: String,
    #[serde(default)]
    pub paths: Vec<String>,
    #[serde(default)]
    pub with_positions: bool,
    #[serde(default)]
    pub metadata: IndexMetadata,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ScalarIndexConfig {
    pub name: String,
    pub label: String,
    pub properties: Vec<String>,
    pub index_type: ScalarIndexType,
    pub where_clause: Option<String>,
    #[serde(default)]
    pub metadata: IndexMetadata,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[non_exhaustive]
pub enum ScalarIndexType {
    BTree,
    Hash,
    Bitmap,
    LabelList,
}

pub struct SchemaManager {
    store: Arc<dyn ObjectStore>,
    path: ObjectStorePath,
    schema: RwLock<Arc<Schema>>,
}

impl SchemaManager {
    pub async fn load(path: impl AsRef<Path>) -> Result<Self> {
        let path = path.as_ref();
        let parent = path
            .parent()
            .ok_or_else(|| anyhow!("Invalid schema path"))?;
        let filename = path
            .file_name()
            .ok_or_else(|| anyhow!("Invalid schema filename"))?
            .to_str()
            .ok_or_else(|| anyhow!("Invalid utf8 filename"))?;

        let store = Arc::new(LocalFileSystem::new_with_prefix(parent)?);
        let obj_path = ObjectStorePath::from(filename);

        Self::load_from_store(store, &obj_path).await
    }

    pub async fn load_from_store(
        store: Arc<dyn ObjectStore>,
        path: &ObjectStorePath,
    ) -> Result<Self> {
        match store.get(path).await {
            Ok(result) => {
                let bytes = result.bytes().await?;
                let content = String::from_utf8(bytes.to_vec())?;
                let mut schema: Schema = serde_json::from_str(&content)?;
                // Self-heal catalogs that grew super-linearly under the
                // pre-fix `add_index` (issue rustic-ai/uni-db#63). Collapse
                // duplicate index entries by name, keeping the *last*
                // occurrence — matches the upsert semantics in `add_index`
                // and preserves whatever metadata the most recent rebuild
                // wrote. The dedup persists on the next mutation that
                // calls `save()`.
                let original_len = schema.indexes.len();
                if original_len > 0 {
                    let mut seen: std::collections::HashSet<String> =
                        std::collections::HashSet::with_capacity(original_len);
                    let mut dedup: Vec<IndexDefinition> = schema
                        .indexes
                        .iter()
                        .rev()
                        .filter(|idx| seen.insert(idx.name().to_string()))
                        .cloned()
                        .collect();
                    dedup.reverse();
                    if dedup.len() != original_len {
                        tracing::warn!(
                            collapsed = original_len - dedup.len(),
                            kept = dedup.len(),
                            "schema.indexes: collapsed duplicate entries on load (issue #63)"
                        );
                        schema.indexes = dedup;
                    }
                }
                Ok(Self {
                    store,
                    path: path.clone(),
                    schema: RwLock::new(Arc::new(schema)),
                })
            }
            Err(object_store::Error::NotFound { .. }) => Ok(Self {
                store,
                path: path.clone(),
                schema: RwLock::new(Arc::new(Schema::default())),
            }),
            Err(e) => Err(anyhow::Error::from(e)),
        }
    }

    pub async fn save(&self) -> Result<()> {
        let content = {
            let schema_guard = acquire_read(&self.schema, "schema")?;
            serde_json::to_string_pretty(&**schema_guard)?
        };
        self.store
            .put(&self.path, content.into())
            .await
            .map_err(anyhow::Error::from)?;
        Ok(())
    }

    pub fn path(&self) -> &ObjectStorePath {
        &self.path
    }

    pub fn schema(&self) -> Arc<Schema> {
        self.schema
            .read()
            .expect("Schema lock poisoned - a thread panicked while holding it")
            .clone()
    }

    /// Normalize function names in an expression to uppercase for case-insensitive matching.
    /// Examples: "lower(email)" -> "LOWER(email)", "trim(name)" -> "TRIM(name)"
    fn normalize_function_names(expr: &str) -> String {
        let mut result = String::with_capacity(expr.len());
        let mut chars = expr.chars().peekable();

        while let Some(ch) = chars.next() {
            if ch.is_alphabetic() {
                // Collect identifier
                let mut ident = String::new();
                ident.push(ch);

                while let Some(&next) = chars.peek() {
                    if next.is_alphanumeric() || next == '_' {
                        ident.push(chars.next().unwrap());
                    } else {
                        break;
                    }
                }

                // If followed by '(', it's a function call - uppercase it
                if chars.peek() == Some(&'(') {
                    result.push_str(&ident.to_uppercase());
                } else {
                    result.push_str(&ident); // Keep property names as-is
                }
            } else {
                result.push(ch);
            }
        }

        result
    }

    /// Generate a consistent internal column name for an expression index.
    /// Uses a hash suffix to ensure uniqueness for different expressions that
    /// might sanitize to the same string (e.g., "a+b" and "a-b" both become "a_b").
    ///
    /// IMPORTANT: Uses FNV-1a hash which is stable across Rust versions and platforms.
    /// DefaultHasher is not guaranteed to be stable and could break persistent data
    /// if the hash changes after a compiler upgrade.
    pub fn generated_column_name(expr: &str) -> String {
        // Normalize function names to uppercase for case-insensitive matching
        let normalized = Self::normalize_function_names(expr);

        let sanitized = normalized
            .replace(|c: char| !c.is_alphanumeric(), "_")
            .trim_matches('_')
            .to_string();

        // FNV-1a 64-bit hash - stable across Rust versions and platforms
        const FNV_OFFSET_BASIS: u64 = 14695981039346656037;
        const FNV_PRIME: u64 = 1099511628211;

        let mut hash = FNV_OFFSET_BASIS;
        for byte in normalized.as_bytes() {
            hash ^= *byte as u64;
            hash = hash.wrapping_mul(FNV_PRIME);
        }

        format!("_gen_{}_{:x}", sanitized, hash)
    }

    pub fn replace_schema(&self, new_schema: Schema) {
        let mut schema = self
            .schema
            .write()
            .expect("Schema lock poisoned - a thread panicked while holding it");
        *schema = Arc::new(new_schema);
    }

    /// Build a fork-scoped manager whose schema is `primary ⊕ overlay`.
    ///
    /// Used by `UniInner::at_fork` to give a forked session a schema view
    /// that includes any labels/edge-types/properties the fork has
    /// introduced on top of primary. The returned manager owns its own
    /// in-memory `Arc<Schema>` — mutations to it never reach primary's
    /// schema file. The returned manager is *not* intended for `.save()`;
    /// fork-overlay persistence is owned by the registry layer
    /// (`catalog/fork_schemas/{fork_id}.json`).
    ///
    /// In Phase 1 the delta is always empty, so the merge is a clone.
    /// Phase 2 starts populating it when on-the-fly label creation lands.
    #[must_use]
    pub fn with_overlay(&self, overlay: &crate::core::fork::SchemaDelta) -> Arc<Self> {
        let primary = self.schema();
        let merged = if overlay.is_empty() {
            (*primary).clone()
        } else {
            let mut merged = (*primary).clone();
            for (name, label) in &overlay.added_labels {
                merged.labels.insert(name.clone(), label.clone());
            }
            for (name, edge_type) in &overlay.added_edge_types {
                merged.edge_types.insert(name.clone(), edge_type.clone());
            }
            for addition in &overlay.added_properties {
                let props = merged.properties.entry(addition.owner.clone()).or_default();
                props.insert(
                    addition.property.clone(),
                    PropertyMeta {
                        r#type: addition.data_type.clone(),
                        nullable: addition.nullable,
                        added_in: merged.schema_version,
                        state: SchemaElementState::Active,
                        generation_expression: None,
                        description: None,
                    },
                );
            }
            merged
        };

        Arc::new(Self {
            store: self.store.clone(),
            path: self.path.clone(),
            schema: RwLock::new(Arc::new(merged)),
        })
    }

    pub fn next_label_id(&self) -> u16 {
        self.schema()
            .labels
            .values()
            .map(|l| l.id)
            .max()
            .unwrap_or(0)
            + 1
    }

    pub fn next_type_id(&self) -> u32 {
        let max_schema_id = self
            .schema()
            .edge_types
            .values()
            .map(|t| t.id)
            .max()
            .unwrap_or(0);

        // Ensure we stay in schema'd ID space (bit 31 = 0)
        if max_schema_id >= MAX_SCHEMA_TYPE_ID {
            panic!("Schema edge type ID exhaustion");
        }

        max_schema_id + 1
    }

    pub fn add_label(&self, name: &str) -> Result<u16> {
        self.add_label_with_desc(name, None)
    }

    pub fn add_label_with_desc(&self, name: &str, description: Option<String>) -> Result<u16> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if schema.labels.contains_key(name) {
            return Err(anyhow!("Label '{}' already exists", name));
        }

        let id = schema.labels.values().map(|l| l.id).max().unwrap_or(0) + 1;
        if id >= VIRTUAL_LABEL_ID_START {
            return Err(anyhow!(
                "Native label space exhausted (next id {id:#x} would enter the \
                 virtual range {VIRTUAL_LABEL_ID_START:#x}..{VIRTUAL_LABEL_ID_SENTINEL:#x} \
                 reserved for catalog-resolved labels)"
            ));
        }
        schema.labels.insert(
            name.to_string(),
            LabelMeta {
                id,
                created_at: Utc::now(),
                state: SchemaElementState::Active,
                description,
            },
        );
        Ok(id)
    }

    pub fn add_edge_type(
        &self,
        name: &str,
        src_labels: Vec<String>,
        dst_labels: Vec<String>,
    ) -> Result<u32> {
        self.add_edge_type_with_desc(name, src_labels, dst_labels, None)
    }

    pub fn add_edge_type_with_desc(
        &self,
        name: &str,
        src_labels: Vec<String>,
        dst_labels: Vec<String>,
        description: Option<String>,
    ) -> Result<u32> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if schema.edge_types.contains_key(name) {
            return Err(anyhow!("Edge type '{}' already exists", name));
        }

        let id = schema.edge_types.values().map(|t| t.id).max().unwrap_or(0) + 1;

        // Stay in the schema-defined sub-range (bit 31 = 0, and below the
        // virtual reservation `VIRTUAL_EDGE_TYPE_ID_START`) — same bound as
        // `add_edge_type`, so the two entry points cannot disagree on the
        // legal ceiling.
        if id >= VIRTUAL_EDGE_TYPE_ID_START {
            return Err(anyhow!(
                "Native edge type space exhausted (next id {id:#x} would enter the \
                 virtual range {VIRTUAL_EDGE_TYPE_ID_START:#x}..{VIRTUAL_EDGE_TYPE_ID_SENTINEL:#x} \
                 reserved for catalog-resolved edge types)"
            ));
        }

        schema.edge_types.insert(
            name.to_string(),
            EdgeTypeMeta {
                id,
                src_labels,
                dst_labels,
                state: SchemaElementState::Active,
                description,
            },
        );
        Ok(id)
    }

    /// Delegates to [`Schema::get_or_assign_edge_type_id`].
    pub fn get_or_assign_edge_type_id(&self, type_name: &str) -> u32 {
        let mut guard = acquire_write(&self.schema, "schema")
            .expect("Schema lock poisoned - a thread panicked while holding it");
        let schema = Arc::make_mut(&mut *guard);
        schema.get_or_assign_edge_type_id(type_name)
    }

    /// Delegates to [`Schema::edge_type_name_by_id_unified`].
    pub fn edge_type_name_by_id_unified(&self, type_id: u32) -> Option<String> {
        let schema = acquire_read(&self.schema, "schema")
            .expect("Schema lock poisoned - a thread panicked while holding it");
        schema.edge_type_name_by_id_unified(type_id)
    }

    pub fn add_property(
        &self,
        label_or_type: &str,
        prop_name: &str,
        data_type: DataType,
        nullable: bool,
    ) -> Result<()> {
        self.add_property_with_desc(label_or_type, prop_name, data_type, nullable, None)
    }

    pub fn add_property_with_desc(
        &self,
        label_or_type: &str,
        prop_name: &str,
        data_type: DataType,
        nullable: bool,
        description: Option<String>,
    ) -> Result<()> {
        validate_property_name(prop_name)?;
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let version = schema.schema_version;
        let props = schema
            .properties
            .entry(label_or_type.to_string())
            .or_default();

        if props.contains_key(prop_name) {
            return Err(anyhow!(
                "Property '{}' already exists for '{}'",
                prop_name,
                label_or_type
            ));
        }

        props.insert(
            prop_name.to_string(),
            PropertyMeta {
                r#type: data_type,
                nullable,
                added_in: version,
                state: SchemaElementState::Active,
                generation_expression: None,
                description,
            },
        );
        Ok(())
    }

    pub fn add_generated_property(
        &self,
        label_or_type: &str,
        prop_name: &str,
        data_type: DataType,
        expr: String,
    ) -> Result<()> {
        // System-generated `_gen_*` columns bypass the underscore-prefix rule
        // but must still avoid storage-layer column-name collisions.
        validate_reserved_property_name(prop_name)?;
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let version = schema.schema_version;
        let props = schema
            .properties
            .entry(label_or_type.to_string())
            .or_default();

        if props.contains_key(prop_name) {
            return Err(anyhow!("Property '{}' already exists", prop_name));
        }

        props.insert(
            prop_name.to_string(),
            PropertyMeta {
                r#type: data_type,
                nullable: true,
                added_in: version,
                state: SchemaElementState::Active,
                generation_expression: Some(expr),
                description: None,
            },
        );
        Ok(())
    }

    pub fn set_label_description(&self, name: &str, description: Option<String>) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let meta = schema
            .labels
            .get_mut(name)
            .ok_or_else(|| anyhow!("Label '{}' does not exist", name))?;
        meta.description = description;
        Ok(())
    }

    pub fn set_edge_type_description(&self, name: &str, description: Option<String>) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let meta = schema
            .edge_types
            .get_mut(name)
            .ok_or_else(|| anyhow!("Edge type '{}' does not exist", name))?;
        meta.description = description;
        Ok(())
    }

    pub fn set_property_description(
        &self,
        entity: &str,
        prop_name: &str,
        description: Option<String>,
    ) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let props = schema
            .properties
            .get_mut(entity)
            .ok_or_else(|| anyhow!("Entity '{}' does not exist", entity))?;
        let meta = props
            .get_mut(prop_name)
            .ok_or_else(|| anyhow!("Property '{}' does not exist on '{}'", prop_name, entity))?;
        meta.description = description;
        Ok(())
    }

    /// Register an index definition on the schema, **upsert by name**.
    ///
    /// If an index with the same `IndexDefinition::name()` already exists, it
    /// is replaced in place; otherwise the def is appended. Idempotent under
    /// repeat invocation, which makes `SchemaBuilder::apply()` re-applicable
    /// without bloating `schema.indexes` and lets the rebuild epilogue inside
    /// every `IndexManager::create_*_index` re-record metadata updates without
    /// duplicating entries (issue rustic-ai/uni-db#63).
    pub fn add_index(&self, index_def: IndexDefinition) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(existing) = schema
            .indexes
            .iter_mut()
            .find(|i| i.name() == index_def.name())
        {
            *existing = index_def;
        } else {
            schema.indexes.push(index_def);
        }
        Ok(())
    }

    pub fn get_index(&self, name: &str) -> Option<IndexDefinition> {
        let schema = self.schema.read().expect("Schema lock poisoned");
        schema.indexes.iter().find(|i| i.name() == name).cloned()
    }

    /// Updates the lifecycle metadata for an index by name.
    ///
    /// The closure receives a mutable reference to the index's `IndexMetadata`,
    /// allowing callers to update status, timestamps, etc.
    pub fn update_index_metadata(
        &self,
        index_name: &str,
        f: impl FnOnce(&mut IndexMetadata),
    ) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        let idx = schema
            .indexes
            .iter_mut()
            .find(|i| i.name() == index_name)
            .ok_or_else(|| anyhow!("Index '{}' not found", index_name))?;
        f(idx.metadata_mut());
        Ok(())
    }

    pub fn remove_index(&self, name: &str) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(pos) = schema.indexes.iter().position(|i| i.name() == name) {
            schema.indexes.remove(pos);
            Ok(())
        } else {
            Err(anyhow!("Index '{}' not found", name))
        }
    }

    pub fn add_constraint(&self, constraint: Constraint) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if schema.constraints.iter().any(|c| c.name == constraint.name) {
            return Err(anyhow!("Constraint '{}' already exists", constraint.name));
        }
        schema.constraints.push(constraint);
        Ok(())
    }

    pub fn drop_constraint(&self, name: &str, if_exists: bool) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(pos) = schema.constraints.iter().position(|c| c.name == name) {
            schema.constraints.remove(pos);
            Ok(())
        } else if if_exists {
            Ok(())
        } else {
            Err(anyhow!("Constraint '{}' not found", name))
        }
    }

    pub fn drop_property(&self, label_or_type: &str, prop_name: &str) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(props) = schema.properties.get_mut(label_or_type) {
            if props.remove(prop_name).is_some() {
                Ok(())
            } else {
                Err(anyhow!(
                    "Property '{}' not found for '{}'",
                    prop_name,
                    label_or_type
                ))
            }
        } else {
            Err(anyhow!("Label or Edge Type '{}' not found", label_or_type))
        }
    }

    pub fn rename_property(
        &self,
        label_or_type: &str,
        old_name: &str,
        new_name: &str,
    ) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(props) = schema.properties.get_mut(label_or_type) {
            if let Some(meta) = props.remove(old_name) {
                if props.contains_key(new_name) {
                    // Rollback removal? Or just error.
                    props.insert(old_name.to_string(), meta); // Restore
                    return Err(anyhow!("Property '{}' already exists", new_name));
                }
                props.insert(new_name.to_string(), meta);
                Ok(())
            } else {
                Err(anyhow!(
                    "Property '{}' not found for '{}'",
                    old_name,
                    label_or_type
                ))
            }
        } else {
            Err(anyhow!("Label or Edge Type '{}' not found", label_or_type))
        }
    }

    pub fn drop_label(&self, name: &str, if_exists: bool) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(label_meta) = schema.labels.get_mut(name) {
            label_meta.state = SchemaElementState::Tombstone { since: Utc::now() };
            // Do not remove properties; they are implicitly tombstoned by the label
            Ok(())
        } else if if_exists {
            Ok(())
        } else {
            Err(anyhow!("Label '{}' not found", name))
        }
    }

    pub fn drop_edge_type(&self, name: &str, if_exists: bool) -> Result<()> {
        let mut guard = acquire_write(&self.schema, "schema")?;
        let schema = Arc::make_mut(&mut *guard);
        if let Some(edge_meta) = schema.edge_types.get_mut(name) {
            edge_meta.state = SchemaElementState::Tombstone { since: Utc::now() };
            // Do not remove properties; they are implicitly tombstoned by the edge type
            Ok(())
        } else if if_exists {
            Ok(())
        } else {
            Err(anyhow!("Edge Type '{}' not found", name))
        }
    }
}

/// Validate identifier names to prevent injection and ensure compatibility.
pub fn validate_identifier(name: &str) -> Result<()> {
    // Length check
    if name.is_empty() || name.len() > 64 {
        return Err(anyhow!("Identifier '{}' must be 1-64 characters", name));
    }

    // First character must be letter or underscore
    let first = name.chars().next().unwrap();
    if !first.is_alphabetic() && first != '_' {
        return Err(anyhow!(
            "Identifier '{}' must start with letter or underscore",
            name
        ));
    }

    // Remaining characters: alphanumeric or underscore
    if !name.chars().all(|c| c.is_alphanumeric() || c == '_') {
        return Err(anyhow!(
            "Identifier '{}' must contain only alphanumeric and underscore",
            name
        ));
    }

    // Reserved words
    const RESERVED: &[&str] = &[
        "MATCH", "CREATE", "DELETE", "SET", "RETURN", "WHERE", "MERGE", "CALL", "YIELD", "WITH",
        "UNION", "ORDER", "LIMIT",
    ];
    if RESERVED.contains(&name.to_uppercase().as_str()) {
        return Err(anyhow!("Identifier '{}' cannot be a reserved word", name));
    }

    Ok(())
}

/// Reject user-declared property names that collide with internal Arrow column
/// names used by the storage layer.
///
/// Without this, declaring a property named e.g. `ext_id` produces an Arrow
/// schema with two `ext_id` fields at flush time, which Lance rejects with
/// "Duplicate field name" — silently losing all in-session writes on shutdown.
pub fn validate_property_name(name: &str) -> Result<()> {
    if name.starts_with('_') {
        return Err(anyhow!(
            "Property name '{}' is reserved: names starting with '_' are reserved by the storage layer",
            name
        ));
    }
    validate_reserved_property_name(name)
}

/// Reject names that collide with storage-layer Arrow column names.
///
/// Used both by `validate_property_name` (user-facing path) and directly by
/// `add_generated_property` (system-generated `_gen_*` path) — the latter
/// needs to bypass the underscore-prefix rule but must still reject the
/// fixed-name collisions below.
fn validate_reserved_property_name(name: &str) -> Result<()> {
    // Unprefixed names that get appended alongside user properties in the
    // per-label vertex (`storage/vertex.rs`), per-edge-type edge
    // (`storage/edge.rs`), or per-edge-type delta (`storage/delta.rs`)
    // Arrow schemas — declaring one of these as a user property produces a
    // duplicate Arrow field and a Lance "Duplicate field name" error at
    // flush time. Fixed-schema-only columns (`type`, `props_json`,
    // `labels` in the main tables) are NOT listed: those tables don't
    // append user properties, so no collision can occur.
    const RESERVED_PROPS: &[&str] = &[
        "ext_id",
        "overflow_json",
        "eid",
        "src_vid",
        "dst_vid",
        "op",
        // Internal planner sentinel: a column-name marker used by
        // `mark_set_item_variables` (uni-query::query::planner) to request
        // narrow structural projection without full-schema expansion.
        // Reserved here defensively so an internal `add_generated_property`
        // path can't accidentally create a colliding user-facing column.
        // The user-facing `validate_property_name` already rejects this
        // via the underscore-prefix rule, so this is belt-and-suspenders.
        "__set_struct__",
    ];
    if RESERVED_PROPS.contains(&name) {
        return Err(anyhow!(
            "Property name '{}' is reserved by the storage layer; please choose a different name",
            name
        ));
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::value::{TemporalValue, Value};
    use object_store::local::LocalFileSystem;
    use tempfile::tempdir;

    #[test]
    fn test_datatype_accepts_matrix() {
        let dt = || TemporalValue::DateTime {
            nanos_since_epoch: 0,
            offset_seconds: 0,
            timezone_name: None,
        };

        // Null is accepted by every type (nullability checked separately).
        for ty in [
            DataType::String,
            DataType::Int64,
            DataType::Bool,
            DataType::DateTime,
            DataType::Float64,
        ] {
            assert!(ty.accepts(&Value::Null), "{ty:?} must accept Null");
        }

        // Exact-type matches.
        assert!(DataType::String.accepts(&Value::String("x".into())));
        assert!(DataType::Int64.accepts(&Value::Int(1)));
        assert!(DataType::Bool.accepts(&Value::Bool(true)));
        assert!(DataType::DateTime.accepts(&Value::Temporal(dt())));

        // Intentional lossless widenings remain allowed.
        assert!(
            DataType::Float64.accepts(&Value::Int(3)),
            "Int widens to Float"
        );
        assert!(DataType::Int32.accepts(&Value::Int(3)), "Int fits Int32");
        assert!(DataType::Timestamp.accepts(&Value::Temporal(dt())));
        assert!(
            DataType::Timestamp.accepts(&Value::String("2026-01-01T00:00:00Z".into())),
            "storage parses strings for non-struct Timestamp columns"
        );

        // The #68 data-loss cases must be rejected (coercion handles strings separately).
        assert!(
            !DataType::DateTime.accepts(&Value::String("2026-01-01T00:00:00Z".into())),
            "String into a DateTime struct column nulls silently — reject here"
        );
        assert!(!DataType::Bool.accepts(&Value::Int(1)));
        assert!(!DataType::Int64.accepts(&Value::Bool(true)));
        assert!(!DataType::Int64.accepts(&Value::Float(1.5)));
        assert!(
            !DataType::String.accepts(&Value::Int(10)),
            "no implicit stringification"
        );
        assert!(!DataType::Duration.accepts(&Value::String("P1D".into())));

        // Opaque columns accept anything.
        assert!(DataType::CypherValue.accepts(&Value::Map(Default::default())));
    }

    #[tokio::test]
    async fn test_schema_management() -> Result<()> {
        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let manager = SchemaManager::load_from_store(store.clone(), &path).await?;

        // Labels
        let lid = manager.add_label("Person")?;
        assert_eq!(lid, 1);
        assert!(manager.add_label("Person").is_err());

        // Properties
        manager.add_property("Person", "name", DataType::String, false)?;
        assert!(
            manager
                .add_property("Person", "name", DataType::String, false)
                .is_err()
        );

        // Edge types
        let tid = manager.add_edge_type("knows", vec!["Person".into()], vec!["Person".into()])?;
        assert_eq!(tid, 1);

        manager.save().await?;
        // Check file exists
        assert!(store.get(&path).await.is_ok());

        let manager2 = SchemaManager::load_from_store(store, &path).await?;
        assert!(manager2.schema().labels.contains_key("Person"));
        assert!(
            manager2
                .schema()
                .properties
                .get("Person")
                .unwrap()
                .contains_key("name")
        );

        Ok(())
    }

    #[tokio::test]
    async fn test_reserved_property_names_rejected() -> Result<()> {
        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let manager = SchemaManager::load_from_store(store, &path).await?;

        manager.add_label("Tiny")?;

        // Unprefixed reserved names — these collide with internal Arrow
        // columns in storage tables and previously caused Lance
        // "Duplicate field name" errors at flush time.
        for reserved in &["ext_id", "overflow_json", "eid", "src_vid", "dst_vid", "op"] {
            let err = manager
                .add_property("Tiny", reserved, DataType::String, true)
                .expect_err(&format!("expected '{reserved}' to be rejected"));
            assert!(
                err.to_string().contains("reserved"),
                "error for '{reserved}' should mention 'reserved', got: {err}"
            );
        }

        // Planner sentinel — reserved in RESERVED_PROPS (belt-and-suspenders
        // alongside the underscore-prefix rule). Confirms an internal
        // `add_generated_property` path cannot accidentally create a column
        // that collides with the SET-target structural-projection marker.
        let err = manager
            .add_property("Tiny", "__set_struct__", DataType::String, true)
            .expect_err("expected '__set_struct__' to be rejected");
        assert!(
            err.to_string().contains("reserved"),
            "__set_struct__ rejection should mention 'reserved', got: {err}"
        );

        // Leading-underscore pattern rule.
        for reserved in &["_vid", "_uid", "_eid", "_version", "_created_at"] {
            assert!(
                manager
                    .add_property("Tiny", reserved, DataType::String, true)
                    .is_err(),
                "expected '{reserved}' to be rejected"
            );
        }

        // Names that merely contain a reserved substring should still be
        // accepted.
        manager.add_property("Tiny", "ext_id_foo", DataType::String, true)?;
        manager.add_property("Tiny", "user_op", DataType::String, true)?;
        manager.add_property("Tiny", "type_name", DataType::String, true)?;

        // Same check applies to edge-type properties (single dispatch).
        manager.add_edge_type("knows", vec!["Tiny".into()], vec!["Tiny".into()])?;
        assert!(
            manager
                .add_property("knows", "src_vid", DataType::Int64, true)
                .is_err()
        );

        // And to generated properties.
        assert!(
            manager
                .add_generated_property(
                    "Tiny",
                    "ext_id",
                    DataType::String,
                    "concat('x', name)".into()
                )
                .is_err()
        );

        Ok(())
    }

    #[test]
    fn test_normalize_function_names() {
        assert_eq!(
            SchemaManager::normalize_function_names("lower(email)"),
            "LOWER(email)"
        );
        assert_eq!(
            SchemaManager::normalize_function_names("LOWER(email)"),
            "LOWER(email)"
        );
        assert_eq!(
            SchemaManager::normalize_function_names("Lower(email)"),
            "LOWER(email)"
        );
        assert_eq!(
            SchemaManager::normalize_function_names("trim(lower(email))"),
            "TRIM(LOWER(email))"
        );
    }

    #[test]
    fn test_generated_column_name_case_insensitive() {
        let col1 = SchemaManager::generated_column_name("lower(email)");
        let col2 = SchemaManager::generated_column_name("LOWER(email)");
        let col3 = SchemaManager::generated_column_name("Lower(email)");
        assert_eq!(col1, col2);
        assert_eq!(col2, col3);
        assert!(col1.starts_with("_gen_LOWER_email_"));
    }

    #[test]
    fn test_index_metadata_serde_backward_compat() {
        // Simulate old JSON without metadata field
        let json = r#"{
            "type": "Scalar",
            "name": "idx_person_name",
            "label": "Person",
            "properties": ["name"],
            "index_type": "BTree",
            "where_clause": null
        }"#;
        let def: IndexDefinition = serde_json::from_str(json).unwrap();
        let meta = def.metadata();
        assert_eq!(meta.status, IndexStatus::Online);
        assert!(meta.last_built_at.is_none());
        assert!(meta.row_count_at_build.is_none());
    }

    #[test]
    fn test_index_metadata_serde_roundtrip() {
        let now = Utc::now();
        let def = IndexDefinition::Scalar(ScalarIndexConfig {
            name: "idx_test".to_string(),
            label: "Test".to_string(),
            properties: vec!["prop".to_string()],
            index_type: ScalarIndexType::BTree,
            where_clause: None,
            metadata: IndexMetadata {
                status: IndexStatus::Building,
                last_built_at: Some(now),
                row_count_at_build: Some(42),
            },
        });

        let json = serde_json::to_string(&def).unwrap();
        let parsed: IndexDefinition = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.metadata().status, IndexStatus::Building);
        assert_eq!(parsed.metadata().row_count_at_build, Some(42));
        assert!(parsed.metadata().last_built_at.is_some());
    }

    #[tokio::test]
    async fn test_update_index_metadata() -> Result<()> {
        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let manager = SchemaManager::load_from_store(store, &path).await?;

        manager.add_label("Person")?;
        let idx = IndexDefinition::Scalar(ScalarIndexConfig {
            name: "idx_test".to_string(),
            label: "Person".to_string(),
            properties: vec!["name".to_string()],
            index_type: ScalarIndexType::BTree,
            where_clause: None,
            metadata: Default::default(),
        });
        manager.add_index(idx)?;

        // Verify initial status is Online
        let initial = manager.get_index("idx_test").unwrap();
        assert_eq!(initial.metadata().status, IndexStatus::Online);

        // Update to Building
        manager.update_index_metadata("idx_test", |m| {
            m.status = IndexStatus::Building;
            m.row_count_at_build = Some(100);
        })?;

        let updated = manager.get_index("idx_test").unwrap();
        assert_eq!(updated.metadata().status, IndexStatus::Building);
        assert_eq!(updated.metadata().row_count_at_build, Some(100));

        // Non-existent index should error
        assert!(manager.update_index_metadata("nope", |_| {}).is_err());

        Ok(())
    }

    /// `add_index` is upsert-by-name (issue rustic-ai/uni-db#63). Repeat
    /// invocations with the same `IndexDefinition::name()` must replace
    /// the entry in place rather than appending. Subsequent `add_index`
    /// calls also reflect metadata updates from the new definition.
    #[tokio::test]
    async fn test_add_index_is_upsert_by_name() -> Result<()> {
        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let manager = SchemaManager::load_from_store(store, &path).await?;
        manager.add_label("Person")?;

        let initial = IndexDefinition::Scalar(ScalarIndexConfig {
            name: "idx_test".to_string(),
            label: "Person".to_string(),
            properties: vec!["name".to_string()],
            index_type: ScalarIndexType::BTree,
            where_clause: None,
            metadata: IndexMetadata {
                status: IndexStatus::Building,
                ..Default::default()
            },
        });
        manager.add_index(initial.clone())?;
        assert_eq!(manager.schema().indexes.len(), 1);

        // Re-add the identical def — must remain a single entry.
        manager.add_index(initial.clone())?;
        assert_eq!(
            manager.schema().indexes.len(),
            1,
            "duplicate add_index by name must not append"
        );

        // Re-add with updated metadata — must replace in place, len unchanged.
        let mut updated_cfg = match initial {
            IndexDefinition::Scalar(c) => c,
            _ => unreachable!(),
        };
        updated_cfg.metadata.status = IndexStatus::Online;
        updated_cfg.metadata.row_count_at_build = Some(42);
        manager.add_index(IndexDefinition::Scalar(updated_cfg))?;
        assert_eq!(manager.schema().indexes.len(), 1);
        let stored = manager.get_index("idx_test").unwrap();
        assert_eq!(stored.metadata().status, IndexStatus::Online);
        assert_eq!(stored.metadata().row_count_at_build, Some(42));

        // A *different* name appends as a new entry.
        let other = IndexDefinition::Scalar(ScalarIndexConfig {
            name: "idx_other".to_string(),
            label: "Person".to_string(),
            properties: vec!["age".to_string()],
            index_type: ScalarIndexType::BTree,
            where_clause: None,
            metadata: IndexMetadata::default(),
        });
        manager.add_index(other)?;
        assert_eq!(manager.schema().indexes.len(), 2);

        Ok(())
    }

    /// `load_from_store` self-heals catalogs that were bloated by the
    /// pre-fix `add_index` (kept the *last* def per name).
    #[tokio::test]
    async fn test_load_dedups_bloated_indexes() -> Result<()> {
        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");

        // Seed disk with a hand-crafted bloated schema: 50 entries, all
        // sharing the same name. The last entry has distinct metadata so
        // we can assert "last writer wins" semantics.
        let mut schema = Schema::default();
        schema.labels.insert(
            "Person".to_string(),
            LabelMeta {
                id: 1,
                created_at: chrono::Utc::now(),
                state: SchemaElementState::Active,
                description: None,
            },
        );
        let make = |status: IndexStatus, count: Option<u64>| {
            IndexDefinition::Scalar(ScalarIndexConfig {
                name: "idx_dup".to_string(),
                label: "Person".to_string(),
                properties: vec!["name".to_string()],
                index_type: ScalarIndexType::BTree,
                where_clause: None,
                metadata: IndexMetadata {
                    status,
                    row_count_at_build: count,
                    ..Default::default()
                },
            })
        };
        for _ in 0..49 {
            schema.indexes.push(make(IndexStatus::Building, None));
        }
        schema.indexes.push(make(IndexStatus::Online, Some(123)));
        let json = serde_json::to_string_pretty(&schema)?;
        store.put(&path, json.into()).await?;

        let manager = SchemaManager::load_from_store(store, &path).await?;
        let schema = manager.schema();
        assert_eq!(
            schema.indexes.len(),
            1,
            "load() must collapse 50 duplicates by name to 1"
        );
        // Last-writer-wins: the kept entry is the final push (Online, 123).
        assert_eq!(schema.indexes[0].metadata().status, IndexStatus::Online);
        assert_eq!(schema.indexes[0].metadata().row_count_at_build, Some(123));

        Ok(())
    }

    #[test]
    fn test_vector_index_for_property_skips_non_online() {
        let mut schema = Schema::default();
        schema.labels.insert(
            "Document".to_string(),
            LabelMeta {
                id: 1,
                created_at: chrono::Utc::now(),
                state: SchemaElementState::Active,
                description: None,
            },
        );

        // Add a vector index with Stale status
        schema
            .indexes
            .push(IndexDefinition::Vector(VectorIndexConfig {
                name: "vec_doc_embedding".to_string(),
                label: "Document".to_string(),
                property: "embedding".to_string(),
                index_type: VectorIndexType::Flat,
                metric: DistanceMetric::Cosine,
                embedding_config: None,
                metadata: IndexMetadata {
                    status: IndexStatus::Stale,
                    ..Default::default()
                },
            }));

        // Stale index should NOT be returned
        assert!(
            schema
                .vector_index_for_property("Document", "embedding")
                .is_none()
        );

        // Set to Online — should now be returned
        if let IndexDefinition::Vector(cfg) = &mut schema.indexes[0] {
            cfg.metadata.status = IndexStatus::Online;
        }
        let result = schema.vector_index_for_property("Document", "embedding");
        assert!(result.is_some());
        assert_eq!(result.unwrap().metric, DistanceMetric::Cosine);
    }

    #[tokio::test]
    async fn with_overlay_empty_clones_primary_in_isolation() -> Result<()> {
        use crate::core::fork::SchemaDelta;

        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let primary = SchemaManager::load_from_store(store, &path).await?;
        primary.add_label("Person")?;

        let overlay = primary.with_overlay(&SchemaDelta::empty());
        assert_eq!(overlay.schema().labels.len(), 1);

        // Phase 1 invariant: mutating the overlay manager must not bleed
        // into primary's schema.
        overlay.add_label("Forked")?;
        assert!(overlay.schema().labels.contains_key("Forked"));
        assert!(!primary.schema().labels.contains_key("Forked"));

        Ok(())
    }

    #[tokio::test]
    async fn with_overlay_merges_added_labels_and_edge_types() -> Result<()> {
        use crate::core::fork::SchemaDelta;
        use chrono::Utc;

        let dir = tempdir()?;
        let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?);
        let path = ObjectStorePath::from("schema.json");
        let primary = SchemaManager::load_from_store(store, &path).await?;
        primary.add_label("Existing")?;

        let label_meta = LabelMeta {
            id: 99,
            created_at: Utc::now(),
            state: SchemaElementState::Active,
            description: None,
        };
        let edge_meta = EdgeTypeMeta {
            id: 99,
            src_labels: vec!["NewLabel".into()],
            dst_labels: vec!["NewLabel".into()],
            state: SchemaElementState::Active,
            description: None,
        };
        let delta = SchemaDelta {
            added_labels: vec![("NewLabel".to_string(), label_meta)],
            added_edge_types: vec![("NewEdge".to_string(), edge_meta)],
            added_properties: vec![],
        };

        let overlay = primary.with_overlay(&delta);
        let merged = overlay.schema();
        assert!(merged.labels.contains_key("Existing"));
        assert!(merged.labels.contains_key("NewLabel"));
        assert!(merged.edge_types.contains_key("NewEdge"));

        // Primary unchanged.
        assert!(!primary.schema().labels.contains_key("NewLabel"));
        Ok(())
    }
}