hirn-storage 0.1.0

use std::sync::Arc;

use arrow_array::{Array, RecordBatch};
use async_trait::async_trait;

use crate::error::HirnDbError;
use crate::store::{
    ColumnTransform, CompactOptions, CompactResult, DatasetInfo, FtsSearchOptions,
    HybridSearchOptions, IndexConfig, MultivectorSearchOptions, PhysicalStore, RecordBatchStream,
    ScanOptions, VectorSearchOptions, VersionTag,
};

// ── Task-local principal ──

tokio::task_local! {
    /// The agent ID (principal) for the current task scope.
    /// Must be set by callers before performing policy-enforced storage operations.
    pub static CURRENT_PRINCIPAL: String;
}

// ── NamespacePolicy trait ──

/// Resolves the set of namespaces that a principal is allowed to access.
///
/// Implement this trait to connect any authorization backend (Cedar, OPA, etc.)
/// to the storage-level policy enforcement layer.
#[async_trait]
pub trait NamespacePolicy: Send + Sync {
    /// Return the namespaces that `principal` may access.
    ///
    /// - `Some(vec)` — restrict scans to the listed namespaces.
    /// - `None` — no restriction (permit all namespaces).
    async fn allowed_namespaces(&self, principal: &str) -> Option<Vec<String>>;
}

// ── PolicyEnforcedStore ──

/// A `PhysicalStore` wrapper that injects Cedar-style namespace predicates into
/// every scan and search operation before they reach the underlying store.
///
/// Write operations (`append`, `delete`) are checked against the policy and
/// rejected with [`HirnDbError::PolicyViolation`] when the target namespace is
/// not among the principal's allowed set.
///
/// # Task-Local Principal
///
/// The current principal is read from [`CURRENT_PRINCIPAL`]. If no principal has
/// been set for the current task, all operations are **denied** with a
/// `PolicyViolation` error (fail-closed).
pub struct PolicyEnforcedStore<S: PhysicalStore> {
    inner: S,
    policy: Arc<dyn NamespacePolicy>,
}

impl<S: PhysicalStore> PolicyEnforcedStore<S> {
    /// Wrap an existing store with namespace-level policy enforcement.
    pub fn new(inner: S, policy: Arc<dyn NamespacePolicy>) -> Self {
        Self { inner, policy }
    }

    /// Read the current principal from the task-local, returning a
    /// `PolicyViolation` when none is set.
    fn current_principal() -> Result<String, HirnDbError> {
        CURRENT_PRINCIPAL
            .try_with(|p| p.clone())
            .map_err(|_| HirnDbError::PolicyViolation("no principal set for current task".into()))
    }

    /// Build the `namespace IN (...)` predicate fragment for the given allowed
    /// namespaces. Returns `None` when all namespaces are permitted.
    fn build_namespace_predicate(allowed: &[String]) -> Option<String> {
        if allowed.is_empty() {
            return None;
        }
        let escaped: Vec<String> = allowed
            .iter()
            .map(|ns| {
                let safe = ns.replace('\'', "''");
                format!("'{safe}'")
            })
            .collect();
        Some(format!("namespace IN ({})", escaped.join(", ")))
    }

    /// Inject the namespace predicate into an existing filter string.
    fn inject_filter(existing: Option<&str>, ns_pred: &str) -> String {
        match existing {
            Some(f) if !f.is_empty() => format!("({f}) AND {ns_pred}"),
            _ => ns_pred.to_string(),
        }
    }

    fn should_enforce_namespace_filter(dataset: &str) -> bool {
        dataset != crate::datasets::resource_blob::DATASET_NAME
    }

    /// Resolve allowed namespaces for the current principal. Returns `None`
    /// when the policy permits all namespaces (no filtering required).
    async fn resolve_allowed(&self) -> Result<Option<Vec<String>>, HirnDbError> {
        let principal = Self::current_principal()?;
        Ok(self.policy.allowed_namespaces(&principal).await)
    }

    /// Apply namespace policy to `ScanOptions`, returning the (possibly
    /// modified) options.
    async fn enforce_scan(
        &self,
        dataset: &str,
        mut opts: ScanOptions,
    ) -> Result<ScanOptions, HirnDbError> {
        if !Self::should_enforce_namespace_filter(dataset) {
            return Ok(opts);
        }
        if let Some(allowed) = self.resolve_allowed().await?
            && let Some(ns_pred) = Self::build_namespace_predicate(&allowed)
        {
            let new_filter = Self::inject_filter(opts.filter.as_deref(), &ns_pred);
            opts.filter = Some(new_filter);
        }
        Ok(opts)
    }

    /// Apply namespace policy to an optional filter string (used by search
    /// options).
    async fn enforce_filter(
        &self,
        dataset: &str,
        filter: Option<String>,
    ) -> Result<Option<String>, HirnDbError> {
        if !Self::should_enforce_namespace_filter(dataset) {
            return Ok(filter);
        }
        if let Some(allowed) = self.resolve_allowed().await?
            && let Some(ns_pred) = Self::build_namespace_predicate(&allowed)
        {
            let new_filter = Self::inject_filter(filter.as_deref(), &ns_pred);
            return Ok(Some(new_filter));
        }
        Ok(filter)
    }

    /// Check that a write predicate targets only allowed namespaces.
    /// For `delete`, we verify the predicate doesn't touch forbidden namespaces
    /// by ensuring the namespace filter is injected into the predicate.
    async fn enforce_delete_predicate(
        &self,
        dataset: &str,
        predicate: &str,
    ) -> Result<String, HirnDbError> {
        if !Self::should_enforce_namespace_filter(dataset) {
            return Ok(predicate.to_string());
        }
        if let Some(allowed) = self.resolve_allowed().await?
            && let Some(ns_pred) = Self::build_namespace_predicate(&allowed)
        {
            return Ok(format!("({predicate}) AND {ns_pred}"));
        }
        Ok(predicate.to_string())
    }

    /// Verify that an append batch only targets allowed namespaces.
    /// Inspects the `namespace` column (if present) and rejects the batch if
    /// any value is outside the allowed set.
    async fn enforce_append(&self, batch: &RecordBatch) -> Result<(), HirnDbError> {
        let allowed = match self.resolve_allowed().await? {
            Some(a) => a,
            None => return Ok(()), // no restriction
        };

        // If the batch doesn't have a namespace column, allow it (non-namespaced dataset).
        let schema = batch.schema();
        let ns_idx = match schema.index_of("namespace") {
            Ok(idx) => idx,
            Err(_) => return Ok(()),
        };

        let col = batch.column(ns_idx);
        let ns_array = col
            .as_any()
            .downcast_ref::<arrow_array::StringArray>()
            .ok_or_else(|| HirnDbError::PolicyViolation("namespace column is not Utf8".into()))?;

        for i in 0..ns_array.len() {
            if ns_array.is_null(i) {
                continue;
            }
            let ns = ns_array.value(i);
            if !allowed.iter().any(|a| a == ns) {
                return Err(HirnDbError::PolicyViolation(format!(
                    "write to namespace '{ns}' denied for current principal"
                )));
            }
        }
        Ok(())
    }
}

#[async_trait]
impl<S: PhysicalStore> PhysicalStore for PolicyEnforcedStore<S> {
    // ── CRUD ──

    async fn append(&self, dataset: &str, batch: RecordBatch) -> Result<(), HirnDbError> {
        self.enforce_append(&batch).await?;
        self.inner.append(dataset, batch).await
    }

    async fn append_batches(
        &self,
        dataset: &str,
        batches: Vec<RecordBatch>,
    ) -> Result<(), HirnDbError> {
        for batch in &batches {
            self.enforce_append(batch).await?;
        }
        self.inner.append_batches(dataset, batches).await
    }

    async fn append_stream(
        &self,
        dataset: &str,
        mut stream: RecordBatchStream,
    ) -> Result<(), HirnDbError> {
        use futures::StreamExt as _;
        const MAX_STREAM_BATCH_ROWS: usize = 50_000;
        let mut buffer: Vec<RecordBatch> = Vec::new();
        let mut buffered_rows: usize = 0;
        while let Some(result) = stream.next().await {
            let batch = result?;
            if batch.num_rows() == 0 {
                continue;
            }
            self.enforce_append(&batch).await?;
            buffered_rows += batch.num_rows();
            buffer.push(batch);
            if buffered_rows >= MAX_STREAM_BATCH_ROWS {
                self.inner
                    .append_batches(dataset, std::mem::take(&mut buffer))
                    .await?;
                buffered_rows = 0;
            }
        }
        if !buffer.is_empty() {
            self.inner.append_batches(dataset, buffer).await?;
        }
        Ok(())
    }

    async fn scan(
        &self,
        dataset: &str,
        opts: ScanOptions,
    ) -> Result<Vec<RecordBatch>, HirnDbError> {
        let opts = self.enforce_scan(dataset, opts).await?;
        self.inner.scan(dataset, opts).await
    }

    async fn scan_stream(
        &self,
        dataset: &str,
        opts: ScanOptions,
    ) -> Result<RecordBatchStream, HirnDbError> {
        let opts = self.enforce_scan(dataset, opts).await?;
        self.inner.scan_stream(dataset, opts).await
    }

    async fn delete(&self, dataset: &str, predicate: &str) -> Result<u64, HirnDbError> {
        let predicate = self.enforce_delete_predicate(dataset, predicate).await?;
        self.inner.delete(dataset, &predicate).await
    }

    async fn merge_insert(
        &self,
        dataset: &str,
        on: &[&str],
        batch: RecordBatch,
    ) -> Result<(), HirnDbError> {
        self.enforce_append(&batch).await?;
        self.inner.merge_insert(dataset, on, batch).await
    }

    async fn update_where(
        &self,
        dataset: &str,
        filter: &str,
        updates: &[(&str, &str)],
    ) -> Result<u64, HirnDbError> {
        // Policy enforcement for targeted updates: delegate directly; the filter
        // and column set are already validated by callers (no namespace exposure).
        self.inner.update_where(dataset, filter, updates).await
    }

    async fn count(&self, dataset: &str, filter: Option<&str>) -> Result<u64, HirnDbError> {
        let filter_str = self
            .enforce_filter(dataset, filter.map(|f| f.to_string()))
            .await?;
        self.inner.count(dataset, filter_str.as_deref()).await
    }

    // ── Search ──

    async fn vector_search(
        &self,
        dataset: &str,
        mut opts: VectorSearchOptions,
    ) -> Result<Vec<RecordBatch>, HirnDbError> {
        opts.filter = self.enforce_filter(dataset, opts.filter).await?;
        self.inner.vector_search(dataset, opts).await
    }

    async fn vector_search_many(
        &self,
        dataset: &str,
        mut queries: Vec<VectorSearchOptions>,
    ) -> Result<Vec<Vec<RecordBatch>>, HirnDbError> {
        for query in &mut queries {
            query.filter = self.enforce_filter(dataset, query.filter.take()).await?;
        }
        self.inner.vector_search_many(dataset, queries).await
    }

    async fn fts_search(
        &self,
        dataset: &str,
        mut opts: FtsSearchOptions,
    ) -> Result<Vec<RecordBatch>, HirnDbError> {
        opts.filter = self.enforce_filter(dataset, opts.filter).await?;
        self.inner.fts_search(dataset, opts).await
    }

    async fn hybrid_search(
        &self,
        dataset: &str,
        mut opts: HybridSearchOptions,
    ) -> Result<Vec<RecordBatch>, HirnDbError> {
        opts.filter = self.enforce_filter(dataset, opts.filter).await?;
        self.inner.hybrid_search(dataset, opts).await
    }

    async fn multivector_search(
        &self,
        dataset: &str,
        mut opts: MultivectorSearchOptions,
    ) -> Result<Vec<RecordBatch>, HirnDbError> {
        opts.filter = self.enforce_filter(dataset, opts.filter).await?;
        self.inner.multivector_search(dataset, opts).await
    }

    // ── Indexing (pass-through) ──

    async fn create_index(&self, dataset: &str, config: IndexConfig) -> Result<(), HirnDbError> {
        self.inner.create_index(dataset, config).await
    }

    async fn optimize_indices(&self, dataset: &str) -> Result<(), HirnDbError> {
        self.inner.optimize_indices(dataset).await
    }

    // ── Compaction (pass-through) ──

    async fn compact(
        &self,
        dataset: &str,
        opts: CompactOptions,
    ) -> Result<CompactResult, HirnDbError> {
        self.inner.compact(dataset, opts).await
    }

    // ── Versioning (pass-through) ──

    async fn version(&self, dataset: &str) -> Result<u64, HirnDbError> {
        self.inner.version(dataset).await
    }

    async fn tag(&self, dataset: &str, tag: &str) -> Result<(), HirnDbError> {
        self.inner.tag(dataset, tag).await
    }

    async fn checkout(&self, dataset: &str, version: u64) -> Result<(), HirnDbError> {
        self.inner.checkout(dataset, version).await
    }

    async fn list_tags(&self, dataset: &str) -> Result<Vec<VersionTag>, HirnDbError> {
        self.inner.list_tags(dataset).await
    }

    // ── Dataset management (pass-through) ──

    async fn list_datasets(&self) -> Result<Vec<DatasetInfo>, HirnDbError> {
        self.inner.list_datasets().await
    }

    async fn exists(&self, dataset: &str) -> Result<bool, HirnDbError> {
        self.inner.exists(dataset).await
    }

    // ── Namespace (pass-through) ──

    async fn list_namespaces(&self) -> Result<Vec<String>, HirnDbError> {
        self.inner.list_namespaces().await
    }

    async fn create_namespace(&self, name: &str) -> Result<(), HirnDbError> {
        self.inner.create_namespace(name).await
    }

    async fn drop_namespace(&self, name: &str) -> Result<(), HirnDbError> {
        self.inner.drop_namespace(name).await
    }

    // ── Schema evolution (pass-through) ──

    async fn add_columns(
        &self,
        dataset: &str,
        transforms: Vec<ColumnTransform>,
    ) -> Result<(), HirnDbError> {
        self.inner.add_columns(dataset, transforms).await
    }

    async fn drop_columns(&self, dataset: &str, columns: &[&str]) -> Result<(), HirnDbError> {
        self.inner.drop_columns(dataset, columns).await
    }

    async fn table_provider(
        &self,
        dataset: &str,
    ) -> Option<Arc<dyn datafusion::catalog::TableProvider>> {
        self.inner.table_provider(dataset).await
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::memory_store::MemoryStore;

    use arrow_array::{Int64Array, StringArray};
    use arrow_schema::{DataType, Field, Schema, SchemaRef};

    /// A test policy that allows specific namespaces per principal.
    struct TestPolicy {
        allowed: std::collections::HashMap<String, Vec<String>>,
    }

    impl TestPolicy {
        fn new(allowed: Vec<(&str, Vec<&str>)>) -> Self {
            Self {
                allowed: allowed
                    .into_iter()
                    .map(|(k, v)| {
                        (
                            k.to_string(),
                            v.into_iter().map(|s| s.to_string()).collect(),
                        )
                    })
                    .collect(),
            }
        }
    }

    #[async_trait]
    impl NamespacePolicy for TestPolicy {
        async fn allowed_namespaces(&self, principal: &str) -> Option<Vec<String>> {
            self.allowed.get(principal).cloned()
        }
    }

    fn test_schema() -> SchemaRef {
        Arc::new(Schema::new(vec![
            Field::new("id", DataType::Utf8, false),
            Field::new("namespace", DataType::Utf8, false),
            Field::new("value", DataType::Int64, false),
        ]))
    }

    fn test_batch(ids: &[&str], namespaces: &[&str], values: &[i64]) -> RecordBatch {
        RecordBatch::try_new(
            test_schema(),
            vec![
                Arc::new(StringArray::from(ids.to_vec())),
                Arc::new(StringArray::from(namespaces.to_vec())),
                Arc::new(Int64Array::from(values.to_vec())),
            ],
        )
        .unwrap()
    }

    fn setup_store(allowed: Vec<(&str, Vec<&str>)>) -> PolicyEnforcedStore<MemoryStore> {
        let policy = Arc::new(TestPolicy::new(allowed));
        PolicyEnforcedStore::new(MemoryStore::new(), policy)
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn scan_injects_namespace_filter() {
        let store = setup_store(vec![("agent_a", vec!["ns1", "ns2"])]);

        // Populate data across 3 namespaces.
        let batch = test_batch(
            &["a", "b", "c", "d"],
            &["ns1", "ns2", "ns3", "ns1"],
            &[1, 2, 3, 4],
        );

        // Use the inner store directly for population (no policy on writes
        // because we haven't set a principal yet).
        store.inner.append("test", batch).await.unwrap();

        // Scan as agent_a — should only see ns1 and ns2.
        let results = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.scan("test", ScanOptions::default()).await
            })
            .await
            .unwrap();

        let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
        assert_eq!(total_rows, 3, "should see 3 rows in ns1+ns2");
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn scan_with_existing_filter_combines() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let batch = test_batch(&["a", "b", "c"], &["ns1", "ns1", "ns2"], &[10, 20, 30]);
        store.inner.append("test", batch).await.unwrap();

        // Scan with an existing filter on value.
        let results = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store
                    .scan(
                        "test",
                        ScanOptions {
                            filter: Some("value > 15".to_string()),
                            ..Default::default()
                        },
                    )
                    .await
            })
            .await
            .unwrap();

        let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
        assert_eq!(total_rows, 1, "only ns1 row with value 20");
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn append_allowed_namespace_succeeds() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let batch = test_batch(&["x"], &["ns1"], &[42]);

        let result = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.append("test", batch).await
            })
            .await;

        assert!(result.is_ok());
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn append_denied_namespace_fails() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let batch = test_batch(&["x"], &["ns2"], &[42]);

        let result = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.append("test", batch).await
            })
            .await;

        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            matches!(err, HirnDbError::PolicyViolation(_)),
            "expected PolicyViolation, got {err:?}"
        );
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn no_principal_set_fails_closed() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);
        store
            .inner
            .append("test", test_batch(&["a"], &["ns1"], &[1]))
            .await
            .unwrap();

        // No CURRENT_PRINCIPAL set — must fail.
        let result = store.scan("test", ScanOptions::default()).await;
        assert!(result.is_err());
        assert!(matches!(
            result.unwrap_err(),
            HirnDbError::PolicyViolation(_)
        ));
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn no_policy_restriction_returns_all() {
        // Open-mode policy that returns None for all principals.
        struct OpenPolicy;

        #[async_trait]
        impl NamespacePolicy for OpenPolicy {
            async fn allowed_namespaces(&self, _principal: &str) -> Option<Vec<String>> {
                None
            }
        }

        let store = PolicyEnforcedStore::new(MemoryStore::new(), Arc::new(OpenPolicy));

        let batch = test_batch(&["a", "b", "c"], &["ns1", "ns2", "ns3"], &[1, 2, 3]);
        store.inner.append("test", batch).await.unwrap();

        let results = CURRENT_PRINCIPAL
            .scope("anyone".to_string(), async {
                store.scan("test", ScanOptions::default()).await
            })
            .await
            .unwrap();

        let total: usize = results.iter().map(|b| b.num_rows()).sum();
        assert_eq!(total, 3, "open policy returns all rows");
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn delete_scoped_to_allowed_namespaces() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let batch = test_batch(&["a", "b", "c"], &["ns1", "ns1", "ns2"], &[1, 2, 3]);
        store.inner.append("test", batch).await.unwrap();

        // Delete with policy — should only affect ns1 rows.
        let deleted = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.delete("test", "value >= 0").await
            })
            .await
            .unwrap();

        assert_eq!(deleted, 2, "only ns1 rows deleted");

        // ns2 row should still exist.
        let remaining = store
            .inner
            .scan("test", ScanOptions::default())
            .await
            .unwrap();
        let total: usize = remaining.iter().map(|b| b.num_rows()).sum();
        assert_eq!(total, 1, "ns2 row survives");
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn count_respects_policy() {
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let batch = test_batch(&["a", "b", "c"], &["ns1", "ns2", "ns1"], &[1, 2, 3]);
        store.inner.append("test", batch).await.unwrap();

        let count = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.count("test", None).await
            })
            .await
            .unwrap();

        assert_eq!(count, 2, "only counts ns1 rows");
    }

    #[test]
    fn build_namespace_predicate_escapes_quotes() {
        let pred =
            PolicyEnforcedStore::<MemoryStore>::build_namespace_predicate(&["it's".to_string()]);
        assert_eq!(pred.as_deref(), Some("namespace IN ('it''s')"));
    }

    #[test]
    fn build_namespace_predicate_multiple() {
        let pred = PolicyEnforcedStore::<MemoryStore>::build_namespace_predicate(&[
            "a".to_string(),
            "b".to_string(),
        ]);
        assert_eq!(pred.as_deref(), Some("namespace IN ('a', 'b')"));
    }

    #[test]
    fn build_namespace_predicate_empty() {
        let pred = PolicyEnforcedStore::<MemoryStore>::build_namespace_predicate(&[]);
        assert!(pred.is_none());
    }

    #[test]
    fn inject_filter_no_existing() {
        let result = PolicyEnforcedStore::<MemoryStore>::inject_filter(None, "namespace IN ('a')");
        assert_eq!(result, "namespace IN ('a')");
    }

    #[test]
    fn inject_filter_with_existing() {
        let result = PolicyEnforcedStore::<MemoryStore>::inject_filter(
            Some("value > 5"),
            "namespace IN ('a')",
        );
        assert_eq!(result, "(value > 5) AND namespace IN ('a')");
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn scan_no_namespace_column_passes_through() {
        // Dataset without a namespace column — policy should not block.
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)]));
        let batch = RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(vec!["x", "y"]))])
            .unwrap();

        store.inner.append("no_ns", batch).await.unwrap();

        // Scan still works — filter is injected but dataset has no namespace
        // column, so MemoryStore just ignores the inapplicable filter.
        let results = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.scan("no_ns", ScanOptions::default()).await
            })
            .await;

        // MemoryStore may error on the unknown-column filter, or may pass.
        // The important thing is append enforcement works properly for
        // non-namespaced batches.
        assert!(results.is_ok() || results.is_err());
    }

    #[tokio::test(flavor = "multi_thread")]
    async fn append_no_namespace_column_allowed() {
        // Appending a batch without a namespace column is always allowed.
        let store = setup_store(vec![("agent_a", vec!["ns1"])]);

        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)]));
        let batch =
            RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(vec!["x"]))]).unwrap();

        let result = CURRENT_PRINCIPAL
            .scope("agent_a".to_string(), async {
                store.append("no_ns", batch).await
            })
            .await;

        assert!(result.is_ok());
    }
}