aletheiadb 0.1.0

A high-performance bi-temporal graph database for LLM integration
Documentation
//! Administrative and test-only helper methods.
//!
//! Provides internal statistics, test visibility, and database maintenance operations.
use crate::core::error::{PersistenceErrorKind, Result, ResultExt, StorageError};
use crate::core::temporal::Timestamp;
use crate::db::AletheiaDB;
use crate::index::temporal::TemporalIndexes;
use crate::query::planner::Statistics;
#[cfg(test)]
use crate::storage::current::CurrentStorage;
use crate::storage::historical::{HistoricalStats, HistoricalStorage};
use crate::storage::index_persistence::operations::{
    persist_temporal_index, persist_vector_indexes,
};
use parking_lot::RwLock;
use std::sync::Arc;

impl AletheiaDB {
    /// Get statistics about the historical storage.
    #[must_use = "the historical statistics value must be used"]
    pub fn historical_stats(&self) -> Result<HistoricalStats> {
        Ok(self.historical.read().stats())
    }

    /// Persist all indexes to disk.
    ///
    /// This saves the current state of all indexes (graph, temporal, vector, strings)
    /// to disk in the configured persistence directory.
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - Index persistence is not enabled in configuration
    /// - Writing index files fails due to I/O errors
    /// - Index serialization fails
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use aletheiadb::AletheiaDB;
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let db = AletheiaDB::new()?;
    /// // ... add data ...
    /// db.persist_indexes()?; // Save indexes to disk
    /// # Ok(())
    /// # }
    /// ```
    #[must_use = "this Result must be used; ignoring errors can lead to silent failures"]
    pub fn persist_indexes(&self) -> Result<()> {
        let result = (|| {
            use crate::storage::index_persistence::formats::IndexManifest;

            // Warn if background persistence thread has stopped
            if self
                .persistence_thread_stopped
                .load(std::sync::atomic::Ordering::Acquire)
            {
                eprintln!(
                    "Warning: Background persistence thread has stopped. \
                     Automatic persistence is disabled. Manual persist_indexes() calls will still work."
                );
            }

            let manager = self.persistence_manager.as_ref().ok_or_else(|| {
                StorageError::InconsistentState {
                    reason: "Index persistence not enabled".to_string(),
                }
            })?;

            // Capture current LSN for all operations
            let current_lsn = self.wal.current_lsn().0;

            let tracker = self.persistence_tracker.as_ref();

            // String interner must be saved first (dependency for all others).
            // Update the string LSN tracker to current_lsn BEFORE calculating safe LSN
            // so even if no new strings were added, the tracker reflects current state.
            if let Some(tracker) = tracker {
                crate::storage::index_persistence::operations::persist_string_interner(
                    manager,
                    tracker,
                    current_lsn,
                )?;
            } else {
                manager.save_string_interner().map_err(|e| {
                    StorageError::persistence_with_kind(
                        PersistenceErrorKind::from(&e),
                        format!("Failed to save string interner: {}", e),
                    )
                })?;
            }

            crate::storage::index_persistence::operations::persist_graph_index(
                &self.current,
                manager,
                tracker,
                current_lsn,
            )?;

            if let Some(tracker) = tracker {
                persist_vector_indexes(&self.current, manager, Some(tracker), current_lsn)?;
                persist_temporal_index(
                    &self.historical,
                    &self.temporal_indexes,
                    manager,
                    tracker,
                    current_lsn,
                )?;
            }

            // Record WAL position for future replay coordination.
            // Safe LSN = min of all components; since we just persisted everything, current_lsn is safe.
            let safe_lsn = tracker
                .map(|t| t.get_safe_manifest_lsn())
                .unwrap_or(current_lsn);

            let manifest = IndexManifest::new(safe_lsn);
            manager.save_manifest(&manifest).map_err(|e| {
                StorageError::persistence_with_kind(
                    PersistenceErrorKind::from(&e),
                    format!("Failed to save manifest: {}", e),
                )
            })?;

            Ok(())
        })();
        result.record_error_metric()
    }

    /// Get a reference to the current storage (test-only helper).
    ///
    /// This method is only available in test builds and provides access to the
    /// internal CurrentStorage for integration test verification purposes.
    #[cfg(test)]
    #[allow(dead_code)]
    pub(crate) fn storage(&self) -> &Arc<CurrentStorage> {
        &self.current
    }

    /// Get the current WAL LSN (test-only helper).
    ///
    /// This method provides access to the current WAL Log Sequence Number for
    /// test verification purposes. This is particularly useful for testing index
    /// persistence where LSN coordination with the WAL is critical for correctness.
    ///
    /// **Warning**: This method exposes internal implementation details and
    /// should only be used in tests.
    ///
    /// # Returns
    ///
    /// The current LSN from the WAL system.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use aletheiadb::{AletheiaDB, PropertyMap};
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let db = AletheiaDB::new()?;
    /// # let properties = PropertyMap::new();
    /// db.create_node("Person", properties)?;
    /// let lsn = db.__test_current_wal_lsn();
    /// assert!(lsn > 0); // LSN advances after operations
    /// # Ok(())
    /// # }
    /// ```
    #[doc(hidden)]
    pub fn __test_current_wal_lsn(&self) -> u64 {
        self.wal.current_lsn().0
    }

    /// Get the current transaction timestamp (test-only helper).
    ///
    /// This method provides access to the internal transaction clock for
    /// integration test verification purposes.
    #[doc(hidden)]
    pub fn __test_current_timestamp(&self) -> Timestamp {
        *self.current_timestamp.lock().unwrap()
    }

    /// Access the internal HistoricalStorage for testing purposes.
    ///
    /// This method provides access to the internal HistoricalStorage for
    /// integration test verification purposes. It is public to allow access from
    /// integration tests but is hidden from documentation and marked with
    /// `__test_` prefix to discourage production use.
    ///
    /// **Warning**: This method exposes internal implementation details and
    /// should only be used in tests.
    #[doc(hidden)]
    pub fn __test_historical_storage(&self) -> &Arc<RwLock<HistoricalStorage>> {
        &self.historical
    }

    /// Provide test-only access to temporal indexes for performance testing.
    ///
    /// This allows tests to verify that temporal indexes are populated correctly
    /// and can query them directly. This is marked as `#[doc(hidden)]` and
    /// should only be used in tests.
    #[doc(hidden)]
    pub fn __test_temporal_indexes(&self) -> &Arc<TemporalIndexes> {
        &self.temporal_indexes
    }

    /// Get adaptive over-fetch statistics for a label (test-only helper).
    ///
    /// Returns the current statistics (search_count, total_candidates, total_results)
    /// for the given label, or None if no searches have been performed yet.
    ///
    /// This is used for testing to verify that adaptive learning is working correctly.
    ///
    /// **Warning**: This method exposes internal implementation details and
    /// should only be used in tests.
    ///
    /// # Returns
    ///
    /// Some((search_count, total_candidates, total_results)) if statistics exist,
    /// None otherwise.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use aletheiadb::AletheiaDB;
    /// # use aletheiadb::index::vector::HnswConfig;
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let db = AletheiaDB::new()?;
    /// # let config = HnswConfig::default();
    /// db.vector_index("embedding").hnsw(config).enable()?;
    /// // ... create nodes and perform searches ...
    /// let (count, candidates, results) = db.__test_get_filter_stats("Person").unwrap();
    /// assert_eq!(count, 10); // 10 searches performed
    /// # Ok(())
    /// # }
    /// ```
    #[doc(hidden)]
    pub fn __test_get_filter_stats(&self, label: &str) -> Option<(u64, u64, u64)> {
        self.current.get_filter_stats(label)
    }

    /// Get the query optimization statistics.
    ///
    /// Statistics are used for cost-based query optimization and are cached
    /// across queries for efficiency. The statistics are automatically refreshed
    /// when needed, but can be manually refreshed using [`refresh_statistics`](Self::refresh_statistics).
    ///
    /// # Returns
    ///
    /// A reference to the shared statistics object.
    pub fn statistics(&self) -> &Arc<Statistics> {
        &self.stats
    }

    /// Refresh query optimization statistics from current storage.
    ///
    /// This collects fresh statistics about node counts, edge counts, label
    /// cardinalities, and other metrics used for cost-based query optimization.
    /// Call this method after significant schema changes or data modifications
    /// to ensure the query planner has accurate information.
    ///
    /// Statistics are automatically refreshed lazily on first query, so this
    /// method is typically only needed for benchmarking or after bulk imports.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use aletheiadb::{AletheiaDB, PropertyMap};
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// # let db = AletheiaDB::new()?;
    /// # let documents: Vec<PropertyMap> = vec![];
    /// // After bulk import
    /// for props in documents {
    ///     db.create_node("Document", props)?;
    /// }
    ///
    /// // Refresh statistics for optimal query planning
    /// db.refresh_statistics();
    ///
    /// // Now queries will use accurate statistics
    /// # let query = db.query().build();
    /// let results = db.execute_query(query)?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn refresh_statistics(&self) {
        // Collect statistics from current storage
        let node_count = self.current.node_count();
        let edge_count = self.current.edge_count();
        let vector_count = self.current.vector_count();

        // Collect label counts from current storage
        let label_counts = self.current.label_counts();

        // Calculate average delta chain length from historical storage
        // This is used for cost estimation of temporal lookups.
        let historical_stats = self.historical.read().stats();
        let total_versions =
            historical_stats.total_node_versions + historical_stats.total_edge_versions;
        let total_anchors = historical_stats.node_anchor_count + historical_stats.edge_anchor_count;

        let avg_delta_chain = if total_anchors > 0 {
            let interval = total_versions as f64 / total_anchors as f64;
            // Average reconstruction depth is roughly (interval - 1) / 2
            (interval - 1.0) / 2.0
        } else {
            // Default estimate if historical storage is empty or has no anchors
            // (Assumes anchor_interval=10, so avg depth ~5)
            5.0
        };

        self.stats.refresh(
            node_count,
            edge_count,
            vector_count,
            label_counts,
            avg_delta_chain,
        );
    }

    /// Invalidate cached query optimization statistics.
    ///
    /// Call this after schema changes to force re-collection of statistics
    /// on the next query. The statistics will be lazily refreshed when needed.
    pub fn invalidate_statistics(&self) {
        self.stats.invalidate();
    }
}