sqlitegraph 2.2.3

Embedded graph database with full ACID transactions, HNSW vector search, dual backend support, and comprehensive graph algorithms library
Documentation
// HNSW Index Internal Helpers
//
// This module contains internal helper methods for HnswIndex.
// Includes layer management, ID translation, validation, and layer insertion.
//
// Note: This file is included via include! macro in index.rs
// All imports are inherited from the parent module

// Note: HnswIndex is defined in the parent (index.rs)
// This file is included via include! macro, so types are available in scope

impl HnswIndex {
    /// Validate the HNSW configuration
    pub(crate) fn validate_config(config: &crate::hnsw::config::HnswConfig) -> Result<(), crate::hnsw::errors::HnswError> {
        use crate::hnsw::errors::{HnswError, HnswIndexError};

        if config.dimension == 0 {
            return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
        }
        if config.m == 0 {
            return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
        }
        if config.ef_construction < config.m {
            return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
        }
        if config.ef_search == 0 {
            return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
        }
        if config.ml == 0 {
            return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
        }
        Ok(())
    }

    /// Determine which layer to insert a vector into using exponential distribution
    pub(crate) fn determine_insertion_level(&mut self) -> usize {
        if self.config.enable_multilayer {
            if let Some(distributor) = &mut self.level_distributor {
                distributor.sample_level_internal()
            } else {
                0 // Fallback to single-layer if distributor not initialized
            }
        } else {
            0 // Single-layer mode for backward compatibility
        }
    }

    /// Insert a vector into a specific layer
    pub(crate) fn insert_into_layer(&mut self, vector_id: u64, level: usize) -> Result<(), crate::hnsw::errors::HnswError> {
        use crate::hnsw::errors::{HnswError, HnswIndexError};

        // Determine local node ID based on mode
        // In multi-layer mode: use LayerMappings for ID translation
        // In single-layer mode: direct conversion (vector_id - 1)
        let node_id = if let Some(manager) = &mut self.multi_layer_manager {
            // Multi-layer mode: use LayerMappings to get local ID
            manager.get_local_id(vector_id, level)
                .ok_or(HnswError::Index(HnswIndexError::NodeNotFound(vector_id)))?
        } else {
            // Single-layer mode: direct 1-based to 0-based conversion
            vector_id - 1
        };

        // Ensure layer exists, create if necessary
        while level >= self.layers.len() {
            let new_level = self.layers.len() as u8;
            let max_connections = (self.config.m / 2_usize.pow(new_level as u32)).max(1);
            self.layers.push(crate::hnsw::layer::HnswLayer::new(new_level, max_connections));
        }

        // Add the node to the layer first (this makes it an entry point if it's one of the first nodes)
        {
            let layer = &mut self.layers[level];
            layer.add_node(node_id)?;
        }

        // For base layer, no need to connect to existing entry points if this is the first node
        if level == 0 && self.layers[level].node_count() == 1 {
            return Ok(());
        }

        // PROPER HNSW INSERTION: Search for nearest neighbors, then connect to them
        // 1. Load vectors for distance computation
        let vectors = self.load_vectors_as_array()?;

        // 2. Get the vector data for the new node
        let new_vector = &vectors[node_id as usize];

        // 3. Find entry points to start the search
        let global_entry_points = self.get_layer_entry_points(level);
        let entry_points: Vec<u64> = if let Some(manager) = &self.multi_layer_manager {
            // Multi-layer mode: use LayerMappings for ID translation
            global_entry_points
                .into_iter()
                .filter_map(|global_id| manager.get_local_id(global_id, level))
                .collect()
        } else {
            // Single-layer mode: direct 1-based to 0-based conversion
            global_entry_points
                .into_iter()
                .map(|global_id| global_id - 1)
                .collect()
        };

        if entry_points.is_empty() {
            // No entry points yet, this must be the first node
            return Ok(());
        }

        // 4. Search for the nearest neighbors to the new vector
        let ef = self.config.ef_construction;
        let search_result = self.search_engine.search_layer(
            &self.layers[level],
            new_vector,
            &vectors,
            &entry_points,
            ef,
        )?;

        // 5. Select top M neighbors (limited by max_connections)
        let candidates = search_result.neighbors();
        let distances = search_result.distances();
        let m = self.layers[level].max_connections();

        // Build distance map for the new node's neighbors
        let mut neighbor_distances = std::collections::HashMap::new();
        for (i, &neighbor_id) in candidates.iter().enumerate() {
            if i < m {
                neighbor_distances.insert(neighbor_id, distances[i]);
            }
        }

        // 6. Add connections from new node to its nearest neighbors
        // The new node gets connections to its M nearest neighbors
        let layer = &mut self.layers[level];
        for &neighbor_id in neighbor_distances.keys() {
            if neighbor_id != node_id {
                layer.add_one_way_connection(node_id, neighbor_id)?;
            }
        }

        // Prune the new node's connections by distance (keeps closest)
        layer.prune_connections_by_distance(node_id, &neighbor_distances);

        // 7. Add reverse connections from neighbors to the new node
        // We use a more lenient pruning strategy for reverse connections to ensure
        // the graph remains well-connected. Only prune if we exceed 2*M connections.
        let max_reverse_conns = (m * 2).max(32); // More lenient limit
        for (&neighbor_id, _dist_to_neighbor) in neighbor_distances.iter() {
            if neighbor_id != node_id {
                // Add reverse connection (existing node -> new node)
                layer.add_one_way_connection(neighbor_id, node_id)?;

                // Only prune if significantly over limit
                if let Ok(existing_conns) = layer.get_connections(neighbor_id) {
                    if existing_conns.len() > max_reverse_conns {
                        // Build distance map with small distance for new node to keep it
                        let mut reverse_distances = std::collections::HashMap::new();
                        reverse_distances.insert(node_id, 0.0); // Keep new connection
                        for &existing_id in existing_conns {
                            if existing_id != node_id {
                                reverse_distances.insert(existing_id, 1.0); // Existing connections
                            }
                        }
                        layer.prune_connections_by_distance(neighbor_id, &reverse_distances);
                    }
                }
            }
        }

        Ok(())
    }

    /// Get entry points for a specific layer
    ///
    /// Returns global vector IDs (1-based) for vectors that are entry points
    /// in the specified layer. In multi-layer mode, uses the manager to
    /// translate local node IDs to global vector IDs.
    pub(crate) fn get_layer_entry_points(&self, level: usize) -> Vec<u64> {
        if self.layers.is_empty() {
            return Vec::new();
        }

        if level == self.layers.len() - 1 {
            // Top layer: return all entry points (already 1-based vector IDs)
            self.entry_points.clone()
        } else if level == 0 {
            // Base layer: use its own entry points
            let layer_entry_points = self.layers[level].get_entry_points();
            if let Some(manager) = &self.multi_layer_manager {
                // Multi-layer mode: convert local node IDs to global vector IDs
                layer_entry_points
                    .iter()
                    .filter_map(|&local_id| manager.get_global_id(level, local_id))
                    .collect()
            } else {
                // Single-layer mode: direct 0-based to 1-based conversion
                layer_entry_points
                    .iter()
                    .map(|&node_id| node_id + 1)
                    .collect()
            }
        } else {
            // Intermediate layers: use entry points from the layer above
            if level + 1 < self.layers.len() {
                let layer_entry_points = self.layers[level + 1].get_entry_points();
                if let Some(manager) = &self.multi_layer_manager {
                    // Multi-layer mode: convert local node IDs to global vector IDs
                    layer_entry_points
                        .iter()
                        .filter_map(|&local_id| manager.get_global_id(level + 1, local_id))
                        .collect()
                } else {
                    // Single-layer mode: direct 0-based to 1-based conversion
                    layer_entry_points
                        .iter()
                        .map(|&node_id| node_id + 1)
                        .collect()
                }
            } else {
                Vec::new()
            }
        }
    }

    /// Get local ID for a global vector ID in a specific layer
    ///
    /// In multi-layer mode, uses LayerMappings for ID translation.
    /// In single-layer mode, uses direct 1-based to 0-based conversion.
    ///
    /// # Arguments
    ///
    /// * `vector_id` - Global vector ID (1-based)
    /// * `layer_id` - Layer ID (0-based)
    ///
    /// # Returns
    ///
    /// Local node ID for the layer
    pub(crate) fn get_local_id_for_layer(&self, vector_id: u64, layer_id: usize) -> Result<u64, crate::hnsw::errors::HnswError> {
        use crate::hnsw::errors::{HnswError, HnswIndexError};

        if let Some(manager) = &self.multi_layer_manager {
            manager.get_local_id(vector_id, layer_id)
                .ok_or(HnswError::Index(HnswIndexError::NodeNotFound(vector_id)))
        } else {
            // Single-layer mode: direct conversion
            Ok(vector_id - 1)
        }
    }

    /// Get global ID for a local node ID in a specific layer
    ///
    /// In multi-layer mode, uses LayerMappings for ID translation.
    /// In single-layer mode, uses direct 0-based to 1-based conversion.
    ///
    /// # Arguments
    ///
    /// * `layer_id` - Layer ID (0-based)
    /// * `local_id` - Local node ID (0-based)
    ///
    /// # Returns
    ///
    /// Global vector ID (1-based)
    pub(crate) fn get_global_id_for_layer(&self, layer_id: usize, local_id: u64) -> Result<u64, crate::hnsw::errors::HnswError> {
        use crate::hnsw::errors::{HnswError, HnswIndexError};

        if let Some(manager) = &self.multi_layer_manager {
            manager.get_global_id(layer_id, local_id)
                .ok_or(HnswError::Index(HnswIndexError::InvalidNodeId(local_id)))
        } else {
            // Single-layer mode: direct conversion
            Ok(local_id + 1)
        }
    }

    /// Load all vectors as a 0-indexed array for layer operations
    ///
    /// Creates a vectors array where vectors[node_id] = vector_data,
    /// allowing efficient lookup by layer-local node IDs.
    ///
    /// # Returns
    ///
    /// Vectors array indexed by 0-based node IDs
    pub(crate) fn load_vectors_as_array(&self) -> Result<Vec<Vec<f32>>, crate::hnsw::errors::HnswError> {
        let vector_ids = self.storage.list_vectors()?;
        let max_vector_id = vector_ids.iter().copied().max().unwrap_or(0);

        let mut vectors_array = vec![vec![]; max_vector_id as usize + 1];
        for vector_id in vector_ids {
            if let Ok(Some(vector)) = self.storage.get_vector(vector_id) {
                let node_id = (vector_id - 1) as usize;
                if node_id < vectors_array.len() {
                    vectors_array[node_id] = vector;
                }
            }
        }

        Ok(vectors_array)
    }
}