stoolap 0.4.0

High-performance embedded SQL database with MVCC, time-travel queries, and full ACID compliance
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
// Copyright 2025 Stoolap Contributors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Index trait for database indexes
//!

use crate::common::I64Map;
use crate::core::{DataType, IndexEntry, IndexType, Operator, Result, RowIdVec, Value};
use crate::storage::expression::Expression;

/// Index represents an abstract index for a column or set of columns
///
/// This trait defines the interface for index operations including
/// lookups, range queries, and batch operations.
///
/// # Index Types
///
/// - **BTree**: For high-cardinality columns (> 5% unique values)
/// - **Bitmap**: For low-cardinality columns (< 5% unique values)
/// - **BTree**: For range queries and ordered access
pub trait Index: Send + Sync {
    /// Returns the name of the index
    fn name(&self) -> &str;

    /// Returns the name of the table this index belongs to
    fn table_name(&self) -> &str;

    /// Builds the index from existing data
    fn build(&mut self) -> Result<()>;

    /// Adds values to the index with the given row IDs
    ///
    /// # Arguments
    /// * `values` - The column values to index (one per indexed column)
    /// * `row_id` - The row ID in the table
    /// * `ref_id` - The reference ID in the index
    ///
    /// Note: Uses `&self` with interior mutability for thread-safe concurrent access
    fn add(&self, values: &[Value], row_id: i64, ref_id: i64) -> Result<()>;

    /// Adds multiple entries to the index in a single batch operation
    ///
    /// # Arguments
    /// * `entries` - Map from row_id to column values
    fn add_batch(&self, entries: &I64Map<Vec<Value>>) -> Result<()>;

    /// Removes values from the index
    ///
    /// # Arguments
    /// * `values` - The column values to remove
    /// * `row_id` - The row ID in the table
    /// * `ref_id` - The reference ID in the index
    ///
    /// Note: Uses `&self` with interior mutability for thread-safe concurrent access
    fn remove(&self, values: &[Value], row_id: i64, ref_id: i64) -> Result<()>;

    /// Removes multiple entries from the index in a single batch operation
    ///
    /// # Arguments
    /// * `entries` - Map from row_id to column values
    fn remove_batch(&self, entries: &I64Map<Vec<Value>>) -> Result<()>;

    /// Adds multiple entries from a slice with single lock acquisition
    ///
    /// This is the most efficient batch method - avoids intermediate allocations
    /// and acquires write locks only once for the entire batch.
    ///
    /// # Arguments
    /// * `entries` - Slice of (row_id, values) pairs
    ///
    /// # Performance
    /// - Single lock acquisition for entire batch (vs N acquisitions for N calls to add())
    /// - No intermediate HashMap construction
    /// - Values are borrowed, not cloned (caller owns the data)
    fn add_batch_slice(&self, entries: &[(i64, &[Value])]) -> Result<()> {
        // Default implementation: delegate to add() for backwards compatibility
        // Concrete implementations should override for single-lock optimization
        for &(row_id, values) in entries {
            self.add(values, row_id, row_id)?;
        }
        Ok(())
    }

    /// Removes multiple entries from a slice with single lock acquisition
    ///
    /// This is the most efficient batch removal method - avoids intermediate allocations
    /// and acquires write locks only once for the entire batch.
    ///
    /// # Arguments
    /// * `entries` - Slice of (row_id, values) pairs
    ///
    /// # Performance
    /// - Single lock acquisition for entire batch (vs N acquisitions for N calls to remove())
    /// - No intermediate HashMap construction
    /// - Values are borrowed, not cloned (caller owns the data)
    fn remove_batch_slice(&self, entries: &[(i64, &[Value])]) -> Result<()> {
        // Default implementation: delegate to remove() for backwards compatibility
        // Concrete implementations should override for single-lock optimization
        for &(row_id, values) in entries {
            self.remove(values, row_id, row_id)?;
        }
        Ok(())
    }

    /// Returns the column IDs for this index
    fn column_ids(&self) -> &[i32];

    /// Returns the column names for this index
    fn column_names(&self) -> &[String];

    /// Returns the data types for the indexed columns
    fn data_types(&self) -> &[DataType];

    /// Returns the type of index (BTree, Bitmap, Hash)
    fn index_type(&self) -> IndexType;

    /// Returns true if this is a unique index
    fn is_unique(&self) -> bool;

    /// Finds all entries where the columns equal the given values
    ///
    /// # Arguments
    /// * `values` - The values to search for
    ///
    /// # Returns
    /// Vector of index entries matching the values
    fn find(&self, values: &[Value]) -> Result<Vec<IndexEntry>>;

    /// Finds all entries where the columns are in the given range
    ///
    /// # Arguments
    /// * `min` - Minimum values (inclusive if min_inclusive is true)
    /// * `max` - Maximum values (inclusive if max_inclusive is true)
    /// * `min_inclusive` - Whether to include the minimum boundary
    /// * `max_inclusive` - Whether to include the maximum boundary
    ///
    /// # Returns
    /// Vector of index entries within the range
    fn find_range(
        &self,
        min: &[Value],
        max: &[Value],
        min_inclusive: bool,
        max_inclusive: bool,
    ) -> Result<Vec<IndexEntry>>;

    /// Finds all entries matching the given operator and values
    ///
    /// # Arguments
    /// * `op` - The comparison operator
    /// * `values` - The values to compare against
    ///
    /// # Returns
    /// Vector of matching index entries
    fn find_with_operator(&self, op: Operator, values: &[Value]) -> Result<Vec<IndexEntry>>;

    /// Returns row IDs with the given values (convenience method)
    ///
    /// This is a simplified version of `find` that returns only row IDs.
    /// Returns a pooled RowIdVec that is automatically recycled on drop.
    fn get_row_ids_equal(&self, values: &[Value]) -> RowIdVec {
        let mut row_ids = RowIdVec::new();
        self.get_row_ids_equal_into(values, &mut row_ids);
        row_ids
    }

    /// Appends row IDs with the given values to the provided buffer
    ///
    /// This enables callers to reuse the vector allocation across multiple calls.
    fn get_row_ids_equal_into(&self, values: &[Value], buffer: &mut Vec<i64>) {
        // Default implementation: delegate to find (allocates)
        // Override in concrete indexes for zero-allocation
        if let Ok(entries) = self.find(values) {
            buffer.reserve(entries.len());
            for entry in entries {
                buffer.push(entry.row_id);
            }
        }
    }

    /// Returns row IDs with values in the given range (convenience method)
    ///
    /// This is a simplified version of `find_range` that returns only row IDs.
    /// Returns a pooled RowIdVec that is automatically recycled on drop.
    fn get_row_ids_in_range(
        &self,
        min_value: &[Value],
        max_value: &[Value],
        include_min: bool,
        include_max: bool,
    ) -> RowIdVec {
        let mut row_ids = RowIdVec::new();
        self.get_row_ids_in_range_into(
            min_value,
            max_value,
            include_min,
            include_max,
            &mut row_ids,
        );
        row_ids
    }

    /// Appends row IDs with values in the given range to the provided buffer
    ///
    /// This enables callers to reuse the vector allocation across multiple calls.
    fn get_row_ids_in_range_into(
        &self,
        min_value: &[Value],
        max_value: &[Value],
        include_min: bool,
        include_max: bool,
        buffer: &mut Vec<i64>,
    ) {
        // Default implementation: delegate to find_range (allocates)
        // Override in concrete indexes for zero-allocation
        if let Ok(entries) = self.find_range(min_value, max_value, include_min, include_max) {
            buffer.reserve(entries.len());
            for entry in entries {
                buffer.push(entry.row_id);
            }
        }
    }

    /// Returns row IDs for values in the given list (IN clause optimization)
    ///
    /// For hash indexes, this does O(k) equality lookups where k is the list size.
    /// For btree indexes, this does O(k * log n) lookups.
    /// Much faster than `get_filtered_row_ids` which may scan the entire index.
    ///
    /// # Arguments
    /// * `value_list` - List of values to match (each is a single column value)
    ///
    /// # Returns
    /// Pooled RowIdVec of row IDs matching any value in the list
    fn get_row_ids_in(&self, value_list: &[Value]) -> RowIdVec {
        let mut results = RowIdVec::new();
        self.get_row_ids_in_into(value_list, &mut results);
        results
    }

    /// Appends row IDs for values in the given list to the provided buffer
    ///
    /// This enables callers to reuse the vector allocation across multiple calls.
    fn get_row_ids_in_into(&self, value_list: &[Value], buffer: &mut Vec<i64>) {
        // Default implementation: do multiple equality lookups
        for value in value_list {
            self.get_row_ids_equal_into(std::slice::from_ref(value), buffer);
        }
    }

    /// Returns row IDs that match the given expression
    ///
    /// # Arguments
    /// * `expr` - The expression to evaluate
    ///
    /// # Returns
    /// Pooled RowIdVec of row IDs matching the expression
    fn get_filtered_row_ids(&self, expr: &dyn Expression) -> RowIdVec;

    /// Returns the minimum value in the index (for MIN aggregate optimization)
    ///
    /// This enables O(1) or O(log n) MIN queries on indexed columns
    /// instead of O(n) full table scans.
    fn get_min_value(&self) -> Option<Value> {
        None // Default implementation - override in concrete indexes
    }

    /// Returns the maximum value in the index (for MAX aggregate optimization)
    ///
    /// This enables O(1) or O(log n) MAX queries on indexed columns
    /// instead of O(n) full table scans.
    fn get_max_value(&self) -> Option<Value> {
        None // Default implementation - override in concrete indexes
    }

    /// Returns all unique values in the index (for ORDER BY optimization)
    ///
    /// This enables sorted iteration through unique values for TOP-N queries.
    fn get_all_values(&self) -> Vec<Value> {
        Vec::new() // Default implementation - override in concrete indexes
    }

    /// Returns the count of distinct non-null values in the index
    ///
    /// This enables O(1) COUNT(DISTINCT col) queries on indexed columns
    /// without cloning all values. Per SQL standard, NULL values are excluded.
    ///
    /// Returns None if the index doesn't support this optimization.
    fn get_distinct_count_excluding_null(&self) -> Option<usize> {
        None // Default implementation - override in concrete indexes
    }

    /// Returns row IDs in sorted order by index value (for ORDER BY optimization)
    ///
    /// This enables efficient ORDER BY queries by iterating through the B-tree
    /// in order, avoiding the need to sort the entire result set.
    ///
    /// # Arguments
    /// * `ascending` - If true, return in ascending order; if false, descending
    /// * `limit` - Maximum number of row IDs to return
    /// * `offset` - Number of row IDs to skip before returning
    ///
    /// # Returns
    /// Vector of row IDs in sorted order, or None if the index doesn't support ordered access
    fn get_row_ids_ordered(
        &self,
        _ascending: bool,
        _limit: usize,
        _offset: usize,
    ) -> Option<Vec<i64>> {
        None // Default implementation - only B-tree indexes support this
    }

    /// Returns grouped row IDs in sorted order by index value (for GROUP BY optimization)
    ///
    /// This enables streaming GROUP BY by iterating through the B-tree in order,
    /// processing one group at a time without needing a hash map.
    ///
    /// # Returns
    /// Vector of (group_value, row_ids) pairs in sorted order,
    /// or None if the index doesn't support ordered group access
    fn get_grouped_row_ids(&self) -> Option<Vec<(Value, Vec<i64>)>> {
        None // Default implementation - only B-tree indexes support this
    }

    /// Iterates through groups in sorted order, calling the callback for each group.
    ///
    /// This is a zero-allocation alternative to `get_grouped_row_ids` that processes
    /// groups one at a time without collecting all groups upfront. This is more
    /// efficient when:
    /// - Early termination is possible (LIMIT)
    /// - Only a few groups pass filtering (HAVING)
    ///
    /// # Arguments
    /// * `callback` - Called for each group with (value, row_ids). Return:
    ///   - `Ok(true)` to continue to next group
    ///   - `Ok(false)` to stop iteration early
    ///   - `Err(e)` to stop and propagate the error
    ///
    /// # Returns
    /// - `Some(Ok(()))` if iteration completed or stopped early with Ok(false)
    /// - `Some(Err(e))` if callback returned an error
    /// - `None` if the index doesn't support ordered group access
    fn for_each_group(
        &self,
        _callback: &mut dyn FnMut(&Value, &[i64]) -> Result<bool>,
    ) -> Option<Result<()>> {
        None // Default implementation - only B-tree indexes support this
    }

    /// Searches for the k nearest neighbors to a query vector.
    ///
    /// Only implemented by HNSW indexes. Returns `None` for non-vector indexes.
    ///
    /// # Arguments
    /// * `query` - The query vector as Extension(Vector) value
    /// * `k` - Number of nearest neighbors to return
    /// * `ef_search` - Search beam width (higher = better recall, slower)
    ///
    /// # Returns
    /// `Some(Vec<(row_id, distance)>)` sorted by distance (closest first),
    /// or `None` if this index doesn't support nearest neighbor search.
    fn search_nearest(
        &self,
        _query: &Value,
        _k: usize,
        _ef_search: usize,
    ) -> Option<Vec<(i64, f64)>> {
        None
    }

    /// Returns the HNSW distance metric as u8 (0=L2, 1=Cosine, 2=IP).
    /// Non-HNSW indexes return None.
    fn hnsw_distance_metric(&self) -> Option<u8> {
        None
    }

    /// Returns the HNSW M parameter (max connections per node per layer).
    /// Non-HNSW indexes return None.
    fn hnsw_m(&self) -> Option<u16> {
        None
    }

    /// Returns the HNSW ef_construction parameter (build beam width).
    /// Non-HNSW indexes return None.
    fn hnsw_ef_construction(&self) -> Option<u16> {
        None
    }

    /// Returns the default ef_search for HNSW indexes.
    /// Non-HNSW indexes return None.
    fn default_ef_search(&self) -> Option<usize> {
        None
    }

    /// Clears all data from the index without closing it.
    /// Used by TRUNCATE for O(1) memory release.
    fn clear(&self) -> Result<()> {
        Ok(())
    }

    /// Performs post-deletion maintenance on the index.
    ///
    /// Called by the background cleanup cycle after deleted rows have been
    /// removed via `remove_batch_slice`. Most index types are no-ops because
    /// `remove` already reclaims storage. HNSW overrides this to compact
    /// the graph when the tombstone ratio exceeds a threshold.
    fn cleanup(&self) -> Result<()> {
        Ok(())
    }

    /// Downcast to concrete type for type-specific operations (e.g., HNSW graph serialization).
    fn as_any(&self) -> &dyn std::any::Any;

    /// Closes the index and releases any resources
    fn close(&mut self) -> Result<()>;
}

#[cfg(test)]
mod tests {
    // Index tests will be implemented when we have concrete implementations
    // For now, just verify the trait compiles correctly

    use super::*;

    // Verify trait is object-safe
    fn _assert_object_safe(_: &dyn Index) {}
}