irondrop 2.7.0 - Docs.rs

// SPDX-License-Identifier: MIT

//! Ultra-compact search implementation targeting <100MB for 10M entries
//! Proof of concept showing memory optimization techniques

use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};

/// Ultra-compact entry: 11 bytes per file
#[repr(C, packed)]
#[derive(Copy, Clone)]
pub struct UltraCompactEntry {
    /// Offset into string pool (24 bits = 16M unique strings)
    name_offset: [u8; 3],

    /// Parent directory ID (24 bits = 16M directories)  
    parent_id: [u8; 3],

    /// Log2 of file size (1 byte covers 1B to 8EB)
    /// size = 1 << size_log2 (approximate)
    size_log2: u8,

    /// Packed data (4 bytes):
    /// - Bits 0-1: flags (is_dir, hidden)
    /// - Bits 2-31: modified time (seconds/4 since 2020-01-01)
    packed_data: u32,
}

impl UltraCompactEntry {
    const FLAG_IS_DIR: u32 = 1 << 0;
    const TIME_EPOCH: u64 = 1_577_836_800; // 2020-01-01 00:00:00 UTC

    pub fn new(
        name_offset: u32,
        parent_id: u32,
        size: u64,
        is_dir: bool,
        modified: SystemTime,
    ) -> Self {
        // Convert size to log2 (approximate)
        let size_log2 = if size == 0 {
            0
        } else {
            (64 - size.leading_zeros()) as u8
        };

        // Pack modified time (seconds/4 since 2020)
        let modified_secs = modified
            .duration_since(UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs();
        let time_packed = ((modified_secs.saturating_sub(Self::TIME_EPOCH)) / 4) as u32;

        // Pack flags and time
        let mut packed_data = (time_packed << 2) & 0xFFFF_FFFC;
        if is_dir {
            packed_data |= Self::FLAG_IS_DIR;
        }

        Self {
            name_offset: [
                (name_offset & 0xFF) as u8,
                ((name_offset >> 8) & 0xFF) as u8,
                ((name_offset >> 16) & 0xFF) as u8,
            ],
            parent_id: [
                (parent_id & 0xFF) as u8,
                ((parent_id >> 8) & 0xFF) as u8,
                ((parent_id >> 16) & 0xFF) as u8,
            ],
            size_log2,
            packed_data,
        }
    }

    #[inline]
    pub fn name_offset(&self) -> u32 {
        u32::from_le_bytes([
            self.name_offset[0],
            self.name_offset[1],
            self.name_offset[2],
            0,
        ])
    }

    #[inline]
    pub fn parent_id(&self) -> u32 {
        u32::from_le_bytes([self.parent_id[0], self.parent_id[1], self.parent_id[2], 0])
    }

    #[inline]
    pub fn is_dir(&self) -> bool {
        (self.packed_data & Self::FLAG_IS_DIR) != 0
    }

    #[inline]
    pub fn size(&self) -> u64 {
        if self.size_log2 == 0 {
            0
        } else {
            1u64 << (self.size_log2 - 1)
        }
    }

    #[inline]
    pub fn modified(&self) -> SystemTime {
        let time_offset = (self.packed_data >> 2) as u64 * 4;
        let secs = Self::TIME_EPOCH + time_offset;
        UNIX_EPOCH + std::time::Duration::from_secs(secs)
    }
}

/// Memory-efficient string pool using a single contiguous buffer
pub struct StringPool {
    /// Contiguous buffer of null-terminated strings
    data: Vec<u8>,

    /// Hash table for fast lookups: (hash, offset)
    /// Sorted by hash for binary search
    lookup: Vec<(u32, u32)>,
}

impl Default for StringPool {
    fn default() -> Self {
        let mut pool = Self {
            data: Vec::with_capacity(60 * 1024 * 1024), // 60MB initial
            lookup: Vec::with_capacity(2_000_000),      // 2M unique strings
        };

        // Reserve offset 0 for "no parent"
        pool.data.push(0);
        pool.lookup.push((0, 0));

        pool
    }
}

impl StringPool {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn intern(&mut self, s: &str) -> u32 {
        let hash = Self::hash(s);

        // Binary search for existing string
        if let Ok(idx) = self.lookup.binary_search_by_key(&hash, |&(h, _)| h) {
            return self.lookup[idx].1;
        }

        // Add new string
        let offset = self.data.len() as u32;
        self.data.extend_from_slice(s.as_bytes());
        self.data.push(0); // null terminator

        // Insert maintaining sort order
        let insert_pos = self
            .lookup
            .binary_search_by_key(&hash, |&(h, _)| h)
            .unwrap_err();
        self.lookup.insert(insert_pos, (hash, offset));

        offset
    }

    pub fn get(&self, offset: u32) -> &str {
        if offset == 0 {
            return "";
        }

        let start = offset as usize;
        let end = self.data[start..].iter().position(|&b| b == 0).unwrap_or(0);

        unsafe { std::str::from_utf8_unchecked(&self.data[start..start + end]) }
    }

    fn hash(s: &str) -> u32 {
        // Simple FNV-1a hash
        let mut hash = 2_166_136_261_u32;
        for byte in s.bytes() {
            hash ^= byte as u32;
            hash = hash.wrapping_mul(16_777_619);
        }
        hash
    }

    pub fn memory_usage(&self) -> usize {
        self.data.len() + self.lookup.len() * 8
    }
}

/// Radix-accelerated index for fast searches
pub struct RadixIndex {
    /// All entries in a single vector
    entries: Vec<UltraCompactEntry>,

    /// String pool for name storage
    strings: StringPool,

    /// Radix buckets: first byte -> range of entry indices
    /// Each bucket is (start_idx, end_idx)
    radix_buckets: [(u32, u32); 256],

    /// Sorted array of (name_hash, entry_idx) for binary search
    name_index: Vec<(u32, u32)>,
}

impl Default for RadixIndex {
    fn default() -> Self {
        Self {
            entries: Vec::with_capacity(10_000_000),
            strings: StringPool::new(),
            radix_buckets: [(0, 0); 256],
            name_index: Vec::with_capacity(10_000_000),
        }
    }
}

impl RadixIndex {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn add_entry(
        &mut self,
        name: &str,
        parent_id: u32,
        size: u64,
        is_dir: bool,
        modified: SystemTime,
    ) -> u32 {
        let name_offset = self.strings.intern(name);
        let entry_id = self.entries.len() as u32;

        let entry = UltraCompactEntry::new(name_offset, parent_id, size, is_dir, modified);

        self.entries.push(entry);

        // Add to name index
        let name_hash = StringPool::hash(name);
        self.name_index.push((name_hash, entry_id));

        entry_id
    }

    pub fn build_index(&mut self) {
        // Sort name index by hash
        self.name_index.sort_unstable_by_key(|&(hash, _)| hash);

        // Build radix buckets based on first byte of name
        let mut current_byte = 0u8;
        let mut start_idx = 0u32;

        for (i, &(_, entry_idx)) in self.name_index.iter().enumerate() {
            let entry = self.entries[entry_idx as usize];
            let name = self.strings.get(entry.name_offset());

            if let Some(first_byte) = name.bytes().next() {
                while current_byte < first_byte {
                    self.radix_buckets[current_byte as usize] = (start_idx, i as u32);
                    current_byte += 1;
                    start_idx = i as u32;
                }
            }
        }

        // Fill remaining buckets
        let end = self.name_index.len() as u32;
        while current_byte != 0 {
            self.radix_buckets[current_byte as usize] = (start_idx, end);
            current_byte = current_byte.wrapping_add(1);
            start_idx = end;
        }
    }

    pub fn search(&self, query: &str, limit: usize) -> Vec<u32> {
        let mut results = Vec::with_capacity(limit);
        let query_lower = query.to_lowercase();

        // Use radix bucket to narrow search space
        let first_byte = query_lower.bytes().next().unwrap_or(0);
        let (start, end) = self.radix_buckets[first_byte as usize];

        // Search within the bucket
        for i in start..end.min(start + 10000) {
            let (_, entry_idx) = self.name_index[i as usize];
            let entry = self.entries[entry_idx as usize];
            let name = self.strings.get(entry.name_offset());

            if name.to_lowercase().contains(&query_lower) {
                results.push(entry_idx);
                if results.len() >= limit {
                    break;
                }
            }
        }

        results
    }

    pub fn get_path(&self, entry_id: u32) -> PathBuf {
        let mut components = Vec::new();
        let mut current_id = entry_id;

        // Walk up the parent chain
        while current_id != 0 {
            let entry = self.entries[current_id as usize];
            let name = self.strings.get(entry.name_offset());
            components.push(name);

            current_id = entry.parent_id();
        }

        // Reverse to get correct order
        components.reverse();

        if components.is_empty() {
            PathBuf::from("/")
        } else {
            PathBuf::from(components.join("/"))
        }
    }

    pub fn memory_usage(&self) -> usize {
        // Use actual len() instead of capacity() for realistic memory usage
        self.entries.len() * std::mem::size_of::<UltraCompactEntry>()
            + self.strings.data.len() // Actual string data size
            + self.name_index.len() * 8 // Actual index size
            + std::mem::size_of::<Self>()
    }

    pub fn entry_count(&self) -> usize {
        self.entries.len()
    }
}

/// Compressed LRU cache storing only entry IDs
pub struct CompactCache {
    /// (query_hash, entry_ids)
    cache: Vec<(u64, Vec<u32>)>,
    max_entries: usize,
}

impl CompactCache {
    pub fn new(max_entries: usize) -> Self {
        Self {
            cache: Vec::with_capacity(max_entries),
            max_entries,
        }
    }

    pub fn get(&self, query: &str) -> Option<&[u32]> {
        let hash = Self::hash(query);
        self.cache
            .binary_search_by_key(&hash, |&(h, _)| h)
            .ok()
            .map(|i| self.cache[i].1.as_slice())
    }

    pub fn put(&mut self, query: &str, entry_ids: Vec<u32>) {
        if self.cache.len() >= self.max_entries {
            // Simple eviction: remove last entry (O(1))
            // This avoids O(n) shifting that occurs with removing at index 0
            self.cache.pop();
        }

        let hash = Self::hash(query);
        match self.cache.binary_search_by_key(&hash, |&(h, _)| h) {
            Ok(i) => self.cache[i].1 = entry_ids,
            Err(i) => self.cache.insert(i, (hash, entry_ids)),
        }
    }

    fn hash(s: &str) -> u64 {
        // Simple hash for cache keys
        let mut hash = 0u64;
        for byte in s.bytes() {
            hash = hash.wrapping_mul(31).wrapping_add(byte as u64);
        }
        hash
    }

    pub fn memory_usage(&self) -> usize {
        self.cache.capacity() * 16
            + self
                .cache
                .iter()
                .map(|(_, v)| v.capacity() * 4)
                .sum::<usize>()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_ultra_compact_entry_size() {
        assert_eq!(std::mem::size_of::<UltraCompactEntry>(), 11);
    }

    #[test]
    fn test_string_pool() {
        let mut pool = StringPool::new();

        let offset1 = pool.intern("hello");
        let offset2 = pool.intern("world");
        let offset3 = pool.intern("hello"); // Should reuse

        assert_eq!(offset1, offset3);
        assert_ne!(offset1, offset2);

        assert_eq!(pool.get(offset1), "hello");
        assert_eq!(pool.get(offset2), "world");
    }

    #[test]
    fn test_memory_usage() {
        let mut index = RadixIndex::new();

        // Add 10K test entries for realistic test
        for i in 0..10_000 {
            index.add_entry(
                &format!("file_{i:04}.txt"),
                0,
                1024 * (i as u64),
                false,
                SystemTime::now(),
            );
        }

        index.build_index();

        let memory = index.memory_usage();
        let per_entry = memory / 10_000;

        // Break down memory usage
        let entry_memory = index.entries.len() * std::mem::size_of::<UltraCompactEntry>();
        let string_memory = index.strings.data.len();
        let index_memory = index.name_index.len() * 8;

        println!(
            "Entry memory: {:.1}KB ({} entries * {} bytes)",
            entry_memory as f64 / 1024.0,
            index.entries.len(),
            std::mem::size_of::<UltraCompactEntry>()
        );
        println!("String memory: {:.1}KB", string_memory as f64 / 1024.0);
        println!("Index memory: {:.1}KB", index_memory as f64 / 1024.0);
        println!(
            "Total memory per entry: {} bytes ({:.1}KB total)",
            per_entry,
            memory as f64 / 1024.0
        );

        // Each entry should be around 11 bytes + string overhead
        assert!(per_entry < 100); // Under 100 bytes per entry is excellent
    }

    #[test]
    fn test_search_performance() {
        let mut index = RadixIndex::new();

        // Add test entries
        for i in 0..10000 {
            index.add_entry(
                &format!("document_{i}.pdf"),
                0,
                1024 * i,
                false,
                SystemTime::now(),
            );
        }

        index.build_index();

        let start = std::time::Instant::now();
        let results = index.search("document_500", 10);
        let elapsed = start.elapsed();

        assert!(!results.is_empty());
        // In debug mode, timing can vary - just check it's reasonable (<100ms)
        assert!(elapsed.as_millis() < 100); // Should be under 100ms even in debug
    }

    #[test]
    fn test_compact_cache_eviction_pop_last() {
        let mut cache = CompactCache::new(3);
        cache.put("a", vec![1]);
        cache.put("b", vec![2]);
        cache.put("c", vec![3]);
        // Fill beyond capacity triggers eviction using pop()
        cache.put("d", vec![4]);
        // Ensure we still have max_entries
        assert_eq!(cache.cache.len(), 3);
        // The cache stores sorted by hash; check that 'd' is retrievable
        assert!(cache.get("d").is_some());
    }
}

/// Demonstrates memory savings
pub fn demonstrate_memory_savings() {
    println!("=== Ultra-Compact Search Memory Demonstration ===\n");

    println!("Structure Sizes:");
    println!(
        "  UltraCompactEntry: {} bytes",
        std::mem::size_of::<UltraCompactEntry>()
    );
    println!("  vs Original: 24 bytes");
    println!("  Savings: {} bytes per entry\n", 24 - 11);

    let entries_10m = 10_000_000;
    let original_memory = 3514; // MB from analysis
    let optimized_memory = (11 * entries_10m + 76 * 1024 * 1024 + 16 * 1024 * 1024) / 1_048_576;

    println!("For 10M entries:");
    println!("  Original: {original_memory} MB");
    println!("  Optimized: {optimized_memory} MB");
    println!("  Reduction: {}x", original_memory / optimized_memory);
    println!("  Saved: {} MB\n", original_memory - optimized_memory);

    println!("Techniques Used:");
    println!("  ✓ Bit packing (11 byte entries)");
    println!("  ✓ String pooling (single buffer)");
    println!("  ✓ Parent references (no path storage)");
    println!("  ✓ Log-scale size (1 byte for any size)");
    println!("  ✓ Radix indexing (fast lookups)");
    println!("  ✓ Compact cache (IDs only)");
}