siftdb_core/
fst_index.rs

1use std::path::{Path, PathBuf};
2use std::collections::HashMap;
3use std::fs::File;
4use std::io::{BufWriter, BufReader};
5use fst::{IntoStreamer, Streamer, Map, MapBuilder};
6use anyhow::{Result, Context};
7use memmap2::Mmap;
8
9/// FST-based path index for fast path prefix queries and globbing
10#[derive(Debug)]
11pub struct PathFSTIndex {
12    /// Maps path -> file_handle as FST (using Mmap for efficiency)
13    path_map: Option<Map<Mmap>>,
14    /// Reverse lookup: file_handle -> path
15    handle_to_path: HashMap<u32, PathBuf>,
16}
17
18impl PathFSTIndex {
19    /// Create a new empty FST index
20    pub fn new() -> Self {
21        Self {
22            path_map: None,
23            handle_to_path: HashMap::new(),
24        }
25    }
26
27    /// Build FST index from path->handle mappings
28    pub fn build_from_paths<P: AsRef<Path>>(
29        paths: HashMap<PathBuf, u32>,
30        output_path: P,
31    ) -> Result<Self> {
32        // Sort paths for FST building
33        let mut sorted_paths: Vec<_> = paths.iter().collect();
34        sorted_paths.sort_by(|a, b| a.0.cmp(b.0));
35
36        // Build FST
37        let file = File::create(&output_path)?;
38        let mut builder = MapBuilder::new(BufWriter::new(file))?;
39
40        let mut handle_to_path = HashMap::new();
41
42        for (path, &handle) in sorted_paths {
43            let path_str = path.to_string_lossy();
44            let path_bytes = path_str.as_bytes();
45            
46            builder.insert(path_bytes, handle as u64)?;
47            handle_to_path.insert(handle, path.clone());
48        }
49
50        builder.finish()?;
51
52        // Load the built FST
53        let file = File::open(&output_path)?;
54        let mmap = unsafe { memmap2::Mmap::map(&file)? };
55        let path_map = Map::new(mmap)?;
56
57        Ok(Self {
58            path_map: Some(path_map),
59            handle_to_path,
60        })
61    }
62
63    /// Load existing FST index from file
64    pub fn load_from_file<P: AsRef<Path>>(
65        fst_path: P,
66        json_path: P,
67    ) -> Result<Self> {
68        // Load FST
69        let file = File::open(&fst_path)?;
70        let mmap = unsafe { memmap2::Mmap::map(&file)? };
71        let path_map = Map::new(mmap)?;
72
73        // Load reverse mapping from JSON
74        let file = File::open(&json_path)?;
75        let reader = BufReader::new(file);
76        let handle_to_path: HashMap<u32, PathBuf> = serde_json::from_reader(reader)
77            .context("Failed to parse handle-to-path mapping")?;
78
79        Ok(Self {
80            path_map: Some(path_map),
81            handle_to_path,
82        })
83    }
84
85    /// Save reverse mapping to JSON file
86    pub fn save_reverse_mapping<P: AsRef<Path>>(&self, path: P) -> Result<()> {
87        let file = File::create(path)?;
88        let writer = BufWriter::new(file);
89        serde_json::to_writer_pretty(writer, &self.handle_to_path)?;
90        Ok(())
91    }
92
93    /// Get file handle for exact path
94    pub fn get_handle(&self, path: &Path) -> Option<u32> {
95        let path_str = path.to_string_lossy();
96        self.path_map.as_ref()?
97            .get(path_str.as_bytes())
98            .map(|h| h as u32)
99    }
100
101    /// Get path for file handle
102    pub fn get_path(&self, handle: u32) -> Option<&PathBuf> {
103        self.handle_to_path.get(&handle)
104    }
105
106    /// Find all paths matching a prefix
107    pub fn find_by_prefix(&self, prefix: &str) -> Vec<(PathBuf, u32)> {
108        let mut results = Vec::new();
109        
110        if let Some(ref path_map) = self.path_map {
111            let prefix_bytes = prefix.as_bytes();
112            let mut stream = path_map.range().ge(prefix_bytes).into_stream();
113            
114            while let Some((path_bytes, handle)) = stream.next() {
115                if !path_bytes.starts_with(prefix_bytes) {
116                    break; // FST is sorted, so we can break early
117                }
118                
119                if let Ok(path_str) = std::str::from_utf8(path_bytes) {
120                    let path = PathBuf::from(path_str);
121                    results.push((path, handle as u32));
122                }
123            }
124        }
125        
126        results
127    }
128
129    /// Find all paths matching a glob pattern using FST acceleration
130    pub fn find_by_glob(&self, pattern: &str) -> Result<Vec<(PathBuf, u32)>> {
131        let glob = globset::Glob::new(pattern)?;
132        let matcher = glob.compile_matcher();
133        
134        let mut results = Vec::new();
135        
136        // If pattern has a clear prefix, use FST range query
137        if let Some(prefix) = extract_prefix(pattern) {
138            let candidates = self.find_by_prefix(&prefix);
139            for (path, handle) in candidates {
140                if matcher.is_match(&path) {
141                    results.push((path, handle));
142                }
143            }
144        } else if let Some(ref path_map) = self.path_map {
145            // Fall back to full scan (still faster than linear due to FST structure)
146            let mut stream = path_map.into_stream();
147            while let Some((path_bytes, handle)) = stream.next() {
148                if let Ok(path_str) = std::str::from_utf8(path_bytes) {
149                    let path = PathBuf::from(path_str);
150                    if matcher.is_match(&path) {
151                        results.push((path, handle as u32));
152                    }
153                }
154            }
155        }
156        
157        Ok(results)
158    }
159
160    /// Get total number of indexed paths
161    pub fn len(&self) -> usize {
162        self.handle_to_path.len()
163    }
164
165    /// Check if index is empty
166    pub fn is_empty(&self) -> bool {
167        self.handle_to_path.is_empty()
168    }
169}
170
171/// Extract a prefix from a glob pattern for FST acceleration
172/// Examples: "src/**/*.rs" -> Some("src/")
173///          "**/*.rs" -> None (no useful prefix)
174fn extract_prefix(pattern: &str) -> Option<String> {
175    let mut prefix = String::new();
176    let chars: Vec<char> = pattern.chars().collect();
177    
178    for &ch in &chars {
179        match ch {
180            '*' | '?' | '[' | '{' => break, // Stop at first wildcard
181            _ => prefix.push(ch),
182        }
183    }
184    
185    // Only return prefix if it's useful (at least one directory level)
186    if prefix.contains('/') && !prefix.is_empty() {
187        // Find the last directory separator to get a clean directory prefix
188        if let Some(pos) = prefix.rfind('/') {
189            Some(prefix[..=pos].to_string())
190        } else {
191            None
192        }
193    } else {
194        None
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201    use tempfile::TempDir;
202
203    #[test]
204    fn test_extract_prefix() {
205        assert_eq!(extract_prefix("src/**/*.rs"), Some("src/".to_string()));
206        assert_eq!(extract_prefix("src/main.rs"), Some("src/".to_string()));
207        assert_eq!(extract_prefix("**/*.rs"), None);
208        assert_eq!(extract_prefix("*.rs"), None);
209        assert_eq!(extract_prefix("tests/unit/*.rs"), Some("tests/unit/".to_string()));
210    }
211
212    #[test]
213    fn test_fst_index_basic() -> Result<()> {
214        let temp_dir = TempDir::new()?;
215        let fst_path = temp_dir.path().join("paths.fst");
216        
217        let mut paths = HashMap::new();
218        paths.insert(PathBuf::from("src/main.rs"), 1);
219        paths.insert(PathBuf::from("src/lib.rs"), 2);
220        paths.insert(PathBuf::from("tests/test.rs"), 3);
221        
222        let index = PathFSTIndex::build_from_paths(paths, &fst_path)?;
223        
224        // Test exact lookup
225        assert_eq!(index.get_handle(&PathBuf::from("src/main.rs")), Some(1));
226        assert_eq!(index.get_handle(&PathBuf::from("nonexistent")), None);
227        
228        // Test prefix search
229        let src_files = index.find_by_prefix("src/");
230        assert_eq!(src_files.len(), 2);
231        
232        Ok(())
233    }
234}