Skip to main content

ix/
string_pool.rs

1//! Path string pool with prefix deduplication.
2//!
3//! Saves space by storing common directory prefixes once.
4
5use crate::error::{Error, Result};
6use std::collections::HashMap;
7use std::io::{Seek, Write};
8use std::path::Path;
9
10pub struct StringPool {
11    prefixes: Vec<String>,
12    prefix_map: HashMap<String, u16>,
13    // Full path -> (offset in pool section, total length)
14    path_info: HashMap<String, (u32, u16)>,
15}
16
17impl Default for StringPool {
18    fn default() -> Self {
19        Self::new()
20    }
21}
22
23impl StringPool {
24    pub fn new() -> Self {
25        let prefixes = vec!["".to_string()];
26        let mut prefix_map = HashMap::new();
27        prefix_map.insert("".to_string(), 0);
28
29        Self {
30            prefixes,
31            prefix_map,
32            path_info: HashMap::new(),
33        }
34    }
35
36    /// Add a path to the pool. During indexing, we just collect unique paths.
37    /// Real prefix deduplication happens during serialization or via pre-added prefixes.
38    pub fn add_path(&mut self, path: &Path) {
39        let path_str = path.to_string_lossy().to_string();
40        self.path_info.entry(path_str).or_insert((0, 0));
41    }
42
43    /// Set the prefixes to be used for deduplication.
44    pub fn set_prefixes(&mut self, prefixes: Vec<String>) {
45        self.prefixes = vec!["".to_string()];
46        self.prefix_map = HashMap::new();
47        self.prefix_map.insert("".to_string(), 0);
48
49        for p in prefixes {
50            if p.is_empty() {
51                continue;
52            }
53            let id = self.prefixes.len() as u16;
54            self.prefix_map.insert(p.clone(), id);
55            self.prefixes.push(p);
56        }
57    }
58
59    pub fn get_info(&self, path: &Path) -> (u32, u16) {
60        let path_str = path.to_string_lossy();
61        *self.path_info.get(path_str.as_ref()).unwrap_or(&(0, 0))
62    }
63
64    pub fn serialize<W: Write + Seek>(&mut self, mut w: W) -> std::io::Result<()> {
65        let start_pos = w.stream_position()?;
66
67        // Write prefix count
68        w.write_all(&(self.prefixes.len() as u32).to_le_bytes())?;
69
70        // Write prefix table
71        for (i, p) in self.prefixes.iter().enumerate() {
72            w.write_all(&(i as u16).to_le_bytes())?;
73            w.write_all(&(p.len() as u16).to_le_bytes())?;
74            w.write_all(p.as_bytes())?;
75        }
76
77        // Align to 4 bytes for path entries
78        let current = w.stream_position()?;
79        let padding = (4 - (current % 4)) % 4;
80        for _ in 0..padding {
81            w.write_all(&[0])?;
82        }
83
84        // Write path entries and record their offsets
85        let paths: Vec<String> = self.path_info.keys().cloned().collect();
86        for path_str in paths {
87            let offset = (w.stream_position()? - start_pos) as u32;
88
89            // Find longest matching prefix
90            let mut best_prefix_id = 0u16;
91            let mut best_prefix_len = 0;
92
93            for (prefix, &id) in &self.prefix_map {
94                if path_str.starts_with(prefix) && prefix.len() > best_prefix_len {
95                    best_prefix_id = id;
96                    best_prefix_len = prefix.len();
97                }
98            }
99
100            let suffix = &path_str[best_prefix_len..];
101            w.write_all(&best_prefix_id.to_le_bytes())?;
102            w.write_all(&(suffix.len() as u16).to_le_bytes())?;
103            w.write_all(suffix.as_bytes())?;
104
105            self.path_info
106                .insert(path_str.clone(), (offset, path_str.len() as u16));
107        }
108
109        Ok(())
110    }
111}
112
113pub struct StringPoolReader<'a> {
114    data: &'a [u8],
115    prefixes: Vec<&'a [u8]>,
116}
117
118impl<'a> StringPoolReader<'a> {
119    pub fn new(data: &'a [u8]) -> Result<Self> {
120        if data.len() < 4 {
121            return Err(Error::StringPoolOutOfBounds);
122        }
123        let prefix_count = data[0..4]
124            .try_into()
125            .ok()
126            .map(u32::from_le_bytes)
127            .unwrap_or(0) as usize;
128        let mut prefixes = Vec::with_capacity(prefix_count);
129        let mut pos = 4;
130
131        for _ in 0..prefix_count {
132            if pos + 4 > data.len() {
133                return Err(Error::StringPoolOutOfBounds);
134            }
135            let _id = data[pos..pos + 2]
136                .try_into()
137                .ok()
138                .map(u16::from_le_bytes)
139                .unwrap_or(0);
140            let len = data[pos + 2..pos + 4]
141                .try_into()
142                .ok()
143                .map(u16::from_le_bytes)
144                .unwrap_or(0) as usize;
145            pos += 4;
146            if pos + len > data.len() {
147                return Err(Error::StringPoolOutOfBounds);
148            }
149            prefixes.push(&data[pos..pos + len]);
150            pos += len;
151        }
152
153        Ok(Self { data, prefixes })
154    }
155
156    pub fn resolve(&self, offset: u32) -> Result<String> {
157        let pos = offset as usize;
158        if pos + 4 > self.data.len() {
159            return Err(Error::StringPoolOutOfBounds);
160        }
161
162        let prefix_id = self.data[pos..pos + 2]
163            .try_into()
164            .ok()
165            .map(u16::from_le_bytes)
166            .unwrap_or(0) as usize;
167        let suffix_len = self.data[pos + 2..pos + 4]
168            .try_into()
169            .ok()
170            .map(u16::from_le_bytes)
171            .unwrap_or(0) as usize;
172
173        if prefix_id >= self.prefixes.len() {
174            return Err(Error::StringPoolOutOfBounds);
175        }
176
177        let prefix = self.prefixes[prefix_id];
178        let suffix_pos = pos + 4;
179        if suffix_pos + suffix_len > self.data.len() {
180            return Err(Error::StringPoolOutOfBounds);
181        }
182        let suffix = &self.data[suffix_pos..suffix_pos + suffix_len];
183
184        let mut res = String::with_capacity(prefix.len() + suffix.len());
185        res.push_str(std::str::from_utf8(prefix).map_err(|_| Error::InvalidPath)?);
186        res.push_str(std::str::from_utf8(suffix).map_err(|_| Error::InvalidPath)?);
187
188        Ok(res)
189    }
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195    use std::io::Cursor;
196
197    #[test]
198    fn roundtrip() {
199        let mut pool = StringPool::new();
200        pool.set_prefixes(vec!["/home/user/".to_string(), "/var/log/".to_string()]);
201        pool.add_path(Path::new("/home/user/file.rs"));
202        pool.add_path(Path::new("/var/log/syslog"));
203        pool.add_path(Path::new("/other/path"));
204
205        let mut buf = Cursor::new(Vec::new());
206        pool.serialize(&mut buf).unwrap();
207
208        let data = buf.into_inner();
209        let reader = StringPoolReader::new(&data).unwrap();
210
211        let (off1, _) = pool.get_info(Path::new("/home/user/file.rs"));
212        assert_eq!(reader.resolve(off1).unwrap(), "/home/user/file.rs");
213
214        let (off2, _) = pool.get_info(Path::new("/var/log/syslog"));
215        assert_eq!(reader.resolve(off2).unwrap(), "/var/log/syslog");
216
217        let (off3, _) = pool.get_info(Path::new("/other/path"));
218        assert_eq!(reader.resolve(off3).unwrap(), "/other/path");
219    }
220}