Skip to main content

ix/
string_pool.rs

1//! Path string pool with prefix deduplication.
2//!
3//! Saves space by storing common directory prefixes once.
4
5use crate::error::{Error, Result};
6use std::collections::HashMap;
7use std::io::{Seek, Write};
8use std::path::Path;
9
10/// Path string pool with prefix deduplication.
11///
12/// Stores common directory prefixes and stores only the suffix for each path.
13pub struct StringPool {
14    prefixes: Vec<String>,
15    prefix_map: HashMap<String, u16>,
16    path_info: HashMap<String, (u32, u16)>,
17}
18
19impl Default for StringPool {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl StringPool {
26    /// Create an empty string pool with a default empty-string prefix.
27    #[must_use]
28    pub fn new() -> Self {
29        let prefixes = vec![String::new()];
30        let mut prefix_map = HashMap::new();
31        prefix_map.insert(String::new(), 0);
32
33        Self {
34            prefixes,
35            prefix_map,
36            path_info: HashMap::new(),
37        }
38    }
39
40    /// Add a path to the pool. During indexing, we just collect unique paths.
41    ///
42    /// Real prefix deduplication happens during serialization or via pre-added prefixes.
43    pub fn add_path(&mut self, path: &Path) {
44        let path_str = path.to_string_lossy().to_string();
45        self.path_info.entry(path_str).or_insert((0, 0));
46    }
47
48    /// Set the prefixes to be used for deduplication.
49    pub fn set_prefixes(&mut self, prefixes: Vec<String>) {
50        self.prefixes = vec![String::new()];
51        self.prefix_map = HashMap::new();
52        self.prefix_map.insert(String::new(), 0);
53
54        for p in prefixes {
55            if p.is_empty() {
56                continue;
57            }
58            let id = u16::try_from(self.prefixes.len()).unwrap_or(0);
59            self.prefix_map.insert(p.clone(), id);
60            self.prefixes.push(p);
61        }
62    }
63
64    /// Get the serialized offset and total length info for a path.
65    #[must_use]
66    pub fn get_info(&self, path: &Path) -> (u32, u16) {
67        let path_str = path.to_string_lossy();
68        *self.path_info.get(path_str.as_ref()).unwrap_or(&(0, 0))
69    }
70
71    /// Serialize the string pool to the writer.
72    ///
73    /// # Errors
74    ///
75    /// Returns an I/O error if the writer fails.
76    #[allow(clippy::as_conversions)] // binary format: suffix/filename len fits in u16
77    pub fn serialize<W: Write + Seek>(&mut self, mut w: W) -> std::io::Result<()> {
78        let start_pos = w.stream_position()?;
79
80        w.write_all(
81            &u32::try_from(self.prefixes.len())
82                .unwrap_or(0)
83                .to_le_bytes(),
84        )?;
85
86        for (i, p) in self.prefixes.iter().enumerate() {
87            w.write_all(&u16::try_from(i).unwrap_or(0).to_le_bytes())?;
88            w.write_all(&u16::try_from(p.len()).unwrap_or(0).to_le_bytes())?;
89            w.write_all(p.as_bytes())?;
90        }
91
92        let current = w.stream_position()?;
93        let padding = (4 - (current % 4)) % 4;
94        for _ in 0..padding {
95            w.write_all(&[0])?;
96        }
97
98        let paths: Vec<String> = self.path_info.keys().cloned().collect();
99        for path_str in paths {
100            let offset = u32::try_from(w.stream_position()? - start_pos).unwrap_or(0);
101
102            let mut best_prefix_id = 0u16;
103            let mut best_prefix_len = 0;
104
105            for (prefix, &id) in &self.prefix_map {
106                if path_str.starts_with(prefix) && prefix.len() > best_prefix_len {
107                    best_prefix_id = id;
108                    best_prefix_len = prefix.len();
109                }
110            }
111
112            let suffix = path_str.get(best_prefix_len..).unwrap_or("");
113            w.write_all(&best_prefix_id.to_le_bytes())?;
114            w.write_all(&(suffix.len() as u16).to_le_bytes())?;
115            w.write_all(suffix.as_bytes())?;
116
117            self.path_info
118                .insert(path_str.clone(), (offset, path_str.len() as u16));
119        }
120
121        Ok(())
122    }
123}
124
125/// Read-only reader for a serialized string pool.
126pub struct StringPoolReader<'a> {
127    data: &'a [u8],
128    prefixes: Vec<&'a [u8]>,
129}
130
131impl<'a> StringPoolReader<'a> {
132    /// Create a new reader from serialized string pool data.
133    ///
134    /// # Errors
135    ///
136    /// Returns an error if the data is too small or the prefix table is
137    /// truncated/invalid.
138    #[allow(clippy::as_conversions)] // binary format: usize quantities within u16 range
139    #[allow(clippy::indexing_slicing)] // length checks guarantee valid range
140    pub fn new(data: &'a [u8]) -> Result<Self> {
141        if data.len() < 4 {
142            return Err(Error::StringPoolOutOfBounds);
143        }
144        let prefix_count = data[0..4].try_into().ok().map_or(0, u32::from_le_bytes) as usize;
145        let mut prefixes = Vec::with_capacity(prefix_count);
146        let mut pos = 4;
147
148        for _ in 0..prefix_count {
149            if pos + 4 > data.len() {
150                return Err(Error::StringPoolOutOfBounds);
151            }
152            let _id = data[pos..pos + 2]
153                .try_into()
154                .ok()
155                .map_or(0, u16::from_le_bytes);
156            let len = data[pos + 2..pos + 4]
157                .try_into()
158                .ok()
159                .map_or(0, u16::from_le_bytes) as usize;
160            pos += 4;
161            if pos + len > data.len() {
162                return Err(Error::StringPoolOutOfBounds);
163            }
164            prefixes.push(&data[pos..pos + len]);
165            pos += len;
166        }
167
168        Ok(Self { data, prefixes })
169    }
170
171    /// Resolve a string from the pool at the given offset.
172    ///
173    /// # Errors
174    ///
175    /// Returns an error if the offset is out of bounds, prefix ID is invalid,
176    /// or the stored bytes are not valid UTF-8.
177    #[allow(clippy::as_conversions)] // binary format: offset/size within u16 range
178    #[allow(clippy::indexing_slicing)] // length checks above guarantee valid range
179    pub fn resolve(&self, offset: u32) -> Result<String> {
180        let pos = offset as usize;
181        if pos + 4 > self.data.len() {
182            return Err(Error::StringPoolOutOfBounds);
183        }
184
185        let prefix_id = self.data[pos..pos + 2]
186            .try_into()
187            .ok()
188            .map_or(0, u16::from_le_bytes) as usize;
189        let suffix_len = self.data[pos + 2..pos + 4]
190            .try_into()
191            .ok()
192            .map_or(0, u16::from_le_bytes) as usize;
193
194        if prefix_id >= self.prefixes.len() {
195            return Err(Error::StringPoolOutOfBounds);
196        }
197
198        let prefix = self.prefixes[prefix_id];
199        let suffix_pos = pos + 4;
200        if suffix_pos + suffix_len > self.data.len() {
201            return Err(Error::StringPoolOutOfBounds);
202        }
203        let suffix = &self.data[suffix_pos..suffix_pos + suffix_len];
204
205        let mut res = String::with_capacity(prefix.len() + suffix.len());
206        res.push_str(std::str::from_utf8(prefix).map_err(|_| Error::InvalidPath)?);
207        res.push_str(std::str::from_utf8(suffix).map_err(|_| Error::InvalidPath)?);
208
209        Ok(res)
210    }
211}
212
213#[cfg(test)]
214#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
215mod tests {
216    use super::*;
217    use std::io::Cursor;
218
219    #[test]
220    fn roundtrip() {
221        let mut pool = StringPool::new();
222        pool.set_prefixes(vec!["/home/user/".to_string(), "/var/log/".to_string()]);
223        pool.add_path(Path::new("/home/user/file.rs"));
224        pool.add_path(Path::new("/var/log/syslog"));
225        pool.add_path(Path::new("/other/path"));
226
227        let mut buf = Cursor::new(Vec::new());
228        pool.serialize(&mut buf).unwrap();
229
230        let data = buf.into_inner();
231        let reader = StringPoolReader::new(&data).unwrap();
232
233        let (off1, _) = pool.get_info(Path::new("/home/user/file.rs"));
234        assert_eq!(reader.resolve(off1).unwrap(), "/home/user/file.rs");
235
236        let (off2, _) = pool.get_info(Path::new("/var/log/syslog"));
237        assert_eq!(reader.resolve(off2).unwrap(), "/var/log/syslog");
238
239        let (off3, _) = pool.get_info(Path::new("/other/path"));
240        assert_eq!(reader.resolve(off3).unwrap(), "/other/path");
241    }
242}