Skip to main content

ix/
string_pool.rs

1//! Path string pool with prefix deduplication.
2//!
3//! Saves space by storing common directory prefixes once.
4
5use crate::error::{Error, Result};
6use std::collections::HashMap;
7use std::io::{Seek, Write};
8use std::path::Path;
9
10/// Path string pool with prefix deduplication.
11///
12/// Stores common directory prefixes and stores only the suffix for each path.
13pub struct StringPool {
14    prefixes: Vec<String>,
15    prefix_map: HashMap<String, u16>,
16    path_info: HashMap<String, (u32, u16)>,
17}
18
19impl Default for StringPool {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl StringPool {
26    /// Create an empty string pool with a default empty-string prefix.
27    #[must_use]
28    pub fn new() -> Self {
29        let prefixes = vec![String::new()];
30        let mut prefix_map = HashMap::new();
31        prefix_map.insert(String::new(), 0);
32
33        Self {
34            prefixes,
35            prefix_map,
36            path_info: HashMap::new(),
37        }
38    }
39
40    /// Add a path to the pool. During indexing, we just collect unique paths.
41    ///
42    /// Real prefix deduplication happens during serialization or via pre-added prefixes.
43    pub fn add_path(&mut self, path: &Path) {
44        let path_str = path.to_string_lossy().to_string();
45        self.path_info.entry(path_str).or_insert((0, 0));
46    }
47
48    /// Set the prefixes to be used for deduplication.
49    pub fn set_prefixes(&mut self, prefixes: Vec<String>) {
50        self.prefixes = vec![String::new()];
51        self.prefix_map = HashMap::new();
52        self.prefix_map.insert(String::new(), 0);
53
54        for p in prefixes {
55            if p.is_empty() {
56                continue;
57            }
58            let id = u16::try_from(self.prefixes.len()).unwrap_or(0);
59            self.prefix_map.insert(p.clone(), id);
60            self.prefixes.push(p);
61        }
62    }
63
64    /// Get the serialized offset and total length info for a path.
65    #[must_use]
66    pub fn get_info(&self, path: &Path) -> (u32, u16) {
67        let path_str = path.to_string_lossy();
68        *self.path_info.get(path_str.as_ref()).unwrap_or(&(0, 0))
69    }
70
71    /// Serialize the string pool to the writer.
72    ///
73    /// # Errors
74    ///
75    /// Returns an I/O error if the writer fails.
76    #[allow(clippy::as_conversions)] // binary format: suffix/filename len fits in u16
77    pub fn serialize<W: Write + Seek>(&mut self, mut w: W) -> std::io::Result<()> {
78        let start_pos = w.stream_position()?;
79
80        w.write_all(&u32::try_from(self.prefixes.len()).unwrap_or(0).to_le_bytes())?;
81
82        for (i, p) in self.prefixes.iter().enumerate() {
83            w.write_all(&u16::try_from(i).unwrap_or(0).to_le_bytes())?;
84            w.write_all(&u16::try_from(p.len()).unwrap_or(0).to_le_bytes())?;
85            w.write_all(p.as_bytes())?;
86        }
87
88        let current = w.stream_position()?;
89        let padding = (4 - (current % 4)) % 4;
90        for _ in 0..padding {
91            w.write_all(&[0])?;
92        }
93
94        let paths: Vec<String> = self.path_info.keys().cloned().collect();
95        for path_str in paths {
96            let offset = u32::try_from(w.stream_position()? - start_pos).unwrap_or(0);
97
98            let mut best_prefix_id = 0u16;
99            let mut best_prefix_len = 0;
100
101            for (prefix, &id) in &self.prefix_map {
102                if path_str.starts_with(prefix) && prefix.len() > best_prefix_len {
103                    best_prefix_id = id;
104                    best_prefix_len = prefix.len();
105                }
106            }
107
108            let suffix = path_str.get(best_prefix_len..).unwrap_or("");
109            w.write_all(&best_prefix_id.to_le_bytes())?;
110            w.write_all(&(suffix.len() as u16).to_le_bytes())?;
111            w.write_all(suffix.as_bytes())?;
112
113            self.path_info
114                .insert(path_str.clone(), (offset, path_str.len() as u16));
115        }
116
117        Ok(())
118    }
119}
120
121/// Read-only reader for a serialized string pool.
122pub struct StringPoolReader<'a> {
123    data: &'a [u8],
124    prefixes: Vec<&'a [u8]>,
125}
126
127impl<'a> StringPoolReader<'a> {
128    /// Create a new reader from serialized string pool data.
129    ///
130    /// # Errors
131    ///
132    /// Returns an error if the data is too small or the prefix table is
133    /// truncated/invalid.
134    #[allow(clippy::as_conversions)] // binary format: usize quantities within u16 range
135    #[allow(clippy::indexing_slicing)] // length checks guarantee valid range
136    pub fn new(data: &'a [u8]) -> Result<Self> {
137        if data.len() < 4 {
138            return Err(Error::StringPoolOutOfBounds);
139        }
140        let prefix_count = data[0..4]
141            .try_into()
142            .ok()
143            .map_or(0, u32::from_le_bytes) as usize;
144        let mut prefixes = Vec::with_capacity(prefix_count);
145        let mut pos = 4;
146
147        for _ in 0..prefix_count {
148            if pos + 4 > data.len() {
149                return Err(Error::StringPoolOutOfBounds);
150            }
151            let _id = data[pos..pos + 2]
152                .try_into()
153                .ok()
154                .map_or(0, u16::from_le_bytes);
155            let len = data[pos + 2..pos + 4]
156                .try_into()
157                .ok()
158                .map_or(0, u16::from_le_bytes) as usize;
159            pos += 4;
160            if pos + len > data.len() {
161                return Err(Error::StringPoolOutOfBounds);
162            }
163            prefixes.push(&data[pos..pos + len]);
164            pos += len;
165        }
166
167        Ok(Self { data, prefixes })
168    }
169
170    /// Resolve a string from the pool at the given offset.
171    ///
172    /// # Errors
173    ///
174    /// Returns an error if the offset is out of bounds, prefix ID is invalid,
175    /// or the stored bytes are not valid UTF-8.
176    #[allow(clippy::as_conversions)] // binary format: offset/size within u16 range
177    #[allow(clippy::indexing_slicing)] // length checks above guarantee valid range
178    pub fn resolve(&self, offset: u32) -> Result<String> {
179        let pos = offset as usize;
180        if pos + 4 > self.data.len() {
181            return Err(Error::StringPoolOutOfBounds);
182        }
183
184        let prefix_id = self.data[pos..pos + 2]
185            .try_into()
186            .ok()
187            .map_or(0, u16::from_le_bytes) as usize;
188        let suffix_len = self.data[pos + 2..pos + 4]
189            .try_into()
190            .ok()
191            .map_or(0, u16::from_le_bytes) as usize;
192
193        if prefix_id >= self.prefixes.len() {
194            return Err(Error::StringPoolOutOfBounds);
195        }
196
197        let prefix = self.prefixes[prefix_id];
198        let suffix_pos = pos + 4;
199        if suffix_pos + suffix_len > self.data.len() {
200            return Err(Error::StringPoolOutOfBounds);
201        }
202        let suffix = &self.data[suffix_pos..suffix_pos + suffix_len];
203
204        let mut res = String::with_capacity(prefix.len() + suffix.len());
205        res.push_str(std::str::from_utf8(prefix).map_err(|_| Error::InvalidPath)?);
206        res.push_str(std::str::from_utf8(suffix).map_err(|_| Error::InvalidPath)?);
207
208        Ok(res)
209    }
210}
211
212#[cfg(test)]
213#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
214mod tests {
215    use super::*;
216    use std::io::Cursor;
217
218    #[test]
219    fn roundtrip() {
220        let mut pool = StringPool::new();
221        pool.set_prefixes(vec!["/home/user/".to_string(), "/var/log/".to_string()]);
222        pool.add_path(Path::new("/home/user/file.rs"));
223        pool.add_path(Path::new("/var/log/syslog"));
224        pool.add_path(Path::new("/other/path"));
225
226        let mut buf = Cursor::new(Vec::new());
227        pool.serialize(&mut buf).unwrap();
228
229        let data = buf.into_inner();
230        let reader = StringPoolReader::new(&data).unwrap();
231
232        let (off1, _) = pool.get_info(Path::new("/home/user/file.rs"));
233        assert_eq!(reader.resolve(off1).unwrap(), "/home/user/file.rs");
234
235        let (off2, _) = pool.get_info(Path::new("/var/log/syslog"));
236        assert_eq!(reader.resolve(off2).unwrap(), "/var/log/syslog");
237
238        let (off3, _) = pool.get_info(Path::new("/other/path"));
239        assert_eq!(reader.resolve(off3).unwrap(), "/other/path");
240    }
241}