1use std::io::{self, Read, Write, Seek, SeekFrom};
50use std::path::{Path, PathBuf};
51
52const MAGIC: &[u8; 8] = b"SSHIDX01";
54
55const MPHF_MAGIC: &[u8; 8] = b"SSHIMH01";
57
58const FORMAT_VERSION: (u32, u32) = (3, 0);
61const MPHF_FORMAT_VERSION: (u32, u32) = (1, 0);
62
63#[derive(Clone, Debug)]
65pub struct DictionarySerializationHeader {
66 pub magic: [u8; 8],
68 pub version_major: u32,
70 pub version_minor: u32,
72 pub k: usize,
74 pub m: usize,
76 pub canonical: bool,
78 pub num_mphf_partitions: u32,
80}
81
82impl DictionarySerializationHeader {
83 pub fn new(k: usize, m: usize, canonical: bool, num_mphf_partitions: u32) -> Self {
85 Self {
86 magic: *MAGIC,
87 version_major: FORMAT_VERSION.0,
88 version_minor: FORMAT_VERSION.1,
89 k,
90 m,
91 canonical,
92 num_mphf_partitions,
93 }
94 }
95
96 pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
98 writer.write_all(&self.magic)?;
99 writer.write_all(&self.version_major.to_le_bytes())?;
100 writer.write_all(&self.version_minor.to_le_bytes())?;
101 writer.write_all(&(self.k as u64).to_le_bytes())?;
102 writer.write_all(&(self.m as u64).to_le_bytes())?;
103 writer.write_all(&[self.canonical as u8])?;
104 writer.write_all(&self.num_mphf_partitions.to_le_bytes())?;
105 Ok(())
106 }
107
108 pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
110 let mut magic = [0u8; 8];
111 reader.read_exact(&mut magic)?;
112
113 if &magic != MAGIC {
114 return Err(io::Error::new(
115 io::ErrorKind::InvalidData,
116 "Invalid magic number for SSHash index file",
117 ));
118 }
119
120 let mut version_major_bytes = [0u8; 4];
121 let mut version_minor_bytes = [0u8; 4];
122 let mut k_bytes = [0u8; 8];
123 let mut m_bytes = [0u8; 8];
124 let mut canonical_bytes = [0u8; 1];
125 let mut num_partitions_bytes = [0u8; 4];
126
127 reader.read_exact(&mut version_major_bytes)?;
128 reader.read_exact(&mut version_minor_bytes)?;
129 reader.read_exact(&mut k_bytes)?;
130 reader.read_exact(&mut m_bytes)?;
131 reader.read_exact(&mut canonical_bytes)?;
132 reader.read_exact(&mut num_partitions_bytes)?;
133
134 let version_major = u32::from_le_bytes(version_major_bytes);
135 let version_minor = u32::from_le_bytes(version_minor_bytes);
136
137 if version_major != FORMAT_VERSION.0 {
138 return Err(io::Error::new(
139 io::ErrorKind::InvalidData,
140 format!(
141 "Incompatible format version: {}.{}, expected {}.{}",
142 version_major, version_minor, FORMAT_VERSION.0, FORMAT_VERSION.1
143 ),
144 ));
145 }
146
147 Ok(Self {
148 magic,
149 version_major,
150 version_minor,
151 k: u64::from_le_bytes(k_bytes) as usize,
152 m: u64::from_le_bytes(m_bytes) as usize,
153 canonical: canonical_bytes[0] != 0,
154 num_mphf_partitions: u32::from_le_bytes(num_partitions_bytes),
155 })
156 }
157}
158
159#[derive(Clone, Copy, Debug)]
161pub struct MphfPartitionEntry {
162 pub partition_id: u32,
164 pub byte_offset: u64,
166 pub byte_size: u64,
168}
169
170impl MphfPartitionEntry {
171 fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
173 writer.write_all(&self.partition_id.to_le_bytes())?;
174 writer.write_all(&self.byte_offset.to_le_bytes())?;
175 writer.write_all(&self.byte_size.to_le_bytes())?;
176 Ok(())
177 }
178
179 fn read(reader: &mut dyn Read) -> io::Result<Self> {
181 let mut id_bytes = [0u8; 4];
182 let mut offset_bytes = [0u8; 8];
183 let mut size_bytes = [0u8; 8];
184
185 reader.read_exact(&mut id_bytes)?;
186 reader.read_exact(&mut offset_bytes)?;
187 reader.read_exact(&mut size_bytes)?;
188
189 Ok(Self {
190 partition_id: u32::from_le_bytes(id_bytes),
191 byte_offset: u64::from_le_bytes(offset_bytes),
192 byte_size: u64::from_le_bytes(size_bytes),
193 })
194 }
195}
196
197#[derive(Clone, Debug)]
206pub struct MphfContainerHeader {
207 pub magic: [u8; 8],
209 pub version_major: u32,
211 pub version_minor: u32,
213 pub num_partitions: u32,
215}
216
217impl MphfContainerHeader {
218 pub fn new(num_partitions: u32) -> Self {
220 Self {
221 magic: *MPHF_MAGIC,
222 version_major: MPHF_FORMAT_VERSION.0,
223 version_minor: MPHF_FORMAT_VERSION.1,
224 num_partitions,
225 }
226 }
227
228 pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
230 writer.write_all(&self.magic)?;
231 writer.write_all(&self.version_major.to_le_bytes())?;
232 writer.write_all(&self.version_minor.to_le_bytes())?;
233 writer.write_all(&self.num_partitions.to_le_bytes())?;
234 Ok(())
235 }
236
237 pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
239 let mut magic = [0u8; 8];
240 reader.read_exact(&mut magic)?;
241
242 if &magic != MPHF_MAGIC {
243 return Err(io::Error::new(
244 io::ErrorKind::InvalidData,
245 "Invalid magic number for SSHash MPHF container file",
246 ));
247 }
248
249 let mut version_major_bytes = [0u8; 4];
250 let mut version_minor_bytes = [0u8; 4];
251 let mut num_partitions_bytes = [0u8; 4];
252
253 reader.read_exact(&mut version_major_bytes)?;
254 reader.read_exact(&mut version_minor_bytes)?;
255 reader.read_exact(&mut num_partitions_bytes)?;
256
257 let version_major = u32::from_le_bytes(version_major_bytes);
258 let version_minor = u32::from_le_bytes(version_minor_bytes);
259
260 if version_major != MPHF_FORMAT_VERSION.0 {
261 return Err(io::Error::new(
262 io::ErrorKind::InvalidData,
263 format!(
264 "Incompatible MPHF format version: {}.{}, expected {}.{}",
265 version_major, version_minor, MPHF_FORMAT_VERSION.0, MPHF_FORMAT_VERSION.1
266 ),
267 ));
268 }
269
270 Ok(Self {
271 magic,
272 version_major,
273 version_minor,
274 num_partitions: u32::from_le_bytes(num_partitions_bytes),
275 })
276 }
277}
278
279pub fn index_file_path<P: AsRef<Path>>(base: P) -> PathBuf {
281 let mut path = base.as_ref().to_path_buf();
282 let ext = path.extension().map(|e| e.to_string_lossy().to_string()).unwrap_or_default();
283 if ext == "ssi" {
284 path
286 } else if ext.is_empty() {
287 path.set_extension("ssi");
288 path
289 } else {
290 path.set_extension(format!("{ext}.ssi"));
291 path
292 }
293}
294
295pub fn mphf_container_path<P: AsRef<Path>>(base: P) -> PathBuf {
297 let base_path = index_file_path(base);
298 let mut container_path = base_path.clone();
299 let filename = format!("{}.mphf", base_path.file_name().unwrap().to_string_lossy());
300 container_path.pop();
301 container_path.push(filename);
302 container_path
303}
304
305#[derive(Debug)]
307pub enum SerializationError {
308 Io(io::Error),
310 Other(String),
312}
313
314impl From<io::Error> for SerializationError {
315 fn from(err: io::Error) -> Self {
316 SerializationError::Io(err)
317 }
318}
319
320impl std::fmt::Display for SerializationError {
321 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
322 match self {
323 SerializationError::Io(e) => write!(f, "IO error: {}", e),
324 SerializationError::Other(s) => write!(f, "{}", s),
325 }
326 }
327}
328
329impl std::error::Error for SerializationError {}
330
331pub type SerializationResult<T> = Result<T, SerializationError>;
333
334pub fn write_mphf_container<W: Write + Seek>(
345 writer: &mut W,
346 mphfs: &[Option<&crate::mphf_config::Mphf>],
347) -> io::Result<Vec<MphfPartitionEntry>> {
348 let num_partitions = mphfs.len() as u32;
349
350 let header = MphfContainerHeader::new(num_partitions);
352 header.write(writer)?;
353
354 let mut offset_table = Vec::new();
356 let offset_table_start = writer.stream_position()?;
357
358 for i in 0..num_partitions {
360 let entry = MphfPartitionEntry {
361 partition_id: i,
362 byte_offset: 0, byte_size: 0, };
365 entry.write(writer)?;
366 }
367
368 let _data_start = writer.stream_position()?;
369
370 for (partition_id, mphf_opt) in mphfs.iter().enumerate() {
372 let byte_offset = writer.stream_position()?;
373
374 if let Some(mphf) = mphf_opt {
375 let mut mphf_buffer = Vec::new();
377 mphf.write(&mut mphf_buffer)?;
378 let byte_size = mphf_buffer.len() as u64;
379
380 writer.write_all(&mphf_buffer)?;
382
383 offset_table.push(MphfPartitionEntry {
385 partition_id: partition_id as u32,
386 byte_offset,
387 byte_size,
388 });
389 } else {
390 offset_table.push(MphfPartitionEntry {
392 partition_id: partition_id as u32,
393 byte_offset,
394 byte_size: 0,
395 });
396 }
397 }
398
399 writer.seek(SeekFrom::Start(offset_table_start))?;
401 for entry in &offset_table {
402 entry.write(writer)?;
403 }
404
405 writer.seek(SeekFrom::End(0))?;
407
408 Ok(offset_table)
409}
410
411pub fn read_mphf_container<R: Read + Seek>(
415 reader: &mut R,
416) -> io::Result<Vec<Option<crate::mphf_config::Mphf>>> {
417 let header = MphfContainerHeader::read(reader)?;
419
420 let mut offset_table = Vec::with_capacity(header.num_partitions as usize);
422 for _ in 0..header.num_partitions {
423 offset_table.push(MphfPartitionEntry::read(reader)?);
424 }
425
426 let mut mphfs: Vec<Option<crate::mphf_config::Mphf>> = (0..header.num_partitions).map(|_| None).collect();
428
429 for entry in offset_table {
430 if entry.byte_size > 0 {
431 reader.seek(SeekFrom::Start(entry.byte_offset))?;
432 let mphf = crate::mphf_config::read_mphf(reader)?;
433 mphfs[entry.partition_id as usize] = Some(mphf);
434 }
435 }
436
437 Ok(mphfs)
438}
439
440#[cfg(test)]
441mod tests {
442 use super::*;
443
444 #[test]
445 fn test_header_roundtrip() {
446 let header = DictionarySerializationHeader::new(31, 13, true, 2);
447
448 let mut buffer = Vec::new();
449 header.write(&mut buffer).unwrap();
450
451 let header2 = DictionarySerializationHeader::read(&mut buffer.as_slice()).unwrap();
452
453 assert_eq!(header.k, header2.k);
454 assert_eq!(header.m, header2.m);
455 assert_eq!(header.canonical, header2.canonical);
456 assert_eq!(header.num_mphf_partitions, header2.num_mphf_partitions);
457 }
458
459 #[test]
460 fn test_mphf_container_header_roundtrip() {
461 let header = MphfContainerHeader::new(5);
462 let mut buffer = Vec::new();
463 header.write(&mut buffer).unwrap();
464
465 let header2 = MphfContainerHeader::read(&mut buffer.as_slice()).unwrap();
466 assert_eq!(header.num_partitions, header2.num_partitions);
467 }
468
469 #[test]
470 fn test_mphf_partition_entry_roundtrip() {
471 let entry = MphfPartitionEntry {
472 partition_id: 3,
473 byte_offset: 1024,
474 byte_size: 512,
475 };
476
477 let mut buffer = Vec::new();
478 entry.write(&mut buffer).unwrap();
479
480 let entry2 = MphfPartitionEntry::read(&mut buffer.as_slice()).unwrap();
481 assert_eq!(entry.partition_id, entry2.partition_id);
482 assert_eq!(entry.byte_offset, entry2.byte_offset);
483 assert_eq!(entry.byte_size, entry2.byte_size);
484 }
485
486 #[test]
487 fn test_file_path_construction() {
488 let base = Path::new("/tmp/my_index");
489 let index = index_file_path(base);
490 assert!(index.to_string_lossy().ends_with("my_index.ssi"));
491
492 let mphf = mphf_container_path(base);
493 assert!(mphf.to_string_lossy().contains("my_index.ssi.mphf"));
494 assert!(!mphf.to_string_lossy().contains(".mphf.0")); }
496}