1use std::io::{self, Read, Write, Seek, SeekFrom};
50use std::path::{Path, PathBuf};
51
52const MAGIC: &[u8; 8] = b"SSHIDX01";
54
55const MPHF_MAGIC: &[u8; 8] = b"SSHIMH02";
57
58const FORMAT_VERSION: (u32, u32) = (3, 0);
61const MPHF_FORMAT_VERSION: (u32, u32) = (2, 0);
62
63#[derive(Clone, Debug)]
65pub struct DictionarySerializationHeader {
66 pub magic: [u8; 8],
68 pub version_major: u32,
70 pub version_minor: u32,
72 pub k: usize,
74 pub m: usize,
76 pub canonical: bool,
78 pub num_mphf_partitions: u32,
80}
81
82impl DictionarySerializationHeader {
83 pub fn new(k: usize, m: usize, canonical: bool, num_mphf_partitions: u32) -> Self {
85 Self {
86 magic: *MAGIC,
87 version_major: FORMAT_VERSION.0,
88 version_minor: FORMAT_VERSION.1,
89 k,
90 m,
91 canonical,
92 num_mphf_partitions,
93 }
94 }
95
96 pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
98 writer.write_all(&self.magic)?;
99 writer.write_all(&self.version_major.to_le_bytes())?;
100 writer.write_all(&self.version_minor.to_le_bytes())?;
101 writer.write_all(&(self.k as u64).to_le_bytes())?;
102 writer.write_all(&(self.m as u64).to_le_bytes())?;
103 writer.write_all(&[self.canonical as u8])?;
104 writer.write_all(&self.num_mphf_partitions.to_le_bytes())?;
105 Ok(())
106 }
107
108 pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
110 let mut magic = [0u8; 8];
111 reader.read_exact(&mut magic)?;
112
113 if &magic != MAGIC {
114 return Err(io::Error::new(
115 io::ErrorKind::InvalidData,
116 "Invalid magic number for SSHash index file",
117 ));
118 }
119
120 let mut version_major_bytes = [0u8; 4];
121 let mut version_minor_bytes = [0u8; 4];
122 let mut k_bytes = [0u8; 8];
123 let mut m_bytes = [0u8; 8];
124 let mut canonical_bytes = [0u8; 1];
125 let mut num_partitions_bytes = [0u8; 4];
126
127 reader.read_exact(&mut version_major_bytes)?;
128 reader.read_exact(&mut version_minor_bytes)?;
129 reader.read_exact(&mut k_bytes)?;
130 reader.read_exact(&mut m_bytes)?;
131 reader.read_exact(&mut canonical_bytes)?;
132 reader.read_exact(&mut num_partitions_bytes)?;
133
134 let version_major = u32::from_le_bytes(version_major_bytes);
135 let version_minor = u32::from_le_bytes(version_minor_bytes);
136
137 if version_major != FORMAT_VERSION.0 {
138 return Err(io::Error::new(
139 io::ErrorKind::InvalidData,
140 format!(
141 "Incompatible format version: {}.{}, expected {}.{}",
142 version_major, version_minor, FORMAT_VERSION.0, FORMAT_VERSION.1
143 ),
144 ));
145 }
146
147 Ok(Self {
148 magic,
149 version_major,
150 version_minor,
151 k: u64::from_le_bytes(k_bytes) as usize,
152 m: u64::from_le_bytes(m_bytes) as usize,
153 canonical: canonical_bytes[0] != 0,
154 num_mphf_partitions: u32::from_le_bytes(num_partitions_bytes),
155 })
156 }
157}
158
159#[derive(Clone, Copy, Debug)]
161pub struct MphfPartitionEntry {
162 pub partition_id: u32,
164 pub byte_offset: u64,
166 pub byte_size: u64,
168}
169
170impl MphfPartitionEntry {
171 fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
173 writer.write_all(&self.partition_id.to_le_bytes())?;
174 writer.write_all(&self.byte_offset.to_le_bytes())?;
175 writer.write_all(&self.byte_size.to_le_bytes())?;
176 Ok(())
177 }
178
179 fn read(reader: &mut dyn Read) -> io::Result<Self> {
181 let mut id_bytes = [0u8; 4];
182 let mut offset_bytes = [0u8; 8];
183 let mut size_bytes = [0u8; 8];
184
185 reader.read_exact(&mut id_bytes)?;
186 reader.read_exact(&mut offset_bytes)?;
187 reader.read_exact(&mut size_bytes)?;
188
189 Ok(Self {
190 partition_id: u32::from_le_bytes(id_bytes),
191 byte_offset: u64::from_le_bytes(offset_bytes),
192 byte_size: u64::from_le_bytes(size_bytes),
193 })
194 }
195}
196
197#[derive(Clone, Debug)]
206pub struct MphfContainerHeader {
207 pub magic: [u8; 8],
209 pub version_major: u32,
211 pub version_minor: u32,
213 pub num_partitions: u32,
215}
216
217impl MphfContainerHeader {
218 pub fn new(num_partitions: u32) -> Self {
220 Self {
221 magic: *MPHF_MAGIC,
222 version_major: MPHF_FORMAT_VERSION.0,
223 version_minor: MPHF_FORMAT_VERSION.1,
224 num_partitions,
225 }
226 }
227
228 pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
230 writer.write_all(&self.magic)?;
231 writer.write_all(&self.version_major.to_le_bytes())?;
232 writer.write_all(&self.version_minor.to_le_bytes())?;
233 writer.write_all(&self.num_partitions.to_le_bytes())?;
234 Ok(())
235 }
236
237 pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
239 let mut magic = [0u8; 8];
240 reader.read_exact(&mut magic)?;
241
242 if &magic != MPHF_MAGIC {
243 if &magic == b"SSHIMH01" {
245 return Err(io::Error::new(
246 io::ErrorKind::InvalidData,
247 "MPHF container is v1 format (SSHIMH01). Please rebuild the index — v2 (PartitionedMphf) is required.",
248 ));
249 }
250 return Err(io::Error::new(
251 io::ErrorKind::InvalidData,
252 "Invalid magic number for SSHash MPHF container file",
253 ));
254 }
255
256 let mut version_major_bytes = [0u8; 4];
257 let mut version_minor_bytes = [0u8; 4];
258 let mut num_partitions_bytes = [0u8; 4];
259
260 reader.read_exact(&mut version_major_bytes)?;
261 reader.read_exact(&mut version_minor_bytes)?;
262 reader.read_exact(&mut num_partitions_bytes)?;
263
264 let version_major = u32::from_le_bytes(version_major_bytes);
265 let version_minor = u32::from_le_bytes(version_minor_bytes);
266
267 if version_major != MPHF_FORMAT_VERSION.0 {
268 return Err(io::Error::new(
269 io::ErrorKind::InvalidData,
270 format!(
271 "Incompatible MPHF format version: {}.{}, expected {}.{}",
272 version_major, version_minor, MPHF_FORMAT_VERSION.0, MPHF_FORMAT_VERSION.1
273 ),
274 ));
275 }
276
277 Ok(Self {
278 magic,
279 version_major,
280 version_minor,
281 num_partitions: u32::from_le_bytes(num_partitions_bytes),
282 })
283 }
284}
285
286pub fn index_file_path<P: AsRef<Path>>(base: P) -> PathBuf {
288 let mut path = base.as_ref().to_path_buf();
289 let ext = path.extension().map(|e| e.to_string_lossy().to_string()).unwrap_or_default();
290 if ext == "ssi" {
291 path
293 } else if ext.is_empty() {
294 path.set_extension("ssi");
295 path
296 } else {
297 path.set_extension(format!("{ext}.ssi"));
298 path
299 }
300}
301
302pub fn mphf_container_path<P: AsRef<Path>>(base: P) -> PathBuf {
304 let base_path = index_file_path(base);
305 let mut container_path = base_path.clone();
306 let filename = format!("{}.mphf", base_path.file_name().unwrap().to_string_lossy());
307 container_path.pop();
308 container_path.push(filename);
309 container_path
310}
311
312#[derive(Debug)]
314pub enum SerializationError {
315 Io(io::Error),
317 Other(String),
319}
320
321impl From<io::Error> for SerializationError {
322 fn from(err: io::Error) -> Self {
323 SerializationError::Io(err)
324 }
325}
326
327impl std::fmt::Display for SerializationError {
328 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
329 match self {
330 SerializationError::Io(e) => write!(f, "IO error: {}", e),
331 SerializationError::Other(s) => write!(f, "{}", s),
332 }
333 }
334}
335
336impl std::error::Error for SerializationError {}
337
338pub type SerializationResult<T> = Result<T, SerializationError>;
340
341pub fn write_mphf_container<W: Write + Seek>(
352 writer: &mut W,
353 mphfs: &[Option<&crate::partitioned_mphf::PartitionedMphf>],
354) -> io::Result<Vec<MphfPartitionEntry>> {
355 let num_partitions = mphfs.len() as u32;
356
357 let header = MphfContainerHeader::new(num_partitions);
359 header.write(writer)?;
360
361 let mut offset_table = Vec::new();
363 let offset_table_start = writer.stream_position()?;
364
365 for i in 0..num_partitions {
367 let entry = MphfPartitionEntry {
368 partition_id: i,
369 byte_offset: 0, byte_size: 0, };
372 entry.write(writer)?;
373 }
374
375 let _data_start = writer.stream_position()?;
376
377 for (partition_id, mphf_opt) in mphfs.iter().enumerate() {
379 let byte_offset = writer.stream_position()?;
380
381 if let Some(pmphf) = mphf_opt {
382 let mut mphf_buffer = Vec::new();
384 pmphf.write_to(&mut mphf_buffer)?;
385 let byte_size = mphf_buffer.len() as u64;
386
387 writer.write_all(&mphf_buffer)?;
389
390 offset_table.push(MphfPartitionEntry {
392 partition_id: partition_id as u32,
393 byte_offset,
394 byte_size,
395 });
396 } else {
397 offset_table.push(MphfPartitionEntry {
399 partition_id: partition_id as u32,
400 byte_offset,
401 byte_size: 0,
402 });
403 }
404 }
405
406 writer.seek(SeekFrom::Start(offset_table_start))?;
408 for entry in &offset_table {
409 entry.write(writer)?;
410 }
411
412 writer.seek(SeekFrom::End(0))?;
414
415 Ok(offset_table)
416}
417
418pub fn read_mphf_container<R: Read + Seek>(
422 reader: &mut R,
423) -> io::Result<Vec<Option<crate::partitioned_mphf::PartitionedMphf>>> {
424 let header = MphfContainerHeader::read(reader)?;
426
427 let mut offset_table = Vec::with_capacity(header.num_partitions as usize);
429 for _ in 0..header.num_partitions {
430 offset_table.push(MphfPartitionEntry::read(reader)?);
431 }
432
433 let mut mphfs: Vec<Option<crate::partitioned_mphf::PartitionedMphf>> =
435 (0..header.num_partitions).map(|_| None).collect();
436
437 for entry in offset_table {
438 if entry.byte_size > 0 {
439 reader.seek(SeekFrom::Start(entry.byte_offset))?;
440 let pmphf = crate::partitioned_mphf::PartitionedMphf::read_from(reader)?;
441 mphfs[entry.partition_id as usize] = Some(pmphf);
442 }
443 }
444
445 Ok(mphfs)
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 #[test]
453 fn test_header_roundtrip() {
454 let header = DictionarySerializationHeader::new(31, 13, true, 2);
455
456 let mut buffer = Vec::new();
457 header.write(&mut buffer).unwrap();
458
459 let header2 = DictionarySerializationHeader::read(&mut buffer.as_slice()).unwrap();
460
461 assert_eq!(header.k, header2.k);
462 assert_eq!(header.m, header2.m);
463 assert_eq!(header.canonical, header2.canonical);
464 assert_eq!(header.num_mphf_partitions, header2.num_mphf_partitions);
465 }
466
467 #[test]
468 fn test_mphf_container_header_roundtrip() {
469 let header = MphfContainerHeader::new(5);
470 let mut buffer = Vec::new();
471 header.write(&mut buffer).unwrap();
472
473 let header2 = MphfContainerHeader::read(&mut buffer.as_slice()).unwrap();
474 assert_eq!(header.num_partitions, header2.num_partitions);
475 }
476
477 #[test]
478 fn test_mphf_partition_entry_roundtrip() {
479 let entry = MphfPartitionEntry {
480 partition_id: 3,
481 byte_offset: 1024,
482 byte_size: 512,
483 };
484
485 let mut buffer = Vec::new();
486 entry.write(&mut buffer).unwrap();
487
488 let entry2 = MphfPartitionEntry::read(&mut buffer.as_slice()).unwrap();
489 assert_eq!(entry.partition_id, entry2.partition_id);
490 assert_eq!(entry.byte_offset, entry2.byte_offset);
491 assert_eq!(entry.byte_size, entry2.byte_size);
492 }
493
494 #[test]
495 fn test_file_path_construction() {
496 let base = Path::new("/tmp/my_index");
497 let index = index_file_path(base);
498 assert!(index.to_string_lossy().ends_with("my_index.ssi"));
499
500 let mphf = mphf_container_path(base);
501 assert!(mphf.to_string_lossy().contains("my_index.ssi.mphf"));
502 assert!(!mphf.to_string_lossy().contains(".mphf.0")); }
504}