Skip to main content

sshash_lib/
serialization.rs

1//! Serialization and deserialization support for Dictionary
2//!
3//! This module provides efficient zero-copy serialization using epserde for sux-rs types
4//! (BitFieldVec, etc.) combined with native pthash serialization for MPHF functions.
5//!
6//! # File Format
7//!
8//! The serialization uses a two-file approach:
9//!
10//! **Main Index File** (`index.ssi`):
11//! - DictionarySerializationHeader (magic, version, k, m, canonical, num_mphf_partitions)
12//! - SpectrumPreservingStringSet (epserde format)
13//! - SparseAndSkewIndex (epserde format, excluding MPHF)
14//!
15//! **MPHF Container File** (`index.ssi.mphf`):
16//! ```text
17//! MphfContainerHeader
18//!   ├─ magic: "SSHIMH01"
19//!   ├─ version_major: u32
20//!   ├─ version_minor: u32
21//!   └─ num_partitions: u32
22//! Offset Table ([num_partitions] entries):
23//!   ├─ MphfPartitionEntry 0
24//!   │  ├─ partition_id: u32
25//!   │  ├─ byte_offset: u64
26//!   │  └─ byte_size: u64
27//!   ├─ MphfPartitionEntry 1
28//!   └─ ...
29//! Data Section (variable length):
30//!   ├─ MPHF partition 0 (raw fmph::GOFunction serialization)
31//!   ├─ MPHF partition 1
32//!   └─ ...
33//! ```
34//!
35//! # Benefits of Single MPHF Container
36//!
37//! - **Scalability**: Works with 1 or 1000 partitions equally well (single file)
38//! - **Random access**: Offset table enables seeking to any partition
39//! - **Memory mappable**: Entire container can be mmap'd
40//! - **Efficient**: No per-file overhead, compact layout
41//! - **Clean separation**: MPHF container is independent binary format
42//!
43//! # Zero-Copy Deserialization
44//!
45//! When deserializing, sux-rs types are handled by epserde:
46//! - `BitFieldVec<Vec<usize>>` deserializes as `BitFieldVec<&[usize]>` (ε-copy)
47//! - The deserialized Dictionary can be memory-mapped for instant loading
48
49use std::io::{self, Read, Write, Seek, SeekFrom};
50use std::path::{Path, PathBuf};
51
52/// Magic bytes for the SSHash index format
53const MAGIC: &[u8; 8] = b"SSHIDX01";
54
55/// Magic bytes for the SSHash MPHF container format
56const MPHF_MAGIC: &[u8; 8] = b"SSHIMH01";
57
58/// File format version: (major, minor)
59/// Increment major on breaking changes, minor on compatible changes
60const FORMAT_VERSION: (u32, u32) = (3, 0);
61const MPHF_FORMAT_VERSION: (u32, u32) = (1, 0);
62
63/// Header for the serialized Dictionary
64#[derive(Clone, Debug)]
65pub struct DictionarySerializationHeader {
66    /// Magic number for format identification ("SSHIDX01")
67    pub magic: [u8; 8],
68    /// Format version (major, minor)
69    pub version_major: u32,
70    /// Format version minor number
71    pub version_minor: u32,
72    /// K-mer size
73    pub k: usize,
74    /// Minimizer size
75    pub m: usize,
76    /// Whether canonical mode is enabled
77    pub canonical: bool,
78    /// Number of MPHF partitions (for heavy buckets)
79    pub num_mphf_partitions: u32,
80}
81
82impl DictionarySerializationHeader {
83    /// Create a new header
84    pub fn new(k: usize, m: usize, canonical: bool, num_mphf_partitions: u32) -> Self {
85        Self {
86            magic: *MAGIC,
87            version_major: FORMAT_VERSION.0,
88            version_minor: FORMAT_VERSION.1,
89            k,
90            m,
91            canonical,
92            num_mphf_partitions,
93        }
94    }
95
96    /// Write header to a writer
97    pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
98        writer.write_all(&self.magic)?;
99        writer.write_all(&self.version_major.to_le_bytes())?;
100        writer.write_all(&self.version_minor.to_le_bytes())?;
101        writer.write_all(&(self.k as u64).to_le_bytes())?;
102        writer.write_all(&(self.m as u64).to_le_bytes())?;
103        writer.write_all(&[self.canonical as u8])?;
104        writer.write_all(&self.num_mphf_partitions.to_le_bytes())?;
105        Ok(())
106    }
107
108    /// Read header from a reader
109    pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
110        let mut magic = [0u8; 8];
111        reader.read_exact(&mut magic)?;
112
113        if &magic != MAGIC {
114            return Err(io::Error::new(
115                io::ErrorKind::InvalidData,
116                "Invalid magic number for SSHash index file",
117            ));
118        }
119
120        let mut version_major_bytes = [0u8; 4];
121        let mut version_minor_bytes = [0u8; 4];
122        let mut k_bytes = [0u8; 8];
123        let mut m_bytes = [0u8; 8];
124        let mut canonical_bytes = [0u8; 1];
125        let mut num_partitions_bytes = [0u8; 4];
126
127        reader.read_exact(&mut version_major_bytes)?;
128        reader.read_exact(&mut version_minor_bytes)?;
129        reader.read_exact(&mut k_bytes)?;
130        reader.read_exact(&mut m_bytes)?;
131        reader.read_exact(&mut canonical_bytes)?;
132        reader.read_exact(&mut num_partitions_bytes)?;
133
134        let version_major = u32::from_le_bytes(version_major_bytes);
135        let version_minor = u32::from_le_bytes(version_minor_bytes);
136
137        if version_major != FORMAT_VERSION.0 {
138            return Err(io::Error::new(
139                io::ErrorKind::InvalidData,
140                format!(
141                    "Incompatible format version: {}.{}, expected {}.{}",
142                    version_major, version_minor, FORMAT_VERSION.0, FORMAT_VERSION.1
143                ),
144            ));
145        }
146
147        Ok(Self {
148            magic,
149            version_major,
150            version_minor,
151            k: u64::from_le_bytes(k_bytes) as usize,
152            m: u64::from_le_bytes(m_bytes) as usize,
153            canonical: canonical_bytes[0] != 0,
154            num_mphf_partitions: u32::from_le_bytes(num_partitions_bytes),
155        })
156    }
157}
158
159/// Entry in the MPHF container offset table
160#[derive(Clone, Copy, Debug)]
161pub struct MphfPartitionEntry {
162    /// Partition ID
163    pub partition_id: u32,
164    /// Byte offset in the container file where this MPHF starts
165    pub byte_offset: u64,
166    /// Size in bytes of the serialized MPHF
167    pub byte_size: u64,
168}
169
170impl MphfPartitionEntry {
171    /// Write entry to a writer
172    fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
173        writer.write_all(&self.partition_id.to_le_bytes())?;
174        writer.write_all(&self.byte_offset.to_le_bytes())?;
175        writer.write_all(&self.byte_size.to_le_bytes())?;
176        Ok(())
177    }
178
179    /// Read entry from a reader
180    fn read(reader: &mut dyn Read) -> io::Result<Self> {
181        let mut id_bytes = [0u8; 4];
182        let mut offset_bytes = [0u8; 8];
183        let mut size_bytes = [0u8; 8];
184
185        reader.read_exact(&mut id_bytes)?;
186        reader.read_exact(&mut offset_bytes)?;
187        reader.read_exact(&mut size_bytes)?;
188
189        Ok(Self {
190            partition_id: u32::from_le_bytes(id_bytes),
191            byte_offset: u64::from_le_bytes(offset_bytes),
192            byte_size: u64::from_le_bytes(size_bytes),
193        })
194    }
195}
196
197/// Header for the MPHF container file
198///
199/// The container format is:
200/// ```text
201/// MphfContainerHeader
202/// offset_table: [MphfPartitionEntry; num_partitions]
203/// data_section: (serialized MPHF data concatenated)
204/// ```
205#[derive(Clone, Debug)]
206pub struct MphfContainerHeader {
207    /// Magic number for format identification ("SSHIMH01")
208    pub magic: [u8; 8],
209    /// Format version (major, minor)
210    pub version_major: u32,
211    /// Format version minor number
212    pub version_minor: u32,
213    /// Number of MPHF partitions in this container
214    pub num_partitions: u32,
215}
216
217impl MphfContainerHeader {
218    /// Create a new MPHF container header
219    pub fn new(num_partitions: u32) -> Self {
220        Self {
221            magic: *MPHF_MAGIC,
222            version_major: MPHF_FORMAT_VERSION.0,
223            version_minor: MPHF_FORMAT_VERSION.1,
224            num_partitions,
225        }
226    }
227
228    /// Write header to a writer
229    pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
230        writer.write_all(&self.magic)?;
231        writer.write_all(&self.version_major.to_le_bytes())?;
232        writer.write_all(&self.version_minor.to_le_bytes())?;
233        writer.write_all(&self.num_partitions.to_le_bytes())?;
234        Ok(())
235    }
236
237    /// Read header from a reader
238    pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
239        let mut magic = [0u8; 8];
240        reader.read_exact(&mut magic)?;
241
242        if &magic != MPHF_MAGIC {
243            return Err(io::Error::new(
244                io::ErrorKind::InvalidData,
245                "Invalid magic number for SSHash MPHF container file",
246            ));
247        }
248
249        let mut version_major_bytes = [0u8; 4];
250        let mut version_minor_bytes = [0u8; 4];
251        let mut num_partitions_bytes = [0u8; 4];
252
253        reader.read_exact(&mut version_major_bytes)?;
254        reader.read_exact(&mut version_minor_bytes)?;
255        reader.read_exact(&mut num_partitions_bytes)?;
256
257        let version_major = u32::from_le_bytes(version_major_bytes);
258        let version_minor = u32::from_le_bytes(version_minor_bytes);
259
260        if version_major != MPHF_FORMAT_VERSION.0 {
261            return Err(io::Error::new(
262                io::ErrorKind::InvalidData,
263                format!(
264                    "Incompatible MPHF format version: {}.{}, expected {}.{}",
265                    version_major, version_minor, MPHF_FORMAT_VERSION.0, MPHF_FORMAT_VERSION.1
266                ),
267            ));
268        }
269
270        Ok(Self {
271            magic,
272            version_major,
273            version_minor,
274            num_partitions: u32::from_le_bytes(num_partitions_bytes),
275        })
276    }
277}
278
279/// Build the main index file path from a base path
280pub fn index_file_path<P: AsRef<Path>>(base: P) -> PathBuf {
281    let mut path = base.as_ref().to_path_buf();
282    let ext = path.extension().map(|e| e.to_string_lossy().to_string()).unwrap_or_default();
283    if ext == "ssi" {
284        // Already has .ssi extension
285        path
286    } else if ext.is_empty() {
287        path.set_extension("ssi");
288        path
289    } else {
290        path.set_extension(format!("{ext}.ssi"));
291        path
292    }
293}
294
295/// Build the MPHF container file path from a base path
296pub fn mphf_container_path<P: AsRef<Path>>(base: P) -> PathBuf {
297    let base_path = index_file_path(base);
298    let mut container_path = base_path.clone();
299    let filename = format!("{}.mphf", base_path.file_name().unwrap().to_string_lossy());
300    container_path.pop();
301    container_path.push(filename);
302    container_path
303}
304
305/// Serialization errors
306#[derive(Debug)]
307pub enum SerializationError {
308    /// I/O error during serialization
309    Io(io::Error),
310    /// Other serialization error
311    Other(String),
312}
313
314impl From<io::Error> for SerializationError {
315    fn from(err: io::Error) -> Self {
316        SerializationError::Io(err)
317    }
318}
319
320impl std::fmt::Display for SerializationError {
321    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
322        match self {
323            SerializationError::Io(e) => write!(f, "IO error: {}", e),
324            SerializationError::Other(s) => write!(f, "{}", s),
325        }
326    }
327}
328
329impl std::error::Error for SerializationError {}
330
331/// Result type for serialization operations
332pub type SerializationResult<T> = Result<T, SerializationError>;
333
334/// Helper functions for MPHF container operations
335///
336/// Write MPHFs to a container format
337///
338/// Creates a container file with:
339/// - Header with num_partitions
340/// - Offset table (partition_id, byte_offset, byte_size) for each partition
341/// - Serialized MPHF data concatenated
342///
343/// Returns the offset table for reference
344pub fn write_mphf_container<W: Write + Seek>(
345    writer: &mut W,
346    mphfs: &[Option<&crate::mphf_config::Mphf>],
347) -> io::Result<Vec<MphfPartitionEntry>> {
348    let num_partitions = mphfs.len() as u32;
349
350    // Write header
351    let header = MphfContainerHeader::new(num_partitions);
352    header.write(writer)?;
353
354    // Calculate and write offset table (with placeholders for now)
355    let mut offset_table = Vec::new();
356    let offset_table_start = writer.stream_position()?;
357
358    // Write placeholder offset table
359    for i in 0..num_partitions {
360        let entry = MphfPartitionEntry {
361            partition_id: i,
362            byte_offset: 0, // Will be updated
363            byte_size: 0,   // Will be updated
364        };
365        entry.write(writer)?;
366    }
367
368    let _data_start = writer.stream_position()?;
369
370    // Serialize MPHFs and track their positions
371    for (partition_id, mphf_opt) in mphfs.iter().enumerate() {
372        let byte_offset = writer.stream_position()?;
373
374        if let Some(mphf) = mphf_opt {
375            // Serialize the MPHF to a temporary buffer to get the size
376            let mut mphf_buffer = Vec::new();
377            mphf.write(&mut mphf_buffer)?;
378            let byte_size = mphf_buffer.len() as u64;
379
380            // Write the serialized MPHF
381            writer.write_all(&mphf_buffer)?;
382
383            // Record the entry
384            offset_table.push(MphfPartitionEntry {
385                partition_id: partition_id as u32,
386                byte_offset,
387                byte_size,
388            });
389        } else {
390            // Empty partition
391            offset_table.push(MphfPartitionEntry {
392                partition_id: partition_id as u32,
393                byte_offset,
394                byte_size: 0,
395            });
396        }
397    }
398
399    // Go back and write the actual offset table
400    writer.seek(SeekFrom::Start(offset_table_start))?;
401    for entry in &offset_table {
402        entry.write(writer)?;
403    }
404
405    // Seek to end for any further writes
406    writer.seek(SeekFrom::End(0))?;
407
408    Ok(offset_table)
409}
410
411/// Read MPHFs from a container format
412///
413/// Returns a vector of Option<Mphf> indexed by partition ID
414pub fn read_mphf_container<R: Read + Seek>(
415    reader: &mut R,
416) -> io::Result<Vec<Option<crate::mphf_config::Mphf>>> {
417    // Read header
418    let header = MphfContainerHeader::read(reader)?;
419
420    // Read offset table
421    let mut offset_table = Vec::with_capacity(header.num_partitions as usize);
422    for _ in 0..header.num_partitions {
423        offset_table.push(MphfPartitionEntry::read(reader)?);
424    }
425
426    // Read MPHFs
427    let mut mphfs: Vec<Option<crate::mphf_config::Mphf>> = (0..header.num_partitions).map(|_| None).collect();
428
429    for entry in offset_table {
430        if entry.byte_size > 0 {
431            reader.seek(SeekFrom::Start(entry.byte_offset))?;
432            let mphf = crate::mphf_config::read_mphf(reader)?;
433            mphfs[entry.partition_id as usize] = Some(mphf);
434        }
435    }
436
437    Ok(mphfs)
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443
444    #[test]
445    fn test_header_roundtrip() {
446        let header = DictionarySerializationHeader::new(31, 13, true, 2);
447
448        let mut buffer = Vec::new();
449        header.write(&mut buffer).unwrap();
450
451        let header2 = DictionarySerializationHeader::read(&mut buffer.as_slice()).unwrap();
452
453        assert_eq!(header.k, header2.k);
454        assert_eq!(header.m, header2.m);
455        assert_eq!(header.canonical, header2.canonical);
456        assert_eq!(header.num_mphf_partitions, header2.num_mphf_partitions);
457    }
458
459    #[test]
460    fn test_mphf_container_header_roundtrip() {
461        let header = MphfContainerHeader::new(5);
462        let mut buffer = Vec::new();
463        header.write(&mut buffer).unwrap();
464
465        let header2 = MphfContainerHeader::read(&mut buffer.as_slice()).unwrap();
466        assert_eq!(header.num_partitions, header2.num_partitions);
467    }
468
469    #[test]
470    fn test_mphf_partition_entry_roundtrip() {
471        let entry = MphfPartitionEntry {
472            partition_id: 3,
473            byte_offset: 1024,
474            byte_size: 512,
475        };
476
477        let mut buffer = Vec::new();
478        entry.write(&mut buffer).unwrap();
479
480        let entry2 = MphfPartitionEntry::read(&mut buffer.as_slice()).unwrap();
481        assert_eq!(entry.partition_id, entry2.partition_id);
482        assert_eq!(entry.byte_offset, entry2.byte_offset);
483        assert_eq!(entry.byte_size, entry2.byte_size);
484    }
485
486    #[test]
487    fn test_file_path_construction() {
488        let base = Path::new("/tmp/my_index");
489        let index = index_file_path(base);
490        assert!(index.to_string_lossy().ends_with("my_index.ssi"));
491
492        let mphf = mphf_container_path(base);
493        assert!(mphf.to_string_lossy().contains("my_index.ssi.mphf"));
494        assert!(!mphf.to_string_lossy().contains(".mphf.0")); // Single file, no partition ID
495    }
496}