Skip to main content

sshash_lib/
serialization.rs

1//! Serialization and deserialization support for Dictionary
2//!
3//! This module provides efficient zero-copy serialization using epserde for sux-rs types
4//! (BitFieldVec, etc.) combined with native pthash serialization for MPHF functions.
5//!
6//! # File Format
7//!
8//! The serialization uses a two-file approach:
9//!
10//! **Main Index File** (`index.ssi`):
11//! - DictionarySerializationHeader (magic, version, k, m, canonical, num_mphf_partitions)
12//! - SpectrumPreservingStringSet (epserde format)
13//! - SparseAndSkewIndex (epserde format, excluding MPHF)
14//!
15//! **MPHF Container File** (`index.ssi.mphf`):
16//! ```text
17//! MphfContainerHeader
18//!   ├─ magic: "SSHIMH01"
19//!   ├─ version_major: u32
20//!   ├─ version_minor: u32
21//!   └─ num_partitions: u32
22//! Offset Table ([num_partitions] entries):
23//!   ├─ MphfPartitionEntry 0
24//!   │  ├─ partition_id: u32
25//!   │  ├─ byte_offset: u64
26//!   │  └─ byte_size: u64
27//!   ├─ MphfPartitionEntry 1
28//!   └─ ...
29//! Data Section (variable length):
30//!   ├─ MPHF partition 0 (raw fmph::GOFunction serialization)
31//!   ├─ MPHF partition 1
32//!   └─ ...
33//! ```
34//!
35//! # Benefits of Single MPHF Container
36//!
37//! - **Scalability**: Works with 1 or 1000 partitions equally well (single file)
38//! - **Random access**: Offset table enables seeking to any partition
39//! - **Memory mappable**: Entire container can be mmap'd
40//! - **Efficient**: No per-file overhead, compact layout
41//! - **Clean separation**: MPHF container is independent binary format
42//!
43//! # Zero-Copy Deserialization
44//!
45//! When deserializing, sux-rs types are handled by epserde:
46//! - `BitFieldVec<Vec<usize>>` deserializes as `BitFieldVec<&[usize]>` (ε-copy)
47//! - The deserialized Dictionary can be memory-mapped for instant loading
48
49use std::io::{self, Read, Write, Seek, SeekFrom};
50use std::path::{Path, PathBuf};
51
52/// Magic bytes for the SSHash index format
53const MAGIC: &[u8; 8] = b"SSHIDX01";
54
55/// Magic bytes for the SSHash MPHF container format (v2: PartitionedMphf)
56const MPHF_MAGIC: &[u8; 8] = b"SSHIMH02";
57
58/// File format version: (major, minor)
59/// Increment major on breaking changes, minor on compatible changes
60const FORMAT_VERSION: (u32, u32) = (3, 0);
61const MPHF_FORMAT_VERSION: (u32, u32) = (2, 0);
62
63/// Header for the serialized Dictionary
64#[derive(Clone, Debug)]
65pub struct DictionarySerializationHeader {
66    /// Magic number for format identification ("SSHIDX01")
67    pub magic: [u8; 8],
68    /// Format version (major, minor)
69    pub version_major: u32,
70    /// Format version minor number
71    pub version_minor: u32,
72    /// K-mer size
73    pub k: usize,
74    /// Minimizer size
75    pub m: usize,
76    /// Whether canonical mode is enabled
77    pub canonical: bool,
78    /// Number of MPHF partitions (for heavy buckets)
79    pub num_mphf_partitions: u32,
80}
81
82impl DictionarySerializationHeader {
83    /// Create a new header
84    pub fn new(k: usize, m: usize, canonical: bool, num_mphf_partitions: u32) -> Self {
85        Self {
86            magic: *MAGIC,
87            version_major: FORMAT_VERSION.0,
88            version_minor: FORMAT_VERSION.1,
89            k,
90            m,
91            canonical,
92            num_mphf_partitions,
93        }
94    }
95
96    /// Write header to a writer
97    pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
98        writer.write_all(&self.magic)?;
99        writer.write_all(&self.version_major.to_le_bytes())?;
100        writer.write_all(&self.version_minor.to_le_bytes())?;
101        writer.write_all(&(self.k as u64).to_le_bytes())?;
102        writer.write_all(&(self.m as u64).to_le_bytes())?;
103        writer.write_all(&[self.canonical as u8])?;
104        writer.write_all(&self.num_mphf_partitions.to_le_bytes())?;
105        Ok(())
106    }
107
108    /// Read header from a reader
109    pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
110        let mut magic = [0u8; 8];
111        reader.read_exact(&mut magic)?;
112
113        if &magic != MAGIC {
114            return Err(io::Error::new(
115                io::ErrorKind::InvalidData,
116                "Invalid magic number for SSHash index file",
117            ));
118        }
119
120        let mut version_major_bytes = [0u8; 4];
121        let mut version_minor_bytes = [0u8; 4];
122        let mut k_bytes = [0u8; 8];
123        let mut m_bytes = [0u8; 8];
124        let mut canonical_bytes = [0u8; 1];
125        let mut num_partitions_bytes = [0u8; 4];
126
127        reader.read_exact(&mut version_major_bytes)?;
128        reader.read_exact(&mut version_minor_bytes)?;
129        reader.read_exact(&mut k_bytes)?;
130        reader.read_exact(&mut m_bytes)?;
131        reader.read_exact(&mut canonical_bytes)?;
132        reader.read_exact(&mut num_partitions_bytes)?;
133
134        let version_major = u32::from_le_bytes(version_major_bytes);
135        let version_minor = u32::from_le_bytes(version_minor_bytes);
136
137        if version_major != FORMAT_VERSION.0 {
138            return Err(io::Error::new(
139                io::ErrorKind::InvalidData,
140                format!(
141                    "Incompatible format version: {}.{}, expected {}.{}",
142                    version_major, version_minor, FORMAT_VERSION.0, FORMAT_VERSION.1
143                ),
144            ));
145        }
146
147        Ok(Self {
148            magic,
149            version_major,
150            version_minor,
151            k: u64::from_le_bytes(k_bytes) as usize,
152            m: u64::from_le_bytes(m_bytes) as usize,
153            canonical: canonical_bytes[0] != 0,
154            num_mphf_partitions: u32::from_le_bytes(num_partitions_bytes),
155        })
156    }
157}
158
159/// Entry in the MPHF container offset table
160#[derive(Clone, Copy, Debug)]
161pub struct MphfPartitionEntry {
162    /// Partition ID
163    pub partition_id: u32,
164    /// Byte offset in the container file where this MPHF starts
165    pub byte_offset: u64,
166    /// Size in bytes of the serialized MPHF
167    pub byte_size: u64,
168}
169
170impl MphfPartitionEntry {
171    /// Write entry to a writer
172    fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
173        writer.write_all(&self.partition_id.to_le_bytes())?;
174        writer.write_all(&self.byte_offset.to_le_bytes())?;
175        writer.write_all(&self.byte_size.to_le_bytes())?;
176        Ok(())
177    }
178
179    /// Read entry from a reader
180    fn read(reader: &mut dyn Read) -> io::Result<Self> {
181        let mut id_bytes = [0u8; 4];
182        let mut offset_bytes = [0u8; 8];
183        let mut size_bytes = [0u8; 8];
184
185        reader.read_exact(&mut id_bytes)?;
186        reader.read_exact(&mut offset_bytes)?;
187        reader.read_exact(&mut size_bytes)?;
188
189        Ok(Self {
190            partition_id: u32::from_le_bytes(id_bytes),
191            byte_offset: u64::from_le_bytes(offset_bytes),
192            byte_size: u64::from_le_bytes(size_bytes),
193        })
194    }
195}
196
197/// Header for the MPHF container file
198///
199/// The container format is:
200/// ```text
201/// MphfContainerHeader
202/// offset_table: [MphfPartitionEntry; num_partitions]
203/// data_section: (serialized MPHF data concatenated)
204/// ```
205#[derive(Clone, Debug)]
206pub struct MphfContainerHeader {
207    /// Magic number for format identification ("SSHIMH01")
208    pub magic: [u8; 8],
209    /// Format version (major, minor)
210    pub version_major: u32,
211    /// Format version minor number
212    pub version_minor: u32,
213    /// Number of MPHF partitions in this container
214    pub num_partitions: u32,
215}
216
217impl MphfContainerHeader {
218    /// Create a new MPHF container header
219    pub fn new(num_partitions: u32) -> Self {
220        Self {
221            magic: *MPHF_MAGIC,
222            version_major: MPHF_FORMAT_VERSION.0,
223            version_minor: MPHF_FORMAT_VERSION.1,
224            num_partitions,
225        }
226    }
227
228    /// Write header to a writer
229    pub fn write(&self, writer: &mut dyn Write) -> io::Result<()> {
230        writer.write_all(&self.magic)?;
231        writer.write_all(&self.version_major.to_le_bytes())?;
232        writer.write_all(&self.version_minor.to_le_bytes())?;
233        writer.write_all(&self.num_partitions.to_le_bytes())?;
234        Ok(())
235    }
236
237    /// Read header from a reader
238    pub fn read(reader: &mut dyn Read) -> io::Result<Self> {
239        let mut magic = [0u8; 8];
240        reader.read_exact(&mut magic)?;
241
242        if &magic != MPHF_MAGIC {
243            // Check for old v1 format
244            if &magic == b"SSHIMH01" {
245                return Err(io::Error::new(
246                    io::ErrorKind::InvalidData,
247                    "MPHF container is v1 format (SSHIMH01). Please rebuild the index — v2 (PartitionedMphf) is required.",
248                ));
249            }
250            return Err(io::Error::new(
251                io::ErrorKind::InvalidData,
252                "Invalid magic number for SSHash MPHF container file",
253            ));
254        }
255
256        let mut version_major_bytes = [0u8; 4];
257        let mut version_minor_bytes = [0u8; 4];
258        let mut num_partitions_bytes = [0u8; 4];
259
260        reader.read_exact(&mut version_major_bytes)?;
261        reader.read_exact(&mut version_minor_bytes)?;
262        reader.read_exact(&mut num_partitions_bytes)?;
263
264        let version_major = u32::from_le_bytes(version_major_bytes);
265        let version_minor = u32::from_le_bytes(version_minor_bytes);
266
267        if version_major != MPHF_FORMAT_VERSION.0 {
268            return Err(io::Error::new(
269                io::ErrorKind::InvalidData,
270                format!(
271                    "Incompatible MPHF format version: {}.{}, expected {}.{}",
272                    version_major, version_minor, MPHF_FORMAT_VERSION.0, MPHF_FORMAT_VERSION.1
273                ),
274            ));
275        }
276
277        Ok(Self {
278            magic,
279            version_major,
280            version_minor,
281            num_partitions: u32::from_le_bytes(num_partitions_bytes),
282        })
283    }
284}
285
286/// Build the main index file path from a base path
287pub fn index_file_path<P: AsRef<Path>>(base: P) -> PathBuf {
288    let mut path = base.as_ref().to_path_buf();
289    let ext = path.extension().map(|e| e.to_string_lossy().to_string()).unwrap_or_default();
290    if ext == "ssi" {
291        // Already has .ssi extension
292        path
293    } else if ext.is_empty() {
294        path.set_extension("ssi");
295        path
296    } else {
297        path.set_extension(format!("{ext}.ssi"));
298        path
299    }
300}
301
302/// Build the MPHF container file path from a base path
303pub fn mphf_container_path<P: AsRef<Path>>(base: P) -> PathBuf {
304    let base_path = index_file_path(base);
305    let mut container_path = base_path.clone();
306    let filename = format!("{}.mphf", base_path.file_name().unwrap().to_string_lossy());
307    container_path.pop();
308    container_path.push(filename);
309    container_path
310}
311
312/// Serialization errors
313#[derive(Debug)]
314pub enum SerializationError {
315    /// I/O error during serialization
316    Io(io::Error),
317    /// Other serialization error
318    Other(String),
319}
320
321impl From<io::Error> for SerializationError {
322    fn from(err: io::Error) -> Self {
323        SerializationError::Io(err)
324    }
325}
326
327impl std::fmt::Display for SerializationError {
328    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
329        match self {
330            SerializationError::Io(e) => write!(f, "IO error: {}", e),
331            SerializationError::Other(s) => write!(f, "{}", s),
332        }
333    }
334}
335
336impl std::error::Error for SerializationError {}
337
338/// Result type for serialization operations
339pub type SerializationResult<T> = Result<T, SerializationError>;
340
341/// Helper functions for MPHF container operations
342///
343/// Write MPHFs to a container format
344///
345/// Creates a container file with:
346/// - Header with num_partitions
347/// - Offset table (partition_id, byte_offset, byte_size) for each partition
348/// - Serialized MPHF data concatenated
349///
350/// Returns the offset table for reference
351pub fn write_mphf_container<W: Write + Seek>(
352    writer: &mut W,
353    mphfs: &[Option<&crate::partitioned_mphf::PartitionedMphf>],
354) -> io::Result<Vec<MphfPartitionEntry>> {
355    let num_partitions = mphfs.len() as u32;
356
357    // Write header
358    let header = MphfContainerHeader::new(num_partitions);
359    header.write(writer)?;
360
361    // Calculate and write offset table (with placeholders for now)
362    let mut offset_table = Vec::new();
363    let offset_table_start = writer.stream_position()?;
364
365    // Write placeholder offset table
366    for i in 0..num_partitions {
367        let entry = MphfPartitionEntry {
368            partition_id: i,
369            byte_offset: 0, // Will be updated
370            byte_size: 0,   // Will be updated
371        };
372        entry.write(writer)?;
373    }
374
375    let _data_start = writer.stream_position()?;
376
377    // Serialize PartitionedMphfs and track their positions
378    for (partition_id, mphf_opt) in mphfs.iter().enumerate() {
379        let byte_offset = writer.stream_position()?;
380
381        if let Some(pmphf) = mphf_opt {
382            // Serialize the PartitionedMphf to a temporary buffer to get the size
383            let mut mphf_buffer = Vec::new();
384            pmphf.write_to(&mut mphf_buffer)?;
385            let byte_size = mphf_buffer.len() as u64;
386
387            // Write the serialized PartitionedMphf
388            writer.write_all(&mphf_buffer)?;
389
390            // Record the entry
391            offset_table.push(MphfPartitionEntry {
392                partition_id: partition_id as u32,
393                byte_offset,
394                byte_size,
395            });
396        } else {
397            // Empty partition
398            offset_table.push(MphfPartitionEntry {
399                partition_id: partition_id as u32,
400                byte_offset,
401                byte_size: 0,
402            });
403        }
404    }
405
406    // Go back and write the actual offset table
407    writer.seek(SeekFrom::Start(offset_table_start))?;
408    for entry in &offset_table {
409        entry.write(writer)?;
410    }
411
412    // Seek to end for any further writes
413    writer.seek(SeekFrom::End(0))?;
414
415    Ok(offset_table)
416}
417
418/// Read PartitionedMphfs from a container format
419///
420/// Returns a vector of Option<PartitionedMphf> indexed by partition ID
421pub fn read_mphf_container<R: Read + Seek>(
422    reader: &mut R,
423) -> io::Result<Vec<Option<crate::partitioned_mphf::PartitionedMphf>>> {
424    // Read header
425    let header = MphfContainerHeader::read(reader)?;
426
427    // Read offset table
428    let mut offset_table = Vec::with_capacity(header.num_partitions as usize);
429    for _ in 0..header.num_partitions {
430        offset_table.push(MphfPartitionEntry::read(reader)?);
431    }
432
433    // Read PartitionedMphfs
434    let mut mphfs: Vec<Option<crate::partitioned_mphf::PartitionedMphf>> =
435        (0..header.num_partitions).map(|_| None).collect();
436
437    for entry in offset_table {
438        if entry.byte_size > 0 {
439            reader.seek(SeekFrom::Start(entry.byte_offset))?;
440            let pmphf = crate::partitioned_mphf::PartitionedMphf::read_from(reader)?;
441            mphfs[entry.partition_id as usize] = Some(pmphf);
442        }
443    }
444
445    Ok(mphfs)
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn test_header_roundtrip() {
454        let header = DictionarySerializationHeader::new(31, 13, true, 2);
455
456        let mut buffer = Vec::new();
457        header.write(&mut buffer).unwrap();
458
459        let header2 = DictionarySerializationHeader::read(&mut buffer.as_slice()).unwrap();
460
461        assert_eq!(header.k, header2.k);
462        assert_eq!(header.m, header2.m);
463        assert_eq!(header.canonical, header2.canonical);
464        assert_eq!(header.num_mphf_partitions, header2.num_mphf_partitions);
465    }
466
467    #[test]
468    fn test_mphf_container_header_roundtrip() {
469        let header = MphfContainerHeader::new(5);
470        let mut buffer = Vec::new();
471        header.write(&mut buffer).unwrap();
472
473        let header2 = MphfContainerHeader::read(&mut buffer.as_slice()).unwrap();
474        assert_eq!(header.num_partitions, header2.num_partitions);
475    }
476
477    #[test]
478    fn test_mphf_partition_entry_roundtrip() {
479        let entry = MphfPartitionEntry {
480            partition_id: 3,
481            byte_offset: 1024,
482            byte_size: 512,
483        };
484
485        let mut buffer = Vec::new();
486        entry.write(&mut buffer).unwrap();
487
488        let entry2 = MphfPartitionEntry::read(&mut buffer.as_slice()).unwrap();
489        assert_eq!(entry.partition_id, entry2.partition_id);
490        assert_eq!(entry.byte_offset, entry2.byte_offset);
491        assert_eq!(entry.byte_size, entry2.byte_size);
492    }
493
494    #[test]
495    fn test_file_path_construction() {
496        let base = Path::new("/tmp/my_index");
497        let index = index_file_path(base);
498        assert!(index.to_string_lossy().ends_with("my_index.ssi"));
499
500        let mphf = mphf_container_path(base);
501        assert!(mphf.to_string_lossy().contains("my_index.ssi.mphf"));
502        assert!(!mphf.to_string_lossy().contains(".mphf.0")); // Single file, no partition ID
503    }
504}