matchy_paraglob/
offset_format.rs

1//! Offset-based binary format for zero-copy memory mapping
2//!
3//! This module defines the binary format used for serializing and loading
4//! Paraglob pattern matchers. The format uses byte offsets instead of pointers,
5//! allowing it to be memory-mapped and used directly without deserialization.
6//!
7//! # Format Overview
8//!
9//! The format consists of C-compatible packed structs that can be cast directly
10//! from bytes. All references use byte offsets from the start of the buffer.
11//!
12//! # Layout
13//!
14//! ```text
15//! [Header: ParaglobHeader (v5: 112 bytes)]
16//! [AC Nodes: ACNode array]
17//! [AC Edges: ACEdge arrays (variable, referenced by nodes)]
18//! [AC Pattern IDs: u32 arrays (variable, referenced by nodes)]
19//! [Pattern Entries: PatternEntry array]
20//! [Pattern Strings: null-terminated UTF-8]
21//! [Meta-word mappings: MetaWordMapping array]
22//! [Pattern reference arrays: u32 arrays]
23//! [Single wildcards: SingleWildcard array]
24//! [Glob Segments: GlobSegmentIndex + segment data (v5+)]
25//! [Data section: optional (v2+)]
26//! [Data mappings: optional (v2+)]
27//! [AC Literal Mapping: optional (v3+)]
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Alignment**: All structs are properly aligned for direct casting
33//! 2. **Offsets**: All references use u32 byte offsets (4GB limit)
34//! 3. **Zero-copy**: Can read directly from mmap without parsing
35//! 4. **Portability**: Little-endian u32/u8 only (standard on x86/ARM)
36
37use std::mem;
38use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
39
40/// Magic bytes identifying Paraglob binary format
41pub const MAGIC: &[u8; 8] = b"PARAGLOB";
42
43/// Current format version (v5: serialized glob segments for zero-copy loading)
44pub const MATCHY_FORMAT_VERSION: u32 = 5;
45
46/// Previous format version (v4: uses ACNodeHot for 50% memory reduction)
47#[allow(dead_code)]
48pub const MATCHY_FORMAT_VERSION_V4: u32 = 4;
49
50/// Previous format version (v3: adds AC literal mapping for zero-copy loading)
51#[allow(dead_code)] // Kept for reference and potential migration code
52pub const MATCHY_FORMAT_VERSION_V3: u32 = 3;
53
54/// Previous format version (v2: adds data section support)
55#[allow(dead_code)] // Kept for reference and potential migration code
56pub const MATCHY_FORMAT_VERSION_V2: u32 = 2;
57
58/// Previous format version (v1: patterns only, no data)
59#[allow(dead_code)] // Kept for reference and potential migration code
60pub const MATCHY_FORMAT_VERSION_V1: u32 = 1;
61
62/// Main header for serialized Paraglob database (112 bytes, 4-byte aligned)
63///
64/// This header appears at the start of every serialized Paraglob file.
65/// All offsets are relative to the start of the buffer.
66///
67/// # Version History
68/// - v1 (72 bytes): Original format, patterns only
69/// - v2 (96 bytes): Adds data section support for pattern-associated data
70/// - v3 (104 bytes): Adds AC literal mapping for O(1) zero-copy loading
71/// - v4 (104 bytes): Uses ACNodeHot (20-byte) instead of ACNode (32-byte) - BREAKING
72/// - v5 (112 bytes): Adds serialized glob segments for zero-copy loading
73#[repr(C)]
74#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
75pub struct ParaglobHeader {
76    /// Magic bytes: "PARAGLOB"
77    pub magic: [u8; 8],
78
79    /// Format version (currently 5)
80    pub version: u32,
81
82    /// Match mode: 0=CaseSensitive, 1=CaseInsensitive
83    pub match_mode: u32,
84
85    // AC Automaton section
86    /// Number of nodes in the AC trie
87    pub ac_node_count: u32,
88
89    /// Offset to first AC node
90    pub ac_nodes_offset: u32,
91
92    /// Total size of AC edges data
93    pub ac_edges_size: u32,
94
95    /// Total size of AC pattern ID arrays
96    pub ac_patterns_size: u32,
97
98    // Pattern section
99    /// Total number of original glob patterns
100    pub pattern_count: u32,
101
102    /// Offset to pattern entry array
103    pub patterns_offset: u32,
104
105    /// Offset to pattern strings area
106    pub pattern_strings_offset: u32,
107
108    /// Total size of pattern strings
109    pub pattern_strings_size: u32,
110
111    // Meta-word mapping section
112    /// Number of meta-word to pattern mappings
113    pub meta_word_mapping_count: u32,
114
115    /// Offset to meta-word mapping array
116    pub meta_word_mappings_offset: u32,
117
118    /// Total size of pattern reference arrays
119    pub pattern_refs_size: u32,
120
121    /// Number of pure wildcard patterns (no literals)
122    pub wildcard_count: u32,
123
124    /// Total size of the entire serialized buffer (bytes)
125    pub total_buffer_size: u32,
126
127    /// Endianness marker: 0x01=little-endian, 0x02=big-endian, 0x00=legacy (assume little-endian)
128    /// Database is always stored in little-endian format.
129    /// This field indicates the endianness of the system that created the file.
130    /// On big-endian systems, all multi-byte values are byte-swapped on read.
131    pub endianness: u8,
132
133    /// Reserved for future use
134    pub reserved: [u8; 3],
135
136    // ===== v2 ADDITIONS (24 bytes) =====
137    /// Offset to data section (0 = no data section)
138    /// Points to MMDB-encoded data or other serialized data
139    pub data_section_offset: u32,
140
141    /// Size of data section in bytes (0 = no data)
142    pub data_section_size: u32,
143
144    /// Offset to pattern→data mapping table (0 = no mappings)
145    /// Each mapping is a PatternDataMapping struct
146    pub mapping_table_offset: u32,
147
148    /// Number of pattern→data mappings
149    /// Should equal pattern_count if all patterns have data
150    pub mapping_count: u32,
151
152    /// Data type flags:
153    /// - Bit 0: inline data (1) vs external references (0)
154    /// - Bit 1-31: reserved
155    pub data_flags: u32,
156
157    /// Reserved for future v2+ features
158    pub reserved_v2: u32,
159
160    // ===== v3 ADDITIONS (8 bytes) =====
161    /// Offset to AC literal→pattern mapping table (0 = no mapping, requires reconstruction)
162    /// Points to serialized `HashMap<u32, Vec<u32>>` for instant loading
163    /// Format: `[entry_count: u32]` followed by entries of:
164    ///   `[literal_id: u32][pattern_count: u32][pattern_id: u32, ...]`
165    pub ac_literal_map_offset: u32,
166
167    /// Number of entries in AC literal mapping table
168    /// 0 = v1/v2 file, requires reconstruct_literal_mapping()
169    pub ac_literal_map_count: u32,
170
171    // ===== v5 ADDITIONS (8 bytes) =====
172    /// Offset to glob segment index (0 = no segments, use lazy parsing)
173    /// Points to array of GlobSegmentIndex structs (one per pattern)
174    pub glob_segments_offset: u32,
175
176    /// Total size of glob segment data (index + segment structures + string data)
177    pub glob_segments_size: u32,
178}
179
180/// State encoding type for AC automaton nodes
181///
182/// Determines how transitions are stored and looked up for optimal performance.
183#[repr(u8)]
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185pub enum StateKind {
186    /// No transitions (terminal state only)
187    Empty = 0,
188    /// Single transition - stored inline in node (75-80% of states)
189    One = 1,
190    /// 2-8 transitions - sparse edge array (10-15% of states)
191    Sparse = 2,
192    /// 9+ transitions - dense lookup table (2-5% of states)
193    Dense = 3,
194}
195
196impl StateKind {
197    /// Lookup table for fast u8 -> StateKind conversion
198    const LOOKUP: [Option<Self>; 256] = {
199        let mut table = [None; 256];
200        table[0] = Some(Self::Empty);
201        table[1] = Some(Self::One);
202        table[2] = Some(Self::Sparse);
203        table[3] = Some(Self::Dense);
204        table
205    };
206
207    /// Convert from u8 (for deserialization) - O(1) lookup
208    #[inline(always)]
209    #[must_use]
210    pub const fn from_u8(value: u8) -> Option<Self> {
211        Self::LOOKUP[value as usize]
212    }
213}
214
215// Re-export ACNodeHot from matchy-ac
216pub use matchy_ac::ACNodeHot;
217
218/// AC Automaton node (32 bytes, 8-byte aligned) - DEPRECATED
219///
220/// Legacy 32-byte node structure. Kept for backward compatibility with old file formats.
221/// New code should use ACNodeHot (16 bytes) for better cache performance.
222///
223/// Represents a single node in the Aho-Corasick trie with state-specific encoding.
224/// All child references are stored as offsets to allow zero-copy loading.
225///
226/// # State Encoding
227///
228/// The node uses different encodings based on transition count:
229/// - **Empty** (0 transitions): No additional data needed
230/// - **One** (1 transition): Character and target stored inline (no indirection!)
231/// - **Sparse** (2-8 transitions): Offset to edge array, linear search
232/// - **Dense** (9+ transitions): Offset to 256-entry lookup table, O(1) access
233#[repr(C)]
234#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
235pub struct ACNode {
236    /// Unique node ID
237    pub node_id: u32,
238
239    /// Offset to failure link node (0 = root)
240    pub failure_offset: u32,
241
242    /// State encoding type (StateKind enum)
243    pub state_kind: u8,
244
245    /// Depth from root node
246    pub depth: u8,
247
248    /// Is this a terminal/final state? (1=yes, 0=no)
249    pub is_final: u8,
250
251    /// Reserved for future flags
252    pub reserved_flags: u8,
253
254    /// ONE encoding: character for single transition
255    pub one_char: u8,
256
257    /// Reserved for alignment
258    pub reserved_one: [u8; 3],
259
260    /// SPARSE/DENSE encoding: offset-based lookup (4 bytes)
261    /// - SPARSE: offset to ACEdge array
262    /// - DENSE: offset to DenseLookup table
263    /// - ONE: target offset for single transition
264    pub edges_offset: u32,
265
266    /// Number of edges (SPARSE/DENSE states only)
267    pub edge_count: u16,
268
269    /// Reserved for alignment
270    pub reserved_edge: u16,
271
272    /// Offset to pattern ID array
273    pub patterns_offset: u32,
274
275    /// Number of pattern IDs at this node
276    pub pattern_count: u16,
277
278    /// Reserved for alignment
279    pub reserved_pattern: u16,
280}
281// Total: node_id(4) + failure_offset(4) + state_kind/depth/is_final/reserved(4)
282//        + one_char/reserved_one(4) + edges_offset(4) + edge_count/reserved(4)
283//        + patterns_offset(4) + pattern_count/reserved(4)
284//        = 4+4+4+4+4+4+4+4 = 32 bytes ✓
285
286/// AC Automaton edge (8 bytes, 4-byte aligned)
287///
288/// Represents a transition from one node to another on a specific character.
289/// Used by SPARSE state encoding.
290#[repr(C)]
291#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
292pub struct ACEdge {
293    /// Input character (0-255)
294    pub character: u8,
295
296    /// Reserved for alignment
297    pub reserved: [u8; 3],
298
299    /// Offset to target node
300    pub target_offset: u32,
301}
302
303/// Dense lookup table for states with many transitions (1024 bytes, 64-byte aligned)
304///
305/// Used by DENSE state encoding for O(1) transition lookup.
306/// Each entry is a target node offset (0 = no transition).
307///
308/// **Cache-line alignment**: The 64-byte alignment ensures this structure starts on a
309/// cache line boundary, preventing cache line splits and improving memory access performance
310/// by 5-15% for dense state lookups. The structure size remains 1024 bytes; only the
311/// placement in memory changes (average 32 bytes padding per instance).
312#[repr(C, align(64))]
313#[derive(Debug, Clone, Copy)]
314pub struct DenseLookup {
315    /// Target offsets indexed by character (0-255)
316    /// 0 means no transition for that character
317    pub targets: [u32; 256],
318}
319
320/// Pattern entry (16 bytes, 8-byte aligned)
321///
322/// Metadata about a single glob pattern in the database.
323#[repr(C)]
324#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
325pub struct PatternEntry {
326    /// Pattern ID (matches IDs used in AC automaton)
327    pub pattern_id: u32,
328
329    /// Pattern type: 0=Literal, 1=Glob
330    pub pattern_type: u8,
331
332    /// Reserved for alignment
333    pub reserved: [u8; 3],
334
335    /// Offset to pattern string (null-terminated UTF-8)
336    pub pattern_string_offset: u32,
337
338    /// Length of pattern string (not including null)
339    pub pattern_string_length: u32,
340}
341
342/// Meta-word to pattern mapping (12 bytes, 4-byte aligned)
343///
344/// Maps a meta-word (literal segment from AC automaton) to all patterns
345/// that contain it. Used for hybrid AC + glob matching.
346#[repr(C)]
347#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
348pub struct MetaWordMapping {
349    /// Meta-word string offset
350    pub meta_word_offset: u32,
351
352    /// Offset to array of pattern IDs (u32[])
353    pub pattern_ids_offset: u32,
354
355    /// Number of patterns containing this meta-word
356    pub pattern_count: u32,
357}
358
359/// Single wildcard entry (8 bytes, 4-byte aligned)
360///
361/// Represents a pattern with only wildcards (*, ?) and no literals.
362/// These must be checked separately since they don't have AC matches.
363#[repr(C)]
364#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
365pub struct SingleWildcard {
366    /// Pattern ID
367    pub pattern_id: u32,
368
369    /// Offset to pattern string
370    pub pattern_string_offset: u32,
371}
372
373/// Pattern-to-data mapping entry (12 bytes, 4-byte aligned)
374///
375/// Maps a pattern ID to associated data. Used in v2 format.
376/// The data can be inline (stored in data section) or external
377/// (reference to MMDB data section).
378#[repr(C)]
379#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
380pub struct PatternDataMapping {
381    /// Pattern ID this mapping applies to
382    pub pattern_id: u32,
383
384    /// Offset to data in data section (or external offset)
385    /// Interpretation depends on data_flags in header
386    pub data_offset: u32,
387
388    /// Size of data in bytes (0 = use data section's size encoding)
389    pub data_size: u32,
390}
391
392/// Glob segment index entry (8 bytes, 4-byte aligned)
393///
394/// Points to the glob segment data for a specific pattern.
395/// One entry exists for each pattern in the database.
396#[repr(C)]
397#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
398pub struct GlobSegmentIndex {
399    /// Offset to first GlobSegmentHeader for this pattern
400    /// Relative to start of buffer
401    pub first_segment_offset: u32,
402
403    /// Number of segments in this pattern
404    pub segment_count: u16,
405
406    /// Reserved for alignment
407    pub reserved: u16,
408}
409
410/// Glob segment header (12 bytes, 4-byte aligned)
411///
412/// Describes a single segment of a glob pattern (Literal, Star, Question, or CharClass).
413/// Followed immediately by segment-specific data (string bytes or CharClassItem array).
414#[repr(C)]
415#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
416pub struct GlobSegmentHeader {
417    /// Segment type:
418    /// - 0: Literal(String)
419    /// - 1: Star
420    /// - 2: Question
421    /// - 3: CharClass
422    pub segment_type: u8,
423
424    /// Flags (for CharClass: bit 0 = negated)
425    pub flags: u8,
426
427    /// Reserved for alignment
428    pub reserved: u16,
429
430    /// Length of associated data in bytes
431    /// - Literal: string byte length
432    /// - Star/Question: 0
433    /// - CharClass: number of CharClassItem entries * 12
434    pub data_len: u32,
435
436    /// Offset to associated data (relative to start of buffer)
437    /// - Literal: offset to UTF-8 string bytes
438    /// - Star/Question: unused (0)
439    /// - CharClass: offset to CharClassItemEncoded array
440    pub data_offset: u32,
441}
442
443/// Encoded character class item (12 bytes, 4-byte aligned)
444///
445/// Represents either a single character or a character range in a glob character class.
446#[repr(C)]
447#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
448pub struct CharClassItemEncoded {
449    /// Item type:
450    /// - 0: Char(char1)
451    /// - 1: Range(char1, char2)
452    pub item_type: u8,
453
454    /// Reserved for alignment
455    pub reserved: [u8; 3],
456
457    /// First character (or only character for Char variant)
458    pub char1: u32,
459
460    /// Second character (for Range variant only, 0 for Char)
461    pub char2: u32,
462}
463
464// Compile-time size assertions to ensure struct layout
465const _: () = assert!(mem::size_of::<ParaglobHeader>() == 112); // v5: 8-byte magic + 26 * u32 fields
466const _: () = assert!(mem::size_of::<ACNodeHot>() == 20); // With one_target field: 4 + 4*4 = 20 bytes
467const _: () = assert!(mem::size_of::<ACNode>() == 32); // Legacy: 2 per cache line
468const _: () = assert!(mem::size_of::<ACEdge>() == 8);
469const _: () = assert!(mem::size_of::<DenseLookup>() == 1024); // 256 * 4 bytes
470const _: () = assert!(mem::align_of::<DenseLookup>() == 64); // Cache-line alignment for performance
471const _: () = assert!(mem::size_of::<PatternEntry>() == 16);
472const _: () = assert!(mem::size_of::<MetaWordMapping>() == 12);
473const _: () = assert!(mem::size_of::<SingleWildcard>() == 8);
474const _: () = assert!(mem::size_of::<PatternDataMapping>() == 12);
475const _: () = assert!(mem::size_of::<GlobSegmentIndex>() == 8);
476const _: () = assert!(mem::size_of::<GlobSegmentHeader>() == 12);
477const _: () = assert!(mem::size_of::<CharClassItemEncoded>() == 12);
478
479impl Default for ParaglobHeader {
480    fn default() -> Self {
481        Self::new()
482    }
483}
484
485impl PatternDataMapping {
486    /// Create a new pattern-to-data mapping
487    #[must_use]
488    pub fn new(pattern_id: u32, data_offset: u32, data_size: u32) -> Self {
489        Self {
490            pattern_id,
491            data_offset,
492            data_size,
493        }
494    }
495}
496
497impl ParaglobHeader {
498    /// Create a new v3 header with magic and version
499    #[must_use]
500    pub fn new() -> Self {
501        Self {
502            magic: *MAGIC,
503            version: MATCHY_FORMAT_VERSION,
504            match_mode: 0,
505            ac_node_count: 0,
506            ac_nodes_offset: 0,
507            ac_edges_size: 0,
508            ac_patterns_size: 0,
509            pattern_count: 0,
510            patterns_offset: 0,
511            pattern_strings_offset: 0,
512            pattern_strings_size: 0,
513            meta_word_mapping_count: 0,
514            meta_word_mappings_offset: 0,
515            pattern_refs_size: 0,
516            wildcard_count: 0,
517            total_buffer_size: 0,
518            endianness: 0x01, // Little-endian marker (reserved for future use)
519            reserved: [0; 3],
520            // v2 fields
521            data_section_offset: 0,
522            data_section_size: 0,
523            mapping_table_offset: 0,
524            mapping_count: 0,
525            data_flags: 0,
526            reserved_v2: 0,
527            // v3 fields
528            ac_literal_map_offset: 0,
529            ac_literal_map_count: 0,
530            // v5 fields
531            glob_segments_offset: 0,
532            glob_segments_size: 0,
533        }
534    }
535
536    /// Validate header magic and version
537    pub fn validate(&self) -> Result<(), &'static str> {
538        if &self.magic != MAGIC {
539            return Err("Invalid magic bytes");
540        }
541        if self.version != MATCHY_FORMAT_VERSION {
542            return Err("Unsupported version - only v5 format supported");
543        }
544        Ok(())
545    }
546
547    /// Validate that all header offsets are within buffer bounds
548    #[allow(dead_code)] // Reserved for validation feature
549    pub fn validate_offsets(&self, buffer_len: usize) -> Result<(), &'static str> {
550        // Validate AC literal mapping offset if present
551        if self.has_ac_literal_mapping() {
552            let offset = self.ac_literal_map_offset as usize;
553            if offset >= buffer_len {
554                return Err("AC literal map offset out of bounds");
555            }
556        }
557
558        // Validate data section if present
559        if self.has_data_section() {
560            let start = self.data_section_offset as usize;
561            let size = self.data_section_size as usize;
562            if start.checked_add(size).is_none_or(|end| end > buffer_len) {
563                return Err("Data section out of bounds");
564            }
565        }
566
567        // Validate mapping table if present
568        if self.mapping_count > 0 {
569            let offset = self.mapping_table_offset as usize;
570            if offset >= buffer_len {
571                return Err("Mapping table offset out of bounds");
572            }
573        }
574
575        // Validate AC nodes section
576        if self.ac_node_count > 0 {
577            let offset = self.ac_nodes_offset as usize;
578            let size = (self.ac_node_count as usize) * mem::size_of::<ACNode>();
579            if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
580                return Err("AC nodes section out of bounds");
581            }
582        }
583
584        // Validate patterns section
585        if self.pattern_count > 0 {
586            let offset = self.patterns_offset as usize;
587            let size = (self.pattern_count as usize) * mem::size_of::<PatternEntry>();
588            if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
589                return Err("Patterns section out of bounds");
590            }
591        }
592
593        // Validate pattern strings section
594        if self.pattern_strings_size > 0 {
595            let start = self.pattern_strings_offset as usize;
596            let size = self.pattern_strings_size as usize;
597            if start.checked_add(size).is_none_or(|end| end > buffer_len) {
598                return Err("Pattern strings section out of bounds");
599            }
600        }
601
602        // Validate meta-word mappings
603        if self.meta_word_mapping_count > 0 {
604            let offset = self.meta_word_mappings_offset as usize;
605            let size = (self.meta_word_mapping_count as usize) * mem::size_of::<MetaWordMapping>();
606            if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
607                return Err("Meta-word mappings section out of bounds");
608            }
609        }
610
611        Ok(())
612    }
613
614    /// Check if this file has a data section
615    #[must_use]
616    pub fn has_data_section(&self) -> bool {
617        self.data_section_size > 0
618    }
619
620    /// Check if this file has a pre-built AC literal mapping (v3+)
621    #[must_use]
622    pub fn has_ac_literal_mapping(&self) -> bool {
623        self.ac_literal_map_count > 0 && self.ac_literal_map_offset > 0
624    }
625
626    /// Check if data is inline (true) or external references (false)
627    #[allow(dead_code)] // Reserved for future use
628    #[must_use]
629    pub fn has_inline_data(&self) -> bool {
630        (self.data_flags & 0x1) != 0
631    }
632
633    /// Check if this file has pre-built glob segments (v5+)
634    #[allow(dead_code)] // Reserved for v5 format implementation
635    #[must_use]
636    pub fn has_glob_segments(&self) -> bool {
637        self.glob_segments_size > 0 && self.glob_segments_offset > 0
638    }
639}
640
641impl ACNode {
642    /// Create a new node with default EMPTY encoding
643    #[allow(dead_code)]
644    #[must_use]
645    pub fn new(node_id: u32, depth: u8) -> Self {
646        Self {
647            node_id,
648            failure_offset: 0,
649            state_kind: StateKind::Empty as u8,
650            depth,
651            is_final: 0,
652            reserved_flags: 0,
653            one_char: 0,
654            reserved_one: [0; 3],
655            edges_offset: 0,
656            edge_count: 0,
657            reserved_edge: 0,
658            patterns_offset: 0,
659            pattern_count: 0,
660            reserved_pattern: 0,
661        }
662    }
663}
664
665impl ACEdge {
666    /// Create a new edge
667    #[allow(dead_code)] // Used by builder code in other crates
668    #[must_use]
669    pub fn new(character: u8, target_offset: u32) -> Self {
670        Self {
671            character,
672            reserved: [0; 3],
673            target_offset,
674        }
675    }
676}
677
678impl PatternEntry {
679    /// Create a new pattern entry
680    #[must_use]
681    pub fn new(pattern_id: u32, pattern_type: u8) -> Self {
682        Self {
683            pattern_id,
684            pattern_type,
685            reserved: [0; 3],
686            pattern_string_offset: 0,
687            pattern_string_length: 0,
688        }
689    }
690}
691
692/// Helper to safely read a struct from a byte buffer at an offset
693///
694/// # Safety
695///
696/// Caller must ensure:
697/// - offset + `size_of::<T>`() <= buffer.len()
698/// - Buffer is properly aligned for T
699/// - Bytes represent a valid T
700#[allow(dead_code)]
701#[must_use]
702pub unsafe fn read_struct<T: Copy>(buffer: &[u8], offset: usize) -> T {
703    debug_assert!(offset + mem::size_of::<T>() <= buffer.len());
704    let ptr = buffer.as_ptr().add(offset).cast::<T>();
705    ptr.read_unaligned()
706}
707
708/// Helper to safely read a slice of structs from a byte buffer
709///
710/// # Safety
711///
712/// Caller must ensure:
713/// - offset + `size_of::<T>`() * count <= buffer.len()
714/// - Buffer contains valid T values
715#[allow(dead_code)]
716#[must_use]
717pub unsafe fn read_struct_slice<T: Copy>(buffer: &[u8], offset: usize, count: usize) -> &[T] {
718    debug_assert!(offset + mem::size_of::<T>() * count <= buffer.len());
719    let ptr = buffer.as_ptr().add(offset).cast::<T>();
720    std::slice::from_raw_parts(ptr, count)
721}
722
723/// Helper to read a null-terminated UTF-8 string from buffer
724///
725/// Returns error if offset is out of bounds, string is not null-terminated,
726/// or bytes are not valid UTF-8.
727pub fn read_cstring(buffer: &[u8], offset: usize) -> Result<&str, &'static str> {
728    if offset >= buffer.len() {
729        return Err("Offset out of bounds");
730    }
731
732    // Find null terminator
733    let start = offset;
734    let mut end = offset;
735    while end < buffer.len() && buffer[end] != 0 {
736        end += 1;
737    }
738
739    if end >= buffer.len() {
740        return Err("String not null-terminated");
741    }
742
743    std::str::from_utf8(&buffer[start..end]).map_err(|_| "Invalid UTF-8")
744}
745
746/// Helper to read a UTF-8 string from buffer with known length (FAST PATH)
747///
748/// This is much faster than `read_cstring` because it doesn't scan for the null terminator.
749/// Use this when you have the string length from PatternEntry.pattern_string_length.
750///
751/// # Safety
752///
753/// Caller must ensure:
754/// - offset + length <= buffer.len()
755/// - Bytes are valid UTF-8
756/// - Length is correct
757#[inline]
758#[allow(dead_code)]
759pub unsafe fn read_cstring_with_len(
760    buffer: &[u8],
761    offset: usize,
762    length: usize,
763) -> Result<&str, &'static str> {
764    if offset + length > buffer.len() {
765        return Err("Offset + length out of bounds");
766    }
767
768    // Direct slice without scanning for null terminator
769    std::str::from_utf8(&buffer[offset..offset + length]).map_err(|_| "Invalid UTF-8")
770}
771
772/// Helper to read a UTF-8 string from buffer with known length (ULTRA-FAST PATH - NO UTF-8 VALIDATION)
773///
774/// This is the fastest option - it skips null terminator scanning AND UTF-8 validation.
775/// Only use this in hot query paths where you KNOW the strings are valid UTF-8 (from build time).
776///
777/// # Safety
778///
779/// Caller must ensure:
780/// - offset + length <= buffer.len()
781/// - Bytes are DEFINITELY valid UTF-8 (undefined behavior if not!)
782/// - Length is correct
783#[inline]
784#[allow(dead_code)]
785#[must_use]
786pub unsafe fn read_str_unchecked(buffer: &[u8], offset: usize, length: usize) -> &str {
787    debug_assert!(offset + length <= buffer.len());
788    // SAFETY: Caller guarantees valid UTF-8
789    std::str::from_utf8_unchecked(&buffer[offset..offset + length])
790}
791
792/// Helper to read a UTF-8 string from buffer with known length (SAFE PATH - validates UTF-8)
793///
794/// This validates UTF-8 on every read. Use for untrusted databases.
795/// Slower than `read_str_unchecked` but prevents undefined behavior.
796///
797/// # Safety
798///
799/// Caller must ensure:
800/// - offset + length <= buffer.len()
801/// - Length is correct
802///
803/// UTF-8 validation is performed, so invalid UTF-8 returns an error.
804#[inline]
805#[allow(dead_code)]
806pub unsafe fn read_str_checked(
807    buffer: &[u8],
808    offset: usize,
809    length: usize,
810) -> Result<&str, &'static str> {
811    if offset + length > buffer.len() {
812        return Err("Offset + length out of bounds");
813    }
814    std::str::from_utf8(&buffer[offset..offset + length]).map_err(|_| "Invalid UTF-8")
815}
816
817#[cfg(test)]
818mod tests {
819    use super::*;
820
821    #[test]
822    fn test_header_size() {
823        assert_eq!(mem::size_of::<ParaglobHeader>(), 112); // v5: 8-byte magic + 26 * u32
824        assert_eq!(mem::align_of::<ParaglobHeader>(), 4);
825    }
826
827    #[test]
828    fn test_node_size() {
829        assert_eq!(mem::size_of::<ACNode>(), 32);
830        assert_eq!(mem::align_of::<ACNode>(), 4);
831    }
832
833    #[test]
834    fn test_edge_size() {
835        assert_eq!(mem::size_of::<ACEdge>(), 8);
836        assert_eq!(mem::align_of::<ACEdge>(), 4);
837    }
838
839    #[test]
840    fn test_pattern_entry_size() {
841        assert_eq!(mem::size_of::<PatternEntry>(), 16);
842        assert_eq!(mem::align_of::<PatternEntry>(), 4);
843    }
844
845    #[test]
846    fn test_header_validation() {
847        let mut header = ParaglobHeader::new();
848        assert!(header.validate().is_ok());
849        assert_eq!(header.version, MATCHY_FORMAT_VERSION);
850
851        header.magic = *b"INVALID!";
852        assert!(header.validate().is_err());
853
854        header.magic = *MAGIC;
855        header.version = 999;
856        assert!(header.validate().is_err());
857
858        // Only v4 is valid
859        header.version = MATCHY_FORMAT_VERSION_V1;
860        assert!(header.validate().is_err());
861
862        header.version = MATCHY_FORMAT_VERSION_V2;
863        assert!(header.validate().is_err());
864
865        header.version = MATCHY_FORMAT_VERSION_V3;
866        assert!(header.validate().is_err());
867
868        header.version = MATCHY_FORMAT_VERSION;
869        assert!(header.validate().is_ok());
870    }
871
872    #[test]
873    fn test_v3_features() {
874        let mut header = ParaglobHeader::new();
875        assert_eq!(header.version, MATCHY_FORMAT_VERSION);
876        assert!(!header.has_data_section());
877        assert!(!header.has_inline_data());
878        assert!(!header.has_ac_literal_mapping());
879
880        // Add data section
881        header.data_section_size = 1024;
882        assert!(header.has_data_section());
883
884        // Set inline data flag
885        header.data_flags = 0x1;
886        assert!(header.has_inline_data());
887
888        // Add AC literal mapping
889        header.ac_literal_map_offset = 1000;
890        header.ac_literal_map_count = 50;
891        assert!(header.has_ac_literal_mapping());
892    }
893
894    #[test]
895    fn test_read_struct() {
896        let mut buffer = vec![0u8; 112]; // v5 header size
897        let header = ParaglobHeader::new();
898
899        // Write header to buffer
900        // SAFETY: buffer is exactly 112 bytes (v5 header size), properly allocated,
901        // and ParaglobHeader is #[repr(C)] with size 112.
902        unsafe {
903            let ptr = buffer.as_mut_ptr().cast::<ParaglobHeader>();
904            ptr.write(header);
905        }
906
907        // Read it back
908        // SAFETY: buffer contains valid ParaglobHeader bytes written above, offset 0 is aligned.
909        let read_header: ParaglobHeader = unsafe { read_struct(&buffer, 0) };
910        assert_eq!(read_header.magic, *MAGIC);
911        assert_eq!(read_header.version, MATCHY_FORMAT_VERSION);
912        assert_eq!(read_header.version, 5);
913    }
914
915    #[test]
916    fn test_read_cstring() {
917        let buffer = b"hello\0world\0\0";
918
919        let s1 = read_cstring(buffer, 0).unwrap();
920        assert_eq!(s1, "hello");
921
922        let s2 = read_cstring(buffer, 6).unwrap();
923        assert_eq!(s2, "world");
924
925        let s3 = read_cstring(buffer, 12).unwrap();
926        assert_eq!(s3, "");
927    }
928}
matchy_paraglob/offset_format.rs

matchy_paraglob/
offset_format.rs