matchy_paraglob/offset_format.rs
1//! Offset-based binary format for zero-copy memory mapping
2//!
3//! This module defines the binary format used for serializing and loading
4//! Paraglob pattern matchers. The format uses byte offsets instead of pointers,
5//! allowing it to be memory-mapped and used directly without deserialization.
6//!
7//! # Format Overview
8//!
9//! The format consists of C-compatible packed structs that can be cast directly
10//! from bytes. All references use byte offsets from the start of the buffer.
11//!
12//! # Layout
13//!
14//! ```text
15//! [Header: ParaglobHeader (v5: 112 bytes)]
16//! [AC Nodes: ACNode array]
17//! [AC Edges: ACEdge arrays (variable, referenced by nodes)]
18//! [AC Pattern IDs: u32 arrays (variable, referenced by nodes)]
19//! [Pattern Entries: PatternEntry array]
20//! [Pattern Strings: null-terminated UTF-8]
21//! [Meta-word mappings: MetaWordMapping array]
22//! [Pattern reference arrays: u32 arrays]
23//! [Single wildcards: SingleWildcard array]
24//! [Glob Segments: GlobSegmentIndex + segment data (v5+)]
25//! [Data section: optional (v2+)]
26//! [Data mappings: optional (v2+)]
27//! [AC Literal Mapping: optional (v3+)]
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Alignment**: All structs are properly aligned for direct casting
33//! 2. **Offsets**: All references use u32 byte offsets (4GB limit)
34//! 3. **Zero-copy**: Can read directly from mmap without parsing
35//! 4. **Portability**: Little-endian u32/u8 only (standard on x86/ARM)
36
37use std::mem;
38use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
39
40/// Magic bytes identifying Paraglob binary format
41pub const MAGIC: &[u8; 8] = b"PARAGLOB";
42
43/// Current format version (v5: serialized glob segments for zero-copy loading)
44pub const MATCHY_FORMAT_VERSION: u32 = 5;
45
46/// Previous format version (v4: uses ACNodeHot for 50% memory reduction)
47#[allow(dead_code)]
48pub const MATCHY_FORMAT_VERSION_V4: u32 = 4;
49
50/// Previous format version (v3: adds AC literal mapping for zero-copy loading)
51#[allow(dead_code)] // Kept for reference and potential migration code
52pub const MATCHY_FORMAT_VERSION_V3: u32 = 3;
53
54/// Previous format version (v2: adds data section support)
55#[allow(dead_code)] // Kept for reference and potential migration code
56pub const MATCHY_FORMAT_VERSION_V2: u32 = 2;
57
58/// Previous format version (v1: patterns only, no data)
59#[allow(dead_code)] // Kept for reference and potential migration code
60pub const MATCHY_FORMAT_VERSION_V1: u32 = 1;
61
62/// Main header for serialized Paraglob database (112 bytes, 4-byte aligned)
63///
64/// This header appears at the start of every serialized Paraglob file.
65/// All offsets are relative to the start of the buffer.
66///
67/// # Version History
68/// - v1 (72 bytes): Original format, patterns only
69/// - v2 (96 bytes): Adds data section support for pattern-associated data
70/// - v3 (104 bytes): Adds AC literal mapping for O(1) zero-copy loading
71/// - v4 (104 bytes): Uses ACNodeHot (20-byte) instead of ACNode (32-byte) - BREAKING
72/// - v5 (112 bytes): Adds serialized glob segments for zero-copy loading
73#[repr(C)]
74#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
75pub struct ParaglobHeader {
76 /// Magic bytes: "PARAGLOB"
77 pub magic: [u8; 8],
78
79 /// Format version (currently 5)
80 pub version: u32,
81
82 /// Match mode: 0=CaseSensitive, 1=CaseInsensitive
83 pub match_mode: u32,
84
85 // AC Automaton section
86 /// Number of nodes in the AC trie
87 pub ac_node_count: u32,
88
89 /// Offset to first AC node
90 pub ac_nodes_offset: u32,
91
92 /// Total size of AC edges data
93 pub ac_edges_size: u32,
94
95 /// Total size of AC pattern ID arrays
96 pub ac_patterns_size: u32,
97
98 // Pattern section
99 /// Total number of original glob patterns
100 pub pattern_count: u32,
101
102 /// Offset to pattern entry array
103 pub patterns_offset: u32,
104
105 /// Offset to pattern strings area
106 pub pattern_strings_offset: u32,
107
108 /// Total size of pattern strings
109 pub pattern_strings_size: u32,
110
111 // Meta-word mapping section
112 /// Number of meta-word to pattern mappings
113 pub meta_word_mapping_count: u32,
114
115 /// Offset to meta-word mapping array
116 pub meta_word_mappings_offset: u32,
117
118 /// Total size of pattern reference arrays
119 pub pattern_refs_size: u32,
120
121 /// Number of pure wildcard patterns (no literals)
122 pub wildcard_count: u32,
123
124 /// Total size of the entire serialized buffer (bytes)
125 pub total_buffer_size: u32,
126
127 /// Endianness marker: 0x01=little-endian, 0x02=big-endian, 0x00=legacy (assume little-endian)
128 /// Database is always stored in little-endian format.
129 /// This field indicates the endianness of the system that created the file.
130 /// On big-endian systems, all multi-byte values are byte-swapped on read.
131 pub endianness: u8,
132
133 /// Reserved for future use
134 pub reserved: [u8; 3],
135
136 // ===== v2 ADDITIONS (24 bytes) =====
137 /// Offset to data section (0 = no data section)
138 /// Points to MMDB-encoded data or other serialized data
139 pub data_section_offset: u32,
140
141 /// Size of data section in bytes (0 = no data)
142 pub data_section_size: u32,
143
144 /// Offset to pattern→data mapping table (0 = no mappings)
145 /// Each mapping is a PatternDataMapping struct
146 pub mapping_table_offset: u32,
147
148 /// Number of pattern→data mappings
149 /// Should equal pattern_count if all patterns have data
150 pub mapping_count: u32,
151
152 /// Data type flags:
153 /// - Bit 0: inline data (1) vs external references (0)
154 /// - Bit 1-31: reserved
155 pub data_flags: u32,
156
157 /// Reserved for future v2+ features
158 pub reserved_v2: u32,
159
160 // ===== v3 ADDITIONS (8 bytes) =====
161 /// Offset to AC literal→pattern mapping table (0 = no mapping, requires reconstruction)
162 /// Points to serialized `HashMap<u32, Vec<u32>>` for instant loading
163 /// Format: `[entry_count: u32]` followed by entries of:
164 /// `[literal_id: u32][pattern_count: u32][pattern_id: u32, ...]`
165 pub ac_literal_map_offset: u32,
166
167 /// Number of entries in AC literal mapping table
168 /// 0 = v1/v2 file, requires reconstruct_literal_mapping()
169 pub ac_literal_map_count: u32,
170
171 // ===== v5 ADDITIONS (8 bytes) =====
172 /// Offset to glob segment index (0 = no segments, use lazy parsing)
173 /// Points to array of GlobSegmentIndex structs (one per pattern)
174 pub glob_segments_offset: u32,
175
176 /// Total size of glob segment data (index + segment structures + string data)
177 pub glob_segments_size: u32,
178}
179
180/// State encoding type for AC automaton nodes
181///
182/// Determines how transitions are stored and looked up for optimal performance.
183#[repr(u8)]
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185pub enum StateKind {
186 /// No transitions (terminal state only)
187 Empty = 0,
188 /// Single transition - stored inline in node (75-80% of states)
189 One = 1,
190 /// 2-8 transitions - sparse edge array (10-15% of states)
191 Sparse = 2,
192 /// 9+ transitions - dense lookup table (2-5% of states)
193 Dense = 3,
194}
195
196impl StateKind {
197 /// Lookup table for fast u8 -> StateKind conversion
198 const LOOKUP: [Option<Self>; 256] = {
199 let mut table = [None; 256];
200 table[0] = Some(Self::Empty);
201 table[1] = Some(Self::One);
202 table[2] = Some(Self::Sparse);
203 table[3] = Some(Self::Dense);
204 table
205 };
206
207 /// Convert from u8 (for deserialization) - O(1) lookup
208 #[inline(always)]
209 #[must_use]
210 pub const fn from_u8(value: u8) -> Option<Self> {
211 Self::LOOKUP[value as usize]
212 }
213}
214
215// Re-export ACNodeHot from matchy-ac
216pub use matchy_ac::ACNodeHot;
217
218/// AC Automaton node (32 bytes, 8-byte aligned) - DEPRECATED
219///
220/// Legacy 32-byte node structure. Kept for backward compatibility with old file formats.
221/// New code should use ACNodeHot (16 bytes) for better cache performance.
222///
223/// Represents a single node in the Aho-Corasick trie with state-specific encoding.
224/// All child references are stored as offsets to allow zero-copy loading.
225///
226/// # State Encoding
227///
228/// The node uses different encodings based on transition count:
229/// - **Empty** (0 transitions): No additional data needed
230/// - **One** (1 transition): Character and target stored inline (no indirection!)
231/// - **Sparse** (2-8 transitions): Offset to edge array, linear search
232/// - **Dense** (9+ transitions): Offset to 256-entry lookup table, O(1) access
233#[repr(C)]
234#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
235pub struct ACNode {
236 /// Unique node ID
237 pub node_id: u32,
238
239 /// Offset to failure link node (0 = root)
240 pub failure_offset: u32,
241
242 /// State encoding type (StateKind enum)
243 pub state_kind: u8,
244
245 /// Depth from root node
246 pub depth: u8,
247
248 /// Is this a terminal/final state? (1=yes, 0=no)
249 pub is_final: u8,
250
251 /// Reserved for future flags
252 pub reserved_flags: u8,
253
254 /// ONE encoding: character for single transition
255 pub one_char: u8,
256
257 /// Reserved for alignment
258 pub reserved_one: [u8; 3],
259
260 /// SPARSE/DENSE encoding: offset-based lookup (4 bytes)
261 /// - SPARSE: offset to ACEdge array
262 /// - DENSE: offset to DenseLookup table
263 /// - ONE: target offset for single transition
264 pub edges_offset: u32,
265
266 /// Number of edges (SPARSE/DENSE states only)
267 pub edge_count: u16,
268
269 /// Reserved for alignment
270 pub reserved_edge: u16,
271
272 /// Offset to pattern ID array
273 pub patterns_offset: u32,
274
275 /// Number of pattern IDs at this node
276 pub pattern_count: u16,
277
278 /// Reserved for alignment
279 pub reserved_pattern: u16,
280}
281// Total: node_id(4) + failure_offset(4) + state_kind/depth/is_final/reserved(4)
282// + one_char/reserved_one(4) + edges_offset(4) + edge_count/reserved(4)
283// + patterns_offset(4) + pattern_count/reserved(4)
284// = 4+4+4+4+4+4+4+4 = 32 bytes ✓
285
286/// AC Automaton edge (8 bytes, 4-byte aligned)
287///
288/// Represents a transition from one node to another on a specific character.
289/// Used by SPARSE state encoding.
290#[repr(C)]
291#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
292pub struct ACEdge {
293 /// Input character (0-255)
294 pub character: u8,
295
296 /// Reserved for alignment
297 pub reserved: [u8; 3],
298
299 /// Offset to target node
300 pub target_offset: u32,
301}
302
303/// Dense lookup table for states with many transitions (1024 bytes, 64-byte aligned)
304///
305/// Used by DENSE state encoding for O(1) transition lookup.
306/// Each entry is a target node offset (0 = no transition).
307///
308/// **Cache-line alignment**: The 64-byte alignment ensures this structure starts on a
309/// cache line boundary, preventing cache line splits and improving memory access performance
310/// by 5-15% for dense state lookups. The structure size remains 1024 bytes; only the
311/// placement in memory changes (average 32 bytes padding per instance).
312#[repr(C, align(64))]
313#[derive(Debug, Clone, Copy)]
314pub struct DenseLookup {
315 /// Target offsets indexed by character (0-255)
316 /// 0 means no transition for that character
317 pub targets: [u32; 256],
318}
319
320/// Pattern entry (16 bytes, 8-byte aligned)
321///
322/// Metadata about a single glob pattern in the database.
323#[repr(C)]
324#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
325pub struct PatternEntry {
326 /// Pattern ID (matches IDs used in AC automaton)
327 pub pattern_id: u32,
328
329 /// Pattern type: 0=Literal, 1=Glob
330 pub pattern_type: u8,
331
332 /// Reserved for alignment
333 pub reserved: [u8; 3],
334
335 /// Offset to pattern string (null-terminated UTF-8)
336 pub pattern_string_offset: u32,
337
338 /// Length of pattern string (not including null)
339 pub pattern_string_length: u32,
340}
341
342/// Meta-word to pattern mapping (12 bytes, 4-byte aligned)
343///
344/// Maps a meta-word (literal segment from AC automaton) to all patterns
345/// that contain it. Used for hybrid AC + glob matching.
346#[repr(C)]
347#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
348pub struct MetaWordMapping {
349 /// Meta-word string offset
350 pub meta_word_offset: u32,
351
352 /// Offset to array of pattern IDs (u32[])
353 pub pattern_ids_offset: u32,
354
355 /// Number of patterns containing this meta-word
356 pub pattern_count: u32,
357}
358
359/// Single wildcard entry (8 bytes, 4-byte aligned)
360///
361/// Represents a pattern with only wildcards (*, ?) and no literals.
362/// These must be checked separately since they don't have AC matches.
363#[repr(C)]
364#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
365pub struct SingleWildcard {
366 /// Pattern ID
367 pub pattern_id: u32,
368
369 /// Offset to pattern string
370 pub pattern_string_offset: u32,
371}
372
373/// Pattern-to-data mapping entry (12 bytes, 4-byte aligned)
374///
375/// Maps a pattern ID to associated data. Used in v2 format.
376/// The data can be inline (stored in data section) or external
377/// (reference to MMDB data section).
378#[repr(C)]
379#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
380pub struct PatternDataMapping {
381 /// Pattern ID this mapping applies to
382 pub pattern_id: u32,
383
384 /// Offset to data in data section (or external offset)
385 /// Interpretation depends on data_flags in header
386 pub data_offset: u32,
387
388 /// Size of data in bytes (0 = use data section's size encoding)
389 pub data_size: u32,
390}
391
392/// Glob segment index entry (8 bytes, 4-byte aligned)
393///
394/// Points to the glob segment data for a specific pattern.
395/// One entry exists for each pattern in the database.
396#[repr(C)]
397#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
398pub struct GlobSegmentIndex {
399 /// Offset to first GlobSegmentHeader for this pattern
400 /// Relative to start of buffer
401 pub first_segment_offset: u32,
402
403 /// Number of segments in this pattern
404 pub segment_count: u16,
405
406 /// Reserved for alignment
407 pub reserved: u16,
408}
409
410/// Glob segment header (12 bytes, 4-byte aligned)
411///
412/// Describes a single segment of a glob pattern (Literal, Star, Question, or CharClass).
413/// Followed immediately by segment-specific data (string bytes or CharClassItem array).
414#[repr(C)]
415#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
416pub struct GlobSegmentHeader {
417 /// Segment type:
418 /// - 0: Literal(String)
419 /// - 1: Star
420 /// - 2: Question
421 /// - 3: CharClass
422 pub segment_type: u8,
423
424 /// Flags (for CharClass: bit 0 = negated)
425 pub flags: u8,
426
427 /// Reserved for alignment
428 pub reserved: u16,
429
430 /// Length of associated data in bytes
431 /// - Literal: string byte length
432 /// - Star/Question: 0
433 /// - CharClass: number of CharClassItem entries * 12
434 pub data_len: u32,
435
436 /// Offset to associated data (relative to start of buffer)
437 /// - Literal: offset to UTF-8 string bytes
438 /// - Star/Question: unused (0)
439 /// - CharClass: offset to CharClassItemEncoded array
440 pub data_offset: u32,
441}
442
443/// Encoded character class item (12 bytes, 4-byte aligned)
444///
445/// Represents either a single character or a character range in a glob character class.
446#[repr(C)]
447#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)]
448pub struct CharClassItemEncoded {
449 /// Item type:
450 /// - 0: Char(char1)
451 /// - 1: Range(char1, char2)
452 pub item_type: u8,
453
454 /// Reserved for alignment
455 pub reserved: [u8; 3],
456
457 /// First character (or only character for Char variant)
458 pub char1: u32,
459
460 /// Second character (for Range variant only, 0 for Char)
461 pub char2: u32,
462}
463
464// Compile-time size assertions to ensure struct layout
465const _: () = assert!(mem::size_of::<ParaglobHeader>() == 112); // v5: 8-byte magic + 26 * u32 fields
466const _: () = assert!(mem::size_of::<ACNodeHot>() == 20); // With one_target field: 4 + 4*4 = 20 bytes
467const _: () = assert!(mem::size_of::<ACNode>() == 32); // Legacy: 2 per cache line
468const _: () = assert!(mem::size_of::<ACEdge>() == 8);
469const _: () = assert!(mem::size_of::<DenseLookup>() == 1024); // 256 * 4 bytes
470const _: () = assert!(mem::align_of::<DenseLookup>() == 64); // Cache-line alignment for performance
471const _: () = assert!(mem::size_of::<PatternEntry>() == 16);
472const _: () = assert!(mem::size_of::<MetaWordMapping>() == 12);
473const _: () = assert!(mem::size_of::<SingleWildcard>() == 8);
474const _: () = assert!(mem::size_of::<PatternDataMapping>() == 12);
475const _: () = assert!(mem::size_of::<GlobSegmentIndex>() == 8);
476const _: () = assert!(mem::size_of::<GlobSegmentHeader>() == 12);
477const _: () = assert!(mem::size_of::<CharClassItemEncoded>() == 12);
478
479impl Default for ParaglobHeader {
480 fn default() -> Self {
481 Self::new()
482 }
483}
484
485impl PatternDataMapping {
486 /// Create a new pattern-to-data mapping
487 #[must_use]
488 pub fn new(pattern_id: u32, data_offset: u32, data_size: u32) -> Self {
489 Self {
490 pattern_id,
491 data_offset,
492 data_size,
493 }
494 }
495}
496
497impl ParaglobHeader {
498 /// Create a new v3 header with magic and version
499 #[must_use]
500 pub fn new() -> Self {
501 Self {
502 magic: *MAGIC,
503 version: MATCHY_FORMAT_VERSION,
504 match_mode: 0,
505 ac_node_count: 0,
506 ac_nodes_offset: 0,
507 ac_edges_size: 0,
508 ac_patterns_size: 0,
509 pattern_count: 0,
510 patterns_offset: 0,
511 pattern_strings_offset: 0,
512 pattern_strings_size: 0,
513 meta_word_mapping_count: 0,
514 meta_word_mappings_offset: 0,
515 pattern_refs_size: 0,
516 wildcard_count: 0,
517 total_buffer_size: 0,
518 endianness: 0x01, // Little-endian marker (reserved for future use)
519 reserved: [0; 3],
520 // v2 fields
521 data_section_offset: 0,
522 data_section_size: 0,
523 mapping_table_offset: 0,
524 mapping_count: 0,
525 data_flags: 0,
526 reserved_v2: 0,
527 // v3 fields
528 ac_literal_map_offset: 0,
529 ac_literal_map_count: 0,
530 // v5 fields
531 glob_segments_offset: 0,
532 glob_segments_size: 0,
533 }
534 }
535
536 /// Validate header magic and version
537 pub fn validate(&self) -> Result<(), &'static str> {
538 if &self.magic != MAGIC {
539 return Err("Invalid magic bytes");
540 }
541 if self.version != MATCHY_FORMAT_VERSION {
542 return Err("Unsupported version - only v5 format supported");
543 }
544 Ok(())
545 }
546
547 /// Validate that all header offsets are within buffer bounds
548 #[allow(dead_code)] // Reserved for validation feature
549 pub fn validate_offsets(&self, buffer_len: usize) -> Result<(), &'static str> {
550 // Validate AC literal mapping offset if present
551 if self.has_ac_literal_mapping() {
552 let offset = self.ac_literal_map_offset as usize;
553 if offset >= buffer_len {
554 return Err("AC literal map offset out of bounds");
555 }
556 }
557
558 // Validate data section if present
559 if self.has_data_section() {
560 let start = self.data_section_offset as usize;
561 let size = self.data_section_size as usize;
562 if start.checked_add(size).is_none_or(|end| end > buffer_len) {
563 return Err("Data section out of bounds");
564 }
565 }
566
567 // Validate mapping table if present
568 if self.mapping_count > 0 {
569 let offset = self.mapping_table_offset as usize;
570 if offset >= buffer_len {
571 return Err("Mapping table offset out of bounds");
572 }
573 }
574
575 // Validate AC nodes section
576 if self.ac_node_count > 0 {
577 let offset = self.ac_nodes_offset as usize;
578 let size = (self.ac_node_count as usize) * mem::size_of::<ACNode>();
579 if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
580 return Err("AC nodes section out of bounds");
581 }
582 }
583
584 // Validate patterns section
585 if self.pattern_count > 0 {
586 let offset = self.patterns_offset as usize;
587 let size = (self.pattern_count as usize) * mem::size_of::<PatternEntry>();
588 if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
589 return Err("Patterns section out of bounds");
590 }
591 }
592
593 // Validate pattern strings section
594 if self.pattern_strings_size > 0 {
595 let start = self.pattern_strings_offset as usize;
596 let size = self.pattern_strings_size as usize;
597 if start.checked_add(size).is_none_or(|end| end > buffer_len) {
598 return Err("Pattern strings section out of bounds");
599 }
600 }
601
602 // Validate meta-word mappings
603 if self.meta_word_mapping_count > 0 {
604 let offset = self.meta_word_mappings_offset as usize;
605 let size = (self.meta_word_mapping_count as usize) * mem::size_of::<MetaWordMapping>();
606 if offset.checked_add(size).is_none_or(|end| end > buffer_len) {
607 return Err("Meta-word mappings section out of bounds");
608 }
609 }
610
611 Ok(())
612 }
613
614 /// Check if this file has a data section
615 #[must_use]
616 pub fn has_data_section(&self) -> bool {
617 self.data_section_size > 0
618 }
619
620 /// Check if this file has a pre-built AC literal mapping (v3+)
621 #[must_use]
622 pub fn has_ac_literal_mapping(&self) -> bool {
623 self.ac_literal_map_count > 0 && self.ac_literal_map_offset > 0
624 }
625
626 /// Check if data is inline (true) or external references (false)
627 #[allow(dead_code)] // Reserved for future use
628 #[must_use]
629 pub fn has_inline_data(&self) -> bool {
630 (self.data_flags & 0x1) != 0
631 }
632
633 /// Check if this file has pre-built glob segments (v5+)
634 #[allow(dead_code)] // Reserved for v5 format implementation
635 #[must_use]
636 pub fn has_glob_segments(&self) -> bool {
637 self.glob_segments_size > 0 && self.glob_segments_offset > 0
638 }
639}
640
641impl ACNode {
642 /// Create a new node with default EMPTY encoding
643 #[allow(dead_code)]
644 #[must_use]
645 pub fn new(node_id: u32, depth: u8) -> Self {
646 Self {
647 node_id,
648 failure_offset: 0,
649 state_kind: StateKind::Empty as u8,
650 depth,
651 is_final: 0,
652 reserved_flags: 0,
653 one_char: 0,
654 reserved_one: [0; 3],
655 edges_offset: 0,
656 edge_count: 0,
657 reserved_edge: 0,
658 patterns_offset: 0,
659 pattern_count: 0,
660 reserved_pattern: 0,
661 }
662 }
663}
664
665impl ACEdge {
666 /// Create a new edge
667 #[allow(dead_code)] // Used by builder code in other crates
668 #[must_use]
669 pub fn new(character: u8, target_offset: u32) -> Self {
670 Self {
671 character,
672 reserved: [0; 3],
673 target_offset,
674 }
675 }
676}
677
678impl PatternEntry {
679 /// Create a new pattern entry
680 #[must_use]
681 pub fn new(pattern_id: u32, pattern_type: u8) -> Self {
682 Self {
683 pattern_id,
684 pattern_type,
685 reserved: [0; 3],
686 pattern_string_offset: 0,
687 pattern_string_length: 0,
688 }
689 }
690}
691
692/// Helper to safely read a struct from a byte buffer at an offset
693///
694/// # Safety
695///
696/// Caller must ensure:
697/// - offset + `size_of::<T>`() <= buffer.len()
698/// - Buffer is properly aligned for T
699/// - Bytes represent a valid T
700#[allow(dead_code)]
701#[must_use]
702pub unsafe fn read_struct<T: Copy>(buffer: &[u8], offset: usize) -> T {
703 debug_assert!(offset + mem::size_of::<T>() <= buffer.len());
704 let ptr = buffer.as_ptr().add(offset).cast::<T>();
705 ptr.read_unaligned()
706}
707
708/// Helper to safely read a slice of structs from a byte buffer
709///
710/// # Safety
711///
712/// Caller must ensure:
713/// - offset + `size_of::<T>`() * count <= buffer.len()
714/// - Buffer contains valid T values
715#[allow(dead_code)]
716#[must_use]
717pub unsafe fn read_struct_slice<T: Copy>(buffer: &[u8], offset: usize, count: usize) -> &[T] {
718 debug_assert!(offset + mem::size_of::<T>() * count <= buffer.len());
719 let ptr = buffer.as_ptr().add(offset).cast::<T>();
720 std::slice::from_raw_parts(ptr, count)
721}
722
723/// Helper to read a null-terminated UTF-8 string from buffer
724///
725/// Returns error if offset is out of bounds, string is not null-terminated,
726/// or bytes are not valid UTF-8.
727pub fn read_cstring(buffer: &[u8], offset: usize) -> Result<&str, &'static str> {
728 if offset >= buffer.len() {
729 return Err("Offset out of bounds");
730 }
731
732 // Find null terminator
733 let start = offset;
734 let mut end = offset;
735 while end < buffer.len() && buffer[end] != 0 {
736 end += 1;
737 }
738
739 if end >= buffer.len() {
740 return Err("String not null-terminated");
741 }
742
743 std::str::from_utf8(&buffer[start..end]).map_err(|_| "Invalid UTF-8")
744}
745
746/// Helper to read a UTF-8 string from buffer with known length (FAST PATH)
747///
748/// This is much faster than `read_cstring` because it doesn't scan for the null terminator.
749/// Use this when you have the string length from PatternEntry.pattern_string_length.
750///
751/// # Safety
752///
753/// Caller must ensure:
754/// - offset + length <= buffer.len()
755/// - Bytes are valid UTF-8
756/// - Length is correct
757#[inline]
758#[allow(dead_code)]
759pub unsafe fn read_cstring_with_len(
760 buffer: &[u8],
761 offset: usize,
762 length: usize,
763) -> Result<&str, &'static str> {
764 if offset + length > buffer.len() {
765 return Err("Offset + length out of bounds");
766 }
767
768 // Direct slice without scanning for null terminator
769 std::str::from_utf8(&buffer[offset..offset + length]).map_err(|_| "Invalid UTF-8")
770}
771
772/// Helper to read a UTF-8 string from buffer with known length (ULTRA-FAST PATH - NO UTF-8 VALIDATION)
773///
774/// This is the fastest option - it skips null terminator scanning AND UTF-8 validation.
775/// Only use this in hot query paths where you KNOW the strings are valid UTF-8 (from build time).
776///
777/// # Safety
778///
779/// Caller must ensure:
780/// - offset + length <= buffer.len()
781/// - Bytes are DEFINITELY valid UTF-8 (undefined behavior if not!)
782/// - Length is correct
783#[inline]
784#[allow(dead_code)]
785#[must_use]
786pub unsafe fn read_str_unchecked(buffer: &[u8], offset: usize, length: usize) -> &str {
787 debug_assert!(offset + length <= buffer.len());
788 // SAFETY: Caller guarantees valid UTF-8
789 std::str::from_utf8_unchecked(&buffer[offset..offset + length])
790}
791
792/// Helper to read a UTF-8 string from buffer with known length (SAFE PATH - validates UTF-8)
793///
794/// This validates UTF-8 on every read. Use for untrusted databases.
795/// Slower than `read_str_unchecked` but prevents undefined behavior.
796///
797/// # Safety
798///
799/// Caller must ensure:
800/// - offset + length <= buffer.len()
801/// - Length is correct
802///
803/// UTF-8 validation is performed, so invalid UTF-8 returns an error.
804#[inline]
805#[allow(dead_code)]
806pub unsafe fn read_str_checked(
807 buffer: &[u8],
808 offset: usize,
809 length: usize,
810) -> Result<&str, &'static str> {
811 if offset + length > buffer.len() {
812 return Err("Offset + length out of bounds");
813 }
814 std::str::from_utf8(&buffer[offset..offset + length]).map_err(|_| "Invalid UTF-8")
815}
816
817#[cfg(test)]
818mod tests {
819 use super::*;
820
821 #[test]
822 fn test_header_size() {
823 assert_eq!(mem::size_of::<ParaglobHeader>(), 112); // v5: 8-byte magic + 26 * u32
824 assert_eq!(mem::align_of::<ParaglobHeader>(), 4);
825 }
826
827 #[test]
828 fn test_node_size() {
829 assert_eq!(mem::size_of::<ACNode>(), 32);
830 assert_eq!(mem::align_of::<ACNode>(), 4);
831 }
832
833 #[test]
834 fn test_edge_size() {
835 assert_eq!(mem::size_of::<ACEdge>(), 8);
836 assert_eq!(mem::align_of::<ACEdge>(), 4);
837 }
838
839 #[test]
840 fn test_pattern_entry_size() {
841 assert_eq!(mem::size_of::<PatternEntry>(), 16);
842 assert_eq!(mem::align_of::<PatternEntry>(), 4);
843 }
844
845 #[test]
846 fn test_header_validation() {
847 let mut header = ParaglobHeader::new();
848 assert!(header.validate().is_ok());
849 assert_eq!(header.version, MATCHY_FORMAT_VERSION);
850
851 header.magic = *b"INVALID!";
852 assert!(header.validate().is_err());
853
854 header.magic = *MAGIC;
855 header.version = 999;
856 assert!(header.validate().is_err());
857
858 // Only v4 is valid
859 header.version = MATCHY_FORMAT_VERSION_V1;
860 assert!(header.validate().is_err());
861
862 header.version = MATCHY_FORMAT_VERSION_V2;
863 assert!(header.validate().is_err());
864
865 header.version = MATCHY_FORMAT_VERSION_V3;
866 assert!(header.validate().is_err());
867
868 header.version = MATCHY_FORMAT_VERSION;
869 assert!(header.validate().is_ok());
870 }
871
872 #[test]
873 fn test_v3_features() {
874 let mut header = ParaglobHeader::new();
875 assert_eq!(header.version, MATCHY_FORMAT_VERSION);
876 assert!(!header.has_data_section());
877 assert!(!header.has_inline_data());
878 assert!(!header.has_ac_literal_mapping());
879
880 // Add data section
881 header.data_section_size = 1024;
882 assert!(header.has_data_section());
883
884 // Set inline data flag
885 header.data_flags = 0x1;
886 assert!(header.has_inline_data());
887
888 // Add AC literal mapping
889 header.ac_literal_map_offset = 1000;
890 header.ac_literal_map_count = 50;
891 assert!(header.has_ac_literal_mapping());
892 }
893
894 #[test]
895 fn test_read_struct() {
896 let mut buffer = vec![0u8; 112]; // v5 header size
897 let header = ParaglobHeader::new();
898
899 // Write header to buffer
900 // SAFETY: buffer is exactly 112 bytes (v5 header size), properly allocated,
901 // and ParaglobHeader is #[repr(C)] with size 112.
902 unsafe {
903 let ptr = buffer.as_mut_ptr().cast::<ParaglobHeader>();
904 ptr.write(header);
905 }
906
907 // Read it back
908 // SAFETY: buffer contains valid ParaglobHeader bytes written above, offset 0 is aligned.
909 let read_header: ParaglobHeader = unsafe { read_struct(&buffer, 0) };
910 assert_eq!(read_header.magic, *MAGIC);
911 assert_eq!(read_header.version, MATCHY_FORMAT_VERSION);
912 assert_eq!(read_header.version, 5);
913 }
914
915 #[test]
916 fn test_read_cstring() {
917 let buffer = b"hello\0world\0\0";
918
919 let s1 = read_cstring(buffer, 0).unwrap();
920 assert_eq!(s1, "hello");
921
922 let s2 = read_cstring(buffer, 6).unwrap();
923 assert_eq!(s2, "world");
924
925 let s3 = read_cstring(buffer, 12).unwrap();
926 assert_eq!(s3, "");
927 }
928}