plotnik_bytecode/bytecode/
module.rs

1//! Bytecode module with unified storage.
2//!
3//! The [`Module`] struct holds compiled bytecode, decoding instructions lazily
4//! when the VM steps into them.
5
6use std::io;
7use std::ops::Deref;
8use std::path::Path;
9
10use super::aligned_vec::AlignedVec;
11use super::header::{Header, SectionOffsets};
12use super::ids::{StringId, TypeId};
13use super::instructions::{Call, Match, Opcode, Return, Trampoline};
14use super::sections::{FieldSymbol, NodeSymbol, TriviaEntry};
15use super::type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeName};
16use super::{Entrypoint, STEP_SIZE, VERSION};
17
18/// Read a little-endian u16 from bytes at the given offset.
19#[inline]
20fn read_u16_le(bytes: &[u8], offset: usize) -> u16 {
21    u16::from_le_bytes([bytes[offset], bytes[offset + 1]])
22}
23
24/// Read a little-endian u32 from bytes at the given offset.
25#[inline]
26fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
27    u32::from_le_bytes([
28        bytes[offset],
29        bytes[offset + 1],
30        bytes[offset + 2],
31        bytes[offset + 3],
32    ])
33}
34
35/// Storage for bytecode bytes with guaranteed 64-byte alignment.
36///
37/// All bytecode must be 64-byte aligned for DFA deserialization and cache
38/// efficiency. This enum ensures alignment through two paths:
39/// - `Static`: Pre-aligned via `include_query_aligned!` macro
40/// - `Aligned`: Allocated with 64-byte alignment via `AlignedVec`
41pub enum ByteStorage {
42    /// Static bytes from `include_query_aligned!` (zero-copy, pre-aligned).
43    Static(&'static [u8]),
44    /// Owned bytes with guaranteed 64-byte alignment.
45    Aligned(AlignedVec),
46}
47
48impl Deref for ByteStorage {
49    type Target = [u8];
50
51    fn deref(&self) -> &Self::Target {
52        match self {
53            ByteStorage::Static(s) => s,
54            ByteStorage::Aligned(v) => v,
55        }
56    }
57}
58
59impl std::fmt::Debug for ByteStorage {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        match self {
62            ByteStorage::Static(s) => f.debug_tuple("Static").field(&s.len()).finish(),
63            ByteStorage::Aligned(v) => f.debug_tuple("Aligned").field(&v.len()).finish(),
64        }
65    }
66}
67
68impl ByteStorage {
69    /// Create from static bytes (zero-copy).
70    ///
71    /// The bytes must be 64-byte aligned. Use `include_query_aligned!` macro.
72    ///
73    /// # Panics
74    /// Panics if bytes are not 64-byte aligned.
75    pub fn from_static(bytes: &'static [u8]) -> Self {
76        assert!(
77            (bytes.as_ptr() as usize).is_multiple_of(64),
78            "static bytes must be 64-byte aligned; use include_query_aligned! macro"
79        );
80        Self::Static(bytes)
81    }
82
83    /// Create from an aligned vector (from compiler or file read).
84    pub fn from_aligned(vec: AlignedVec) -> Self {
85        Self::Aligned(vec)
86    }
87
88    /// Create by copying bytes into aligned storage.
89    ///
90    /// Use this when receiving bytes from unknown sources (e.g., network).
91    pub fn copy_from_slice(bytes: &[u8]) -> Self {
92        Self::Aligned(AlignedVec::copy_from_slice(bytes))
93    }
94
95    /// Read a file into aligned storage.
96    pub fn from_file(path: impl AsRef<Path>) -> io::Result<Self> {
97        Ok(Self::Aligned(AlignedVec::from_file(path)?))
98    }
99}
100
101/// Decoded instruction from bytecode.
102#[derive(Clone, Copy, Debug)]
103pub enum Instruction<'a> {
104    Match(Match<'a>),
105    Call(Call),
106    Return(Return),
107    Trampoline(Trampoline),
108}
109
110impl<'a> Instruction<'a> {
111    /// Decode an instruction from bytecode bytes.
112    #[inline]
113    pub fn from_bytes(bytes: &'a [u8]) -> Self {
114        debug_assert!(bytes.len() >= 8, "instruction too short");
115
116        let opcode = Opcode::from_u8(bytes[0] & 0xF);
117        match opcode {
118            Opcode::Call => {
119                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
120                Self::Call(Call::from_bytes(arr))
121            }
122            Opcode::Return => {
123                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
124                Self::Return(Return::from_bytes(arr))
125            }
126            Opcode::Trampoline => {
127                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
128                Self::Trampoline(Trampoline::from_bytes(arr))
129            }
130            _ => Self::Match(Match::from_bytes(bytes)),
131        }
132    }
133}
134
135/// Module load error.
136#[derive(Debug, thiserror::Error)]
137pub enum ModuleError {
138    #[error("invalid magic: expected PTKQ")]
139    InvalidMagic,
140    #[error("unsupported version: {0} (expected {VERSION})")]
141    UnsupportedVersion(u32),
142    #[error("file too small: {0} bytes (minimum 64)")]
143    FileTooSmall(usize),
144    #[error("size mismatch: header says {header} bytes, got {actual}")]
145    SizeMismatch { header: u32, actual: usize },
146    #[error("io error: {0}")]
147    Io(#[from] io::Error),
148}
149
150/// A compiled bytecode module.
151///
152/// Instructions are decoded lazily via [`decode_step`](Self::decode_step).
153/// Cold data (strings, symbols, types) is accessed through view methods.
154#[derive(Debug)]
155pub struct Module {
156    storage: ByteStorage,
157    header: Header,
158    /// Cached section offsets (computed from header counts).
159    offsets: SectionOffsets,
160}
161
162impl Module {
163    /// Load a module from an aligned vector (compiler output).
164    ///
165    /// This is the primary constructor for bytecode produced by the compiler.
166    pub fn from_aligned(vec: AlignedVec) -> Result<Self, ModuleError> {
167        Self::from_storage(ByteStorage::from_aligned(vec))
168    }
169
170    /// Load a module from static bytes (zero-copy).
171    ///
172    /// Use with `include_query_aligned!` to embed aligned bytecode:
173    /// ```ignore
174    /// use plotnik_lib::include_query_aligned;
175    ///
176    /// let module = Module::from_static(include_query_aligned!("query.ptk.bin"))?;
177    /// ```
178    ///
179    /// # Panics
180    /// Panics if bytes are not 64-byte aligned.
181    pub fn from_static(bytes: &'static [u8]) -> Result<Self, ModuleError> {
182        Self::from_storage(ByteStorage::from_static(bytes))
183    }
184
185    /// Load a module from a file path.
186    ///
187    /// Reads the file into 64-byte aligned storage.
188    pub fn from_path(path: impl AsRef<Path>) -> Result<Self, ModuleError> {
189        Self::from_storage(ByteStorage::from_file(&path)?)
190    }
191
192    /// Load a module from arbitrary bytes (copies into aligned storage).
193    ///
194    /// Use this for bytes from unknown sources (network, etc.). Always copies.
195    pub fn load(bytes: &[u8]) -> Result<Self, ModuleError> {
196        Self::from_storage(ByteStorage::copy_from_slice(bytes))
197    }
198
199    /// Load a module from owned bytes (copies into aligned storage).
200    #[deprecated(
201        since = "0.1.0",
202        note = "use `Module::from_aligned` for AlignedVec or `Module::load` for copying"
203    )]
204    pub fn from_bytes(bytes: Vec<u8>) -> Result<Self, ModuleError> {
205        Self::load(&bytes)
206    }
207
208    /// Load a module from storage.
209    fn from_storage(storage: ByteStorage) -> Result<Self, ModuleError> {
210        if storage.len() < 64 {
211            return Err(ModuleError::FileTooSmall(storage.len()));
212        }
213
214        let header = Header::from_bytes(&storage[..64]);
215
216        if !header.validate_magic() {
217            return Err(ModuleError::InvalidMagic);
218        }
219        if !header.validate_version() {
220            return Err(ModuleError::UnsupportedVersion(header.version));
221        }
222        if header.total_size as usize != storage.len() {
223            return Err(ModuleError::SizeMismatch {
224                header: header.total_size,
225                actual: storage.len(),
226            });
227        }
228
229        // Compute all section offsets from header counts and blob sizes
230        let offsets = header.compute_offsets();
231
232        Ok(Self {
233            storage,
234            header,
235            offsets,
236        })
237    }
238
239    /// Get the parsed header.
240    pub fn header(&self) -> &Header {
241        &self.header
242    }
243
244    /// Get the computed section offsets.
245    pub fn offsets(&self) -> &SectionOffsets {
246        &self.offsets
247    }
248
249    /// Get the raw bytes.
250    pub fn bytes(&self) -> &[u8] {
251        &self.storage
252    }
253
254    /// Decode an instruction at the given step index.
255    #[inline]
256    pub fn decode_step(&self, step: u16) -> Instruction<'_> {
257        let offset = self.offsets.transitions as usize + (step as usize) * STEP_SIZE;
258        Instruction::from_bytes(&self.storage[offset..])
259    }
260
261    /// Get a view into the string table.
262    pub fn strings(&self) -> StringsView<'_> {
263        StringsView {
264            blob: &self.storage[self.offsets.str_blob as usize..],
265            table: self.string_table_slice(),
266        }
267    }
268
269    /// Get a view into the node type symbols.
270    pub fn node_types(&self) -> SymbolsView<'_, NodeSymbol> {
271        let offset = self.offsets.node_types as usize;
272        let count = self.header.node_types_count as usize;
273        SymbolsView {
274            bytes: &self.storage[offset..offset + count * 4],
275            count,
276            _marker: std::marker::PhantomData,
277        }
278    }
279
280    /// Get a view into the node field symbols.
281    pub fn node_fields(&self) -> SymbolsView<'_, FieldSymbol> {
282        let offset = self.offsets.node_fields as usize;
283        let count = self.header.node_fields_count as usize;
284        SymbolsView {
285            bytes: &self.storage[offset..offset + count * 4],
286            count,
287            _marker: std::marker::PhantomData,
288        }
289    }
290
291    /// Get a view into the trivia entries.
292    pub fn trivia(&self) -> TriviaView<'_> {
293        let offset = self.offsets.trivia as usize;
294        let count = self.header.trivia_count as usize;
295        TriviaView {
296            bytes: &self.storage[offset..offset + count * 2],
297            count,
298        }
299    }
300
301    /// Get a view into the regex table.
302    pub fn regexes(&self) -> RegexView<'_> {
303        RegexView {
304            blob: &self.storage[self.offsets.regex_blob as usize..],
305            table: self.regex_table_slice(),
306        }
307    }
308
309    /// Get a view into the type metadata.
310    pub fn types(&self) -> TypesView<'_> {
311        let defs_offset = self.offsets.type_defs as usize;
312        let defs_count = self.header.type_defs_count as usize;
313        let members_offset = self.offsets.type_members as usize;
314        let members_count = self.header.type_members_count as usize;
315        let names_offset = self.offsets.type_names as usize;
316        let names_count = self.header.type_names_count as usize;
317
318        TypesView {
319            defs_bytes: &self.storage[defs_offset..defs_offset + defs_count * 4],
320            members_bytes: &self.storage[members_offset..members_offset + members_count * 4],
321            names_bytes: &self.storage[names_offset..names_offset + names_count * 4],
322            defs_count,
323            members_count,
324            names_count,
325        }
326    }
327
328    /// Get a view into the entrypoints.
329    pub fn entrypoints(&self) -> EntrypointsView<'_> {
330        let offset = self.offsets.entrypoints as usize;
331        let count = self.header.entrypoints_count as usize;
332        EntrypointsView {
333            bytes: &self.storage[offset..offset + count * 8],
334            count,
335        }
336    }
337
338    /// Helper to get string table as bytes.
339    /// The table has count+1 entries (includes sentinel for length calculation).
340    fn string_table_slice(&self) -> &[u8] {
341        let offset = self.offsets.str_table as usize;
342        let count = self.header.str_table_count as usize;
343        &self.storage[offset..offset + (count + 1) * 4]
344    }
345
346    /// Helper to get regex table as bytes.
347    /// The table has count+1 entries (includes sentinel for length calculation).
348    fn regex_table_slice(&self) -> &[u8] {
349        let offset = self.offsets.regex_table as usize;
350        let count = self.header.regex_table_count as usize;
351        &self.storage[offset..offset + (count + 1) * 8]
352    }
353}
354
355/// View into the string table for lazy string lookup.
356pub struct StringsView<'a> {
357    blob: &'a [u8],
358    table: &'a [u8],
359}
360
361impl<'a> StringsView<'a> {
362    /// Get a string by its ID (type-safe access for bytecode references).
363    pub fn get(&self, id: StringId) -> &'a str {
364        self.get_by_index(id.get() as usize)
365    }
366
367    /// Get a string by raw index (for iteration/dumps, including easter egg at 0).
368    ///
369    /// The string table contains sequential u32 offsets. To get string i:
370    /// `start = table[i]`, `end = table[i+1]`, `length = end - start`.
371    pub fn get_by_index(&self, idx: usize) -> &'a str {
372        let start = read_u32_le(self.table, idx * 4) as usize;
373        let end = read_u32_le(self.table, (idx + 1) * 4) as usize;
374        std::str::from_utf8(&self.blob[start..end]).expect("invalid UTF-8 in string table")
375    }
376}
377
378/// View into symbol tables (node types or field names).
379pub struct SymbolsView<'a, T> {
380    bytes: &'a [u8],
381    count: usize,
382    _marker: std::marker::PhantomData<T>,
383}
384
385impl<'a> SymbolsView<'a, NodeSymbol> {
386    /// Get a node symbol by index.
387    pub fn get(&self, idx: usize) -> NodeSymbol {
388        assert!(idx < self.count, "node symbol index out of bounds");
389        let offset = idx * 4;
390        NodeSymbol::new(
391            read_u16_le(self.bytes, offset),
392            StringId::new(read_u16_le(self.bytes, offset + 2)),
393        )
394    }
395
396    /// Number of entries.
397    pub fn len(&self) -> usize {
398        self.count
399    }
400
401    /// Check if empty.
402    pub fn is_empty(&self) -> bool {
403        self.count == 0
404    }
405}
406
407impl<'a> SymbolsView<'a, FieldSymbol> {
408    /// Get a field symbol by index.
409    pub fn get(&self, idx: usize) -> FieldSymbol {
410        assert!(idx < self.count, "field symbol index out of bounds");
411        let offset = idx * 4;
412        FieldSymbol::new(
413            read_u16_le(self.bytes, offset),
414            StringId::new(read_u16_le(self.bytes, offset + 2)),
415        )
416    }
417
418    /// Number of entries.
419    pub fn len(&self) -> usize {
420        self.count
421    }
422
423    /// Check if empty.
424    pub fn is_empty(&self) -> bool {
425        self.count == 0
426    }
427}
428
429/// View into trivia entries.
430pub struct TriviaView<'a> {
431    bytes: &'a [u8],
432    count: usize,
433}
434
435impl<'a> TriviaView<'a> {
436    /// Get a trivia entry by index.
437    pub fn get(&self, idx: usize) -> TriviaEntry {
438        assert!(idx < self.count, "trivia index out of bounds");
439        TriviaEntry::new(read_u16_le(self.bytes, idx * 2))
440    }
441
442    /// Number of entries.
443    pub fn len(&self) -> usize {
444        self.count
445    }
446
447    /// Check if empty.
448    pub fn is_empty(&self) -> bool {
449        self.count == 0
450    }
451
452    /// Check if a node type is trivia.
453    pub fn contains(&self, node_type: u16) -> bool {
454        (0..self.count).any(|i| self.get(i).node_type == node_type)
455    }
456}
457
458/// View into the regex table for lazy DFA lookup.
459///
460/// Table format per entry: `string_id (u16) | reserved (u16) | offset (u32)` = 8 bytes.
461/// This allows access to both the pattern string (via StringTable) and DFA bytes.
462pub struct RegexView<'a> {
463    blob: &'a [u8],
464    table: &'a [u8],
465}
466
467impl<'a> RegexView<'a> {
468    /// Entry size in bytes: string_id (u16) + reserved (u16) + offset (u32).
469    const ENTRY_SIZE: usize = 8;
470
471    /// Get regex DFA bytes by index.
472    ///
473    /// Returns the raw DFA bytes for the regex at the given index.
474    /// Use `regex-automata` to deserialize: `DFA::from_bytes(&bytes)`.
475    pub fn get_by_index(&self, idx: usize) -> &'a [u8] {
476        let entry_offset = idx * Self::ENTRY_SIZE;
477        let next_entry_offset = (idx + 1) * Self::ENTRY_SIZE;
478
479        let start = read_u32_le(self.table, entry_offset + 4) as usize;
480        let end = read_u32_le(self.table, next_entry_offset + 4) as usize;
481        &self.blob[start..end]
482    }
483
484    /// Get the StringId of the pattern for a regex by index.
485    ///
486    /// This allows looking up the pattern text from StringTable for display.
487    pub fn get_string_id(&self, idx: usize) -> super::StringId {
488        let entry_offset = idx * Self::ENTRY_SIZE;
489        let string_id = read_u16_le(self.table, entry_offset);
490        super::StringId::new(string_id)
491    }
492}
493
494/// View into type metadata.
495///
496/// Types are stored in three sub-sections:
497/// - TypeDefs: structural topology (4 bytes each)
498/// - TypeMembers: fields and variants (4 bytes each)
499/// - TypeNames: name → TypeId mapping (4 bytes each)
500pub struct TypesView<'a> {
501    defs_bytes: &'a [u8],
502    members_bytes: &'a [u8],
503    names_bytes: &'a [u8],
504    defs_count: usize,
505    members_count: usize,
506    names_count: usize,
507}
508
509impl<'a> TypesView<'a> {
510    /// Get a type definition by index.
511    pub fn get_def(&self, idx: usize) -> TypeDef {
512        assert!(idx < self.defs_count, "type def index out of bounds");
513        let offset = idx * 4;
514        TypeDef::from_bytes(&self.defs_bytes[offset..])
515    }
516
517    /// Get a type definition by TypeId.
518    pub fn get(&self, id: TypeId) -> Option<TypeDef> {
519        let idx = id.0 as usize;
520        if idx < self.defs_count {
521            Some(self.get_def(idx))
522        } else {
523            None
524        }
525    }
526
527    /// Get a type member by index.
528    pub fn get_member(&self, idx: usize) -> TypeMember {
529        assert!(idx < self.members_count, "type member index out of bounds");
530        let offset = idx * 4;
531        TypeMember::new(
532            StringId::new(read_u16_le(self.members_bytes, offset)),
533            TypeId(read_u16_le(self.members_bytes, offset + 2)),
534        )
535    }
536
537    /// Get a type name entry by index.
538    pub fn get_name(&self, idx: usize) -> TypeName {
539        assert!(idx < self.names_count, "type name index out of bounds");
540        let offset = idx * 4;
541        TypeName::new(
542            StringId::new(read_u16_le(self.names_bytes, offset)),
543            TypeId(read_u16_le(self.names_bytes, offset + 2)),
544        )
545    }
546
547    /// Number of type definitions.
548    pub fn defs_count(&self) -> usize {
549        self.defs_count
550    }
551
552    /// Number of type members.
553    pub fn members_count(&self) -> usize {
554        self.members_count
555    }
556
557    /// Number of type names.
558    pub fn names_count(&self) -> usize {
559        self.names_count
560    }
561
562    /// Iterate over members of a struct or enum type.
563    pub fn members_of(&self, def: &TypeDef) -> impl Iterator<Item = TypeMember> + '_ {
564        let (start, count) = match def.classify() {
565            TypeData::Composite {
566                member_start,
567                member_count,
568                ..
569            } => (member_start as usize, member_count as usize),
570            _ => (0, 0),
571        };
572        (0..count).map(move |i| self.get_member(start + i))
573    }
574
575    /// Unwrap Optional wrapper and return (inner_type, is_optional).
576    /// If not Optional, returns (type_id, false).
577    pub fn unwrap_optional(&self, type_id: TypeId) -> (TypeId, bool) {
578        let Some(type_def) = self.get(type_id) else {
579            return (type_id, false);
580        };
581        match type_def.classify() {
582            TypeData::Wrapper {
583                kind: TypeKind::Optional,
584                inner,
585            } => (inner, true),
586            _ => (type_id, false),
587        }
588    }
589}
590
591/// View into entrypoints.
592pub struct EntrypointsView<'a> {
593    bytes: &'a [u8],
594    count: usize,
595}
596
597impl<'a> EntrypointsView<'a> {
598    /// Get an entrypoint by index.
599    pub fn get(&self, idx: usize) -> Entrypoint {
600        assert!(idx < self.count, "entrypoint index out of bounds");
601        let offset = idx * 8;
602        Entrypoint::from_bytes(&self.bytes[offset..])
603    }
604
605    /// Number of entrypoints.
606    pub fn len(&self) -> usize {
607        self.count
608    }
609
610    /// Check if empty.
611    pub fn is_empty(&self) -> bool {
612        self.count == 0
613    }
614
615    /// Find an entrypoint by name (requires StringsView for comparison).
616    pub fn find_by_name(&self, name: &str, strings: &StringsView<'_>) -> Option<Entrypoint> {
617        (0..self.count)
618            .map(|i| self.get(i))
619            .find(|e| strings.get(e.name()) == name)
620    }
621}