plotnik_bytecode/bytecode/
module.rs

1//! Bytecode module with unified storage.
2//!
3//! The [`Module`] struct holds compiled bytecode, decoding instructions lazily
4//! when the VM steps into them.
5
6use std::io;
7use std::ops::Deref;
8use std::path::Path;
9
10use super::aligned_vec::AlignedVec;
11use super::header::{Header, SectionOffsets};
12use super::ids::{StringId, TypeId};
13use super::instructions::{Call, Match, Opcode, Return, Trampoline};
14use super::sections::{FieldSymbol, NodeSymbol, TriviaEntry};
15use super::type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeName};
16use super::{Entrypoint, STEP_SIZE, VERSION};
17
18/// Read a little-endian u16 from bytes at the given offset.
19#[inline]
20fn read_u16_le(bytes: &[u8], offset: usize) -> u16 {
21    u16::from_le_bytes([bytes[offset], bytes[offset + 1]])
22}
23
24/// Read a little-endian u32 from bytes at the given offset.
25#[inline]
26fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
27    u32::from_le_bytes([
28        bytes[offset],
29        bytes[offset + 1],
30        bytes[offset + 2],
31        bytes[offset + 3],
32    ])
33}
34
35/// Storage for bytecode bytes with guaranteed 64-byte alignment.
36///
37/// All bytecode must be 64-byte aligned for DFA deserialization and cache
38/// efficiency. This enum ensures alignment through two paths:
39/// - `Static`: Pre-aligned via `include_query_aligned!` macro
40/// - `Aligned`: Allocated with 64-byte alignment via `AlignedVec`
41pub enum ByteStorage {
42    /// Static bytes from `include_query_aligned!` (zero-copy, pre-aligned).
43    Static(&'static [u8]),
44    /// Owned bytes with guaranteed 64-byte alignment.
45    Aligned(AlignedVec),
46}
47
48impl Deref for ByteStorage {
49    type Target = [u8];
50
51    fn deref(&self) -> &Self::Target {
52        match self {
53            ByteStorage::Static(s) => s,
54            ByteStorage::Aligned(v) => v,
55        }
56    }
57}
58
59impl std::fmt::Debug for ByteStorage {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        match self {
62            ByteStorage::Static(s) => f.debug_tuple("Static").field(&s.len()).finish(),
63            ByteStorage::Aligned(v) => f.debug_tuple("Aligned").field(&v.len()).finish(),
64        }
65    }
66}
67
68impl ByteStorage {
69    /// Create from static bytes (zero-copy).
70    ///
71    /// The bytes must be 64-byte aligned. Use `include_query_aligned!` macro.
72    ///
73    /// # Panics
74    /// Panics if bytes are not 64-byte aligned.
75    pub fn from_static(bytes: &'static [u8]) -> Self {
76        assert!(
77            (bytes.as_ptr() as usize).is_multiple_of(64),
78            "static bytes must be 64-byte aligned; use include_query_aligned! macro"
79        );
80        Self::Static(bytes)
81    }
82
83    /// Create from an aligned vector (from compiler or file read).
84    pub fn from_aligned(vec: AlignedVec) -> Self {
85        Self::Aligned(vec)
86    }
87
88    /// Create by copying bytes into aligned storage.
89    ///
90    /// Use this when receiving bytes from unknown sources (e.g., network).
91    pub fn copy_from_slice(bytes: &[u8]) -> Self {
92        Self::Aligned(AlignedVec::copy_from_slice(bytes))
93    }
94
95    /// Read a file into aligned storage.
96    pub fn from_file(path: impl AsRef<Path>) -> io::Result<Self> {
97        Ok(Self::Aligned(AlignedVec::from_file(path)?))
98    }
99}
100
101/// Decoded instruction from bytecode.
102#[derive(Clone, Copy, Debug)]
103pub enum Instruction<'a> {
104    Match(Match<'a>),
105    Call(Call),
106    Return(Return),
107    Trampoline(Trampoline),
108}
109
110impl<'a> Instruction<'a> {
111    /// Decode an instruction from bytecode bytes.
112    #[inline]
113    pub fn from_bytes(bytes: &'a [u8]) -> Self {
114        debug_assert!(bytes.len() >= 8, "instruction too short");
115
116        let opcode = Opcode::from_u8(bytes[0] & 0xF);
117        match opcode {
118            Opcode::Call => {
119                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
120                Self::Call(Call::from_bytes(arr))
121            }
122            Opcode::Return => {
123                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
124                Self::Return(Return::from_bytes(arr))
125            }
126            Opcode::Trampoline => {
127                let arr: [u8; 8] = bytes[..8].try_into().unwrap();
128                Self::Trampoline(Trampoline::from_bytes(arr))
129            }
130            _ => Self::Match(Match::from_bytes(bytes)),
131        }
132    }
133}
134
135/// Module load error.
136#[derive(Debug, thiserror::Error)]
137pub enum ModuleError {
138    #[error("invalid magic: expected PTKQ")]
139    InvalidMagic,
140    #[error("unsupported version: {0} (expected {VERSION})")]
141    UnsupportedVersion(u32),
142    #[error("file too small: {0} bytes (minimum 64)")]
143    FileTooSmall(usize),
144    #[error("size mismatch: header says {header} bytes, got {actual}")]
145    SizeMismatch { header: u32, actual: usize },
146    #[error("io error: {0}")]
147    Io(#[from] io::Error),
148}
149
150/// A compiled bytecode module.
151///
152/// Instructions are decoded lazily via [`decode_step`](Self::decode_step).
153/// Cold data (strings, symbols, types) is accessed through view methods.
154#[derive(Debug)]
155pub struct Module {
156    storage: ByteStorage,
157    header: Header,
158    /// Cached section offsets (computed from header counts).
159    offsets: SectionOffsets,
160}
161
162impl Module {
163    /// Load a module from an aligned vector (compiler output).
164    ///
165    /// This is the primary constructor for bytecode produced by the compiler.
166    pub fn from_aligned(vec: AlignedVec) -> Result<Self, ModuleError> {
167        Self::from_storage(ByteStorage::from_aligned(vec))
168    }
169
170    /// Load a module from static bytes (zero-copy).
171    ///
172    /// Use with `include_query_aligned!` to embed aligned bytecode:
173    /// ```ignore
174    /// use plotnik_lib::include_query_aligned;
175    ///
176    /// let module = Module::from_static(include_query_aligned!("query.ptk.bin"))?;
177    /// ```
178    ///
179    /// # Panics
180    /// Panics if bytes are not 64-byte aligned.
181    pub fn from_static(bytes: &'static [u8]) -> Result<Self, ModuleError> {
182        Self::from_storage(ByteStorage::from_static(bytes))
183    }
184
185    /// Load a module from a file path.
186    ///
187    /// Reads the file into 64-byte aligned storage.
188    pub fn from_path(path: impl AsRef<Path>) -> Result<Self, ModuleError> {
189        Self::from_storage(ByteStorage::from_file(&path)?)
190    }
191
192    /// Load a module from arbitrary bytes (copies into aligned storage).
193    ///
194    /// Use this for bytes from unknown sources (network, etc.). Always copies.
195    pub fn load(bytes: &[u8]) -> Result<Self, ModuleError> {
196        Self::from_storage(ByteStorage::copy_from_slice(bytes))
197    }
198
199    /// Load a module from owned bytes (copies into aligned storage).
200    #[deprecated(since = "0.1.0", note = "use `Module::from_aligned` for AlignedVec or `Module::load` for copying")]
201    pub fn from_bytes(bytes: Vec<u8>) -> Result<Self, ModuleError> {
202        Self::load(&bytes)
203    }
204
205    /// Load a module from storage.
206    fn from_storage(storage: ByteStorage) -> Result<Self, ModuleError> {
207        if storage.len() < 64 {
208            return Err(ModuleError::FileTooSmall(storage.len()));
209        }
210
211        let header = Header::from_bytes(&storage[..64]);
212
213        if !header.validate_magic() {
214            return Err(ModuleError::InvalidMagic);
215        }
216        if !header.validate_version() {
217            return Err(ModuleError::UnsupportedVersion(header.version));
218        }
219        if header.total_size as usize != storage.len() {
220            return Err(ModuleError::SizeMismatch {
221                header: header.total_size,
222                actual: storage.len(),
223            });
224        }
225
226        // Compute all section offsets from header counts and blob sizes
227        let offsets = header.compute_offsets();
228
229        Ok(Self {
230            storage,
231            header,
232            offsets,
233        })
234    }
235
236    /// Get the parsed header.
237    pub fn header(&self) -> &Header {
238        &self.header
239    }
240
241    /// Get the computed section offsets.
242    pub fn offsets(&self) -> &SectionOffsets {
243        &self.offsets
244    }
245
246    /// Get the raw bytes.
247    pub fn bytes(&self) -> &[u8] {
248        &self.storage
249    }
250
251    /// Decode an instruction at the given step index.
252    #[inline]
253    pub fn decode_step(&self, step: u16) -> Instruction<'_> {
254        let offset = self.offsets.transitions as usize + (step as usize) * STEP_SIZE;
255        Instruction::from_bytes(&self.storage[offset..])
256    }
257
258    /// Get a view into the string table.
259    pub fn strings(&self) -> StringsView<'_> {
260        StringsView {
261            blob: &self.storage[self.offsets.str_blob as usize..],
262            table: self.string_table_slice(),
263        }
264    }
265
266    /// Get a view into the node type symbols.
267    pub fn node_types(&self) -> SymbolsView<'_, NodeSymbol> {
268        let offset = self.offsets.node_types as usize;
269        let count = self.header.node_types_count as usize;
270        SymbolsView {
271            bytes: &self.storage[offset..offset + count * 4],
272            count,
273            _marker: std::marker::PhantomData,
274        }
275    }
276
277    /// Get a view into the node field symbols.
278    pub fn node_fields(&self) -> SymbolsView<'_, FieldSymbol> {
279        let offset = self.offsets.node_fields as usize;
280        let count = self.header.node_fields_count as usize;
281        SymbolsView {
282            bytes: &self.storage[offset..offset + count * 4],
283            count,
284            _marker: std::marker::PhantomData,
285        }
286    }
287
288    /// Get a view into the trivia entries.
289    pub fn trivia(&self) -> TriviaView<'_> {
290        let offset = self.offsets.trivia as usize;
291        let count = self.header.trivia_count as usize;
292        TriviaView {
293            bytes: &self.storage[offset..offset + count * 2],
294            count,
295        }
296    }
297
298    /// Get a view into the regex table.
299    pub fn regexes(&self) -> RegexView<'_> {
300        RegexView {
301            blob: &self.storage[self.offsets.regex_blob as usize..],
302            table: self.regex_table_slice(),
303        }
304    }
305
306    /// Get a view into the type metadata.
307    pub fn types(&self) -> TypesView<'_> {
308        let defs_offset = self.offsets.type_defs as usize;
309        let defs_count = self.header.type_defs_count as usize;
310        let members_offset = self.offsets.type_members as usize;
311        let members_count = self.header.type_members_count as usize;
312        let names_offset = self.offsets.type_names as usize;
313        let names_count = self.header.type_names_count as usize;
314
315        TypesView {
316            defs_bytes: &self.storage[defs_offset..defs_offset + defs_count * 4],
317            members_bytes: &self.storage[members_offset..members_offset + members_count * 4],
318            names_bytes: &self.storage[names_offset..names_offset + names_count * 4],
319            defs_count,
320            members_count,
321            names_count,
322        }
323    }
324
325    /// Get a view into the entrypoints.
326    pub fn entrypoints(&self) -> EntrypointsView<'_> {
327        let offset = self.offsets.entrypoints as usize;
328        let count = self.header.entrypoints_count as usize;
329        EntrypointsView {
330            bytes: &self.storage[offset..offset + count * 8],
331            count,
332        }
333    }
334
335    /// Helper to get string table as bytes.
336    /// The table has count+1 entries (includes sentinel for length calculation).
337    fn string_table_slice(&self) -> &[u8] {
338        let offset = self.offsets.str_table as usize;
339        let count = self.header.str_table_count as usize;
340        &self.storage[offset..offset + (count + 1) * 4]
341    }
342
343    /// Helper to get regex table as bytes.
344    /// The table has count+1 entries (includes sentinel for length calculation).
345    fn regex_table_slice(&self) -> &[u8] {
346        let offset = self.offsets.regex_table as usize;
347        let count = self.header.regex_table_count as usize;
348        &self.storage[offset..offset + (count + 1) * 8]
349    }
350}
351
352/// View into the string table for lazy string lookup.
353pub struct StringsView<'a> {
354    blob: &'a [u8],
355    table: &'a [u8],
356}
357
358impl<'a> StringsView<'a> {
359    /// Get a string by its ID (type-safe access for bytecode references).
360    pub fn get(&self, id: StringId) -> &'a str {
361        self.get_by_index(id.get() as usize)
362    }
363
364    /// Get a string by raw index (for iteration/dumps, including easter egg at 0).
365    ///
366    /// The string table contains sequential u32 offsets. To get string i:
367    /// `start = table[i]`, `end = table[i+1]`, `length = end - start`.
368    pub fn get_by_index(&self, idx: usize) -> &'a str {
369        let start = read_u32_le(self.table, idx * 4) as usize;
370        let end = read_u32_le(self.table, (idx + 1) * 4) as usize;
371        std::str::from_utf8(&self.blob[start..end]).expect("invalid UTF-8 in string table")
372    }
373}
374
375/// View into symbol tables (node types or field names).
376pub struct SymbolsView<'a, T> {
377    bytes: &'a [u8],
378    count: usize,
379    _marker: std::marker::PhantomData<T>,
380}
381
382impl<'a> SymbolsView<'a, NodeSymbol> {
383    /// Get a node symbol by index.
384    pub fn get(&self, idx: usize) -> NodeSymbol {
385        assert!(idx < self.count, "node symbol index out of bounds");
386        let offset = idx * 4;
387        NodeSymbol::new(
388            read_u16_le(self.bytes, offset),
389            StringId::new(read_u16_le(self.bytes, offset + 2)),
390        )
391    }
392
393    /// Number of entries.
394    pub fn len(&self) -> usize {
395        self.count
396    }
397
398    /// Check if empty.
399    pub fn is_empty(&self) -> bool {
400        self.count == 0
401    }
402}
403
404impl<'a> SymbolsView<'a, FieldSymbol> {
405    /// Get a field symbol by index.
406    pub fn get(&self, idx: usize) -> FieldSymbol {
407        assert!(idx < self.count, "field symbol index out of bounds");
408        let offset = idx * 4;
409        FieldSymbol::new(
410            read_u16_le(self.bytes, offset),
411            StringId::new(read_u16_le(self.bytes, offset + 2)),
412        )
413    }
414
415    /// Number of entries.
416    pub fn len(&self) -> usize {
417        self.count
418    }
419
420    /// Check if empty.
421    pub fn is_empty(&self) -> bool {
422        self.count == 0
423    }
424}
425
426/// View into trivia entries.
427pub struct TriviaView<'a> {
428    bytes: &'a [u8],
429    count: usize,
430}
431
432impl<'a> TriviaView<'a> {
433    /// Get a trivia entry by index.
434    pub fn get(&self, idx: usize) -> TriviaEntry {
435        assert!(idx < self.count, "trivia index out of bounds");
436        TriviaEntry::new(read_u16_le(self.bytes, idx * 2))
437    }
438
439    /// Number of entries.
440    pub fn len(&self) -> usize {
441        self.count
442    }
443
444    /// Check if empty.
445    pub fn is_empty(&self) -> bool {
446        self.count == 0
447    }
448
449    /// Check if a node type is trivia.
450    pub fn contains(&self, node_type: u16) -> bool {
451        (0..self.count).any(|i| self.get(i).node_type == node_type)
452    }
453}
454
455/// View into the regex table for lazy DFA lookup.
456///
457/// Table format per entry: `string_id (u16) | reserved (u16) | offset (u32)` = 8 bytes.
458/// This allows access to both the pattern string (via StringTable) and DFA bytes.
459pub struct RegexView<'a> {
460    blob: &'a [u8],
461    table: &'a [u8],
462}
463
464impl<'a> RegexView<'a> {
465    /// Entry size in bytes: string_id (u16) + reserved (u16) + offset (u32).
466    const ENTRY_SIZE: usize = 8;
467
468    /// Get regex DFA bytes by index.
469    ///
470    /// Returns the raw DFA bytes for the regex at the given index.
471    /// Use `regex-automata` to deserialize: `DFA::from_bytes(&bytes)`.
472    pub fn get_by_index(&self, idx: usize) -> &'a [u8] {
473        let entry_offset = idx * Self::ENTRY_SIZE;
474        let next_entry_offset = (idx + 1) * Self::ENTRY_SIZE;
475
476        let start = read_u32_le(self.table, entry_offset + 4) as usize;
477        let end = read_u32_le(self.table, next_entry_offset + 4) as usize;
478        &self.blob[start..end]
479    }
480
481    /// Get the StringId of the pattern for a regex by index.
482    ///
483    /// This allows looking up the pattern text from StringTable for display.
484    pub fn get_string_id(&self, idx: usize) -> super::StringId {
485        let entry_offset = idx * Self::ENTRY_SIZE;
486        let string_id = read_u16_le(self.table, entry_offset);
487        super::StringId::new(string_id)
488    }
489}
490
491/// View into type metadata.
492///
493/// Types are stored in three sub-sections:
494/// - TypeDefs: structural topology (4 bytes each)
495/// - TypeMembers: fields and variants (4 bytes each)
496/// - TypeNames: name → TypeId mapping (4 bytes each)
497pub struct TypesView<'a> {
498    defs_bytes: &'a [u8],
499    members_bytes: &'a [u8],
500    names_bytes: &'a [u8],
501    defs_count: usize,
502    members_count: usize,
503    names_count: usize,
504}
505
506impl<'a> TypesView<'a> {
507    /// Get a type definition by index.
508    pub fn get_def(&self, idx: usize) -> TypeDef {
509        assert!(idx < self.defs_count, "type def index out of bounds");
510        let offset = idx * 4;
511        TypeDef::from_bytes(&self.defs_bytes[offset..])
512    }
513
514    /// Get a type definition by TypeId.
515    pub fn get(&self, id: TypeId) -> Option<TypeDef> {
516        let idx = id.0 as usize;
517        if idx < self.defs_count {
518            Some(self.get_def(idx))
519        } else {
520            None
521        }
522    }
523
524    /// Get a type member by index.
525    pub fn get_member(&self, idx: usize) -> TypeMember {
526        assert!(idx < self.members_count, "type member index out of bounds");
527        let offset = idx * 4;
528        TypeMember::new(
529            StringId::new(read_u16_le(self.members_bytes, offset)),
530            TypeId(read_u16_le(self.members_bytes, offset + 2)),
531        )
532    }
533
534    /// Get a type name entry by index.
535    pub fn get_name(&self, idx: usize) -> TypeName {
536        assert!(idx < self.names_count, "type name index out of bounds");
537        let offset = idx * 4;
538        TypeName::new(
539            StringId::new(read_u16_le(self.names_bytes, offset)),
540            TypeId(read_u16_le(self.names_bytes, offset + 2)),
541        )
542    }
543
544    /// Number of type definitions.
545    pub fn defs_count(&self) -> usize {
546        self.defs_count
547    }
548
549    /// Number of type members.
550    pub fn members_count(&self) -> usize {
551        self.members_count
552    }
553
554    /// Number of type names.
555    pub fn names_count(&self) -> usize {
556        self.names_count
557    }
558
559    /// Iterate over members of a struct or enum type.
560    pub fn members_of(&self, def: &TypeDef) -> impl Iterator<Item = TypeMember> + '_ {
561        let (start, count) = match def.classify() {
562            TypeData::Composite {
563                member_start,
564                member_count,
565                ..
566            } => (member_start as usize, member_count as usize),
567            _ => (0, 0),
568        };
569        (0..count).map(move |i| self.get_member(start + i))
570    }
571
572    /// Unwrap Optional wrapper and return (inner_type, is_optional).
573    /// If not Optional, returns (type_id, false).
574    pub fn unwrap_optional(&self, type_id: TypeId) -> (TypeId, bool) {
575        let Some(type_def) = self.get(type_id) else {
576            return (type_id, false);
577        };
578        match type_def.classify() {
579            TypeData::Wrapper {
580                kind: TypeKind::Optional,
581                inner,
582            } => (inner, true),
583            _ => (type_id, false),
584        }
585    }
586}
587
588/// View into entrypoints.
589pub struct EntrypointsView<'a> {
590    bytes: &'a [u8],
591    count: usize,
592}
593
594impl<'a> EntrypointsView<'a> {
595    /// Get an entrypoint by index.
596    pub fn get(&self, idx: usize) -> Entrypoint {
597        assert!(idx < self.count, "entrypoint index out of bounds");
598        let offset = idx * 8;
599        Entrypoint::from_bytes(&self.bytes[offset..])
600    }
601
602    /// Number of entrypoints.
603    pub fn len(&self) -> usize {
604        self.count
605    }
606
607    /// Check if empty.
608    pub fn is_empty(&self) -> bool {
609        self.count == 0
610    }
611
612    /// Find an entrypoint by name (requires StringsView for comparison).
613    pub fn find_by_name(&self, name: &str, strings: &StringsView<'_>) -> Option<Entrypoint> {
614        (0..self.count)
615            .map(|i| self.get(i))
616            .find(|e| strings.get(e.name()) == name)
617    }
618}