Skip to main content

ud_format/
wasm.rs

1//! WebAssembly binary container — parse + byte-identical write.
2//!
3//! WASM modules are a flat sequence of *sections*:
4//!
5//! ```text
6//!   magic    : '\0' 'a' 's' 'm'         (4 bytes)
7//!   version  : u32 little-endian        (4 bytes; 1 for current MVP)
8//!   sections : (section_id:u8, size:LEB128, body:[u8; size])*
9//! ```
10//!
11//! For round-trip byte identity we preserve every byte of the
12//! header and every section. LEB128 size encodings vary in
13//! width — LLVM pads them to 5 bytes so post-link patching
14//! can adjust section sizes without shifting the rest of the
15//! module — and `WasmFile` keeps the encoded form verbatim so
16//! a parse → write_to_vec cycle reproduces the input exactly.
17//!
18//! Section interpretation (decoding function bodies, names,
19//! types, …) lives in higher layers. The format crate only
20//! offers: where each section starts, what its id is, and a
21//! handle to its raw bytes.
22
23use std::ops::Range;
24
25/// 4-byte WASM magic.
26pub const MAGIC: [u8; 4] = [0x00, b'a', b's', b'm'];
27
28/// Errors raised by [`WasmFile::parse`].
29#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
30pub enum Error {
31    #[error("buffer too short ({len} bytes); needs at least 8 for magic + version")]
32    TooShort { len: usize },
33    #[error("bad magic {magic:?}; expected {:?}", MAGIC)]
34    BadMagic { magic: [u8; 4] },
35    #[error("section header at offset {at:#x} truncated")]
36    SectionTruncated { at: usize },
37    #[error("section size LEB128 at offset {at:#x} doesn't terminate")]
38    LebOverflow { at: usize },
39    #[error("section size LEB128 at offset {at:#x} encodes value > usize::MAX")]
40    LebTooLarge { at: usize },
41    #[error("section at offset {at:#x} declares size {size} that runs past end of buffer")]
42    SectionRunsPastEnd { at: usize, size: u64 },
43}
44
45pub type Result<T, E = Error> = std::result::Result<T, E>;
46
47/// Standard section IDs. The custom id (0) groups everything
48/// non-standard — debug info, name maps, producer metadata, …
49pub const SECTION_CUSTOM: u8 = 0;
50pub const SECTION_TYPE: u8 = 1;
51pub const SECTION_IMPORT: u8 = 2;
52pub const SECTION_FUNCTION: u8 = 3;
53pub const SECTION_TABLE: u8 = 4;
54pub const SECTION_MEMORY: u8 = 5;
55pub const SECTION_GLOBAL: u8 = 6;
56pub const SECTION_EXPORT: u8 = 7;
57pub const SECTION_START: u8 = 8;
58pub const SECTION_ELEMENT: u8 = 9;
59pub const SECTION_CODE: u8 = 10;
60pub const SECTION_DATA: u8 = 11;
61pub const SECTION_DATA_COUNT: u8 = 12;
62
63/// Metadata for one section, located by byte range into the
64/// parent [`WasmFile`]'s `bytes`. The `header_range` covers
65/// the section id byte plus the size-LEB; `body_range` covers
66/// just the section's payload.
67#[derive(Debug, Clone)]
68pub struct Section {
69    pub id: u8,
70    pub header_range: Range<usize>,
71    pub body_range: Range<usize>,
72}
73
74/// A parsed WASM module. `bytes` is the verbatim input; the
75/// `sections` list records where each section starts and ends
76/// so callers can read or rewrite them without re-parsing.
77#[derive(Debug, Clone)]
78pub struct WasmFile {
79    pub bytes: Vec<u8>,
80    pub version: u32,
81    pub sections: Vec<Section>,
82}
83
84impl WasmFile {
85    /// Parse `bytes` into a [`WasmFile`]. The bytes are
86    /// retained verbatim so [`Self::write_to_vec`] can
87    /// reproduce them.
88    #[allow(clippy::missing_errors_doc)]
89    pub fn parse(bytes: &[u8]) -> Result<Self> {
90        if bytes.len() < 8 {
91            return Err(Error::TooShort { len: bytes.len() });
92        }
93        let magic: [u8; 4] = bytes[0..4].try_into().expect("4-byte prefix");
94        if magic != MAGIC {
95            return Err(Error::BadMagic { magic });
96        }
97        let version = u32::from_le_bytes(bytes[4..8].try_into().expect("4-byte prefix"));
98
99        let mut sections = Vec::new();
100        let mut cursor = 8usize;
101        while cursor < bytes.len() {
102            if cursor + 1 > bytes.len() {
103                return Err(Error::SectionTruncated { at: cursor });
104            }
105            let id = bytes[cursor];
106            let header_start = cursor;
107            cursor += 1;
108            let (size, size_len) = read_leb128_u32(bytes, cursor)?;
109            cursor += size_len;
110            let body_start = cursor;
111            let size_us = size as usize;
112            let body_end = body_start
113                .checked_add(size_us)
114                .ok_or(Error::SectionRunsPastEnd {
115                    at: header_start,
116                    size: u64::from(size),
117                })?;
118            if body_end > bytes.len() {
119                return Err(Error::SectionRunsPastEnd {
120                    at: header_start,
121                    size: u64::from(size),
122                });
123            }
124            sections.push(Section {
125                id,
126                header_range: header_start..body_start,
127                body_range: body_start..body_end,
128            });
129            cursor = body_end;
130        }
131
132        Ok(Self {
133            bytes: bytes.to_vec(),
134            version,
135            sections,
136        })
137    }
138
139    /// Reproduce the original bytes. Currently a clone — when
140    /// section editing lands the layout-rebuild logic will
141    /// live here.
142    #[must_use]
143    pub fn write_to_vec(&self) -> Vec<u8> {
144        self.bytes.clone()
145    }
146
147    /// Body bytes of `section` (excluding the id + size header).
148    #[must_use]
149    pub fn section_body(&self, section: &Section) -> &[u8] {
150        &self.bytes[section.body_range.clone()]
151    }
152
153    /// For a custom (id = 0) section, return its decoded name
154    /// (UTF-8). The name is a LEB128-length-prefixed string at
155    /// the start of the body. Returns `None` for non-custom
156    /// sections or malformed names.
157    #[must_use]
158    pub fn custom_section_name(&self, section: &Section) -> Option<String> {
159        if section.id != SECTION_CUSTOM {
160            return None;
161        }
162        let body = self.section_body(section);
163        let (len, len_len) = read_leb128_u32(body, 0).ok()?;
164        let name_start = len_len;
165        let name_end = name_start.checked_add(len as usize)?;
166        if name_end > body.len() {
167            return None;
168        }
169        std::str::from_utf8(&body[name_start..name_end])
170            .ok()
171            .map(str::to_string)
172    }
173}
174
175/// Is `bytes` plausibly a WASM module? Just checks the first
176/// 4 bytes against [`MAGIC`].
177#[must_use]
178pub fn is_wasm(bytes: &[u8]) -> bool {
179    bytes.len() >= 4 && bytes[0..4] == MAGIC
180}
181
182/// Decode an unsigned LEB128 starting at `bytes[at]`. Returns
183/// `(value, byte_count)`. Caps at 32-bit values per the WASM
184/// section-size encoding rules (5-byte max LEB128).
185fn read_leb128_u32(bytes: &[u8], at: usize) -> Result<(u32, usize)> {
186    let mut result: u64 = 0;
187    let mut shift: u32 = 0;
188    let mut i = 0usize;
189    loop {
190        let pos = at.checked_add(i).ok_or(Error::LebOverflow { at })?;
191        let byte = *bytes.get(pos).ok_or(Error::LebOverflow { at })?;
192        let chunk = u64::from(byte & 0x7f);
193        result |= chunk << shift;
194        i += 1;
195        if byte & 0x80 == 0 {
196            break;
197        }
198        shift += 7;
199        if i > 5 {
200            return Err(Error::LebOverflow { at });
201        }
202    }
203    if result > u64::from(u32::MAX) {
204        return Err(Error::LebTooLarge { at });
205    }
206    #[allow(clippy::cast_possible_truncation)]
207    Ok((result as u32, i))
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn empty_module_round_trips() {
216        let bytes = [0x00, b'a', b's', b'm', 0x01, 0x00, 0x00, 0x00];
217        let m = WasmFile::parse(&bytes).unwrap();
218        assert_eq!(m.version, 1);
219        assert_eq!(m.sections.len(), 0);
220        assert_eq!(m.write_to_vec(), bytes);
221    }
222
223    #[test]
224    fn rejects_bad_magic() {
225        let bytes = [0x00, b'a', b's', b'n', 0x01, 0x00, 0x00, 0x00];
226        assert!(matches!(
227            WasmFile::parse(&bytes),
228            Err(Error::BadMagic { .. })
229        ));
230    }
231
232    #[test]
233    fn leb128_handles_padded_encoding() {
234        // 0x0b encoded as 5-byte padded LEB128: 8b 80 80 80 00.
235        let bytes = [0x8b, 0x80, 0x80, 0x80, 0x00];
236        assert_eq!(read_leb128_u32(&bytes, 0).unwrap(), (0x0b, 5));
237        // Same value, minimal: just 0x0b.
238        let minimal = [0x0b];
239        assert_eq!(read_leb128_u32(&minimal, 0).unwrap(), (0x0b, 1));
240    }
241}