styx_embed/
lib.rs

1#![doc = include_str!("../README.md")]
2//! Embed Styx schemas in binaries for zero-execution discovery.
3//!
4//! This crate provides macros to embed schemas in your binary, and functions to
5//! extract them without executing the binary. This enables tooling (LSP, CLI) to
6//! discover schemas safely.
7//!
8//! # Embedding schemas
9//!
10//! Each schema must have a `meta { id ... }` block. The ID is used to generate
11//! a unique static name, allowing multiple schemas to coexist in the same binary.
12//!
13//! ## Inline strings
14//!
15//! ```rust,ignore
16//! styx_embed::embed_inline!(r#"
17//! meta { id my-config, version 1.0.0 }
18//! schema { @ @object{ host @string, port @int } }
19//! "#);
20//! ```
21//!
22//! ## From files
23//!
24//! ```rust,ignore
25//! // Single file (path relative to crate root)
26//! styx_embed::embed_file!("schema.styx");
27//!
28//! // Multiple files (each becomes its own embedded schema)
29//! styx_embed::embed_files!("config.styx", "plugin.styx");
30//! ```
31//!
32//! ## Generated from types (build script pattern)
33//!
34//! For schemas derived from Rust types using facet-styx, use a build script:
35//!
36//! ```rust,ignore
37//! // build.rs
38//! fn main() {
39//!     facet_styx::GenerateSchema::<MyConfig>::new()
40//!         .crate_name("myapp-config")
41//!         .version("1")
42//!         .cli("myapp")
43//!         .write("schema.styx");
44//! }
45//!
46//! // src/main.rs
47//! styx_embed::embed_outdir_file!("schema.styx");
48//! ```
49//!
50//! This keeps the schema in sync with your types automatically.
51//!
52//! # Binary format (V2)
53//!
54//! Each embedded schema is stored as its own blob:
55//!
56//! ```text
57//! STYX_SCHEMA_V2\0\0           // 16 bytes magic
58//! <decompressed_len:u32le>
59//! <compressed_len:u32le>
60//! <blake3:32bytes>             // hash of decompressed content
61//! <lz4 compressed schema>
62//! ```
63//!
64//! Multiple schemas in a binary means multiple blobs, each with its own magic header.
65//! The schema's `meta { id ... }` is used to identify which schema is which.
66//!
67//! # Extracting schemas
68//!
69//! ```rust,ignore
70//! use styx_embed::extract_schemas;
71//!
72//! let schemas = extract_schemas(binary_bytes)?;
73//! for schema in schemas {
74//!     println!("{}", schema);
75//! }
76//! ```
77
78// Re-export the proc macros
79pub use styx_embed_macros::{
80    embed_file, embed_files, embed_inline, embed_outdir_file, embed_schema, embed_schemas,
81};
82
83/// Magic bytes that identify an embedded Styx schema (V2 format).
84/// 16 bytes: "STYX_SCHEMA_V2\0\0"
85pub const MAGIC_V2: &[u8; 16] = b"STYX_SCHEMA_V2\0\0";
86
87/// Magic bytes for legacy V1 format (multiple schemas per blob).
88/// 16 bytes: "STYX_SCHEMAS_V1\0"
89pub const MAGIC_V1: &[u8; 16] = b"STYX_SCHEMAS_V1\0";
90
91/// Error type for schema extraction.
92#[derive(Debug)]
93pub enum ExtractError {
94    /// Magic bytes not found in binary.
95    NotFound,
96    /// Binary is truncated or malformed.
97    Truncated,
98    /// LZ4 decompression failed.
99    DecompressFailed,
100    /// BLAKE3 hash mismatch (data corruption or false positive match).
101    HashMismatch,
102    /// Decompressed data is not valid UTF-8.
103    InvalidUtf8,
104}
105
106impl std::fmt::Display for ExtractError {
107    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108        match self {
109            ExtractError::NotFound => write!(f, "no embedded styx schemas found"),
110            ExtractError::Truncated => write!(f, "embedded schema data is truncated"),
111            ExtractError::DecompressFailed => write!(f, "LZ4 decompression failed"),
112            ExtractError::HashMismatch => write!(f, "BLAKE3 hash mismatch"),
113            ExtractError::InvalidUtf8 => write!(f, "schema is not valid UTF-8"),
114        }
115    }
116}
117
118impl std::error::Error for ExtractError {}
119
120/// Compress a schema and return the blob (for testing).
121pub fn compress_schema(schema: &str) -> Vec<u8> {
122    let decompressed = schema.as_bytes();
123    let hash = blake3::hash(decompressed);
124    let compressed = lz4_flex::compress_prepend_size(decompressed);
125
126    let mut blob = Vec::with_capacity(16 + 4 + 4 + 32 + compressed.len());
127    blob.extend_from_slice(MAGIC_V2);
128    blob.extend_from_slice(&(decompressed.len() as u32).to_le_bytes());
129    blob.extend_from_slice(&(compressed.len() as u32).to_le_bytes());
130    blob.extend_from_slice(hash.as_bytes());
131    blob.extend_from_slice(&compressed);
132    blob
133}
134
135/// Build the complete embedded blob for a single schema (V2 format).
136pub fn build_embedded_blob(schema: &str) -> Vec<u8> {
137    compress_schema(schema)
138}
139
140/// Extract all schemas from binary data.
141///
142/// Scans for magic bytes and extracts all embedded schemas found.
143/// In V2 format, each schema has its own blob with its own magic header.
144///
145/// Returns an error only if no schemas are found at all.
146pub fn extract_schemas(data: &[u8]) -> Result<Vec<String>, ExtractError> {
147    let mut schemas = Vec::new();
148    let mut search_start = 0;
149
150    // Find all V2 blobs
151    while let Some(magic_pos) = find_magic_from(data, search_start, MAGIC_V2) {
152        match try_extract_v2_at(data, magic_pos) {
153            Ok(schema) => {
154                schemas.push(schema);
155                // Continue searching after this blob
156                search_start = magic_pos + MAGIC_V2.len();
157            }
158            Err(_) => {
159                // False positive (e.g., magic in debug symbols), try next
160                search_start = magic_pos + 1;
161            }
162        }
163    }
164
165    // Also try legacy V1 format for backwards compatibility
166    search_start = 0;
167    while let Some(magic_pos) = find_magic_from(data, search_start, MAGIC_V1) {
168        match try_extract_v1_at(data, magic_pos) {
169            Ok(mut v1_schemas) => {
170                schemas.append(&mut v1_schemas);
171                search_start = magic_pos + MAGIC_V1.len();
172            }
173            Err(_) => {
174                search_start = magic_pos + 1;
175            }
176        }
177    }
178
179    if schemas.is_empty() {
180        Err(ExtractError::NotFound)
181    } else {
182        Ok(schemas)
183    }
184}
185
186/// Try to extract a single schema from V2 format at a specific position.
187fn try_extract_v2_at(data: &[u8], magic_pos: usize) -> Result<String, ExtractError> {
188    let mut pos = magic_pos + MAGIC_V2.len();
189
190    // Read header: decompressed_len (4) + compressed_len (4) + hash (32) = 40 bytes
191    if pos + 40 > data.len() {
192        return Err(ExtractError::Truncated);
193    }
194
195    let decompressed_len =
196        u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
197    pos += 4;
198
199    let compressed_len =
200        u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
201    pos += 4;
202
203    let expected_hash: [u8; 32] = data[pos..pos + 32]
204        .try_into()
205        .map_err(|_| ExtractError::Truncated)?;
206    pos += 32;
207
208    // Read compressed data
209    if pos + compressed_len > data.len() {
210        return Err(ExtractError::Truncated);
211    }
212    let compressed = &data[pos..pos + compressed_len];
213
214    // Decompress
215    let decompressed = lz4_flex::decompress_size_prepended(compressed)
216        .map_err(|_| ExtractError::DecompressFailed)?;
217
218    // Verify length
219    if decompressed.len() != decompressed_len {
220        return Err(ExtractError::DecompressFailed);
221    }
222
223    // Verify hash
224    let actual_hash = blake3::hash(&decompressed);
225    if actual_hash.as_bytes() != &expected_hash {
226        return Err(ExtractError::HashMismatch);
227    }
228
229    // Convert to string
230    String::from_utf8(decompressed).map_err(|_| ExtractError::InvalidUtf8)
231}
232
233/// Try to extract schemas from legacy V1 format at a specific position.
234fn try_extract_v1_at(data: &[u8], magic_pos: usize) -> Result<Vec<String>, ExtractError> {
235    let mut pos = magic_pos + MAGIC_V1.len();
236
237    // Read count
238    if pos + 2 > data.len() {
239        return Err(ExtractError::Truncated);
240    }
241    let count = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
242    pos += 2;
243
244    let mut schemas = Vec::with_capacity(count);
245
246    for _ in 0..count {
247        // Read header: decompressed_len (4) + compressed_len (4) + hash (32) = 40 bytes
248        if pos + 40 > data.len() {
249            return Err(ExtractError::Truncated);
250        }
251
252        let decompressed_len =
253            u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
254        pos += 4;
255
256        let compressed_len =
257            u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
258        pos += 4;
259
260        let expected_hash: [u8; 32] = data[pos..pos + 32]
261            .try_into()
262            .map_err(|_| ExtractError::Truncated)?;
263        pos += 32;
264
265        // Read compressed data
266        if pos + compressed_len > data.len() {
267            return Err(ExtractError::Truncated);
268        }
269        let compressed = &data[pos..pos + compressed_len];
270        pos += compressed_len;
271
272        // Decompress
273        let decompressed = lz4_flex::decompress_size_prepended(compressed)
274            .map_err(|_| ExtractError::DecompressFailed)?;
275
276        // Verify length
277        if decompressed.len() != decompressed_len {
278            return Err(ExtractError::DecompressFailed);
279        }
280
281        // Verify hash
282        let actual_hash = blake3::hash(&decompressed);
283        if actual_hash.as_bytes() != &expected_hash {
284            return Err(ExtractError::HashMismatch);
285        }
286
287        // Convert to string
288        let schema = String::from_utf8(decompressed).map_err(|_| ExtractError::InvalidUtf8)?;
289        schemas.push(schema);
290    }
291
292    Ok(schemas)
293}
294
295/// Find the position of magic bytes in the data, starting from an offset.
296fn find_magic_from(data: &[u8], start: usize, magic: &[u8; 16]) -> Option<usize> {
297    if start >= data.len() {
298        return None;
299    }
300    data[start..]
301        .windows(magic.len())
302        .position(|w| w == magic)
303        .map(|pos| start + pos)
304}
305
306/// Section names used for embedding schemas in different object formats.
307mod section_names {
308    /// ELF section name (Linux)
309    pub const ELF: &str = ".styx_schemas";
310    /// Mach-O segment name (macOS)
311    pub const MACHO_SEGMENT: &str = "__DATA";
312    /// Mach-O section name (macOS)
313    pub const MACHO_SECTION: &str = "__styx_schemas";
314    /// PE/COFF section name (Windows)
315    pub const PE: &str = ".styx";
316}
317
318/// Extract schemas from binary data using object format parsing.
319///
320/// Parses ELF, Mach-O, or PE headers to locate the embedded schema section
321/// directly, avoiding a full binary scan. Falls back to magic byte scanning
322/// if the object format is unknown or section not found.
323pub fn extract_schemas_from_object(data: &[u8]) -> Result<Vec<String>, ExtractError> {
324    use goblin::Object;
325
326    // Try to parse as a known object format
327    if let Ok(object) = Object::parse(data)
328        && let Some(section_data) = find_schema_section(&object, data)
329    {
330        // Found the section - extract directly from it
331        return extract_schemas(section_data);
332    }
333
334    // Fall back to magic byte scanning for unknown formats or missing section
335    extract_schemas(data)
336}
337
338/// Find the schema section in a parsed object file.
339fn find_schema_section<'a>(object: &goblin::Object, data: &'a [u8]) -> Option<&'a [u8]> {
340    use goblin::Object;
341
342    match object {
343        Object::Elf(elf) => find_elf_section(elf, data),
344        Object::Mach(mach) => find_macho_section(mach, data),
345        Object::PE(pe) => find_pe_section(pe, data),
346        _ => None,
347    }
348}
349
350/// Find the .styx_schemas section in an ELF binary.
351fn find_elf_section<'a>(elf: &goblin::elf::Elf, data: &'a [u8]) -> Option<&'a [u8]> {
352    for section in &elf.section_headers {
353        if let Some(name) = elf.shdr_strtab.get_at(section.sh_name)
354            && name == section_names::ELF
355        {
356            let start = section.sh_offset as usize;
357            let size = section.sh_size as usize;
358            if start + size <= data.len() {
359                return Some(&data[start..start + size]);
360            }
361        }
362    }
363    None
364}
365
366/// Find the __DATA,__styx_schemas section in a Mach-O binary.
367fn find_macho_section<'a>(mach: &goblin::mach::Mach, data: &'a [u8]) -> Option<&'a [u8]> {
368    use goblin::mach::Mach;
369
370    match mach {
371        Mach::Binary(macho) => find_macho_section_in_binary(macho, data),
372        Mach::Fat(fat) => {
373            // For fat binaries, try each architecture
374            for arch in fat.iter_arches().flatten() {
375                let start = arch.offset as usize;
376                let size = arch.size as usize;
377                if start + size <= data.len() {
378                    let arch_data = &data[start..start + size];
379                    if let Ok(goblin::Object::Mach(Mach::Binary(macho))) =
380                        goblin::Object::parse(arch_data)
381                        && let Some(section) = find_macho_section_in_binary(&macho, arch_data)
382                    {
383                        return Some(section);
384                    }
385                }
386            }
387            None
388        }
389    }
390}
391
392/// Find the section in a single Mach-O binary (not fat).
393fn find_macho_section_in_binary<'a>(
394    macho: &goblin::mach::MachO,
395    data: &'a [u8],
396) -> Option<&'a [u8]> {
397    for segment in &macho.segments {
398        if let Ok(name) = segment.name()
399            && name == section_names::MACHO_SEGMENT
400        {
401            for (section, _section_data) in segment.sections().ok()? {
402                if let Ok(sect_name) = section.name()
403                    && sect_name == section_names::MACHO_SECTION
404                {
405                    let start = section.offset as usize;
406                    let size = section.size as usize;
407                    if start + size <= data.len() {
408                        return Some(&data[start..start + size]);
409                    }
410                }
411            }
412        }
413    }
414    None
415}
416
417/// Find the .styx section in a PE binary.
418fn find_pe_section<'a>(pe: &goblin::pe::PE, data: &'a [u8]) -> Option<&'a [u8]> {
419    for section in &pe.sections {
420        if let Ok(name) = section.name()
421            && name == section_names::PE
422        {
423            let start = section.pointer_to_raw_data as usize;
424            let size = section.size_of_raw_data as usize;
425            if start + size <= data.len() {
426                return Some(&data[start..start + size]);
427            }
428        }
429    }
430    None
431}
432
433/// Extract schemas from a file by memory-mapping it.
434///
435/// Uses object format parsing to locate the schema section directly.
436/// Falls back to magic byte scanning if the format is unknown.
437pub fn extract_schemas_from_file(
438    path: &std::path::Path,
439) -> Result<Vec<String>, Box<dyn std::error::Error>> {
440    use std::fs::File;
441    let file = File::open(path)?;
442    let mmap = unsafe { memmap2::Mmap::map(&file) }?;
443    Ok(extract_schemas_from_object(&mmap)?)
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn roundtrip_single_schema_v2() {
452        let schema = r#"meta {
453  id test-schema
454  version 1.0.0
455}
456
457schema {
458  @ @object{
459    name @string
460    port @int
461  }
462}
463"#;
464
465        let blob = build_embedded_blob(schema);
466        let extracted = extract_schemas(&blob).unwrap();
467
468        assert_eq!(extracted.len(), 1);
469        assert_eq!(extracted[0], schema);
470    }
471
472    #[test]
473    fn multiple_v2_blobs() {
474        let schema1 = "meta { id s1, version 1.0.0 }\nschema { @ @string }";
475        let schema2 = "meta { id s2, version 2.0.0 }\nschema { @ @int }";
476
477        // Concatenate two V2 blobs (simulating multiple embedded schemas)
478        let mut data = build_embedded_blob(schema1);
479        data.extend(build_embedded_blob(schema2));
480
481        let extracted = extract_schemas(&data).unwrap();
482
483        assert_eq!(extracted.len(), 2);
484        assert_eq!(extracted[0], schema1);
485        assert_eq!(extracted[1], schema2);
486    }
487
488    #[test]
489    fn not_found_in_random_data() {
490        let data = vec![0u8; 1000];
491        assert!(matches!(
492            extract_schemas(&data),
493            Err(ExtractError::NotFound)
494        ));
495    }
496
497    #[test]
498    fn embedded_in_larger_binary() {
499        let schema = "meta { id test, version 1.0.0 }\nschema { @ @bool }";
500
501        // Simulate a binary with stuff before and after
502        let mut binary = vec![0xDE, 0xAD, 0xBE, 0xEF]; // header
503        binary.extend_from_slice(&[0u8; 1000]); // padding
504        binary.extend_from_slice(&build_embedded_blob(schema));
505        binary.extend_from_slice(&[0u8; 500]); // trailing data
506
507        let extracted = extract_schemas(&binary).unwrap();
508        assert_eq!(extracted.len(), 1);
509        assert_eq!(extracted[0], schema);
510    }
511
512    #[test]
513    fn hash_mismatch_detected() {
514        let schema = "meta { id test, version 1.0.0 }\nschema { @ @unit }";
515        let mut blob = build_embedded_blob(schema);
516
517        // Corrupt the hash (bytes 16+8 = 24 onwards is the hash)
518        let hash_start = MAGIC_V2.len() + 4 + 4;
519        blob[hash_start] ^= 0xFF;
520
521        assert!(matches!(
522            extract_schemas(&blob),
523            Err(ExtractError::NotFound) // No valid schemas found
524        ));
525    }
526}