styx_embed/
lib.rs

1//! Embed Styx schemas in binaries for zero-execution discovery.
2//!
3//! This crate provides macros to embed schemas in your binary, and functions to
4//! extract them without executing the binary. This enables tooling (LSP, CLI) to
5//! discover schemas safely.
6//!
7//! # Embedding schemas
8//!
9//! Each schema must have a `meta { id ... }` block. The ID is used to generate
10//! a unique static name, allowing multiple schemas to coexist in the same binary.
11//!
12//! ## Inline strings
13//!
14//! ```rust,ignore
15//! styx_embed::embed_inline!(r#"
16//! meta { id my-config, version 1.0.0 }
17//! schema { @ @object{ host @string, port @int } }
18//! "#);
19//! ```
20//!
21//! ## From files
22//!
23//! ```rust,ignore
24//! // Single file (path relative to crate root)
25//! styx_embed::embed_file!("schema.styx");
26//!
27//! // Multiple files (each becomes its own embedded schema)
28//! styx_embed::embed_files!("config.styx", "plugin.styx");
29//! ```
30//!
31//! ## Generated from types (build script pattern)
32//!
33//! For schemas derived from Rust types using facet-styx, use a build script:
34//!
35//! ```rust,ignore
36//! // build.rs
37//! fn main() {
38//!     facet_styx::GenerateSchema::<MyConfig>::new()
39//!         .crate_name("myapp-config")
40//!         .version("1")
41//!         .cli("myapp")
42//!         .write("schema.styx");
43//! }
44//!
45//! // src/main.rs
46//! styx_embed::embed_outdir_file!("schema.styx");
47//! ```
48//!
49//! This keeps the schema in sync with your types automatically.
50//!
51//! # Binary format (V2)
52//!
53//! Each embedded schema is stored as its own blob:
54//!
55//! ```text
56//! STYX_SCHEMA_V2\0\0           // 16 bytes magic
57//! <decompressed_len:u32le>
58//! <compressed_len:u32le>
59//! <blake3:32bytes>             // hash of decompressed content
60//! <lz4 compressed schema>
61//! ```
62//!
63//! Multiple schemas in a binary means multiple blobs, each with its own magic header.
64//! The schema's `meta { id ... }` is used to identify which schema is which.
65//!
66//! # Extracting schemas
67//!
68//! ```rust,ignore
69//! use styx_embed::extract_schemas;
70//!
71//! let schemas = extract_schemas(binary_bytes)?;
72//! for schema in schemas {
73//!     println!("{}", schema);
74//! }
75//! ```
76
77// Re-export the proc macros
78pub use styx_embed_macros::{
79    embed_file, embed_files, embed_inline, embed_outdir_file, embed_schema, embed_schemas,
80};
81
82/// Magic bytes that identify an embedded Styx schema (V2 format).
83/// 16 bytes: "STYX_SCHEMA_V2\0\0"
84pub const MAGIC_V2: &[u8; 16] = b"STYX_SCHEMA_V2\0\0";
85
86/// Magic bytes for legacy V1 format (multiple schemas per blob).
87/// 16 bytes: "STYX_SCHEMAS_V1\0"
88pub const MAGIC_V1: &[u8; 16] = b"STYX_SCHEMAS_V1\0";
89
90/// Error type for schema extraction.
91#[derive(Debug)]
92pub enum ExtractError {
93    /// Magic bytes not found in binary.
94    NotFound,
95    /// Binary is truncated or malformed.
96    Truncated,
97    /// LZ4 decompression failed.
98    DecompressFailed,
99    /// BLAKE3 hash mismatch (data corruption or false positive match).
100    HashMismatch,
101    /// Decompressed data is not valid UTF-8.
102    InvalidUtf8,
103}
104
105impl std::fmt::Display for ExtractError {
106    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
107        match self {
108            ExtractError::NotFound => write!(f, "no embedded styx schemas found"),
109            ExtractError::Truncated => write!(f, "embedded schema data is truncated"),
110            ExtractError::DecompressFailed => write!(f, "LZ4 decompression failed"),
111            ExtractError::HashMismatch => write!(f, "BLAKE3 hash mismatch"),
112            ExtractError::InvalidUtf8 => write!(f, "schema is not valid UTF-8"),
113        }
114    }
115}
116
117impl std::error::Error for ExtractError {}
118
119/// Compress a schema and return the blob (for testing).
120pub fn compress_schema(schema: &str) -> Vec<u8> {
121    let decompressed = schema.as_bytes();
122    let hash = blake3::hash(decompressed);
123    let compressed = lz4_flex::compress_prepend_size(decompressed);
124
125    let mut blob = Vec::with_capacity(16 + 4 + 4 + 32 + compressed.len());
126    blob.extend_from_slice(MAGIC_V2);
127    blob.extend_from_slice(&(decompressed.len() as u32).to_le_bytes());
128    blob.extend_from_slice(&(compressed.len() as u32).to_le_bytes());
129    blob.extend_from_slice(hash.as_bytes());
130    blob.extend_from_slice(&compressed);
131    blob
132}
133
134/// Build the complete embedded blob for a single schema (V2 format).
135pub fn build_embedded_blob(schema: &str) -> Vec<u8> {
136    compress_schema(schema)
137}
138
139/// Extract all schemas from binary data.
140///
141/// Scans for magic bytes and extracts all embedded schemas found.
142/// In V2 format, each schema has its own blob with its own magic header.
143///
144/// Returns an error only if no schemas are found at all.
145pub fn extract_schemas(data: &[u8]) -> Result<Vec<String>, ExtractError> {
146    let mut schemas = Vec::new();
147    let mut search_start = 0;
148
149    // Find all V2 blobs
150    while let Some(magic_pos) = find_magic_from(data, search_start, MAGIC_V2) {
151        match try_extract_v2_at(data, magic_pos) {
152            Ok(schema) => {
153                schemas.push(schema);
154                // Continue searching after this blob
155                search_start = magic_pos + MAGIC_V2.len();
156            }
157            Err(_) => {
158                // False positive (e.g., magic in debug symbols), try next
159                search_start = magic_pos + 1;
160            }
161        }
162    }
163
164    // Also try legacy V1 format for backwards compatibility
165    search_start = 0;
166    while let Some(magic_pos) = find_magic_from(data, search_start, MAGIC_V1) {
167        match try_extract_v1_at(data, magic_pos) {
168            Ok(mut v1_schemas) => {
169                schemas.append(&mut v1_schemas);
170                search_start = magic_pos + MAGIC_V1.len();
171            }
172            Err(_) => {
173                search_start = magic_pos + 1;
174            }
175        }
176    }
177
178    if schemas.is_empty() {
179        Err(ExtractError::NotFound)
180    } else {
181        Ok(schemas)
182    }
183}
184
185/// Try to extract a single schema from V2 format at a specific position.
186fn try_extract_v2_at(data: &[u8], magic_pos: usize) -> Result<String, ExtractError> {
187    let mut pos = magic_pos + MAGIC_V2.len();
188
189    // Read header: decompressed_len (4) + compressed_len (4) + hash (32) = 40 bytes
190    if pos + 40 > data.len() {
191        return Err(ExtractError::Truncated);
192    }
193
194    let decompressed_len =
195        u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
196    pos += 4;
197
198    let compressed_len =
199        u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
200    pos += 4;
201
202    let expected_hash: [u8; 32] = data[pos..pos + 32]
203        .try_into()
204        .map_err(|_| ExtractError::Truncated)?;
205    pos += 32;
206
207    // Read compressed data
208    if pos + compressed_len > data.len() {
209        return Err(ExtractError::Truncated);
210    }
211    let compressed = &data[pos..pos + compressed_len];
212
213    // Decompress
214    let decompressed = lz4_flex::decompress_size_prepended(compressed)
215        .map_err(|_| ExtractError::DecompressFailed)?;
216
217    // Verify length
218    if decompressed.len() != decompressed_len {
219        return Err(ExtractError::DecompressFailed);
220    }
221
222    // Verify hash
223    let actual_hash = blake3::hash(&decompressed);
224    if actual_hash.as_bytes() != &expected_hash {
225        return Err(ExtractError::HashMismatch);
226    }
227
228    // Convert to string
229    String::from_utf8(decompressed).map_err(|_| ExtractError::InvalidUtf8)
230}
231
232/// Try to extract schemas from legacy V1 format at a specific position.
233fn try_extract_v1_at(data: &[u8], magic_pos: usize) -> Result<Vec<String>, ExtractError> {
234    let mut pos = magic_pos + MAGIC_V1.len();
235
236    // Read count
237    if pos + 2 > data.len() {
238        return Err(ExtractError::Truncated);
239    }
240    let count = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
241    pos += 2;
242
243    let mut schemas = Vec::with_capacity(count);
244
245    for _ in 0..count {
246        // Read header: decompressed_len (4) + compressed_len (4) + hash (32) = 40 bytes
247        if pos + 40 > data.len() {
248            return Err(ExtractError::Truncated);
249        }
250
251        let decompressed_len =
252            u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
253        pos += 4;
254
255        let compressed_len =
256            u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
257        pos += 4;
258
259        let expected_hash: [u8; 32] = data[pos..pos + 32]
260            .try_into()
261            .map_err(|_| ExtractError::Truncated)?;
262        pos += 32;
263
264        // Read compressed data
265        if pos + compressed_len > data.len() {
266            return Err(ExtractError::Truncated);
267        }
268        let compressed = &data[pos..pos + compressed_len];
269        pos += compressed_len;
270
271        // Decompress
272        let decompressed = lz4_flex::decompress_size_prepended(compressed)
273            .map_err(|_| ExtractError::DecompressFailed)?;
274
275        // Verify length
276        if decompressed.len() != decompressed_len {
277            return Err(ExtractError::DecompressFailed);
278        }
279
280        // Verify hash
281        let actual_hash = blake3::hash(&decompressed);
282        if actual_hash.as_bytes() != &expected_hash {
283            return Err(ExtractError::HashMismatch);
284        }
285
286        // Convert to string
287        let schema = String::from_utf8(decompressed).map_err(|_| ExtractError::InvalidUtf8)?;
288        schemas.push(schema);
289    }
290
291    Ok(schemas)
292}
293
294/// Find the position of magic bytes in the data, starting from an offset.
295fn find_magic_from(data: &[u8], start: usize, magic: &[u8; 16]) -> Option<usize> {
296    if start >= data.len() {
297        return None;
298    }
299    data[start..]
300        .windows(magic.len())
301        .position(|w| w == magic)
302        .map(|pos| start + pos)
303}
304
305/// Section names used for embedding schemas in different object formats.
306mod section_names {
307    /// ELF section name (Linux)
308    pub const ELF: &str = ".styx_schemas";
309    /// Mach-O segment name (macOS)
310    pub const MACHO_SEGMENT: &str = "__DATA";
311    /// Mach-O section name (macOS)
312    pub const MACHO_SECTION: &str = "__styx_schemas";
313    /// PE/COFF section name (Windows)
314    pub const PE: &str = ".styx";
315}
316
317/// Extract schemas from binary data using object format parsing.
318///
319/// Parses ELF, Mach-O, or PE headers to locate the embedded schema section
320/// directly, avoiding a full binary scan. Falls back to magic byte scanning
321/// if the object format is unknown or section not found.
322pub fn extract_schemas_from_object(data: &[u8]) -> Result<Vec<String>, ExtractError> {
323    use goblin::Object;
324
325    // Try to parse as a known object format
326    if let Ok(object) = Object::parse(data)
327        && let Some(section_data) = find_schema_section(&object, data)
328    {
329        // Found the section - extract directly from it
330        return extract_schemas(section_data);
331    }
332
333    // Fall back to magic byte scanning for unknown formats or missing section
334    extract_schemas(data)
335}
336
337/// Find the schema section in a parsed object file.
338fn find_schema_section<'a>(object: &goblin::Object, data: &'a [u8]) -> Option<&'a [u8]> {
339    use goblin::Object;
340
341    match object {
342        Object::Elf(elf) => find_elf_section(elf, data),
343        Object::Mach(mach) => find_macho_section(mach, data),
344        Object::PE(pe) => find_pe_section(pe, data),
345        _ => None,
346    }
347}
348
349/// Find the .styx_schemas section in an ELF binary.
350fn find_elf_section<'a>(elf: &goblin::elf::Elf, data: &'a [u8]) -> Option<&'a [u8]> {
351    for section in &elf.section_headers {
352        if let Some(name) = elf.shdr_strtab.get_at(section.sh_name)
353            && name == section_names::ELF
354        {
355            let start = section.sh_offset as usize;
356            let size = section.sh_size as usize;
357            if start + size <= data.len() {
358                return Some(&data[start..start + size]);
359            }
360        }
361    }
362    None
363}
364
365/// Find the __DATA,__styx_schemas section in a Mach-O binary.
366fn find_macho_section<'a>(mach: &goblin::mach::Mach, data: &'a [u8]) -> Option<&'a [u8]> {
367    use goblin::mach::Mach;
368
369    match mach {
370        Mach::Binary(macho) => find_macho_section_in_binary(macho, data),
371        Mach::Fat(fat) => {
372            // For fat binaries, try each architecture
373            for arch in fat.iter_arches().flatten() {
374                let start = arch.offset as usize;
375                let size = arch.size as usize;
376                if start + size <= data.len() {
377                    let arch_data = &data[start..start + size];
378                    if let Ok(goblin::Object::Mach(Mach::Binary(macho))) =
379                        goblin::Object::parse(arch_data)
380                        && let Some(section) = find_macho_section_in_binary(&macho, arch_data)
381                    {
382                        return Some(section);
383                    }
384                }
385            }
386            None
387        }
388    }
389}
390
391/// Find the section in a single Mach-O binary (not fat).
392fn find_macho_section_in_binary<'a>(
393    macho: &goblin::mach::MachO,
394    data: &'a [u8],
395) -> Option<&'a [u8]> {
396    for segment in &macho.segments {
397        if let Ok(name) = segment.name()
398            && name == section_names::MACHO_SEGMENT
399        {
400            for (section, _section_data) in segment.sections().ok()? {
401                if let Ok(sect_name) = section.name()
402                    && sect_name == section_names::MACHO_SECTION
403                {
404                    let start = section.offset as usize;
405                    let size = section.size as usize;
406                    if start + size <= data.len() {
407                        return Some(&data[start..start + size]);
408                    }
409                }
410            }
411        }
412    }
413    None
414}
415
416/// Find the .styx section in a PE binary.
417fn find_pe_section<'a>(pe: &goblin::pe::PE, data: &'a [u8]) -> Option<&'a [u8]> {
418    for section in &pe.sections {
419        if let Ok(name) = section.name()
420            && name == section_names::PE
421        {
422            let start = section.pointer_to_raw_data as usize;
423            let size = section.size_of_raw_data as usize;
424            if start + size <= data.len() {
425                return Some(&data[start..start + size]);
426            }
427        }
428    }
429    None
430}
431
432/// Extract schemas from a file by memory-mapping it.
433///
434/// Uses object format parsing to locate the schema section directly.
435/// Falls back to magic byte scanning if the format is unknown.
436pub fn extract_schemas_from_file(
437    path: &std::path::Path,
438) -> Result<Vec<String>, Box<dyn std::error::Error>> {
439    use std::fs::File;
440    let file = File::open(path)?;
441    let mmap = unsafe { memmap2::Mmap::map(&file) }?;
442    Ok(extract_schemas_from_object(&mmap)?)
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448
449    #[test]
450    fn roundtrip_single_schema_v2() {
451        let schema = r#"meta {
452  id test-schema
453  version 1.0.0
454}
455
456schema {
457  @ @object{
458    name @string
459    port @int
460  }
461}
462"#;
463
464        let blob = build_embedded_blob(schema);
465        let extracted = extract_schemas(&blob).unwrap();
466
467        assert_eq!(extracted.len(), 1);
468        assert_eq!(extracted[0], schema);
469    }
470
471    #[test]
472    fn multiple_v2_blobs() {
473        let schema1 = "meta { id s1, version 1.0.0 }\nschema { @ @string }";
474        let schema2 = "meta { id s2, version 2.0.0 }\nschema { @ @int }";
475
476        // Concatenate two V2 blobs (simulating multiple embedded schemas)
477        let mut data = build_embedded_blob(schema1);
478        data.extend(build_embedded_blob(schema2));
479
480        let extracted = extract_schemas(&data).unwrap();
481
482        assert_eq!(extracted.len(), 2);
483        assert_eq!(extracted[0], schema1);
484        assert_eq!(extracted[1], schema2);
485    }
486
487    #[test]
488    fn not_found_in_random_data() {
489        let data = vec![0u8; 1000];
490        assert!(matches!(
491            extract_schemas(&data),
492            Err(ExtractError::NotFound)
493        ));
494    }
495
496    #[test]
497    fn embedded_in_larger_binary() {
498        let schema = "meta { id test, version 1.0.0 }\nschema { @ @bool }";
499
500        // Simulate a binary with stuff before and after
501        let mut binary = vec![0xDE, 0xAD, 0xBE, 0xEF]; // header
502        binary.extend_from_slice(&[0u8; 1000]); // padding
503        binary.extend_from_slice(&build_embedded_blob(schema));
504        binary.extend_from_slice(&[0u8; 500]); // trailing data
505
506        let extracted = extract_schemas(&binary).unwrap();
507        assert_eq!(extracted.len(), 1);
508        assert_eq!(extracted[0], schema);
509    }
510
511    #[test]
512    fn hash_mismatch_detected() {
513        let schema = "meta { id test, version 1.0.0 }\nschema { @ @unit }";
514        let mut blob = build_embedded_blob(schema);
515
516        // Corrupt the hash (bytes 16+8 = 24 onwards is the hash)
517        let hash_start = MAGIC_V2.len() + 4 + 4;
518        blob[hash_start] ^= 0xFF;
519
520        assert!(matches!(
521            extract_schemas(&blob),
522            Err(ExtractError::NotFound) // No valid schemas found
523        ));
524    }
525}