zarja_core/scanner/
mod.rs

1//! Binary scanning module for finding embedded protobuf descriptors.
2//!
3//! This module provides functionality to scan binary files for embedded
4//! `FileDescriptorProto` data and extract it for reconstruction.
5//!
6//! ## Algorithm Overview
7//!
8//! 1. Search for the `.proto` byte sequence in the binary
9//! 2. Backtrack to find the magic byte `0x0A` (field 1, wire type LEN)
10//! 3. Parse forward using protobuf wire format to find record boundaries
11//! 4. Extract the complete `FileDescriptorProto` bytes
12//!
13//! ## Extensibility
14//!
15//! The [`ScanStrategy`] trait allows custom scanning algorithms:
16//!
17//! ```no_run
18//! use zarja_core::scanner::{ScanStrategy, ScanResult};
19//! use zarja_core::Result;
20//!
21//! struct CustomScanner;
22//!
23//! impl ScanStrategy for CustomScanner {
24//!     fn scan(&self, data: &[u8]) -> Result<Vec<ScanResult>> {
25//!         // Custom scanning logic
26//!         Ok(vec![])
27//!     }
28//! }
29//! ```
30
31mod wire;
32
33use crate::error::{Error, Result};
34use std::ops::Range;
35use tracing::{debug, trace};
36
37pub use wire::{WireType, decode_varint, consume_field, consume_fields, MAX_VALID_NUMBER};
38
39/// Pattern to search for in binaries (filename suffix)
40const PROTO_SUFFIX: &[u8] = b".proto";
41
42/// Magic byte indicating start of FileDescriptorProto
43/// This is field 1 (name) with wire type 2 (LEN): (1 << 3) | 2 = 0x0A
44const MAGIC_BYTE: u8 = 0x0A;
45
46/// Result of scanning a binary for a single descriptor
47#[derive(Debug, Clone)]
48pub struct ScanResult {
49    /// The raw bytes of the FileDescriptorProto
50    pub data: Vec<u8>,
51    /// Byte range in the original input where this was found
52    pub range: Range<usize>,
53}
54
55impl ScanResult {
56    /// Creates a new scan result
57    pub fn new(data: Vec<u8>, range: Range<usize>) -> Self {
58        Self { data, range }
59    }
60
61    /// Returns the data as a slice
62    pub fn as_bytes(&self) -> &[u8] {
63        &self.data
64    }
65}
66
67/// Configuration for the scanner
68#[derive(Debug, Clone)]
69pub struct ScannerConfig {
70    /// Maximum number of descriptors to find (0 = unlimited)
71    pub max_results: usize,
72    /// Minimum size for a valid descriptor (filters noise)
73    pub min_descriptor_size: usize,
74    /// Maximum size for a valid descriptor (filters garbage)
75    pub max_descriptor_size: usize,
76}
77
78impl Default for ScannerConfig {
79    fn default() -> Self {
80        Self {
81            max_results: 0,
82            min_descriptor_size: 10,
83            max_descriptor_size: 10 * 1024 * 1024, // 10 MB
84        }
85    }
86}
87
88impl ScannerConfig {
89    /// Creates a new scanner config with default values
90    pub fn new() -> Self {
91        Self::default()
92    }
93
94    /// Sets the maximum number of results to return
95    pub fn max_results(mut self, max: usize) -> Self {
96        self.max_results = max;
97        self
98    }
99
100    /// Sets the minimum descriptor size filter
101    pub fn min_descriptor_size(mut self, size: usize) -> Self {
102        self.min_descriptor_size = size;
103        self
104    }
105
106    /// Sets the maximum descriptor size filter
107    pub fn max_descriptor_size(mut self, size: usize) -> Self {
108        self.max_descriptor_size = size;
109        self
110    }
111}
112
113/// Trait for implementing custom scanning strategies
114///
115/// This trait allows you to plug in different algorithms for finding
116/// protobuf descriptors in binary data.
117pub trait ScanStrategy: Send + Sync {
118    /// Scan the provided data for protobuf descriptors
119    fn scan(&self, data: &[u8]) -> Result<Vec<ScanResult>>;
120
121    /// Scan the data and return an iterator (for streaming large files)
122    fn scan_iter<'a>(&'a self, data: &'a [u8]) -> Box<dyn Iterator<Item = Result<ScanResult>> + 'a> {
123        // Default implementation: collect all results into a vec and iterate
124        match self.scan(data) {
125            Ok(results) => Box::new(results.into_iter().map(Ok)),
126            Err(e) => Box::new(std::iter::once(Err(e))),
127        }
128    }
129}
130
131/// Primary scanner for finding embedded protobuf descriptors
132#[derive(Debug, Clone)]
133pub struct Scanner {
134    config: ScannerConfig,
135}
136
137impl Default for Scanner {
138    fn default() -> Self {
139        Self::new()
140    }
141}
142
143impl Scanner {
144    /// Creates a new scanner with default configuration
145    pub fn new() -> Self {
146        Self {
147            config: ScannerConfig::default(),
148        }
149    }
150
151    /// Creates a new scanner with custom configuration
152    pub fn with_config(config: ScannerConfig) -> Self {
153        Self { config }
154    }
155
156    /// Consumes protobuf fields starting from the given position
157    /// Returns the number of bytes consumed for the complete record
158    fn consume_record(&self, data: &[u8], start: usize) -> Result<usize> {
159        let mut position = start;
160        let mut consumed_field_one = false;
161
162        loop {
163            if position >= data.len() {
164                // Reached end of data, return what we have
165                return Ok(position - start);
166            }
167
168            match consume_field(&data[position..]) {
169                Ok((field_number, length)) => {
170                    // If we see field 1 again, we've hit the next descriptor
171                    // (adjacent descriptors in binary)
172                    if field_number == 1 {
173                        if consumed_field_one {
174                            trace!(
175                                "Found adjacent descriptor at position {}",
176                                position
177                            );
178                            return Ok(position - start);
179                        }
180                        consumed_field_one = true;
181                    }
182
183                    position += length;
184
185                    // Safety check: don't exceed data bounds
186                    if position > data.len() {
187                        return Ok(data.len() - start);
188                    }
189                }
190                Err(_) => {
191                    // Hit invalid data, return what we have so far
192                    return Ok(position - start);
193                }
194            }
195        }
196    }
197
198    /// Find the start of a FileDescriptorProto by backtracking from a `.proto` match
199    fn find_record_start(&self, data: &[u8], proto_suffix_pos: usize) -> Option<usize> {
200        // We need to backtrack to find the 0x0A byte that starts the record
201        // The structure is: 0x0A [varint length] [filename bytes ending in .proto]
202
203        // The .proto suffix is at proto_suffix_pos, so the filename ends at proto_suffix_pos + 6
204        // We need to find where the filename starts
205
206        // Search backwards for the magic byte
207        let search_start = proto_suffix_pos.saturating_sub(256); // Filenames shouldn't be longer than 256 bytes
208
209        for i in (search_start..proto_suffix_pos).rev() {
210            if data[i] == MAGIC_BYTE {
211                // Verify this is a valid length-prefixed string
212                if i + 1 < data.len() {
213                    // Try to decode the length varint
214                    if let Ok((length, varint_len)) = decode_varint(&data[i + 1..]) {
215                        let expected_end = i + 1 + varint_len + length as usize;
216                        let actual_end = proto_suffix_pos + PROTO_SUFFIX.len();
217
218                        // Check if this length matches our .proto position
219                        if expected_end == actual_end {
220                            return Some(i);
221                        }
222
223                        // Edge case: filename is exactly 10 chars, 0x0A might be the length
224                        if length == 10 && i > 0 && data[i - 1] == MAGIC_BYTE {
225                            return Some(i - 1);
226                        }
227                    }
228                }
229            }
230        }
231
232        None
233    }
234}
235
236impl ScanStrategy for Scanner {
237    fn scan(&self, data: &[u8]) -> Result<Vec<ScanResult>> {
238        let mut results = Vec::new();
239        let mut position = 0;
240
241        debug!("Starting scan of {} bytes", data.len());
242
243        while position < data.len() {
244            // Find next occurrence of ".proto"
245            let remaining = &data[position..];
246            let proto_pos = find_subsequence(remaining, PROTO_SUFFIX);
247
248            let Some(relative_pos) = proto_pos else {
249                break;
250            };
251
252            let absolute_pos = position + relative_pos;
253            trace!("Found .proto suffix at position {}", absolute_pos);
254
255            // Try to find the record start
256            if let Some(record_start) = self.find_record_start(data, absolute_pos) {
257                trace!("Found record start at position {}", record_start);
258
259                // Consume the complete record
260                match self.consume_record(data, record_start) {
261                    Ok(record_len) => {
262                        // Apply size filters
263                        if record_len >= self.config.min_descriptor_size
264                            && record_len <= self.config.max_descriptor_size
265                        {
266                            let record_data = data[record_start..record_start + record_len].to_vec();
267                            let range = record_start..record_start + record_len;
268
269                            debug!(
270                                "Found descriptor at {}..{} ({} bytes)",
271                                range.start, range.end, record_len
272                            );
273
274                            results.push(ScanResult::new(record_data, range));
275
276                            // Check if we've hit the limit
277                            if self.config.max_results > 0
278                                && results.len() >= self.config.max_results
279                            {
280                                break;
281                            }
282
283                            // Skip past this record
284                            position = record_start + record_len;
285                            continue;
286                        }
287                    }
288                    Err(e) => {
289                        trace!("Failed to consume record: {}", e);
290                    }
291                }
292            }
293
294            // Move past this .proto occurrence and continue searching
295            position = absolute_pos + PROTO_SUFFIX.len();
296        }
297
298        debug!("Scan complete: found {} descriptors", results.len());
299        Ok(results)
300    }
301}
302
303/// Find a subsequence within a byte slice
304fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
305    haystack
306        .windows(needle.len())
307        .position(|window| window == needle)
308}
309
310/// Scan a file for embedded protobuf descriptors
311///
312/// This is a convenience function that reads the file and scans it.
313pub fn scan_file(path: impl AsRef<std::path::Path>) -> Result<Vec<ScanResult>> {
314    let path = path.as_ref();
315    let data = std::fs::read(path).map_err(|e| Error::file_read(path, e))?;
316    Scanner::new().scan(&data)
317}
318
319/// Scan a file with custom configuration
320pub fn scan_file_with_config(
321    path: impl AsRef<std::path::Path>,
322    config: ScannerConfig,
323) -> Result<Vec<ScanResult>> {
324    let path = path.as_ref();
325    let data = std::fs::read(path).map_err(|e| Error::file_read(path, e))?;
326    Scanner::with_config(config).scan(&data)
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn test_find_subsequence() {
335        let data = b"hello.proto.world";
336        assert_eq!(find_subsequence(data, b".proto"), Some(5));
337        assert_eq!(find_subsequence(data, b"world"), Some(12));
338        assert_eq!(find_subsequence(data, b"missing"), None);
339    }
340
341    #[test]
342    fn test_scanner_config_builder() {
343        let config = ScannerConfig::new()
344            .max_results(10)
345            .min_descriptor_size(20)
346            .max_descriptor_size(1000);
347
348        assert_eq!(config.max_results, 10);
349        assert_eq!(config.min_descriptor_size, 20);
350        assert_eq!(config.max_descriptor_size, 1000);
351    }
352
353    #[test]
354    fn test_empty_input() {
355        let scanner = Scanner::new();
356        let results = scanner.scan(&[]).unwrap();
357        assert!(results.is_empty());
358    }
359
360    #[test]
361    fn test_no_proto_suffix() {
362        let scanner = Scanner::new();
363        let data = b"this is just some random data without any protobuf content";
364        let results = scanner.scan(data).unwrap();
365        assert!(results.is_empty());
366    }
367}