zarja_core/scanner/
mod.rs1mod wire;
32
33use crate::error::{Error, Result};
34use std::ops::Range;
35use tracing::{debug, trace};
36
37pub use wire::{WireType, decode_varint, consume_field, consume_fields, MAX_VALID_NUMBER};
38
39const PROTO_SUFFIX: &[u8] = b".proto";
41
42const MAGIC_BYTE: u8 = 0x0A;
45
46#[derive(Debug, Clone)]
48pub struct ScanResult {
49 pub data: Vec<u8>,
51 pub range: Range<usize>,
53}
54
55impl ScanResult {
56 pub fn new(data: Vec<u8>, range: Range<usize>) -> Self {
58 Self { data, range }
59 }
60
61 pub fn as_bytes(&self) -> &[u8] {
63 &self.data
64 }
65}
66
67#[derive(Debug, Clone)]
69pub struct ScannerConfig {
70 pub max_results: usize,
72 pub min_descriptor_size: usize,
74 pub max_descriptor_size: usize,
76}
77
78impl Default for ScannerConfig {
79 fn default() -> Self {
80 Self {
81 max_results: 0,
82 min_descriptor_size: 10,
83 max_descriptor_size: 10 * 1024 * 1024, }
85 }
86}
87
88impl ScannerConfig {
89 pub fn new() -> Self {
91 Self::default()
92 }
93
94 pub fn max_results(mut self, max: usize) -> Self {
96 self.max_results = max;
97 self
98 }
99
100 pub fn min_descriptor_size(mut self, size: usize) -> Self {
102 self.min_descriptor_size = size;
103 self
104 }
105
106 pub fn max_descriptor_size(mut self, size: usize) -> Self {
108 self.max_descriptor_size = size;
109 self
110 }
111}
112
113pub trait ScanStrategy: Send + Sync {
118 fn scan(&self, data: &[u8]) -> Result<Vec<ScanResult>>;
120
121 fn scan_iter<'a>(&'a self, data: &'a [u8]) -> Box<dyn Iterator<Item = Result<ScanResult>> + 'a> {
123 match self.scan(data) {
125 Ok(results) => Box::new(results.into_iter().map(Ok)),
126 Err(e) => Box::new(std::iter::once(Err(e))),
127 }
128 }
129}
130
131#[derive(Debug, Clone)]
133pub struct Scanner {
134 config: ScannerConfig,
135}
136
137impl Default for Scanner {
138 fn default() -> Self {
139 Self::new()
140 }
141}
142
143impl Scanner {
144 pub fn new() -> Self {
146 Self {
147 config: ScannerConfig::default(),
148 }
149 }
150
151 pub fn with_config(config: ScannerConfig) -> Self {
153 Self { config }
154 }
155
156 fn consume_record(&self, data: &[u8], start: usize) -> Result<usize> {
159 let mut position = start;
160 let mut consumed_field_one = false;
161
162 loop {
163 if position >= data.len() {
164 return Ok(position - start);
166 }
167
168 match consume_field(&data[position..]) {
169 Ok((field_number, length)) => {
170 if field_number == 1 {
173 if consumed_field_one {
174 trace!(
175 "Found adjacent descriptor at position {}",
176 position
177 );
178 return Ok(position - start);
179 }
180 consumed_field_one = true;
181 }
182
183 position += length;
184
185 if position > data.len() {
187 return Ok(data.len() - start);
188 }
189 }
190 Err(_) => {
191 return Ok(position - start);
193 }
194 }
195 }
196 }
197
198 fn find_record_start(&self, data: &[u8], proto_suffix_pos: usize) -> Option<usize> {
200 let search_start = proto_suffix_pos.saturating_sub(256); for i in (search_start..proto_suffix_pos).rev() {
210 if data[i] == MAGIC_BYTE {
211 if i + 1 < data.len() {
213 if let Ok((length, varint_len)) = decode_varint(&data[i + 1..]) {
215 let expected_end = i + 1 + varint_len + length as usize;
216 let actual_end = proto_suffix_pos + PROTO_SUFFIX.len();
217
218 if expected_end == actual_end {
220 return Some(i);
221 }
222
223 if length == 10 && i > 0 && data[i - 1] == MAGIC_BYTE {
225 return Some(i - 1);
226 }
227 }
228 }
229 }
230 }
231
232 None
233 }
234}
235
236impl ScanStrategy for Scanner {
237 fn scan(&self, data: &[u8]) -> Result<Vec<ScanResult>> {
238 let mut results = Vec::new();
239 let mut position = 0;
240
241 debug!("Starting scan of {} bytes", data.len());
242
243 while position < data.len() {
244 let remaining = &data[position..];
246 let proto_pos = find_subsequence(remaining, PROTO_SUFFIX);
247
248 let Some(relative_pos) = proto_pos else {
249 break;
250 };
251
252 let absolute_pos = position + relative_pos;
253 trace!("Found .proto suffix at position {}", absolute_pos);
254
255 if let Some(record_start) = self.find_record_start(data, absolute_pos) {
257 trace!("Found record start at position {}", record_start);
258
259 match self.consume_record(data, record_start) {
261 Ok(record_len) => {
262 if record_len >= self.config.min_descriptor_size
264 && record_len <= self.config.max_descriptor_size
265 {
266 let record_data = data[record_start..record_start + record_len].to_vec();
267 let range = record_start..record_start + record_len;
268
269 debug!(
270 "Found descriptor at {}..{} ({} bytes)",
271 range.start, range.end, record_len
272 );
273
274 results.push(ScanResult::new(record_data, range));
275
276 if self.config.max_results > 0
278 && results.len() >= self.config.max_results
279 {
280 break;
281 }
282
283 position = record_start + record_len;
285 continue;
286 }
287 }
288 Err(e) => {
289 trace!("Failed to consume record: {}", e);
290 }
291 }
292 }
293
294 position = absolute_pos + PROTO_SUFFIX.len();
296 }
297
298 debug!("Scan complete: found {} descriptors", results.len());
299 Ok(results)
300 }
301}
302
303fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
305 haystack
306 .windows(needle.len())
307 .position(|window| window == needle)
308}
309
310pub fn scan_file(path: impl AsRef<std::path::Path>) -> Result<Vec<ScanResult>> {
314 let path = path.as_ref();
315 let data = std::fs::read(path).map_err(|e| Error::file_read(path, e))?;
316 Scanner::new().scan(&data)
317}
318
319pub fn scan_file_with_config(
321 path: impl AsRef<std::path::Path>,
322 config: ScannerConfig,
323) -> Result<Vec<ScanResult>> {
324 let path = path.as_ref();
325 let data = std::fs::read(path).map_err(|e| Error::file_read(path, e))?;
326 Scanner::with_config(config).scan(&data)
327}
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332
333 #[test]
334 fn test_find_subsequence() {
335 let data = b"hello.proto.world";
336 assert_eq!(find_subsequence(data, b".proto"), Some(5));
337 assert_eq!(find_subsequence(data, b"world"), Some(12));
338 assert_eq!(find_subsequence(data, b"missing"), None);
339 }
340
341 #[test]
342 fn test_scanner_config_builder() {
343 let config = ScannerConfig::new()
344 .max_results(10)
345 .min_descriptor_size(20)
346 .max_descriptor_size(1000);
347
348 assert_eq!(config.max_results, 10);
349 assert_eq!(config.min_descriptor_size, 20);
350 assert_eq!(config.max_descriptor_size, 1000);
351 }
352
353 #[test]
354 fn test_empty_input() {
355 let scanner = Scanner::new();
356 let results = scanner.scan(&[]).unwrap();
357 assert!(results.is_empty());
358 }
359
360 #[test]
361 fn test_no_proto_suffix() {
362 let scanner = Scanner::new();
363 let data = b"this is just some random data without any protobuf content";
364 let results = scanner.scan(data).unwrap();
365 assert!(results.is_empty());
366 }
367}