Skip to main content

bytecode_filter/
split.rs

1//! Zero-copy payload splitting.
2//!
3//! Splits a payload by a delimiter into parts, returning slices into the original data.
4
5use bytes::Bytes;
6use memchr::memmem::Finder;
7
8/// Maximum number of parts supported.
9pub const MAX_PARTS: usize = 32;
10
11/// Zero-copy payload splitter.
12///
13/// Splits a `Bytes` payload by a delimiter and provides access to individual parts
14/// as slices into the original payload. No allocations occur during splitting.
15///
16/// Supports both eager splitting (via `split` / `split_with_finder`) and
17/// demand-driven lazy splitting (via `new_lazy` + `ensure`).
18#[derive(Debug)]
19pub struct PayloadParts {
20    /// The original payload (keeps Bytes alive for zero-copy access).
21    payload: Bytes,
22
23    /// Offsets for each part: (start, end) pairs.
24    /// Uses u32 to save space - payloads > 4GB are not supported.
25    offsets: [(u32, u32); MAX_PARTS],
26
27    /// Number of parts found so far.
28    count: usize,
29
30    /// Current scan position for lazy splitting.
31    scan_cursor: usize,
32
33    /// Whether the entire payload has been scanned.
34    finished: bool,
35}
36
37impl PayloadParts {
38    /// Split a payload by the given delimiter.
39    ///
40    /// # Arguments
41    /// * `payload` - The payload to split
42    /// * `delimiter` - The delimiter bytes (e.g., `b";;;"`)
43    ///
44    /// # Returns
45    /// A `PayloadParts` instance with zero-copy access to each part.
46    ///
47    /// # Performance
48    /// - O(n) scan with SIMD-accelerated delimiter search
49    /// - Zero heap allocations
50    /// - Parts are slices into the original Bytes
51    #[inline]
52    pub fn split(payload: Bytes, delimiter: &[u8]) -> Self {
53        let mut offsets = [(0u32, 0u32); MAX_PARTS];
54        let mut count = 0;
55        let mut start = 0usize;
56
57        if delimiter.is_empty() {
58            offsets[0] = (0, payload.len() as u32);
59            return Self {
60                payload,
61                offsets,
62                count: 1,
63                scan_cursor: 0,
64                finished: true,
65            };
66        }
67
68        let finder = Finder::new(delimiter);
69        let payload_len = payload.len();
70        let data = payload.as_ref();
71
72        while count < MAX_PARTS - 1 {
73            if let Some(pos) = finder.find(&data[start..]) {
74                offsets[count] = (start as u32, (start + pos) as u32);
75                count += 1;
76                start += pos + delimiter.len();
77            } else {
78                break;
79            }
80        }
81
82        if start <= payload_len && count < MAX_PARTS {
83            offsets[count] = (start as u32, payload_len as u32);
84            count += 1;
85        }
86
87        Self {
88            payload,
89            offsets,
90            count,
91            scan_cursor: payload_len,
92            finished: true,
93        }
94    }
95
96    /// Split a payload using a pre-built `Finder`.
97    ///
98    /// This avoids rebuilding the SIMD searcher on every call.
99    ///
100    /// # Arguments
101    /// * `payload` - The payload to split
102    /// * `finder` - Pre-built delimiter finder
103    /// * `delim_len` - Length of the delimiter in bytes
104    #[inline]
105    pub fn split_with_finder(payload: Bytes, finder: &Finder<'_>, delim_len: usize) -> Self {
106        let mut offsets = [(0u32, 0u32); MAX_PARTS];
107        let mut count = 0;
108        let mut start = 0usize;
109
110        if delim_len == 0 {
111            offsets[0] = (0, payload.len() as u32);
112            return Self {
113                payload,
114                offsets,
115                count: 1,
116                scan_cursor: 0,
117                finished: true,
118            };
119        }
120
121        let payload_len = payload.len();
122        let data = payload.as_ref();
123
124        while count < MAX_PARTS - 1 {
125            if let Some(pos) = finder.find(&data[start..]) {
126                offsets[count] = (start as u32, (start + pos) as u32);
127                count += 1;
128                start += pos + delim_len;
129            } else {
130                break;
131            }
132        }
133
134        if start <= payload_len && count < MAX_PARTS {
135            offsets[count] = (start as u32, payload_len as u32);
136            count += 1;
137        }
138
139        Self {
140            payload,
141            offsets,
142            count,
143            scan_cursor: payload_len,
144            finished: true,
145        }
146    }
147
148    /// Create a lazy payload splitter that scans delimiters on demand.
149    ///
150    /// No scanning happens until `ensure()` is called.
151    #[inline]
152    pub fn new_lazy(payload: Bytes) -> Self {
153        Self {
154            payload,
155            offsets: [(0u32, 0u32); MAX_PARTS],
156            count: 0,
157            scan_cursor: 0,
158            finished: false,
159        }
160    }
161
162    /// Ensure that part `index` is available by scanning delimiters incrementally.
163    ///
164    /// After this call, `self.get(index)` returns the correct slice if the part
165    /// exists, or an empty slice if the payload has fewer parts.
166    #[inline]
167    pub fn ensure(&mut self, index: usize, finder: &Finder<'_>, delim_len: usize) {
168        // Already have enough parts, or payload fully scanned
169        if index < self.count || self.finished {
170            return;
171        }
172
173        let data = self.payload.as_ref();
174
175        while self.count <= index && !self.finished {
176            if self.count >= MAX_PARTS - 1 {
177                // Last slot — take remainder
178                if self.scan_cursor <= data.len() {
179                    self.offsets[self.count] =
180                        (self.scan_cursor as u32, data.len() as u32);
181                    self.count += 1;
182                }
183                self.finished = true;
184                return;
185            }
186
187            if let Some(pos) = finder.find(&data[self.scan_cursor..]) {
188                self.offsets[self.count] =
189                    (self.scan_cursor as u32, (self.scan_cursor + pos) as u32);
190                self.count += 1;
191                self.scan_cursor += pos + delim_len;
192            } else {
193                // No more delimiters — remainder is last part
194                if self.scan_cursor <= data.len() && self.count < MAX_PARTS {
195                    self.offsets[self.count] =
196                        (self.scan_cursor as u32, data.len() as u32);
197                    self.count += 1;
198                }
199                self.finished = true;
200                return;
201            }
202        }
203    }
204
205    /// Get the number of parts.
206    #[inline]
207    pub fn len(&self) -> usize {
208        self.count
209    }
210
211    /// Check if there are no parts.
212    #[inline]
213    pub fn is_empty(&self) -> bool {
214        self.count == 0
215    }
216
217    /// Get a part by index as a byte slice.
218    ///
219    /// Returns an empty slice if the index is out of bounds.
220    #[inline]
221    pub fn get(&self, index: usize) -> &[u8] {
222        if index < self.count {
223            let (start, end) = self.offsets[index];
224            &self.payload[start as usize..end as usize]
225        } else {
226            &[]
227        }
228    }
229
230    /// Get a part by index as a `Bytes` (zero-copy slice).
231    ///
232    /// Returns an empty `Bytes` if the index is out of bounds.
233    #[inline]
234    pub fn get_bytes(&self, index: usize) -> Bytes {
235        if index < self.count {
236            let (start, end) = self.offsets[index];
237            self.payload.slice(start as usize..end as usize)
238        } else {
239            Bytes::new()
240        }
241    }
242
243    /// Get the original payload.
244    #[inline]
245    pub fn payload(&self) -> &Bytes {
246        &self.payload
247    }
248
249    /// Iterate over all parts as byte slices.
250    #[inline]
251    pub fn iter(&self) -> impl Iterator<Item = &[u8]> {
252        (0..self.count).map(move |i| self.get(i))
253    }
254}
255
256/// Extract an HTTP header value from a headers blob.
257///
258/// Headers format: `Header-Name: value\r\nOther-Header: value2\r\n`
259///
260/// # Arguments
261/// * `headers` - The raw headers blob
262/// * `header_name` - The header name to search for (case-insensitive)
263///
264/// # Returns
265/// The header value if found, with leading/trailing whitespace trimmed.
266#[inline]
267pub fn extract_header_value<'a>(headers: &'a [u8], header_name: &[u8]) -> Option<&'a [u8]> {
268    if headers.is_empty() || header_name.is_empty() {
269        return None;
270    }
271
272    let mut line_start = 0;
273
274    while line_start < headers.len() {
275        // Find end of current line using SIMD-accelerated search
276        let line_end = match memchr::memchr2(b'\r', b'\n', &headers[line_start..]) {
277            Some(pos) => line_start + pos,
278            None => headers.len(),
279        };
280
281        let line = &headers[line_start..line_end];
282
283        // Check if line starts with header name followed by ':'
284        if line.len() > header_name.len() {
285            let potential_name = &line[..header_name.len()];
286            if potential_name.eq_ignore_ascii_case(header_name) && line[header_name.len()] == b':' {
287                // Found header, extract value
288                let mut val_start = header_name.len() + 1;
289
290                // Skip leading whitespace
291                while val_start < line.len()
292                    && (line[val_start] == b' ' || line[val_start] == b'\t')
293                {
294                    val_start += 1;
295                }
296
297                return Some(&line[val_start..]);
298            }
299        }
300
301        // Move to next line - skip \r\n or \n
302        line_start = line_end;
303        if line_start < headers.len() && headers[line_start] == b'\r' {
304            line_start += 1;
305        }
306        if line_start < headers.len() && headers[line_start] == b'\n' {
307            line_start += 1;
308        }
309
310        // If we didn't move, we're stuck - break to avoid infinite loop
311        if line_start == line_end {
312            break;
313        }
314    }
315
316    None
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn test_split_basic() {
325        let payload = Bytes::from("a;;;b;;;c");
326        let parts = PayloadParts::split(payload, b";;;");
327
328        assert_eq!(parts.len(), 3);
329        assert_eq!(parts.get(0), b"a");
330        assert_eq!(parts.get(1), b"b");
331        assert_eq!(parts.get(2), b"c");
332    }
333
334    #[test]
335    fn test_split_no_delimiter() {
336        let payload = Bytes::from("hello world");
337        let parts = PayloadParts::split(payload, b";;;");
338
339        assert_eq!(parts.len(), 1);
340        assert_eq!(parts.get(0), b"hello world");
341    }
342
343    #[test]
344    fn test_split_empty_parts() {
345        let payload = Bytes::from("a;;;;;;b");
346        let parts = PayloadParts::split(payload, b";;;");
347
348        assert_eq!(parts.len(), 3);
349        assert_eq!(parts.get(0), b"a");
350        assert_eq!(parts.get(1), b"");
351        assert_eq!(parts.get(2), b"b");
352    }
353
354    #[test]
355    fn test_split_empty_payload() {
356        let payload = Bytes::from("");
357        let parts = PayloadParts::split(payload, b";;;");
358
359        assert_eq!(parts.len(), 1);
360        assert_eq!(parts.get(0), b"");
361    }
362
363    #[test]
364    fn test_split_trailing_delimiter() {
365        let payload = Bytes::from("a;;;b;;;");
366        let parts = PayloadParts::split(payload, b";;;");
367
368        assert_eq!(parts.len(), 3);
369        assert_eq!(parts.get(0), b"a");
370        assert_eq!(parts.get(1), b"b");
371        assert_eq!(parts.get(2), b"");
372    }
373
374    #[test]
375    fn test_split_single_char_delimiter() {
376        let payload = Bytes::from("a|b|c|d");
377        let parts = PayloadParts::split(payload, b"|");
378
379        assert_eq!(parts.len(), 4);
380        assert_eq!(parts.get(0), b"a");
381        assert_eq!(parts.get(1), b"b");
382        assert_eq!(parts.get(2), b"c");
383        assert_eq!(parts.get(3), b"d");
384    }
385
386    #[test]
387    fn test_get_out_of_bounds() {
388        let payload = Bytes::from("a;;;b");
389        let parts = PayloadParts::split(payload, b";;;");
390
391        assert_eq!(parts.get(0), b"a");
392        assert_eq!(parts.get(1), b"b");
393        assert_eq!(parts.get(2), b""); // Out of bounds returns empty
394        assert_eq!(parts.get(100), b"");
395    }
396
397    #[test]
398    fn test_get_bytes_zero_copy() {
399        let payload = Bytes::from("hello;;;world");
400        let parts = PayloadParts::split(payload.clone(), b";;;");
401
402        let part0 = parts.get_bytes(0);
403        let part1 = parts.get_bytes(1);
404
405        assert_eq!(&part0[..], b"hello");
406        assert_eq!(&part1[..], b"world");
407
408        // Verify it's truly zero-copy by checking pointer
409        assert_eq!(part0.as_ptr(), payload.as_ptr());
410    }
411
412    #[test]
413    fn test_extract_header_basic() {
414        let headers = b"Content-Type: application/json\r\nX-Custom: value\r\n";
415
416        assert_eq!(
417            extract_header_value(headers, b"Content-Type"),
418            Some(b"application/json".as_slice())
419        );
420        assert_eq!(
421            extract_header_value(headers, b"X-Custom"),
422            Some(b"value".as_slice())
423        );
424    }
425
426    #[test]
427    fn test_extract_header_case_insensitive() {
428        let headers = b"Content-Type: application/json\r\n";
429
430        assert_eq!(
431            extract_header_value(headers, b"content-type"),
432            Some(b"application/json".as_slice())
433        );
434        assert_eq!(
435            extract_header_value(headers, b"CONTENT-TYPE"),
436            Some(b"application/json".as_slice())
437        );
438    }
439
440    #[test]
441    fn test_extract_header_with_whitespace() {
442        let headers = b"X-Custom:   value with spaces  \r\n";
443
444        assert_eq!(
445            extract_header_value(headers, b"X-Custom"),
446            Some(b"value with spaces  ".as_slice())
447        );
448    }
449
450    #[test]
451    fn test_extract_header_not_found() {
452        let headers = b"Content-Type: application/json\r\n";
453
454        assert_eq!(extract_header_value(headers, b"X-Missing"), None);
455    }
456
457    #[test]
458    fn test_extract_header_empty() {
459        assert_eq!(extract_header_value(b"", b"Content-Type"), None);
460        assert_eq!(extract_header_value(b"Content-Type: value", b""), None);
461    }
462
463    #[test]
464    fn test_extract_header_no_crlf() {
465        // Headers without \r\n (just \n)
466        let headers = b"Content-Type: value\nX-Other: other\n";
467
468        assert_eq!(
469            extract_header_value(headers, b"Content-Type"),
470            Some(b"value".as_slice())
471        );
472        assert_eq!(
473            extract_header_value(headers, b"X-Other"),
474            Some(b"other".as_slice())
475        );
476    }
477}