Skip to main content

hitbox_http/extractors/
body.rs

1//! Body content extraction for cache keys.
2//!
3//! Provides [`Body`] extractor with support for hashing, jq (JSON) queries,
4//! and regular expression matching.
5//!
6//! # Extraction Modes
7//!
8//! - **Hash**: SHA256 hash of the entire body (truncated to 16 hex chars)
9//! - **Jq**: Extract values from JSON bodies using jq expressions
10//! - **Regex**: Extract values using regular expression capture groups
11//!
12//! # Performance
13//!
14//! All modes buffer the body into memory. For large bodies, consider
15//! using hash mode to minimize cache key size.
16
17use std::collections::HashMap;
18use std::fmt::Debug;
19use std::rc::Rc;
20
21use async_trait::async_trait;
22use hitbox::{Extractor, KeyPart, KeyParts};
23use hyper::body::Body as HttpBody;
24use jaq_core::box_iter::box_once;
25use jaq_core::load::{Arena, File, Loader};
26use jaq_core::{Bind, Ctx, Exn, Filter, Native, RcIter};
27use jaq_json::Val;
28use regex::Regex;
29use serde_json::Value;
30
31pub use super::transform::Transform;
32use super::transform::{apply_hash, apply_transform_chain};
33use crate::CacheableHttpRequest;
34
35/// Body extraction mode for generating cache key parts.
36///
37/// # Variants
38///
39/// - [`Hash`](Self::Hash): SHA256 hash of entire body
40/// - [`Jq`](Self::Jq): Extract from JSON using jq expressions
41/// - [`Regex`](Self::Regex): Extract using regular expression captures
42#[derive(Debug, Clone)]
43pub enum BodyExtraction {
44    /// Hash the entire body using SHA256 (truncated to 16 hex chars).
45    Hash,
46    /// Extract values from JSON body using a jq expression.
47    Jq(JqExtraction),
48    /// Extract values using regular expression captures.
49    Regex(RegexExtraction),
50}
51
52/// A compiled jq expression for extracting values from JSON bodies.
53///
54/// Includes a custom `hash` function for hashing extracted values.
55///
56/// # Examples
57///
58/// ```
59/// use hitbox_http::extractors::body::JqExtraction;
60///
61/// // Extract user ID from JSON body
62/// let extraction = JqExtraction::compile(".user.id").unwrap();
63///
64/// // Extract and hash a sensitive field
65/// let extraction = JqExtraction::compile(".password | hash").unwrap();
66/// ```
67#[derive(Clone)]
68pub struct JqExtraction {
69    filter: Filter<Native<Val>>,
70}
71
72impl Debug for JqExtraction {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        f.debug_struct("JqExtraction").finish_non_exhaustive()
75    }
76}
77
78/// Result type for jq functions.
79type JqResult = Result<Val, jaq_core::Error<Val>>;
80
81/// Custom jq functions for hitbox.
82fn custom_jq_funs() -> impl Iterator<Item = (&'static str, Box<[Bind]>, Native<Val>)> {
83    let v0: Box<[Bind]> = Box::new([]);
84
85    [
86        // hash: SHA256 hash of the string value (truncated to 16 hex chars)
87        (
88            "hash",
89            v0,
90            Native::new(|_, cv| {
91                let val = cv.1;
92                let result: JqResult = match &val {
93                    Val::Str(s) => {
94                        let hash = apply_hash(s);
95                        Ok(Val::Str(Rc::new(hash)))
96                    }
97                    Val::Int(n) => {
98                        let hash = apply_hash(&n.to_string());
99                        Ok(Val::Str(Rc::new(hash)))
100                    }
101                    Val::Float(f) => {
102                        let hash = apply_hash(&f.to_string());
103                        Ok(Val::Str(Rc::new(hash)))
104                    }
105                    Val::Bool(b) => {
106                        let hash = apply_hash(&b.to_string());
107                        Ok(Val::Str(Rc::new(hash)))
108                    }
109                    Val::Null => {
110                        let hash = apply_hash("null");
111                        Ok(Val::Str(Rc::new(hash)))
112                    }
113                    Val::Num(n) => {
114                        let hash = apply_hash(n);
115                        Ok(Val::Str(Rc::new(hash)))
116                    }
117                    Val::Arr(_) | Val::Obj(_) => {
118                        // For arrays and objects, serialize to JSON string first
119                        let json: Value = val.clone().into();
120                        let hash = apply_hash(&json.to_string());
121                        Ok(Val::Str(Rc::new(hash)))
122                    }
123                };
124                box_once(result.map_err(Exn::from))
125            }),
126        ),
127    ]
128    .into_iter()
129}
130
131impl JqExtraction {
132    /// Compiles a jq expression for extracting values from JSON bodies.
133    ///
134    /// The compiled filter can be reused across multiple requests.
135    ///
136    /// # Errors
137    ///
138    /// Returns `Err(String)` if the expression is invalid:
139    /// - Parse errors (syntax errors in the jq expression)
140    /// - Compile errors (undefined functions, type mismatches)
141    ///
142    /// The error message includes details about the parsing or compilation failure.
143    pub fn compile(expression: &str) -> Result<Self, String> {
144        let program = File {
145            code: expression,
146            path: (),
147        };
148        let loader = Loader::new(jaq_std::defs().chain(jaq_json::defs()));
149        let arena = Arena::default();
150        let modules = loader
151            .load(&arena, program)
152            .map_err(|e| format!("jq parse error: {:?}", e))?;
153        let filter = jaq_core::Compiler::default()
154            .with_funs(
155                jaq_std::funs()
156                    .chain(jaq_json::funs())
157                    .chain(custom_jq_funs()),
158            )
159            .compile(modules)
160            .map_err(|e| format!("jq compile error: {:?}", e))?;
161        Ok(Self { filter })
162    }
163
164    fn apply(&self, input: Value) -> Vec<Value> {
165        let inputs = RcIter::new(core::iter::empty());
166        let out = self.filter.run((Ctx::new([], &inputs), Val::from(input)));
167        out.filter_map(|r| r.ok()).map(|v| v.into()).collect()
168    }
169}
170
171/// Configuration for regex-based body extraction.
172///
173/// Extracts values using regular expression captures. Supports both named
174/// and unnamed capture groups, with optional transformations.
175///
176/// # Examples
177///
178/// ```
179/// use hitbox_http::extractors::body::{RegexExtraction, Transforms};
180/// use regex::Regex;
181///
182/// // Extract order ID from body
183/// let extraction = RegexExtraction {
184///     regex: Regex::new(r#""order_id":\s*"(\w+)""#).unwrap(),
185///     key: Some("order_id".to_string()),
186///     global: false,
187///     transforms: Transforms::None,
188/// };
189/// ```
190#[derive(Debug, Clone)]
191pub struct RegexExtraction {
192    /// The regular expression pattern.
193    pub regex: Regex,
194    /// Key name for unnamed captures. Defaults to `"body"` if `None`.
195    pub key: Option<String>,
196    /// If `true`, extract all matches; if `false`, extract first match only.
197    pub global: bool,
198    /// Transformations to apply to captured values.
199    pub transforms: Transforms,
200}
201
202/// Transformations to apply to extracted values.
203///
204/// Apply hash, lowercase, or other transforms to captured values
205/// before using them in cache keys.
206#[derive(Debug, Clone, Default)]
207pub enum Transforms {
208    /// No transformations applied.
209    #[default]
210    None,
211    /// Apply transforms to all captured values.
212    FullBody(Vec<Transform>),
213    /// Apply different transforms per capture group name.
214    PerKey(HashMap<String, Vec<Transform>>),
215}
216
217/// Extracts cache key parts from request bodies.
218///
219/// Supports hash, jq (JSON), and regex extraction modes.
220/// Chain with other extractors using the builder pattern.
221///
222/// # Caveats
223///
224/// The entire body is buffered into memory during extraction.
225/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete)
226/// after extraction.
227#[derive(Debug)]
228pub struct Body<E> {
229    inner: E,
230    extraction: BodyExtraction,
231}
232
233impl<S> Body<super::NeutralExtractor<S>> {
234    /// Creates a body extractor with the given extraction mode.
235    pub fn new(extraction: BodyExtraction) -> Self {
236        Self {
237            inner: super::NeutralExtractor::new(),
238            extraction,
239        }
240    }
241}
242
243/// Extension trait for adding body extraction to an extractor chain.
244///
245/// # For Callers
246///
247/// Chain this to extract cache key parts from request bodies. Choose an
248/// extraction mode based on your needs:
249/// - [`BodyExtraction::Hash`] for opaque body identification
250/// - [`BodyExtraction::Jq`] for JSON content extraction
251/// - [`BodyExtraction::Regex`] for pattern-based extraction
252///
253/// **Important**: Body extraction buffers the entire body into memory.
254/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete) after extraction.
255///
256/// # For Implementors
257///
258/// This trait is automatically implemented for all [`Extractor`]
259/// types. You don't need to implement it manually.
260pub trait BodyExtractor: Sized {
261    /// Adds body extraction with the specified mode.
262    fn body(self, extraction: BodyExtraction) -> Body<Self>;
263}
264
265impl<E> BodyExtractor for E
266where
267    E: hitbox::Extractor,
268{
269    fn body(self, extraction: BodyExtraction) -> Body<Self> {
270        Body {
271            inner: self,
272            extraction,
273        }
274    }
275}
276
277/// Extract key parts from jq result.
278fn extract_jq_parts(values: Vec<Value>) -> Vec<KeyPart> {
279    let mut parts = Vec::new();
280
281    for value in values {
282        match value {
283            Value::Object(map) => {
284                for (key, val) in map {
285                    let value_str = value_to_string(&val);
286                    parts.push(KeyPart::new(key, value_str));
287                }
288            }
289            Value::Array(arr) => {
290                for item in arr {
291                    match item {
292                        Value::Object(map) => {
293                            for (key, val) in map {
294                                let value_str = value_to_string(&val);
295                                parts.push(KeyPart::new(key, value_str));
296                            }
297                        }
298                        other => {
299                            let value_str = value_to_string(&other);
300                            parts.push(KeyPart::new("body", value_str));
301                        }
302                    }
303                }
304            }
305            other => {
306                let value_str = value_to_string(&other);
307                parts.push(KeyPart::new("body", value_str));
308            }
309        }
310    }
311
312    parts
313}
314
315/// Convert JSON value to string for cache key.
316fn value_to_string(value: &Value) -> Option<String> {
317    match value {
318        Value::Null => None,
319        Value::String(s) => Some(s.clone()),
320        Value::Number(n) => Some(n.to_string()),
321        Value::Bool(b) => Some(b.to_string()),
322        other => Some(other.to_string()),
323    }
324}
325
326/// Extract key parts from regex matches.
327fn extract_regex_parts(
328    body: &str,
329    regex: &Regex,
330    key: &Option<String>,
331    global: bool,
332    transforms: &Transforms,
333) -> Vec<KeyPart> {
334    let mut parts = Vec::new();
335    let capture_names: Vec<_> = regex.capture_names().flatten().collect();
336    let has_named_groups = !capture_names.is_empty();
337
338    let apply_transforms = |key_name: &str, value: String| -> String {
339        match transforms {
340            Transforms::None => value,
341            Transforms::FullBody(chain) => apply_transform_chain(value, chain),
342            Transforms::PerKey(map) => {
343                if let Some(chain) = map.get(key_name) {
344                    apply_transform_chain(value, chain)
345                } else {
346                    value
347                }
348            }
349        }
350    };
351
352    if global {
353        for caps in regex.captures_iter(body) {
354            if has_named_groups {
355                for name in &capture_names {
356                    if let Some(m) = caps.name(name) {
357                        let value = apply_transforms(name, m.as_str().to_string());
358                        parts.push(KeyPart::new(*name, Some(value)));
359                    }
360                }
361            } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
362                let key_name = key.as_deref().unwrap_or("body");
363                let value = apply_transforms(key_name, m.as_str().to_string());
364                parts.push(KeyPart::new(key_name, Some(value)));
365            }
366        }
367    } else if let Some(caps) = regex.captures(body) {
368        if has_named_groups {
369            for name in &capture_names {
370                if let Some(m) = caps.name(name) {
371                    let value = apply_transforms(name, m.as_str().to_string());
372                    parts.push(KeyPart::new(*name, Some(value)));
373                }
374            }
375        } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
376            let key_name = key.as_deref().unwrap_or("body");
377            let value = apply_transforms(key_name, m.as_str().to_string());
378            parts.push(KeyPart::new(key_name, Some(value)));
379        }
380    }
381
382    parts
383}
384
385#[async_trait]
386impl<ReqBody, E> Extractor for Body<E>
387where
388    ReqBody: HttpBody + Send + 'static,
389    ReqBody::Error: Send,
390    ReqBody::Data: Send,
391    E: Extractor<Subject = CacheableHttpRequest<ReqBody>> + Send + Sync,
392{
393    type Subject = E::Subject;
394
395    async fn get(&self, subject: Self::Subject) -> KeyParts<Self::Subject> {
396        let (parts, body) = subject.into_parts();
397
398        // Collect body
399        let payload = match body.collect().await {
400            Ok(bytes) => bytes,
401            Err(error_body) => {
402                let request = CacheableHttpRequest::from_request(http::Request::from_parts(
403                    parts, error_body,
404                ));
405                let mut key_parts = self.inner.get(request).await;
406                key_parts.push(KeyPart::new("body", None::<String>));
407                return key_parts;
408            }
409        };
410
411        let body_bytes = payload.to_vec();
412        let body_str = String::from_utf8_lossy(&body_bytes);
413
414        let extracted_parts = match &self.extraction {
415            BodyExtraction::Hash => {
416                let hash = apply_hash(&body_str);
417                vec![KeyPart::new("body", Some(hash))]
418            }
419            BodyExtraction::Jq(jq) => {
420                let json_value = serde_json::from_str(&body_str).unwrap_or(Value::Null);
421                let results = jq.apply(json_value);
422                extract_jq_parts(results)
423            }
424            BodyExtraction::Regex(regex_ext) => extract_regex_parts(
425                &body_str,
426                &regex_ext.regex,
427                &regex_ext.key,
428                regex_ext.global,
429                &regex_ext.transforms,
430            ),
431        };
432
433        let body = crate::BufferedBody::Complete(Some(payload));
434        let request = CacheableHttpRequest::from_request(http::Request::from_parts(parts, body));
435
436        let mut key_parts = self.inner.get(request).await;
437        for part in extracted_parts {
438            key_parts.push(part);
439        }
440        key_parts
441    }
442}