Skip to main content

hitbox_http/extractors/
body.rs

1//! Body content extraction for cache keys.
2//!
3//! Provides [`Body`] extractor with support for hashing, jq (JSON) queries,
4//! and regular expression matching.
5//!
6//! # Extraction Modes
7//!
8//! - **Hash**: SHA256 hash of the entire body (truncated to 16 hex chars)
9//! - **Jq**: Extract values from JSON bodies using jq expressions
10//! - **Regex**: Extract values using regular expression capture groups
11//!
12//! # Performance
13//!
14//! All modes buffer the body into memory. For large bodies, consider
15//! using hash mode to minimize cache key size.
16
17use std::collections::HashMap;
18use std::fmt::Debug;
19use std::rc::Rc;
20
21use async_trait::async_trait;
22use hitbox::{Extractor, KeyPart, KeyParts};
23use hyper::body::Body as HttpBody;
24use jaq_core::box_iter::box_once;
25use jaq_core::load::{Arena, File, Loader};
26use jaq_core::{Bind, Ctx, Exn, Filter, Native, RcIter};
27use jaq_json::Val;
28use regex::Regex;
29use serde_json::Value;
30use tracing::warn;
31
32pub use super::transform::Transform;
33use super::transform::{apply_hash, apply_transform_chain};
34use crate::CacheableHttpRequest;
35
36/// Body extraction mode for generating cache key parts.
37///
38/// # Variants
39///
40/// - [`Hash`](Self::Hash): SHA256 hash of entire body
41/// - [`Jq`](Self::Jq): Extract from JSON using jq expressions
42/// - [`Regex`](Self::Regex): Extract using regular expression captures
43#[derive(Debug, Clone)]
44pub enum BodyExtraction {
45    /// Hash the entire body using SHA256 (truncated to 16 hex chars).
46    Hash,
47    /// Extract values from JSON body using a jq expression.
48    Jq(JqExtraction),
49    /// Extract values using regular expression captures.
50    Regex(RegexExtraction),
51}
52
53/// A compiled jq expression for extracting values from JSON bodies.
54///
55/// Includes a custom `hash` function for hashing extracted values.
56///
57/// # Examples
58///
59/// ```
60/// use hitbox_http::extractors::body::JqExtraction;
61///
62/// // Extract user ID from JSON body
63/// let extraction = JqExtraction::compile(".user.id").unwrap();
64///
65/// // Extract and hash a sensitive field
66/// let extraction = JqExtraction::compile(".password | hash").unwrap();
67/// ```
68#[derive(Clone)]
69pub struct JqExtraction {
70    filter: Filter<Native<Val>>,
71}
72
73impl Debug for JqExtraction {
74    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75        f.debug_struct("JqExtraction").finish_non_exhaustive()
76    }
77}
78
79/// Result type for jq functions.
80type JqResult = Result<Val, jaq_core::Error<Val>>;
81
82/// Custom jq functions for hitbox.
83fn custom_jq_funs() -> impl Iterator<Item = (&'static str, Box<[Bind]>, Native<Val>)> {
84    let v0: Box<[Bind]> = Box::new([]);
85
86    [
87        // hash: SHA256 hash of the string value (truncated to 16 hex chars)
88        (
89            "hash",
90            v0,
91            Native::new(|_, cv| {
92                let val = cv.1;
93                let result: JqResult = match &val {
94                    Val::Str(s) => {
95                        let hash = apply_hash(s);
96                        Ok(Val::Str(Rc::new(hash)))
97                    }
98                    Val::Int(n) => {
99                        let hash = apply_hash(&n.to_string());
100                        Ok(Val::Str(Rc::new(hash)))
101                    }
102                    Val::Float(f) => {
103                        let hash = apply_hash(&f.to_string());
104                        Ok(Val::Str(Rc::new(hash)))
105                    }
106                    Val::Bool(b) => {
107                        let hash = apply_hash(&b.to_string());
108                        Ok(Val::Str(Rc::new(hash)))
109                    }
110                    Val::Null => {
111                        let hash = apply_hash("null");
112                        Ok(Val::Str(Rc::new(hash)))
113                    }
114                    Val::Num(n) => {
115                        let hash = apply_hash(n);
116                        Ok(Val::Str(Rc::new(hash)))
117                    }
118                    Val::Arr(_) | Val::Obj(_) => {
119                        // For arrays and objects, serialize to JSON string first
120                        let json: Value = val.clone().into();
121                        let hash = apply_hash(&json.to_string());
122                        Ok(Val::Str(Rc::new(hash)))
123                    }
124                };
125                box_once(result.map_err(Exn::from))
126            }),
127        ),
128    ]
129    .into_iter()
130}
131
132impl JqExtraction {
133    /// Compiles a jq expression for extracting values from JSON bodies.
134    ///
135    /// The compiled filter can be reused across multiple requests.
136    ///
137    /// # Errors
138    ///
139    /// Returns `Err(String)` if the expression is invalid:
140    /// - Parse errors (syntax errors in the jq expression)
141    /// - Compile errors (undefined functions, type mismatches)
142    ///
143    /// The error message includes details about the parsing or compilation failure.
144    pub fn compile(expression: &str) -> Result<Self, String> {
145        let program = File {
146            code: expression,
147            path: (),
148        };
149        let loader = Loader::new(jaq_std::defs().chain(jaq_json::defs()));
150        let arena = Arena::default();
151        let modules = loader
152            .load(&arena, program)
153            .map_err(|e| format!("jq parse error: {:?}", e))?;
154        let filter = jaq_core::Compiler::default()
155            .with_funs(
156                jaq_std::funs()
157                    .chain(jaq_json::funs())
158                    .chain(custom_jq_funs()),
159            )
160            .compile(modules)
161            .map_err(|e| format!("jq compile error: {:?}", e))?;
162        Ok(Self { filter })
163    }
164
165    fn apply(&self, input: Value) -> Vec<Value> {
166        let inputs = RcIter::new(core::iter::empty());
167        let out = self.filter.run((Ctx::new([], &inputs), Val::from(input)));
168        out.filter_map(|r| r.ok()).map(|v| v.into()).collect()
169    }
170}
171
172/// Configuration for regex-based body extraction.
173///
174/// Extracts values using regular expression captures. Supports both named
175/// and unnamed capture groups, with optional transformations.
176///
177/// # Examples
178///
179/// ```
180/// use hitbox_http::extractors::body::{RegexExtraction, Transforms};
181/// use regex::Regex;
182///
183/// // Extract order ID from body
184/// let extraction = RegexExtraction {
185///     regex: Regex::new(r#""order_id":\s*"(\w+)""#).unwrap(),
186///     key: Some("order_id".to_string()),
187///     global: false,
188///     transforms: Transforms::None,
189/// };
190/// ```
191#[derive(Debug, Clone)]
192pub struct RegexExtraction {
193    /// The regular expression pattern.
194    pub regex: Regex,
195    /// Key name for unnamed captures. Defaults to `"body"` if `None`.
196    pub key: Option<String>,
197    /// If `true`, extract all matches; if `false`, extract first match only.
198    pub global: bool,
199    /// Transformations to apply to captured values.
200    pub transforms: Transforms,
201}
202
203/// Transformations to apply to extracted values.
204///
205/// Apply hash, lowercase, or other transforms to captured values
206/// before using them in cache keys.
207#[derive(Debug, Clone, Default)]
208pub enum Transforms {
209    /// No transformations applied.
210    #[default]
211    None,
212    /// Apply transforms to all captured values.
213    FullBody(Vec<Transform>),
214    /// Apply different transforms per capture group name.
215    PerKey(HashMap<String, Vec<Transform>>),
216}
217
218/// Extracts cache key parts from request bodies.
219///
220/// Supports hash, jq (JSON), and regex extraction modes.
221/// Chain with other extractors using the builder pattern.
222///
223/// # Caveats
224///
225/// The entire body is buffered into memory during extraction.
226/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete)
227/// after extraction.
228#[derive(Debug)]
229pub struct Body<E> {
230    inner: E,
231    extraction: BodyExtraction,
232}
233
234impl<S> Body<super::NeutralExtractor<S>> {
235    /// Creates a body extractor with the given extraction mode.
236    pub fn new(extraction: BodyExtraction) -> Self {
237        Self {
238            inner: super::NeutralExtractor::new(),
239            extraction,
240        }
241    }
242}
243
244/// Extension trait for adding body extraction to an extractor chain.
245///
246/// # For Callers
247///
248/// Chain this to extract cache key parts from request bodies. Choose an
249/// extraction mode based on your needs:
250/// - [`BodyExtraction::Hash`] for opaque body identification
251/// - [`BodyExtraction::Jq`] for JSON content extraction
252/// - [`BodyExtraction::Regex`] for pattern-based extraction
253///
254/// **Important**: Body extraction buffers the entire body into memory.
255/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete) after extraction.
256///
257/// # For Implementors
258///
259/// This trait is automatically implemented for all [`Extractor`]
260/// types. You don't need to implement it manually.
261pub trait BodyExtractor: Sized {
262    /// Adds body extraction with the specified mode.
263    fn body(self, extraction: BodyExtraction) -> Body<Self>;
264}
265
266impl<E> BodyExtractor for E
267where
268    E: hitbox::Extractor,
269{
270    fn body(self, extraction: BodyExtraction) -> Body<Self> {
271        Body {
272            inner: self,
273            extraction,
274        }
275    }
276}
277
278/// Extract key parts from jq result.
279fn extract_jq_parts(values: Vec<Value>) -> Vec<KeyPart> {
280    let mut parts = Vec::new();
281
282    for value in values {
283        match value {
284            Value::Object(map) => {
285                for (key, val) in map {
286                    let value_str = value_to_string(&val);
287                    parts.push(KeyPart::new(key, value_str));
288                }
289            }
290            Value::Array(arr) => {
291                for item in arr {
292                    match item {
293                        Value::Object(map) => {
294                            for (key, val) in map {
295                                let value_str = value_to_string(&val);
296                                parts.push(KeyPart::new(key, value_str));
297                            }
298                        }
299                        other => {
300                            let value_str = value_to_string(&other);
301                            parts.push(KeyPart::new("body", value_str));
302                        }
303                    }
304                }
305            }
306            other => {
307                let value_str = value_to_string(&other);
308                parts.push(KeyPart::new("body", value_str));
309            }
310        }
311    }
312
313    parts
314}
315
316/// Convert JSON value to string for cache key.
317fn value_to_string(value: &Value) -> Option<String> {
318    match value {
319        Value::Null => None,
320        Value::String(s) => Some(s.clone()),
321        Value::Number(n) => Some(n.to_string()),
322        Value::Bool(b) => Some(b.to_string()),
323        other => Some(other.to_string()),
324    }
325}
326
327/// Extract key parts from regex matches.
328fn extract_regex_parts(
329    body: &str,
330    regex: &Regex,
331    key: &Option<String>,
332    global: bool,
333    transforms: &Transforms,
334) -> Vec<KeyPart> {
335    let mut parts = Vec::new();
336    let capture_names: Vec<_> = regex.capture_names().flatten().collect();
337    let has_named_groups = !capture_names.is_empty();
338
339    let apply_transforms = |key_name: &str, value: String| -> String {
340        match transforms {
341            Transforms::None => value,
342            Transforms::FullBody(chain) => apply_transform_chain(value, chain),
343            Transforms::PerKey(map) => {
344                if let Some(chain) = map.get(key_name) {
345                    apply_transform_chain(value, chain)
346                } else {
347                    value
348                }
349            }
350        }
351    };
352
353    if global {
354        for caps in regex.captures_iter(body) {
355            if has_named_groups {
356                for name in &capture_names {
357                    if let Some(m) = caps.name(name) {
358                        let value = apply_transforms(name, m.as_str().to_string());
359                        parts.push(KeyPart::new(*name, Some(value)));
360                    }
361                }
362            } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
363                let key_name = key.as_deref().unwrap_or("body");
364                let value = apply_transforms(key_name, m.as_str().to_string());
365                parts.push(KeyPart::new(key_name, Some(value)));
366            }
367        }
368    } else if let Some(caps) = regex.captures(body) {
369        if has_named_groups {
370            for name in &capture_names {
371                if let Some(m) = caps.name(name) {
372                    let value = apply_transforms(name, m.as_str().to_string());
373                    parts.push(KeyPart::new(*name, Some(value)));
374                }
375            }
376        } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
377            let key_name = key.as_deref().unwrap_or("body");
378            let value = apply_transforms(key_name, m.as_str().to_string());
379            parts.push(KeyPart::new(key_name, Some(value)));
380        }
381    }
382
383    parts
384}
385
386#[async_trait]
387impl<ReqBody, E> Extractor for Body<E>
388where
389    ReqBody: HttpBody + Send + 'static,
390    ReqBody::Error: Send,
391    ReqBody::Data: Send,
392    E: Extractor<Subject = CacheableHttpRequest<ReqBody>> + Send + Sync,
393{
394    type Subject = E::Subject;
395
396    async fn get(&self, subject: Self::Subject) -> KeyParts<Self::Subject> {
397        let (parts, body) = subject.into_parts();
398
399        // Collect body
400        let payload = match body.collect().await {
401            Ok(bytes) => bytes,
402            Err(error_body) => {
403                let request = CacheableHttpRequest::from_request(http::Request::from_parts(
404                    parts, error_body,
405                ));
406                let mut key_parts = self.inner.get(request).await;
407                key_parts.push(KeyPart::new("body", None::<String>));
408                return key_parts;
409            }
410        };
411
412        let body_bytes = payload.to_vec();
413        let body_str = String::from_utf8_lossy(&body_bytes);
414
415        let extracted_parts = match &self.extraction {
416            BodyExtraction::Hash => {
417                let hash = apply_hash(&body_str);
418                vec![KeyPart::new("body", Some(hash))]
419            }
420            BodyExtraction::Jq(jq) => match serde_json::from_str(&body_str) {
421                Ok(json_value) => {
422                    let results = jq.apply(json_value);
423                    extract_jq_parts(results)
424                }
425                Err(err) => {
426                    warn!(%err, "Jq body extraction failed: invalid JSON, falling back to body hash");
427                    let hash = apply_hash(&body_str);
428                    vec![KeyPart::new("body", Some(hash))]
429                }
430            },
431            BodyExtraction::Regex(regex_ext) => extract_regex_parts(
432                &body_str,
433                &regex_ext.regex,
434                &regex_ext.key,
435                regex_ext.global,
436                &regex_ext.transforms,
437            ),
438        };
439
440        let body = crate::BufferedBody::Complete(Some(payload));
441        let request = CacheableHttpRequest::from_request(http::Request::from_parts(parts, body));
442
443        let mut key_parts = self.inner.get(request).await;
444        for part in extracted_parts {
445            key_parts.push(part);
446        }
447        key_parts
448    }
449}