hitbox-http 0.2.1

Cacheable HTTP Request and Response
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
//! Body content extraction for cache keys.
//!
//! Provides [`Body`] extractor with support for hashing, jq (JSON) queries,
//! and regular expression matching.
//!
//! # Extraction Modes
//!
//! - **Hash**: SHA256 hash of the entire body (truncated to 16 hex chars)
//! - **Jq**: Extract values from JSON bodies using jq expressions
//! - **Regex**: Extract values using regular expression capture groups
//!
//! # Performance
//!
//! All modes buffer the body into memory. For large bodies, consider
//! using hash mode to minimize cache key size.

use std::collections::HashMap;
use std::fmt::Debug;
use std::rc::Rc;

use async_trait::async_trait;
use hitbox::{Extractor, KeyPart, KeyParts};
use hyper::body::Body as HttpBody;
use jaq_core::box_iter::box_once;
use jaq_core::load::{Arena, File, Loader};
use jaq_core::{Bind, Ctx, Exn, Filter, Native, RcIter};
use jaq_json::Val;
use regex::Regex;
use serde_json::Value;
use tracing::warn;

pub use super::transform::Transform;
use super::transform::{apply_hash, apply_transform_chain};
use crate::CacheableHttpRequest;

/// Body extraction mode for generating cache key parts.
///
/// # Variants
///
/// - [`Hash`](Self::Hash): SHA256 hash of entire body
/// - [`Jq`](Self::Jq): Extract from JSON using jq expressions
/// - [`Regex`](Self::Regex): Extract using regular expression captures
#[derive(Debug, Clone)]
pub enum BodyExtraction {
    /// Hash the entire body using SHA256 (truncated to 16 hex chars).
    Hash,
    /// Extract values from JSON body using a jq expression.
    Jq(JqExtraction),
    /// Extract values using regular expression captures.
    Regex(RegexExtraction),
}

/// A compiled jq expression for extracting values from JSON bodies.
///
/// Includes a custom `hash` function for hashing extracted values.
///
/// # Examples
///
/// ```
/// use hitbox_http::extractors::body::JqExtraction;
///
/// // Extract user ID from JSON body
/// let extraction = JqExtraction::compile(".user.id").unwrap();
///
/// // Extract and hash a sensitive field
/// let extraction = JqExtraction::compile(".password | hash").unwrap();
/// ```
#[derive(Clone)]
pub struct JqExtraction {
    filter: Filter<Native<Val>>,
}

impl Debug for JqExtraction {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("JqExtraction").finish_non_exhaustive()
    }
}

/// Result type for jq functions.
type JqResult = Result<Val, jaq_core::Error<Val>>;

/// Custom jq functions for hitbox.
fn custom_jq_funs() -> impl Iterator<Item = (&'static str, Box<[Bind]>, Native<Val>)> {
    let v0: Box<[Bind]> = Box::new([]);

    [
        // hash: SHA256 hash of the string value (truncated to 16 hex chars)
        (
            "hash",
            v0,
            Native::new(|_, cv| {
                let val = cv.1;
                let result: JqResult = match &val {
                    Val::Str(s) => {
                        let hash = apply_hash(s);
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Int(n) => {
                        let hash = apply_hash(&n.to_string());
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Float(f) => {
                        let hash = apply_hash(&f.to_string());
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Bool(b) => {
                        let hash = apply_hash(&b.to_string());
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Null => {
                        let hash = apply_hash("null");
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Num(n) => {
                        let hash = apply_hash(n);
                        Ok(Val::Str(Rc::new(hash)))
                    }
                    Val::Arr(_) | Val::Obj(_) => {
                        // For arrays and objects, serialize to JSON string first
                        let json: Value = val.clone().into();
                        let hash = apply_hash(&json.to_string());
                        Ok(Val::Str(Rc::new(hash)))
                    }
                };
                box_once(result.map_err(Exn::from))
            }),
        ),
    ]
    .into_iter()
}

impl JqExtraction {
    /// Compiles a jq expression for extracting values from JSON bodies.
    ///
    /// The compiled filter can be reused across multiple requests.
    ///
    /// # Errors
    ///
    /// Returns `Err(String)` if the expression is invalid:
    /// - Parse errors (syntax errors in the jq expression)
    /// - Compile errors (undefined functions, type mismatches)
    ///
    /// The error message includes details about the parsing or compilation failure.
    pub fn compile(expression: &str) -> Result<Self, String> {
        let program = File {
            code: expression,
            path: (),
        };
        let loader = Loader::new(jaq_std::defs().chain(jaq_json::defs()));
        let arena = Arena::default();
        let modules = loader
            .load(&arena, program)
            .map_err(|e| format!("jq parse error: {:?}", e))?;
        let filter = jaq_core::Compiler::default()
            .with_funs(
                jaq_std::funs()
                    .chain(jaq_json::funs())
                    .chain(custom_jq_funs()),
            )
            .compile(modules)
            .map_err(|e| format!("jq compile error: {:?}", e))?;
        Ok(Self { filter })
    }

    fn apply(&self, input: Value) -> Vec<Value> {
        let inputs = RcIter::new(core::iter::empty());
        let out = self.filter.run((Ctx::new([], &inputs), Val::from(input)));
        out.filter_map(|r| r.ok()).map(|v| v.into()).collect()
    }
}

/// Configuration for regex-based body extraction.
///
/// Extracts values using regular expression captures. Supports both named
/// and unnamed capture groups, with optional transformations.
///
/// # Examples
///
/// ```
/// use hitbox_http::extractors::body::{RegexExtraction, Transforms};
/// use regex::Regex;
///
/// // Extract order ID from body
/// let extraction = RegexExtraction {
///     regex: Regex::new(r#""order_id":\s*"(\w+)""#).unwrap(),
///     key: Some("order_id".to_string()),
///     global: false,
///     transforms: Transforms::None,
/// };
/// ```
#[derive(Debug, Clone)]
pub struct RegexExtraction {
    /// The regular expression pattern.
    pub regex: Regex,
    /// Key name for unnamed captures. Defaults to `"body"` if `None`.
    pub key: Option<String>,
    /// If `true`, extract all matches; if `false`, extract first match only.
    pub global: bool,
    /// Transformations to apply to captured values.
    pub transforms: Transforms,
}

/// Transformations to apply to extracted values.
///
/// Apply hash, lowercase, or other transforms to captured values
/// before using them in cache keys.
#[derive(Debug, Clone, Default)]
pub enum Transforms {
    /// No transformations applied.
    #[default]
    None,
    /// Apply transforms to all captured values.
    FullBody(Vec<Transform>),
    /// Apply different transforms per capture group name.
    PerKey(HashMap<String, Vec<Transform>>),
}

/// Extracts cache key parts from request bodies.
///
/// Supports hash, jq (JSON), and regex extraction modes.
/// Chain with other extractors using the builder pattern.
///
/// # Caveats
///
/// The entire body is buffered into memory during extraction.
/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete)
/// after extraction.
#[derive(Debug)]
pub struct Body<E> {
    inner: E,
    extraction: BodyExtraction,
}

impl<S> Body<super::NeutralExtractor<S>> {
    /// Creates a body extractor with the given extraction mode.
    pub fn new(extraction: BodyExtraction) -> Self {
        Self {
            inner: super::NeutralExtractor::new(),
            extraction,
        }
    }
}

/// Extension trait for adding body extraction to an extractor chain.
///
/// # For Callers
///
/// Chain this to extract cache key parts from request bodies. Choose an
/// extraction mode based on your needs:
/// - [`BodyExtraction::Hash`] for opaque body identification
/// - [`BodyExtraction::Jq`] for JSON content extraction
/// - [`BodyExtraction::Regex`] for pattern-based extraction
///
/// **Important**: Body extraction buffers the entire body into memory.
/// The body is returned as [`BufferedBody::Complete`](crate::BufferedBody::Complete) after extraction.
///
/// # For Implementors
///
/// This trait is automatically implemented for all [`Extractor`]
/// types. You don't need to implement it manually.
pub trait BodyExtractor: Sized {
    /// Adds body extraction with the specified mode.
    fn body(self, extraction: BodyExtraction) -> Body<Self>;
}

impl<E> BodyExtractor for E
where
    E: hitbox::Extractor,
{
    fn body(self, extraction: BodyExtraction) -> Body<Self> {
        Body {
            inner: self,
            extraction,
        }
    }
}

/// Extract key parts from jq result.
fn extract_jq_parts(values: Vec<Value>) -> Vec<KeyPart> {
    let mut parts = Vec::new();

    for value in values {
        match value {
            Value::Object(map) => {
                for (key, val) in map {
                    let value_str = value_to_string(&val);
                    parts.push(KeyPart::new(key, value_str));
                }
            }
            Value::Array(arr) => {
                for item in arr {
                    match item {
                        Value::Object(map) => {
                            for (key, val) in map {
                                let value_str = value_to_string(&val);
                                parts.push(KeyPart::new(key, value_str));
                            }
                        }
                        other => {
                            let value_str = value_to_string(&other);
                            parts.push(KeyPart::new("body", value_str));
                        }
                    }
                }
            }
            other => {
                let value_str = value_to_string(&other);
                parts.push(KeyPart::new("body", value_str));
            }
        }
    }

    parts
}

/// Convert JSON value to string for cache key.
fn value_to_string(value: &Value) -> Option<String> {
    match value {
        Value::Null => None,
        Value::String(s) => Some(s.clone()),
        Value::Number(n) => Some(n.to_string()),
        Value::Bool(b) => Some(b.to_string()),
        other => Some(other.to_string()),
    }
}

/// Extract key parts from regex matches.
fn extract_regex_parts(
    body: &str,
    regex: &Regex,
    key: &Option<String>,
    global: bool,
    transforms: &Transforms,
) -> Vec<KeyPart> {
    let mut parts = Vec::new();
    let capture_names: Vec<_> = regex.capture_names().flatten().collect();
    let has_named_groups = !capture_names.is_empty();

    let apply_transforms = |key_name: &str, value: String| -> String {
        match transforms {
            Transforms::None => value,
            Transforms::FullBody(chain) => apply_transform_chain(value, chain),
            Transforms::PerKey(map) => {
                if let Some(chain) = map.get(key_name) {
                    apply_transform_chain(value, chain)
                } else {
                    value
                }
            }
        }
    };

    if global {
        for caps in regex.captures_iter(body) {
            if has_named_groups {
                for name in &capture_names {
                    if let Some(m) = caps.name(name) {
                        let value = apply_transforms(name, m.as_str().to_string());
                        parts.push(KeyPart::new(*name, Some(value)));
                    }
                }
            } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
                let key_name = key.as_deref().unwrap_or("body");
                let value = apply_transforms(key_name, m.as_str().to_string());
                parts.push(KeyPart::new(key_name, Some(value)));
            }
        }
    } else if let Some(caps) = regex.captures(body) {
        if has_named_groups {
            for name in &capture_names {
                if let Some(m) = caps.name(name) {
                    let value = apply_transforms(name, m.as_str().to_string());
                    parts.push(KeyPart::new(*name, Some(value)));
                }
            }
        } else if let Some(m) = caps.get(1).or_else(|| caps.get(0)) {
            let key_name = key.as_deref().unwrap_or("body");
            let value = apply_transforms(key_name, m.as_str().to_string());
            parts.push(KeyPart::new(key_name, Some(value)));
        }
    }

    parts
}

#[async_trait]
impl<ReqBody, E> Extractor for Body<E>
where
    ReqBody: HttpBody + Send + 'static,
    ReqBody::Error: Send,
    ReqBody::Data: Send,
    E: Extractor<Subject = CacheableHttpRequest<ReqBody>> + Send + Sync,
{
    type Subject = E::Subject;

    async fn get(&self, subject: Self::Subject) -> KeyParts<Self::Subject> {
        let (parts, body) = subject.into_parts();

        // Collect body
        let payload = match body.collect().await {
            Ok(bytes) => bytes,
            Err(error_body) => {
                let request = CacheableHttpRequest::from_request(http::Request::from_parts(
                    parts, error_body,
                ));
                let mut key_parts = self.inner.get(request).await;
                key_parts.push(KeyPart::new("body", None::<String>));
                return key_parts;
            }
        };

        let body_bytes = payload.to_vec();
        let body_str = String::from_utf8_lossy(&body_bytes);

        let extracted_parts = match &self.extraction {
            BodyExtraction::Hash => {
                let hash = apply_hash(&body_str);
                vec![KeyPart::new("body", Some(hash))]
            }
            BodyExtraction::Jq(jq) => match serde_json::from_str(&body_str) {
                Ok(json_value) => {
                    let results = jq.apply(json_value);
                    extract_jq_parts(results)
                }
                Err(err) => {
                    warn!(%err, "Jq body extraction failed: invalid JSON, falling back to body hash");
                    let hash = apply_hash(&body_str);
                    vec![KeyPart::new("body", Some(hash))]
                }
            },
            BodyExtraction::Regex(regex_ext) => extract_regex_parts(
                &body_str,
                &regex_ext.regex,
                &regex_ext.key,
                regex_ext.global,
                &regex_ext.transforms,
            ),
        };

        let body = crate::BufferedBody::Complete(Some(payload));
        let request = CacheableHttpRequest::from_request(http::Request::from_parts(parts, body));

        let mut key_parts = self.inner.get(request).await;
        for part in extracted_parts {
            key_parts.push(part);
        }
        key_parts
    }
}