Skip to main content

sanitize_engine/processor/
log_line.rs

1//! JSON-in-log-line processor (NDJSON / structured log output).
2//!
3//! Processes files where each line may contain an embedded JSON object
4//! (e.g. structured logging output from `slog`, `tracing-json`, `bunyan`,
5//! `logrus`, Datadog, etc.).
6//!
7//! # Behaviour
8//!
9//! Each line is processed individually:
10//!
11//! 1. Scan for the first `{` on the line.
12//! 2. If found, attempt to locate the matching `}` using brace-counting.
13//! 3. Parse the extracted `{...}` span as JSON.
14//! 4. If parsing succeeds, pass the JSON object through the full JSON
15//!    processor with all field rules from the profile.
16//! 5. Reconstruct: `line_prefix` + sanitised JSON + `line_suffix`.
17//! 6. If parsing fails or no JSON span is found, the line is emitted
18//!    unchanged. The outer double-pass streaming scan will still catch
19//!    plain-text secrets on those lines.
20//!
21//! # Format Detection
22//!
23//! This processor is **not** auto-detected from `.log` extension.  
24//! It must be requested explicitly with `--format log`.  
25//! This avoids misprocessing plain-text log files that happen to contain
26//! individual `{` characters.
27//!
28//! # Field Rules
29//!
30//! Use `"*"` to sanitize every string field inside the JSON payloads,
31//! or specific dot-separated paths (e.g. `"user.token"`) to be selective.
32
33use crate::error::Result;
34use crate::processor::{FileTypeProfile, Processor};
35use crate::processor::json_proc::JsonProcessor;
36use crate::store::MappingStore;
37
38/// Maximum allowed input size (bytes) for log-line processing.
39const MAX_LOG_INPUT_SIZE: usize = 256 * 1024 * 1024; // 256 MiB
40
41/// Structured processor for NDJSON / structured-log files.
42pub struct LogLineProcessor {
43    json_proc: JsonProcessor,
44}
45
46impl LogLineProcessor {
47    pub fn new() -> Self {
48        Self {
49            json_proc: JsonProcessor,
50        }
51    }
52}
53
54impl Default for LogLineProcessor {
55    fn default() -> Self {
56        Self::new()
57    }
58}
59
60impl Processor for LogLineProcessor {
61    fn name(&self) -> &'static str {
62        "log"
63    }
64
65    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
66        profile.processor == "log"
67    }
68
69    fn process(
70        &self,
71        content: &[u8],
72        profile: &FileTypeProfile,
73        store: &MappingStore,
74    ) -> Result<Vec<u8>> {
75        if content.len() > MAX_LOG_INPUT_SIZE {
76            use crate::error::SanitizeError;
77            return Err(SanitizeError::InputTooLarge {
78                size: content.len(),
79                limit: MAX_LOG_INPUT_SIZE,
80            });
81        }
82
83        let text = String::from_utf8_lossy(content);
84        let mut output = String::with_capacity(text.len());
85
86        // Split on '\n'. `split('\n')` on a '\n'-terminated string produces a
87        // trailing empty element — skip it so we don't emit an extra blank line.
88        let raw_lines: Vec<&str> = text.split('\n').collect();
89        let lines = if raw_lines.last().map_or(false, |l| l.is_empty()) {
90            &raw_lines[..raw_lines.len() - 1]
91        } else {
92            &raw_lines[..]
93        };
94
95        for line in lines {
96            let processed_line = process_log_line(line, profile, store, &self.json_proc);
97            output.push_str(&processed_line);
98            output.push('\n');
99        }
100
101        // Restore the absence of a trailing newline if the original had none.
102        if !text.ends_with('\n') && output.ends_with('\n') {
103            output.pop();
104        }
105
106        Ok(output.into_bytes())
107    }
108}
109
110/// Process a single log line: find embedded JSON, sanitise it, recombine.
111/// Falls back to returning the line unchanged on any error.
112fn process_log_line(
113    line: &str,
114    profile: &FileTypeProfile,
115    store: &MappingStore,
116    json_proc: &JsonProcessor,
117) -> String {
118    // Locate the first `{` in the line.
119    let Some(json_start) = line.find('{') else {
120        return line.to_string();
121    };
122
123    // Find the matching closing `}` by counting brace depth.
124    let json_end = match find_matching_brace(&line[json_start..]) {
125        Some(relative_end) => json_start + relative_end,
126        None => return line.to_string(),
127    };
128
129    let json_span = &line[json_start..=json_end];
130    let prefix = &line[..json_start];
131    let suffix = &line[json_end + 1..];
132
133    // Build a compact-JSON profile so the output stays on one line.
134    let compact_profile = FileTypeProfile::new(
135        "json",
136        profile.fields.clone(),
137    )
138    .with_option("compact", "true");
139
140    // Try to sanitise the JSON span.
141    match json_proc.process(json_span.as_bytes(), &compact_profile, store) {
142        Ok(sanitised_bytes) => {
143            let sanitised = String::from_utf8_lossy(&sanitised_bytes);
144            format!("{}{}{}", prefix, sanitised, suffix)
145        }
146        // If JSON parsing fails (e.g. the `{` is part of a template string),
147        // emit the line unchanged. The streaming scanner pass handles the rest.
148        Err(_) => line.to_string(),
149    }
150}
151
152/// Find the index of the matching `}` for the `{` at position 0 of `s`.
153/// Returns `None` if the string does not start with `{` or has no matching `}`.
154fn find_matching_brace(s: &str) -> Option<usize> {
155    if !s.starts_with('{') {
156        return None;
157    }
158    let mut depth: usize = 0;
159    let mut in_string = false;
160    let mut escaped = false;
161    let bytes = s.as_bytes();
162
163    for (i, &b) in bytes.iter().enumerate() {
164        if escaped {
165            escaped = false;
166            continue;
167        }
168        match b {
169            b'\\' if in_string => escaped = true,
170            b'"' => in_string = !in_string,
171            b'{' if !in_string => depth += 1,
172            b'}' if !in_string => {
173                depth -= 1;
174                if depth == 0 {
175                    return Some(i);
176                }
177            }
178            _ => {}
179        }
180    }
181    None
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187    use crate::generator::HmacGenerator;
188    use crate::processor::profile::FieldRule;
189    use std::sync::Arc;
190
191    fn make_store() -> MappingStore {
192        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
193        MappingStore::new(gen, None)
194    }
195
196    fn wildcard_profile() -> FileTypeProfile {
197        FileTypeProfile::new("log", vec![FieldRule::new("*")])
198    }
199
200    #[test]
201    fn pure_ndjson_line() {
202        let store = make_store();
203        let proc = LogLineProcessor::new();
204        let content = b"{\"level\":\"info\",\"token\":\"abc123\",\"msg\":\"ok\"}\n";
205        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
206        let text = String::from_utf8(output).unwrap();
207        assert!(!text.contains("abc123"));
208        // JSON structure preserved.
209        assert!(text.contains("\"level\""));
210    }
211
212    #[test]
213    fn log_prefix_before_json() {
214        let store = make_store();
215        let proc = LogLineProcessor::new();
216        let content = b"2024-01-01T00:00:00Z INFO {\"token\":\"secret\",\"user\":\"bob\"}\n";
217        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
218        let text = String::from_utf8(output).unwrap();
219        // Prefix preserved.
220        assert!(text.contains("2024-01-01T00:00:00Z INFO "));
221        // Secrets sanitised.
222        assert!(!text.contains("secret"));
223        assert!(!text.contains("bob"));
224    }
225
226    #[test]
227    fn non_json_line_preserved() {
228        let store = make_store();
229        let proc = LogLineProcessor::new();
230        let content = b"plain text log line with no json\n";
231        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
232        assert_eq!(output, content);
233    }
234
235    #[test]
236    fn malformed_json_line_preserved() {
237        let store = make_store();
238        let proc = LogLineProcessor::new();
239        // Contains `{` but is not valid JSON — should pass through unchanged.
240        let content = b"ERROR: template {name} not found\n";
241        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
242        assert_eq!(output, content);
243    }
244
245    #[test]
246    fn multi_line_ndjson() {
247        let store = make_store();
248        let proc = LogLineProcessor::new();
249        let content = b"{\"token\":\"abc\"}\n{\"key\":\"xyz\"}\n";
250        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
251        let text = String::from_utf8(output).unwrap();
252        assert!(!text.contains("abc"));
253        assert!(!text.contains("xyz"));
254        assert_eq!(text.lines().count(), 2);
255    }
256
257    #[test]
258    fn find_matching_brace_simple() {
259        assert_eq!(find_matching_brace("{\"a\":\"b\"}"), Some(8));
260    }
261
262    #[test]
263    fn find_matching_brace_nested() {
264        assert_eq!(find_matching_brace("{\"a\":{\"b\":\"c\"}}"), Some(14));
265    }
266
267    #[test]
268    fn find_matching_brace_brace_in_string() {
269        assert_eq!(find_matching_brace("{\"a\":\"{not_nested}\"}"), Some(19));
270    }
271}