Skip to main content

sanitize_engine/processor/
log_line.rs

1//! JSON-in-log-line processor (NDJSON / structured log output).
2//!
3//! Processes files where each line may contain an embedded JSON object
4//! (e.g. structured logging output from `slog`, `tracing-json`, `bunyan`,
5//! `logrus`, Datadog, etc.).
6//!
7//! # Behaviour
8//!
9//! Each line is processed individually:
10//!
11//! 1. Scan for the first `{` on the line.
12//! 2. If found, attempt to locate the matching `}` using brace-counting.
13//! 3. Parse the extracted `{...}` span as JSON.
14//! 4. If parsing succeeds, pass the JSON object through the full JSON
15//!    processor with all field rules from the profile.
16//! 5. Reconstruct: `line_prefix` + sanitised JSON + `line_suffix`.
17//! 6. If parsing fails or no JSON span is found, the line is emitted
18//!    unchanged. The outer double-pass streaming scan will still catch
19//!    plain-text secrets on those lines.
20//!
21//! # Format Detection
22//!
23//! This processor is **not** auto-detected from `.log` extension.
24//! It must be requested explicitly with `--format log`.
25//! This avoids misprocessing plain-text log files that happen to contain
26//! individual `{` characters.
27//!
28//! # Field Rules
29//!
30//! Use `"*"` to sanitize every string field inside the JSON payloads,
31//! or specific dot-separated paths (e.g. `"user.token"`) to be selective.
32
33use crate::error::Result;
34use crate::processor::json_proc::JsonProcessor;
35use crate::processor::limits::DEFAULT_INPUT_SIZE;
36use crate::processor::{FileTypeProfile, Processor};
37use crate::store::MappingStore;
38
39/// Structured processor for NDJSON / structured-log files.
40pub struct LogLineProcessor {
41    json_proc: JsonProcessor,
42}
43
44impl LogLineProcessor {
45    pub fn new() -> Self {
46        Self {
47            json_proc: JsonProcessor,
48        }
49    }
50}
51
52impl Default for LogLineProcessor {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl Processor for LogLineProcessor {
59    fn name(&self) -> &'static str {
60        "log"
61    }
62
63    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
64        profile.processor == "log"
65    }
66
67    fn process(
68        &self,
69        content: &[u8],
70        profile: &FileTypeProfile,
71        store: &MappingStore,
72    ) -> Result<Vec<u8>> {
73        if content.len() > DEFAULT_INPUT_SIZE {
74            use crate::error::SanitizeError;
75            return Err(SanitizeError::InputTooLarge {
76                size: content.len(),
77                limit: DEFAULT_INPUT_SIZE,
78            });
79        }
80
81        let text = String::from_utf8_lossy(content);
82        let mut output = String::with_capacity(text.len());
83
84        // Split on '\n'. `split('\n')` on a '\n'-terminated string produces a
85        // trailing empty element — skip it so we don't emit an extra blank line.
86        let raw_lines: Vec<&str> = text.split('\n').collect();
87        let lines = if raw_lines.last().is_some_and(|l| l.is_empty()) {
88            &raw_lines[..raw_lines.len() - 1]
89        } else {
90            &raw_lines[..]
91        };
92
93        for line in lines {
94            let processed_line = process_log_line(line, profile, store, &self.json_proc);
95            output.push_str(&processed_line);
96            output.push('\n');
97        }
98
99        // Restore the absence of a trailing newline if the original had none.
100        if !text.ends_with('\n') && output.ends_with('\n') {
101            output.pop();
102        }
103
104        Ok(output.into_bytes())
105    }
106}
107
108/// Process a single log line: find embedded JSON, sanitise it, recombine.
109/// Falls back to returning the line unchanged on any error.
110fn process_log_line(
111    line: &str,
112    profile: &FileTypeProfile,
113    store: &MappingStore,
114    json_proc: &JsonProcessor,
115) -> String {
116    // Locate the first `{` in the line.
117    let Some(json_start) = line.find('{') else {
118        return line.to_string();
119    };
120
121    // Find the matching closing `}` by counting brace depth.
122    let json_end = match find_matching_brace(&line[json_start..]) {
123        Some(relative_end) => json_start + relative_end,
124        None => return line.to_string(),
125    };
126
127    let json_span = &line[json_start..=json_end];
128    let prefix = &line[..json_start];
129    let suffix = &line[json_end + 1..];
130
131    // Build a compact-JSON profile so the output stays on one line.
132    let compact_profile =
133        FileTypeProfile::new("json", profile.fields.clone()).with_option("compact", "true");
134
135    // Try to sanitise the JSON span.
136    match json_proc.process(json_span.as_bytes(), &compact_profile, store) {
137        Ok(sanitised_bytes) => {
138            let sanitised = String::from_utf8_lossy(&sanitised_bytes);
139            format!("{}{}{}", prefix, sanitised, suffix)
140        }
141        // If JSON parsing fails (e.g. the `{` is part of a template string),
142        // emit the line unchanged. The streaming scanner pass handles the rest.
143        Err(_) => line.to_string(),
144    }
145}
146
147/// Find the index of the matching `}` for the `{` at position 0 of `s`.
148/// Returns `None` if the string does not start with `{` or has no matching `}`.
149fn find_matching_brace(s: &str) -> Option<usize> {
150    if !s.starts_with('{') {
151        return None;
152    }
153    let mut depth: usize = 0;
154    let mut in_string = false;
155    let mut escaped = false;
156    let bytes = s.as_bytes();
157
158    for (i, &b) in bytes.iter().enumerate() {
159        if escaped {
160            escaped = false;
161            continue;
162        }
163        match b {
164            b'\\' if in_string => escaped = true,
165            b'"' => in_string = !in_string,
166            b'{' if !in_string => depth += 1,
167            b'}' if !in_string => {
168                depth -= 1;
169                if depth == 0 {
170                    return Some(i);
171                }
172            }
173            _ => {}
174        }
175    }
176    None
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::generator::HmacGenerator;
183    use crate::processor::profile::FieldRule;
184    use std::sync::Arc;
185
186    fn make_store() -> MappingStore {
187        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
188        MappingStore::new(gen, None)
189    }
190
191    fn wildcard_profile() -> FileTypeProfile {
192        FileTypeProfile::new("log", vec![FieldRule::new("*")])
193    }
194
195    #[test]
196    fn pure_ndjson_line() {
197        let store = make_store();
198        let proc = LogLineProcessor::new();
199        let content = b"{\"level\":\"info\",\"token\":\"abc123\",\"msg\":\"ok\"}\n";
200        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
201        let text = String::from_utf8(output).unwrap();
202        assert!(!text.contains("abc123"));
203        // JSON structure preserved.
204        assert!(text.contains("\"level\""));
205    }
206
207    #[test]
208    fn log_prefix_before_json() {
209        let store = make_store();
210        let proc = LogLineProcessor::new();
211        let content = b"2024-01-01T00:00:00Z INFO {\"token\":\"secret\",\"user\":\"bob\"}\n";
212        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
213        let text = String::from_utf8(output).unwrap();
214        // Prefix preserved.
215        assert!(text.contains("2024-01-01T00:00:00Z INFO "));
216        // Secrets sanitised.
217        assert!(!text.contains("secret"));
218        assert!(!text.contains("bob"));
219    }
220
221    #[test]
222    fn non_json_line_preserved() {
223        let store = make_store();
224        let proc = LogLineProcessor::new();
225        let content = b"plain text log line with no json\n";
226        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
227        assert_eq!(output, content);
228    }
229
230    #[test]
231    fn malformed_json_line_preserved() {
232        let store = make_store();
233        let proc = LogLineProcessor::new();
234        // Contains `{` but is not valid JSON — should pass through unchanged.
235        let content = b"ERROR: template {name} not found\n";
236        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
237        assert_eq!(output, content);
238    }
239
240    #[test]
241    fn multi_line_ndjson() {
242        let store = make_store();
243        let proc = LogLineProcessor::new();
244        let content = b"{\"token\":\"abc\"}\n{\"key\":\"xyz\"}\n";
245        let output = proc.process(content, &wildcard_profile(), &store).unwrap();
246        let text = String::from_utf8(output).unwrap();
247        assert!(!text.contains("abc"));
248        assert!(!text.contains("xyz"));
249        assert_eq!(text.lines().count(), 2);
250    }
251
252    #[test]
253    fn find_matching_brace_simple() {
254        assert_eq!(find_matching_brace("{\"a\":\"b\"}"), Some(8));
255    }
256
257    #[test]
258    fn find_matching_brace_nested() {
259        assert_eq!(find_matching_brace("{\"a\":{\"b\":\"c\"}}"), Some(14));
260    }
261
262    #[test]
263    fn find_matching_brace_brace_in_string() {
264        assert_eq!(find_matching_brace("{\"a\":\"{not_nested}\"}"), Some(19));
265    }
266}