Skip to main content

qubit_json/
lenient_json_normalizer.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026.
4 *    Haixing Hu, Qubit Co. Ltd.
5 *
6 *    All rights reserved.
7 *
8 ******************************************************************************/
9//! Internal normalization utilities used by the lenient JSON decoder.
10//!
11//! Author: Haixing Hu
12
13use std::borrow::Cow;
14
15use crate::{JsonDecodeError, JsonDecodeOptions};
16
17/// Normalizes one raw JSON text input before JSON parsing.
18///
19/// The object holds normalization options and applies all supported preprocessing
20/// rules in the same order for every `normalize` call.
21#[derive(Debug, Clone, Copy)]
22pub struct LenientJsonNormalizer {
23    /// Stores the option set used by the normalizer.
24    options: JsonDecodeOptions,
25}
26
27impl Default for LenientJsonNormalizer {
28    fn default() -> Self {
29        Self::new(JsonDecodeOptions::default())
30    }
31}
32
33impl LenientJsonNormalizer {
34    /// Creates a normalizer with the provided lenient decoding options.
35    ///
36    /// The options are copied into the object so each `normalize` call uses a
37    /// consistent policy without external mutation.
38    #[must_use]
39    pub const fn new(options: JsonDecodeOptions) -> Self {
40        Self { options }
41    }
42
43    /// Returns the configuration used by this normalizer.
44    #[must_use]
45    pub const fn options(&self) -> &JsonDecodeOptions {
46        &self.options
47    }
48
49    /// Normalizes one raw JSON text input and returns a parsed-ready text slice.
50    ///
51    /// The pipeline is intentionally narrow: it trims whitespace, strips an
52    /// optional BOM, optionally removes a Markdown code fence, escapes control
53    /// characters in strings, and finally validates non-emptiness again.
54    pub fn normalize<'a>(&self, input: &'a str) -> Result<Cow<'a, str>, JsonDecodeError> {
55        let input = self.require_non_empty(input)?;
56        let input = self.trim_if_enabled(input);
57        let input = self.strip_utf8_bom(input);
58        let input = self.trim_if_enabled(input);
59        let input = self.strip_markdown_code_fence(input);
60        let input = self.trim_if_enabled(input);
61        let input = self.escape_control_chars_in_json_strings(input);
62        let input = self.trim_cow_if_enabled(input);
63
64        if input.is_empty() {
65            Err(JsonDecodeError::empty_input())
66        } else {
67            Ok(input)
68        }
69    }
70
71    /// Verifies that the input is not empty under the configured policy.
72    ///
73    /// If `trim_whitespace` is enabled, whitespace-only input is rejected as
74    /// empty; otherwise only zero-length input is rejected.
75    fn require_non_empty<'a>(&self, input: &'a str) -> Result<&'a str, JsonDecodeError> {
76        if self.options.trim_whitespace {
77            if input.trim().is_empty() {
78                Err(JsonDecodeError::empty_input())
79            } else {
80                Ok(input)
81            }
82        } else if input.is_empty() {
83            Err(JsonDecodeError::empty_input())
84        } else {
85            Ok(input)
86        }
87    }
88
89    /// Trims a borrowed input slice if trimming is enabled.
90    ///
91    /// This helper borrows and never allocates when trimming is disabled.
92    fn trim_if_enabled<'a>(&self, input: &'a str) -> &'a str {
93        if self.options.trim_whitespace {
94            input.trim()
95        } else {
96            input
97        }
98    }
99
100    /// Trims the normalized text when trimming is enabled.
101    ///
102    /// Borrowed values remain borrowed, and owned values are copied only when
103    /// trimming removes characters.
104    fn trim_cow_if_enabled<'a>(&self, input: Cow<'a, str>) -> Cow<'a, str> {
105        if !self.options.trim_whitespace {
106            return input;
107        }
108        match input {
109            Cow::Borrowed(text) => Cow::Borrowed(text.trim()),
110            Cow::Owned(text) => {
111                let trimmed = text.trim();
112                if trimmed.len() == text.len() {
113                    Cow::Owned(text)
114                } else {
115                    Cow::Owned(trimmed.to_string())
116                }
117            }
118        }
119    }
120
121    /// Removes an optional UTF-8 BOM before parsing.
122    ///
123    /// If no BOM exists, the input is returned unchanged.
124    fn strip_utf8_bom<'a>(&self, input: &'a str) -> &'a str {
125        if self.options.strip_utf8_bom {
126            input.strip_prefix('\u{feff}').unwrap_or(input)
127        } else {
128            input
129        }
130    }
131
132    /// Removes one outer Markdown code fence when enabled.
133    ///
134    /// The helper only strips a fence that starts at the beginning of input.
135    /// If a closing fence is present after trimming the trailing side, it is
136    /// also removed.
137    fn strip_markdown_code_fence<'a>(&self, input: &'a str) -> &'a str {
138        if !self.options.strip_markdown_code_fence || !input.starts_with("```") {
139            return input;
140        }
141
142        let Some(line_end) = input.find('\n') else {
143            return input;
144        };
145        let content = &input[line_end + 1..];
146        let trimmed_end = content.trim_end_matches(char::is_whitespace);
147
148        if let Some(without_close) = trimmed_end.strip_suffix("```") {
149            without_close
150        } else {
151            content
152        }
153    }
154
155    /// Escapes raw ASCII control chars inside JSON string literals.
156    ///
157    /// Characters outside strings remain unchanged. Existing escape sequences are
158    /// preserved so valid escapes are not double-escaped.
159    fn escape_control_chars_in_json_strings<'a>(&self, input: &'a str) -> Cow<'a, str> {
160        if !self.options.escape_control_chars_in_strings {
161            return Cow::Borrowed(input);
162        }
163
164        let mut in_string = false;
165        let mut in_escape = false;
166        let mut output: Option<String> = None;
167
168        for (index, ch) in input.char_indices() {
169            let mut replacement = None;
170
171            if in_string {
172                if in_escape {
173                    in_escape = false;
174                } else if ch == '\\' {
175                    in_escape = true;
176                } else if ch == '"' {
177                    in_string = false;
178                } else if ('\u{0000}'..='\u{001f}').contains(&ch) {
179                    replacement = Some(self.escaped_control_char(ch));
180                }
181            } else if ch == '"' {
182                in_string = true;
183            }
184
185            if let Some(escaped) = replacement {
186                let text = output.get_or_insert_with(|| {
187                    let mut text = String::with_capacity(input.len() + 8);
188                    text.push_str(&input[..index]);
189                    text
190                });
191                text.push_str(escaped);
192                continue;
193            }
194
195            if let Some(text) = output.as_mut() {
196                text.push(ch);
197            }
198        }
199
200        match output {
201            Some(text) => Cow::Owned(text),
202            None => Cow::Borrowed(input),
203        }
204    }
205
206    /// Maps one supported ASCII control character to its JSON escape.
207    ///
208    /// This helper only handles characters in `U+0000..=U+001F`.
209    fn escaped_control_char(&self, ch: char) -> &'static str {
210        match ch {
211            '\u{0008}' => "\\b",
212            '\u{0009}' => "\\t",
213            '\u{000a}' => "\\n",
214            '\u{000c}' => "\\f",
215            '\u{000d}' => "\\r",
216            '\u{0000}' => "\\u0000",
217            '\u{0001}' => "\\u0001",
218            '\u{0002}' => "\\u0002",
219            '\u{0003}' => "\\u0003",
220            '\u{0004}' => "\\u0004",
221            '\u{0005}' => "\\u0005",
222            '\u{0006}' => "\\u0006",
223            '\u{0007}' => "\\u0007",
224            '\u{000b}' => "\\u000b",
225            '\u{000e}' => "\\u000e",
226            '\u{000f}' => "\\u000f",
227            '\u{0010}' => "\\u0010",
228            '\u{0011}' => "\\u0011",
229            '\u{0012}' => "\\u0012",
230            '\u{0013}' => "\\u0013",
231            '\u{0014}' => "\\u0014",
232            '\u{0015}' => "\\u0015",
233            '\u{0016}' => "\\u0016",
234            '\u{0017}' => "\\u0017",
235            '\u{0018}' => "\\u0018",
236            '\u{0019}' => "\\u0019",
237            '\u{001a}' => "\\u001a",
238            '\u{001b}' => "\\u001b",
239            '\u{001c}' => "\\u001c",
240            '\u{001d}' => "\\u001d",
241            '\u{001e}' => "\\u001e",
242            '\u{001f}' => "\\u001f",
243            _ => unreachable!("escaped_control_char only supports ASCII control chars"),
244        }
245    }
246}