Skip to main content

qubit_json/
lenient_json_normalizer.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026.
4 *    Haixing Hu, Qubit Co. Ltd.
5 *
6 *    All rights reserved.
7 *
8 ******************************************************************************/
9//! Internal normalization utilities used by the lenient JSON decoder.
10//!
11//! Author: Haixing Hu
12
13use std::borrow::Cow;
14
15use crate::{JsonDecodeError, JsonDecodeOptions};
16
17/// Normalizes one raw JSON text input before JSON parsing.
18///
19/// The object holds normalization options and applies all supported preprocessing
20/// rules in the same order for every `normalize` call.
21#[derive(Debug, Clone, Copy)]
22pub struct LenientJsonNormalizer {
23    /// Stores the option set used by the normalizer.
24    options: JsonDecodeOptions,
25}
26
27impl Default for LenientJsonNormalizer {
28    fn default() -> Self {
29        Self::new(JsonDecodeOptions::default())
30    }
31}
32
33impl LenientJsonNormalizer {
34    /// Creates a normalizer with the provided lenient decoding options.
35    ///
36    /// The options are copied into the object so each `normalize` call uses a
37    /// consistent policy without external mutation.
38    #[must_use]
39    pub const fn new(options: JsonDecodeOptions) -> Self {
40        Self { options }
41    }
42
43    /// Returns the configuration used by this normalizer.
44    #[must_use]
45    pub const fn options(&self) -> &JsonDecodeOptions {
46        &self.options
47    }
48
49    /// Normalizes one raw JSON text input and returns a parsed-ready text slice.
50    ///
51    /// The pipeline is intentionally narrow: it trims whitespace, strips an
52    /// optional BOM, optionally removes a Markdown code fence, escapes control
53    /// characters in strings, and finally validates non-emptiness again.
54    pub fn normalize<'a>(&self, input: &'a str) -> Result<Cow<'a, str>, JsonDecodeError> {
55        self.require_within_size_limit(input)?;
56        let input = self.require_non_empty(input)?;
57        let input = self.trim_if_enabled(input);
58        let input = self.strip_utf8_bom(input);
59        let input = self.trim_if_enabled(input);
60        let input = self.strip_markdown_code_fence(input);
61        let input = self.trim_if_enabled(input);
62        let input = self.escape_control_chars_in_json_strings(input);
63        let input = self.trim_cow_if_enabled(input);
64
65        if input.is_empty() {
66            Err(JsonDecodeError::empty_input())
67        } else {
68            Ok(input)
69        }
70    }
71
72    /// Verifies that the input is not empty under the configured policy.
73    ///
74    /// If `trim_whitespace` is enabled, whitespace-only input is rejected as
75    /// empty; otherwise only zero-length input is rejected.
76    fn require_non_empty<'a>(&self, input: &'a str) -> Result<&'a str, JsonDecodeError> {
77        if self.options.trim_whitespace {
78            if input.trim().is_empty() {
79                Err(JsonDecodeError::empty_input())
80            } else {
81                Ok(input)
82            }
83        } else if input.is_empty() {
84            Err(JsonDecodeError::empty_input())
85        } else {
86            Ok(input)
87        }
88    }
89
90    /// Verifies that the raw input length does not exceed the configured
91    /// maximum, when one is configured.
92    fn require_within_size_limit(&self, input: &str) -> Result<(), JsonDecodeError> {
93        if let Some(limit) = self.options.max_input_bytes {
94            let size = input.len();
95            if size > limit {
96                return Err(JsonDecodeError::input_too_large(size, limit));
97            }
98        }
99        Ok(())
100    }
101
102    /// Trims a borrowed input slice if trimming is enabled.
103    ///
104    /// This helper borrows and never allocates when trimming is disabled.
105    fn trim_if_enabled<'a>(&self, input: &'a str) -> &'a str {
106        if self.options.trim_whitespace {
107            input.trim()
108        } else {
109            input
110        }
111    }
112
113    /// Trims the normalized text when trimming is enabled.
114    ///
115    /// Borrowed values remain borrowed, and owned values are copied only when
116    /// trimming removes characters.
117    fn trim_cow_if_enabled<'a>(&self, input: Cow<'a, str>) -> Cow<'a, str> {
118        if !self.options.trim_whitespace {
119            return input;
120        }
121        match input {
122            Cow::Borrowed(text) => Cow::Borrowed(text.trim()),
123            Cow::Owned(text) => {
124                let trimmed = text.trim();
125                if trimmed.len() == text.len() {
126                    Cow::Owned(text)
127                } else {
128                    Cow::Owned(trimmed.to_string())
129                }
130            }
131        }
132    }
133
134    /// Removes an optional UTF-8 BOM before parsing.
135    ///
136    /// If no BOM exists, the input is returned unchanged.
137    fn strip_utf8_bom<'a>(&self, input: &'a str) -> &'a str {
138        if self.options.strip_utf8_bom {
139            input.strip_prefix('\u{feff}').unwrap_or(input)
140        } else {
141            input
142        }
143    }
144
145    /// Removes one outer Markdown code fence when enabled.
146    ///
147    /// The helper only strips a fence that starts at the beginning of input.
148    /// If a closing fence is present after trimming the trailing side, it is
149    /// also removed.
150    fn strip_markdown_code_fence<'a>(&self, input: &'a str) -> &'a str {
151        if !self.options.strip_markdown_code_fence || !input.starts_with("```") {
152            return input;
153        }
154
155        let Some(line_end) = input.find('\n') else {
156            return input;
157        };
158        let opening_tag = input[3..line_end].trim();
159        if self.options.strip_markdown_code_fence_json_only
160            && !Self::is_json_code_fence_tag(opening_tag)
161        {
162            return input;
163        }
164
165        let content = &input[line_end + 1..];
166
167        if let Some(without_close) = Self::strip_markdown_closing_fence(content) {
168            return without_close;
169        }
170        if self.options.strip_markdown_code_fence_requires_closing {
171            input
172        } else {
173            content
174        }
175    }
176
177    /// Returns whether a fenced language tag should be treated as JSON.
178    fn is_json_code_fence_tag(tag: &str) -> bool {
179        tag.is_empty() || tag.eq_ignore_ascii_case("json") || tag.eq_ignore_ascii_case("jsonc")
180    }
181
182    /// Removes a valid closing Markdown code fence from `content` when present.
183    ///
184    /// A closing fence is considered valid only when the last non-whitespace
185    /// token is exactly ````` and appears on its own line.
186    fn strip_markdown_closing_fence(content: &str) -> Option<&str> {
187        let trimmed_end = content.trim_end_matches(char::is_whitespace);
188        let without_close = trimmed_end.strip_suffix("```")?;
189        if without_close.is_empty()
190            || without_close.ends_with('\n')
191            || without_close.ends_with('\r')
192        {
193            Some(without_close)
194        } else {
195            None
196        }
197    }
198
199    /// Escapes raw ASCII control chars inside JSON string literals.
200    ///
201    /// Characters outside strings remain unchanged. Existing escape sequences are
202    /// preserved so valid escapes are not double-escaped.
203    fn escape_control_chars_in_json_strings<'a>(&self, input: &'a str) -> Cow<'a, str> {
204        if !self.options.escape_control_chars_in_strings {
205            return Cow::Borrowed(input);
206        }
207
208        let mut in_string = false;
209        let mut in_escape = false;
210        let mut output: Option<String> = None;
211
212        for (index, ch) in input.char_indices() {
213            let mut replacement = None;
214
215            if in_string {
216                if in_escape {
217                    in_escape = false;
218                } else if ch == '\\' {
219                    in_escape = true;
220                } else if ch == '"' {
221                    in_string = false;
222                } else if ('\u{0000}'..='\u{001f}').contains(&ch) {
223                    replacement = Some(self.escaped_control_char(ch));
224                }
225            } else if ch == '"' {
226                in_string = true;
227            }
228
229            if let Some(escaped) = replacement {
230                let text = output.get_or_insert_with(|| {
231                    let mut text = String::with_capacity(input.len() + 8);
232                    text.push_str(&input[..index]);
233                    text
234                });
235                text.push_str(escaped);
236                continue;
237            }
238
239            if let Some(text) = output.as_mut() {
240                text.push(ch);
241            }
242        }
243
244        match output {
245            Some(text) => Cow::Owned(text),
246            None => Cow::Borrowed(input),
247        }
248    }
249
250    /// Maps one supported ASCII control character to its JSON escape.
251    ///
252    /// This helper only handles characters in `U+0000..=U+001F`.
253    fn escaped_control_char(&self, ch: char) -> &'static str {
254        match ch {
255            '\u{0008}' => "\\b",
256            '\u{0009}' => "\\t",
257            '\u{000a}' => "\\n",
258            '\u{000c}' => "\\f",
259            '\u{000d}' => "\\r",
260            '\u{0000}' => "\\u0000",
261            '\u{0001}' => "\\u0001",
262            '\u{0002}' => "\\u0002",
263            '\u{0003}' => "\\u0003",
264            '\u{0004}' => "\\u0004",
265            '\u{0005}' => "\\u0005",
266            '\u{0006}' => "\\u0006",
267            '\u{0007}' => "\\u0007",
268            '\u{000b}' => "\\u000b",
269            '\u{000e}' => "\\u000e",
270            '\u{000f}' => "\\u000f",
271            '\u{0010}' => "\\u0010",
272            '\u{0011}' => "\\u0011",
273            '\u{0012}' => "\\u0012",
274            '\u{0013}' => "\\u0013",
275            '\u{0014}' => "\\u0014",
276            '\u{0015}' => "\\u0015",
277            '\u{0016}' => "\\u0016",
278            '\u{0017}' => "\\u0017",
279            '\u{0018}' => "\\u0018",
280            '\u{0019}' => "\\u0019",
281            '\u{001a}' => "\\u001a",
282            '\u{001b}' => "\\u001b",
283            '\u{001c}' => "\\u001c",
284            '\u{001d}' => "\\u001d",
285            '\u{001e}' => "\\u001e",
286            '\u{001f}' => "\\u001f",
287            _ => unreachable!("escaped_control_char only supports ASCII control chars"),
288        }
289    }
290}