Skip to main content

lnmp_sanitize/
sanitize.rs

1use std::borrow::Cow;
2
3use crate::mode::SanitizationLevel;
4
5/// Configuration options for sanitization.
6#[derive(Debug, Clone)]
7pub struct SanitizationConfig {
8    /// Overall repair level for heuristics
9    pub level: SanitizationLevel,
10    /// Automatically wrap string-like segments with quotes when needed
11    pub auto_quote_strings: bool,
12    /// Escape stray quotes inside text sections
13    pub auto_escape_quotes: bool,
14    /// Normalize boolean text representations to 1/0 outside quotes
15    pub normalize_booleans: bool,
16    /// Normalize simple numeric forms (e.g., remove leading zeros)
17    pub normalize_numbers: bool,
18}
19
20impl Default for SanitizationConfig {
21    fn default() -> Self {
22        Self {
23            level: SanitizationLevel::Normal,
24            auto_quote_strings: true,
25            auto_escape_quotes: true,
26            normalize_booleans: true,
27            normalize_numbers: false,
28        }
29    }
30}
31
32/// Leniently sanitizes LNMP-like text. When no changes are required the input is returned
33/// by reference to avoid allocations.
34pub fn sanitize_lnmp_text<'a>(input: &'a str, config: &SanitizationConfig) -> Cow<'a, str> {
35    let mut changed = false;
36
37    // Pass 1: whitespace/structural cleanup
38    let pass1 = structural_cleanup(input, config, &mut changed);
39
40    // Pass 2: quote/escape repair + optional auto-quoting
41    let pass2 = if config.level == SanitizationLevel::Minimal {
42        pass1
43    } else {
44        let quote_fixed = quote_and_escape_repair(&pass1, config, &mut changed);
45        if config.auto_quote_strings {
46            Cow::Owned(auto_quote_unquoted_values(
47                quote_fixed.as_ref(),
48                &mut changed,
49            ))
50        } else {
51            quote_fixed
52        }
53    };
54
55    // Pass 3: semantic normalization (Aggressive only)
56    let pass3 = if config.level == SanitizationLevel::Aggressive
57        && (config.normalize_booleans || config.normalize_numbers)
58    {
59        Cow::Owned(normalize_tokens(&pass2, config, &mut changed))
60    } else {
61        pass2
62    };
63
64    if changed {
65        Cow::Owned(pass3.into_owned())
66    } else {
67        Cow::Borrowed(input)
68    }
69}
70
71fn structural_cleanup<'a>(
72    input: &'a str,
73    config: &SanitizationConfig,
74    changed: &mut bool,
75) -> Cow<'a, str> {
76    // Minimal mode: only newline normalization and trailing space trim.
77    if config.level == SanitizationLevel::Minimal {
78        let mut output = String::with_capacity(input.len());
79        for line in input.lines() {
80            let trimmed = line.trim_end_matches([' ', '\t']);
81            if trimmed.len() != line.len() {
82                *changed = true;
83            }
84            output.push_str(trimmed);
85            output.push('\n');
86        }
87        if !input.ends_with('\n') && !input.is_empty() {
88            output.pop();
89        }
90
91        if *changed {
92            return Cow::Owned(output);
93        }
94        return Cow::Borrowed(input);
95    }
96
97    let mut output = String::with_capacity(input.len());
98    let mut in_quotes = false;
99    let mut escape_next = false;
100    let mut last_emitted: Option<char> = None;
101
102    let mut chars = input.chars().peekable();
103    while let Some(ch) = chars.next() {
104        if escape_next {
105            output.push(ch);
106            last_emitted = Some(ch);
107            escape_next = false;
108            continue;
109        }
110
111        match ch {
112            '\\' => {
113                output.push('\\');
114                match chars.peek() {
115                    Some('"' | '\\' | 'n' | 'r' | 't') => {
116                        escape_next = true;
117                    }
118                    Some(_) if in_quotes && config.auto_escape_quotes => {
119                        escape_next = true;
120                        *changed = true;
121                    }
122                    None => {
123                        output.push('\\');
124                        *changed = true;
125                    }
126                    _ => {}
127                }
128                last_emitted = Some('\\');
129            }
130            '"' => {
131                in_quotes = !in_quotes;
132                output.push('"');
133                last_emitted = Some('"');
134            }
135            ';' if !in_quotes => {
136                output.push(';');
137                last_emitted = Some(';');
138                while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
139                    chars.next();
140                    *changed = true;
141                }
142            }
143            ',' if !in_quotes => {
144                output.push(',');
145                last_emitted = Some(',');
146                while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
147                    chars.next();
148                    *changed = true;
149                }
150            }
151            '\n' => {
152                while output.ends_with(' ') || output.ends_with('\t') {
153                    output.pop();
154                    *changed = true;
155                }
156                output.push('\n');
157                last_emitted = Some('\n');
158            }
159            '\r' => {
160                *changed = true;
161                output.push('\n');
162                last_emitted = Some('\n');
163            }
164            ' ' | '\t' if !in_quotes => {
165                let next_non_space = {
166                    let mut clone = chars.clone();
167                    clone.find(|c| *c != ' ' && *c != '\t')
168                };
169
170                let prev_is_boundary = matches!(
171                    last_emitted,
172                    None | Some('\n' | ';' | ',' | '=' | '[' | '{')
173                );
174                let next_is_boundary = matches!(
175                    next_non_space,
176                    None | Some('\n' | ';' | ',' | '=' | ']' | '}')
177                );
178
179                if prev_is_boundary || next_is_boundary {
180                    *changed = true;
181                    continue;
182                }
183
184                if last_emitted == Some(' ') {
185                    *changed = true;
186                    continue;
187                }
188
189                output.push(' ');
190                last_emitted = Some(' ');
191            }
192            other => {
193                output.push(other);
194                last_emitted = Some(other);
195            }
196        }
197    }
198
199    if in_quotes && config.auto_escape_quotes {
200        output.push('"');
201        *changed = true;
202    }
203
204    if *changed {
205        Cow::Owned(output)
206    } else {
207        Cow::Borrowed(input)
208    }
209}
210
211fn quote_and_escape_repair<'a>(
212    input: &'a str,
213    config: &SanitizationConfig,
214    changed: &mut bool,
215) -> Cow<'a, str> {
216    let mut output = String::with_capacity(input.len());
217    let mut in_quotes = false;
218    let mut escape_next = false;
219
220    for ch in input.chars() {
221        if escape_next {
222            output.push(ch);
223            escape_next = false;
224            continue;
225        }
226
227        match ch {
228            '\\' => {
229                output.push('\\');
230                escape_next = true;
231            }
232            '"' => {
233                in_quotes = !in_quotes;
234                output.push('"');
235            }
236            _ => {
237                output.push(ch);
238            }
239        }
240    }
241
242    if in_quotes && config.auto_escape_quotes {
243        output.push('"');
244        *changed = true;
245    }
246
247    if *changed {
248        Cow::Owned(output)
249    } else {
250        Cow::Borrowed(input)
251    }
252}
253
254fn auto_quote_unquoted_values(input: &str, changed: &mut bool) -> String {
255    let mut output = String::with_capacity(input.len());
256    let mut iter = input.char_indices().peekable();
257    while let Some((idx, ch)) = iter.next() {
258        if ch == '=' {
259            output.push('=');
260
261            let value_start = idx + ch.len_utf8();
262            let mut value_end = value_start;
263            let mut in_quotes = false;
264            let mut escape_next = false;
265
266            while let Some(&(next_idx, next_ch)) = iter.peek() {
267                if escape_next {
268                    escape_next = false;
269                    iter.next();
270                    value_end = next_idx + next_ch.len_utf8();
271                    continue;
272                }
273                match next_ch {
274                    '\\' => {
275                        escape_next = true;
276                        iter.next();
277                        value_end = next_idx + next_ch.len_utf8();
278                    }
279                    '"' => {
280                        in_quotes = !in_quotes;
281                        iter.next();
282                        value_end = next_idx + next_ch.len_utf8();
283                    }
284                    ';' | '\n' if !in_quotes => break,
285                    _ => {
286                        iter.next();
287                        value_end = next_idx + next_ch.len_utf8();
288                    }
289                }
290            }
291
292            let value = &input[value_start..value_end];
293            let trimmed = value.trim();
294            let starts_structural = trimmed.starts_with('[') || trimmed.starts_with('{');
295            let needs_quotes = !trimmed.is_empty()
296                && !trimmed.starts_with('"')
297                && !starts_structural
298                && (trimmed.contains('"') || trimmed.chars().any(char::is_whitespace));
299
300            if needs_quotes {
301                let mut escaped = String::with_capacity(value.len() + 4);
302                for ch in value.chars() {
303                    match ch {
304                        '"' => {
305                            escaped.push_str("\\\"");
306                        }
307                        '\\' => {
308                            escaped.push_str("\\\\");
309                        }
310                        _ => escaped.push(ch),
311                    }
312                }
313                output.push('"');
314                output.push_str(escaped.trim());
315                output.push('"');
316                *changed = true;
317            } else {
318                output.push_str(value);
319            }
320
321            if let Some(&(_, delim)) = iter.peek() {
322                if delim == ';' || delim == '\n' {
323                    output.push(delim);
324                    iter.next();
325                }
326            }
327        } else {
328            output.push(ch);
329        }
330    }
331
332    output
333}
334
335fn normalize_tokens(input: &str, config: &SanitizationConfig, changed: &mut bool) -> String {
336    let mut out = String::with_capacity(input.len());
337    let mut token = String::new();
338    let mut in_quotes = false;
339    let mut escape_next = false;
340
341    for ch in input.chars() {
342        if escape_next {
343            out.push(ch);
344            escape_next = false;
345            continue;
346        }
347
348        if ch == '\\' && in_quotes {
349            out.push('\\');
350            escape_next = true;
351            continue;
352        }
353
354        if ch == '"' {
355            flush_token(&mut token, &mut out, config, changed);
356            in_quotes = !in_quotes;
357            out.push('"');
358            continue;
359        }
360
361        if in_quotes {
362            out.push(ch);
363            continue;
364        }
365
366        if ch.is_ascii_alphanumeric() || ch == '-' {
367            token.push(ch);
368        } else {
369            flush_token(&mut token, &mut out, config, changed);
370            out.push(ch);
371        }
372    }
373
374    flush_token(&mut token, &mut out, config, changed);
375    out
376}
377
378fn flush_token(
379    token: &mut String,
380    out: &mut String,
381    config: &SanitizationConfig,
382    changed: &mut bool,
383) {
384    if token.is_empty() {
385        return;
386    }
387
388    let mut replacement: Option<String> = None;
389
390    if config.normalize_booleans {
391        match token.to_ascii_lowercase().as_str() {
392            "true" | "yes" => replacement = Some("1".to_string()),
393            "false" | "no" => replacement = Some("0".to_string()),
394            _ => {}
395        }
396    }
397
398    if replacement.is_none()
399        && config.normalize_numbers
400        && token.len() > 1
401        && token.chars().all(|c| c.is_ascii_digit())
402        && token.starts_with('0')
403    {
404        let trimmed = token.trim_start_matches('0');
405        let normalized = if trimmed.is_empty() { "0" } else { trimmed };
406        replacement = Some(normalized.to_string());
407    }
408
409    if let Some(ref value) = replacement {
410        *changed |= value != token;
411        out.push_str(value);
412    } else {
413        out.push_str(token);
414    }
415
416    token.clear();
417}