Skip to main content

sanitize_engine/processor/
key_value.rs

1//! Key-value processor for `gitlab.rb`-style configuration files.
2//!
3//! Handles files with lines of the form:
4//!
5//! ```text
6//! key = "value"
7//! key = 'value'
8//! key = value
9//! # comment lines are preserved
10//! ```
11//!
12//! The delimiter, comment prefix, and quoting style are configurable
13//! via the profile's `options` map.
14//!
15//! # Profile Options
16//!
17//! | Key              | Default | Description                                  |
18//! |------------------|---------|----------------------------------------------|
19//! | `delimiter`           | `"="`   | The key-value separator.                     |
20//! | `secondary_delimiter` | *(none)*| Optional additional delimiter(s) tried when  |
21//! |                       |         | the primary delimiter's key does not match   |
22//! |                       |         | any field rule. Comma-separate multiple      |
23//! |                       |         | values (e.g. `"=>,:"`). Surrounding quotes  |
24//! |                       |         | are stripped from the key before matching,   |
25//! |                       |         | and any suffix after the value (e.g. a      |
26//! |                       |         | trailing `,`) is preserved in the output.    |
27//! |                       |         | Useful for Ruby hash literals that use `=>`  |
28//! |                       |         | or `:` alongside a `=`-delimited file.       |
29//! | `comment_prefix`      | `"#"`   | Lines starting with this (after whitespace)  |
30//! |                  |         | are treated as comments.                     |
31//! | `ignore_comments`     | `false` | When `true`, comment lines are preserved     |
32//! |                  |         | verbatim and never sanitized. By default,    |
33//! |                  |         | field rules are applied to commented-out     |
34//! |                  |         | key-value lines so that secrets left behind  |
35//! |                  |         | in comments are still replaced.              |
36//! | `value_strip_suffix`  | *(none)*| Strip this suffix from value before          |
37//! |                       |         | sanitizing and re-append it afterwards.      |
38//! |                       |         | Use `";"` for nginx-style `key value;` files.|
39//!
40//! # Heredoc / Sub-processor Support
41//!
42//! When a matched field rule has `sub_processor` set and the value is a
43//! Ruby-style heredoc (`<<-'EOS'`, `<<~EOS`, etc.), the processor switches
44//! into collection mode: it accumulates heredoc lines until the end marker,
45//! then delegates the collected content to the named sub-processor using the
46//! rule's `sub_fields`. This allows structured content embedded inside
47//! key-value files (e.g. YAML inside `gitlab.rb`) to be sanitized at the
48//! field level rather than relying solely on the streaming scanner.
49//!
50//! For non-heredoc values with `sub_processor`, the value (after quote
51//! stripping) is passed directly to the sub-processor.
52//!
53//! # Formatting Preservation
54//!
55//! - Blank lines and indentation are preserved verbatim.
56//! - Comment lines are preserved verbatim when no field rule matches their body,
57//!   or when `ignore_comments: true` is set in the profile options.
58//! - The original quoting style (single, double, or unquoted) is kept.
59//! - Whitespace around the delimiter is preserved where possible.
60//! - Heredoc opening and closing marker lines are preserved verbatim.
61
62use crate::error::{Result, SanitizeError};
63use crate::processor::limits::DEFAULT_INPUT_SIZE;
64use crate::processor::profile::FieldRule;
65use crate::processor::{
66    find_field_signal, find_matching_rule, replace_by_signal, replace_value, FileTypeProfile,
67    Processor,
68};
69use crate::store::MappingStore;
70use std::collections::HashMap;
71
72// ---------------------------------------------------------------------------
73// Per-file configuration (constant across all lines in one processing call)
74// ---------------------------------------------------------------------------
75
76/// Bundles the per-file options that are constant for every line in a
77/// `process_line` invocation, so the call site stays readable and adding
78/// new options doesn't widen the function signature.
79struct KvConfig<'a> {
80    delimiter: &'a str,
81    comment_prefix: &'a str,
82    secondary_delimiters: &'a [&'a str],
83    value_strip_suffix: Option<&'a str>,
84    ignore_comments: bool,
85    profile: &'a FileTypeProfile,
86    store: &'a MappingStore,
87}
88
89// ---------------------------------------------------------------------------
90// Internal state machine
91// ---------------------------------------------------------------------------
92
93/// Processing state for the line-by-line loop.
94enum LineState {
95    Normal,
96    /// Collecting lines of a heredoc until `end_marker` is seen.
97    Heredoc {
98        end_marker: String,
99        rule: FieldRule,
100        lines: Vec<String>,
101        /// `true` for `<<~` squiggly heredocs: the minimum leading indentation
102        /// is stripped from the body before passing to the sub-processor, then
103        /// re-added to every output line so the file structure is preserved.
104        strip_indent: bool,
105    },
106}
107
108// ---------------------------------------------------------------------------
109// Processor implementation
110// ---------------------------------------------------------------------------
111
112/// Structured processor for key = value configuration files.
113pub struct KeyValueProcessor;
114
115impl Processor for KeyValueProcessor {
116    fn name(&self) -> &'static str {
117        "key_value"
118    }
119
120    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
121        matches!(profile.processor.as_str(), "key_value" | "key-value")
122    }
123
124    fn process(
125        &self,
126        content: &[u8],
127        profile: &FileTypeProfile,
128        store: &MappingStore,
129    ) -> Result<Vec<u8>> {
130        if content.len() > DEFAULT_INPUT_SIZE {
131            return Err(SanitizeError::InputTooLarge {
132                size: content.len(),
133                limit: DEFAULT_INPUT_SIZE,
134            });
135        }
136
137        let text = String::from_utf8_lossy(content);
138        let delimiter = profile.options.get("delimiter").map_or("=", |s| s.as_str());
139        let comment_prefix = profile
140            .options
141            .get("comment_prefix")
142            .map_or("#", |s| s.as_str());
143        let secondary_delimiter_raw = profile
144            .options
145            .get("secondary_delimiter")
146            .map_or("", |s| s.as_str());
147        let secondary_delimiters: Vec<&str> = if secondary_delimiter_raw.is_empty() {
148            vec![]
149        } else {
150            secondary_delimiter_raw.split(',').collect()
151        };
152        let value_strip_suffix = profile
153            .options
154            .get("value_strip_suffix")
155            .map(|s| s.as_str());
156        let ignore_comments = profile
157            .options
158            .get("ignore_comments")
159            .is_some_and(|s| s == "true");
160
161        let cfg = KvConfig {
162            delimiter,
163            comment_prefix,
164            secondary_delimiters: &secondary_delimiters,
165            value_strip_suffix,
166            ignore_comments,
167            profile,
168            store,
169        };
170
171        let mut output = String::with_capacity(text.len());
172        let mut state = LineState::Normal;
173
174        for line in text.split('\n') {
175            process_line(line, &mut state, &mut output, &cfg)?;
176        }
177
178        // Normalise trailing newline: strip all, then re-add exactly one
179        // iff the original ended with one. This corrects the extra '\n'
180        // that split('\n') produces for a trailing-newline input.
181        while output.ends_with('\n') {
182            output.pop();
183        }
184        if text.ends_with('\n') {
185            output.push('\n');
186        }
187
188        Ok(output.into_bytes())
189    }
190}
191
192// ---------------------------------------------------------------------------
193// Per-line processing (extracted to stay within clippy line limit)
194// ---------------------------------------------------------------------------
195
196#[allow(clippy::too_many_lines)]
197fn process_line(
198    line: &str,
199    state: &mut LineState,
200    output: &mut String,
201    cfg: &KvConfig<'_>,
202) -> Result<()> {
203    match state {
204        LineState::Heredoc {
205            ref end_marker,
206            ref rule,
207            ref mut lines,
208            strip_indent,
209        } => {
210            if line.trim() == end_marker.as_str() {
211                // For `<<~` squiggly heredocs, strip the minimum common
212                // indentation before sub-processing (matching Ruby semantics),
213                // then re-add that indentation to every output line so the
214                // file structure is preserved verbatim.
215                let (content, stripped_indent) = if *strip_indent {
216                    strip_min_indent(lines)
217                } else {
218                    (lines.join("\n"), 0)
219                };
220                let processed = process_sub_content(&content, rule, cfg.store)?;
221                let final_content = if *strip_indent && stripped_indent > 0 {
222                    reindent_content(&processed, stripped_indent)
223                } else {
224                    processed
225                };
226                for processed_line in final_content.split('\n') {
227                    output.push_str(processed_line);
228                    output.push('\n');
229                }
230                output.push_str(line);
231                output.push('\n');
232                *state = LineState::Normal;
233            } else {
234                lines.push(line.to_owned());
235            }
236        }
237        LineState::Normal => {
238            let trimmed = line.trim();
239            if trimmed.is_empty() {
240                output.push_str(line);
241                output.push('\n');
242                return Ok(());
243            }
244            if trimmed.starts_with(cfg.comment_prefix) {
245                if !cfg.ignore_comments {
246                    // Find where the comment prefix starts in the original line
247                    // and split into header (everything up to and including the
248                    // prefix) and body (the rest, which may be a key-value pair).
249                    if let Some(prefix_pos) = line.find(cfg.comment_prefix) {
250                        let prefix_end = prefix_pos + cfg.comment_prefix.len();
251                        let comment_header = &line[..prefix_end];
252                        let body = &line[prefix_end..];
253                        if let Some(sanitized_body) = try_sanitize_kv_body(body, cfg)? {
254                            output.push_str(comment_header);
255                            output.push_str(&sanitized_body);
256                            output.push('\n');
257                            return Ok(());
258                        }
259                    }
260                }
261                output.push_str(line);
262                output.push('\n');
263                return Ok(());
264            }
265            // Search for the delimiter in the indent-stripped line so that
266            // indented directives (e.g. nginx `    proxy_pass URL;`) are found
267            // even when the delimiter is a space character.
268            let line_body = line.trim_start();
269            let indent_len = line.len() - line_body.len();
270            if let Some(delim_pos) = line_body.find(cfg.delimiter) {
271                // raw_key preserves leading indent for faithful output reconstruction.
272                let raw_key = &line[..indent_len + delim_pos];
273                let after_delim = &line_body[delim_pos + cfg.delimiter.len()..];
274                let key = line_body[..delim_pos].trim();
275                if let Some(rule) = find_matching_rule(key, cfg.profile) {
276                    if rule.sub_processor.is_some() {
277                        if let Some((marker, strip_indent)) = detect_heredoc(after_delim) {
278                            output.push_str(line);
279                            output.push('\n');
280                            *state = LineState::Heredoc {
281                                end_marker: marker,
282                                rule: rule.clone(),
283                                lines: Vec::new(),
284                                strip_indent,
285                            };
286                            return Ok(());
287                        }
288                        let raw_value = after_delim.trim();
289                        let (quote_char, inner) = detect_quotes(raw_value);
290                        let processed = process_sub_content(inner, rule, cfg.store)?;
291                        emit_replaced(
292                            raw_key,
293                            cfg.delimiter,
294                            after_delim,
295                            quote_char,
296                            &processed,
297                            output,
298                        );
299                        output.push('\n');
300                        return Ok(());
301                    }
302                    let raw_value = after_delim.trim();
303                    let (quote_char, inner) = detect_quotes(raw_value);
304                    let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
305                        Some(sfx) if inner.ends_with(sfx) => {
306                            (&inner[..inner.len() - sfx.len()], sfx)
307                        }
308                        _ => (inner, ""),
309                    };
310                    let replaced = replace_value(sanitize_inner, rule, cfg.store)?;
311                    if suffix.is_empty() {
312                        emit_replaced(
313                            raw_key,
314                            cfg.delimiter,
315                            after_delim,
316                            quote_char,
317                            &replaced,
318                            output,
319                        );
320                    } else {
321                        emit_replaced_with_suffix(
322                            raw_key,
323                            cfg.delimiter,
324                            after_delim,
325                            quote_char,
326                            &replaced,
327                            suffix,
328                            output,
329                        );
330                    }
331                    output.push('\n');
332                    return Ok(());
333                } else if let Some(sig) = find_field_signal(key, &cfg.profile.field_name_signals) {
334                    let raw_value = after_delim.trim();
335                    let (quote_char, inner) = detect_quotes(raw_value);
336                    let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
337                        Some(sfx) if inner.ends_with(sfx) => {
338                            (&inner[..inner.len() - sfx.len()], sfx)
339                        }
340                        _ => (inner, ""),
341                    };
342                    if let Some(replaced) = replace_by_signal(sanitize_inner, sig, cfg.store)? {
343                        if suffix.is_empty() {
344                            emit_replaced(
345                                raw_key,
346                                cfg.delimiter,
347                                after_delim,
348                                quote_char,
349                                &replaced,
350                                output,
351                            );
352                        } else {
353                            emit_replaced_with_suffix(
354                                raw_key,
355                                cfg.delimiter,
356                                after_delim,
357                                quote_char,
358                                &replaced,
359                                suffix,
360                                output,
361                            );
362                        }
363                        output.push('\n');
364                        return Ok(());
365                    }
366                }
367            }
368            // Try secondary delimiters in order (e.g. `=>` and `:` for Ruby
369            // hash lines like `'aws_access_key_id' => 'KEY',` or
370            // `'client_secret': 'VALUE',`).
371            for &sec_delim in cfg.secondary_delimiters {
372                if let Some(delim_pos) = line.find(sec_delim) {
373                    let raw_key = &line[..delim_pos];
374                    let after_delim = &line[delim_pos + sec_delim.len()..];
375                    // Strip surrounding quotes from the key before matching
376                    // (e.g. `'aws_access_key_id'` → `aws_access_key_id`).
377                    let trimmed_key = raw_key.trim();
378                    let (_, unquoted_key) = detect_quotes(trimmed_key);
379                    if let Some(rule) = find_matching_rule(unquoted_key, cfg.profile) {
380                        let (quote_char, inner, suffix) =
381                            detect_quoted_value_with_suffix(after_delim);
382                        let replaced = replace_value(inner, rule, cfg.store)?;
383                        emit_replaced_with_suffix(
384                            raw_key,
385                            sec_delim,
386                            after_delim,
387                            quote_char,
388                            &replaced,
389                            suffix,
390                            output,
391                        );
392                        output.push('\n');
393                        return Ok(());
394                    } else if let Some(sig) =
395                        find_field_signal(unquoted_key, &cfg.profile.field_name_signals)
396                    {
397                        let (quote_char, inner, suffix) =
398                            detect_quoted_value_with_suffix(after_delim);
399                        if let Some(replaced) = replace_by_signal(inner, sig, cfg.store)? {
400                            emit_replaced_with_suffix(
401                                raw_key,
402                                sec_delim,
403                                after_delim,
404                                quote_char,
405                                &replaced,
406                                suffix,
407                                output,
408                            );
409                            output.push('\n');
410                            return Ok(());
411                        }
412                    }
413                }
414            }
415            output.push_str(line);
416            output.push('\n');
417        }
418    }
419    Ok(())
420}
421
422// ---------------------------------------------------------------------------
423// Comment-body sanitization
424// ---------------------------------------------------------------------------
425
426/// Try to parse and sanitize `body` (the text after the comment prefix on a
427/// commented-out line) as a key-value pair using the same field rules as normal
428/// lines. Returns `Some(sanitized_body)` — without a trailing newline — when a
429/// field rule matched and the value was replaced; `None` when nothing matched
430/// and the line should be preserved verbatim.
431#[allow(clippy::too_many_lines)]
432fn try_sanitize_kv_body(body: &str, cfg: &KvConfig<'_>) -> Result<Option<String>> {
433    let body_trimmed = body.trim_start();
434    let indent_len = body.len() - body_trimmed.len();
435
436    // Try primary delimiter.
437    if let Some(delim_pos) = body_trimmed.find(cfg.delimiter) {
438        let raw_key = &body[..indent_len + delim_pos];
439        let after_delim = &body_trimmed[delim_pos + cfg.delimiter.len()..];
440        let key = body_trimmed[..delim_pos].trim();
441        if let Some(rule) = find_matching_rule(key, cfg.profile) {
442            let raw_value = after_delim.trim();
443            let (quote_char, inner) = detect_quotes(raw_value);
444            let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
445                Some(sfx) if inner.ends_with(sfx) => (&inner[..inner.len() - sfx.len()], sfx),
446                _ => (inner, ""),
447            };
448            let replaced = replace_value(sanitize_inner, rule, cfg.store)?;
449            let mut out = String::new();
450            if suffix.is_empty() {
451                emit_replaced(
452                    raw_key,
453                    cfg.delimiter,
454                    after_delim,
455                    quote_char,
456                    &replaced,
457                    &mut out,
458                );
459            } else {
460                emit_replaced_with_suffix(
461                    raw_key,
462                    cfg.delimiter,
463                    after_delim,
464                    quote_char,
465                    &replaced,
466                    suffix,
467                    &mut out,
468                );
469            }
470            return Ok(Some(out));
471        } else if let Some(sig) = find_field_signal(key, &cfg.profile.field_name_signals) {
472            let raw_value = after_delim.trim();
473            let (quote_char, inner) = detect_quotes(raw_value);
474            let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
475                Some(sfx) if inner.ends_with(sfx) => (&inner[..inner.len() - sfx.len()], sfx),
476                _ => (inner, ""),
477            };
478            if let Some(replaced) = replace_by_signal(sanitize_inner, sig, cfg.store)? {
479                let mut out = String::new();
480                if suffix.is_empty() {
481                    emit_replaced(
482                        raw_key,
483                        cfg.delimiter,
484                        after_delim,
485                        quote_char,
486                        &replaced,
487                        &mut out,
488                    );
489                } else {
490                    emit_replaced_with_suffix(
491                        raw_key,
492                        cfg.delimiter,
493                        after_delim,
494                        quote_char,
495                        &replaced,
496                        suffix,
497                        &mut out,
498                    );
499                }
500                return Ok(Some(out));
501            }
502        }
503    }
504
505    // Try secondary delimiters in order.
506    for &sec_delim in cfg.secondary_delimiters {
507        if let Some(delim_pos) = body.find(sec_delim) {
508            let raw_key = &body[..delim_pos];
509            let after_delim = &body[delim_pos + sec_delim.len()..];
510            let trimmed_key = raw_key.trim();
511            let (_, unquoted_key) = detect_quotes(trimmed_key);
512            if let Some(rule) = find_matching_rule(unquoted_key, cfg.profile) {
513                let (quote_char, inner, suffix) = detect_quoted_value_with_suffix(after_delim);
514                let replaced = replace_value(inner, rule, cfg.store)?;
515                let mut out = String::new();
516                emit_replaced_with_suffix(
517                    raw_key,
518                    sec_delim,
519                    after_delim,
520                    quote_char,
521                    &replaced,
522                    suffix,
523                    &mut out,
524                );
525                return Ok(Some(out));
526            } else if let Some(sig) =
527                find_field_signal(unquoted_key, &cfg.profile.field_name_signals)
528            {
529                let (quote_char, inner, suffix) = detect_quoted_value_with_suffix(after_delim);
530                if let Some(replaced) = replace_by_signal(inner, sig, cfg.store)? {
531                    let mut out = String::new();
532                    emit_replaced_with_suffix(
533                        raw_key,
534                        sec_delim,
535                        after_delim,
536                        quote_char,
537                        &replaced,
538                        suffix,
539                        &mut out,
540                    );
541                    return Ok(Some(out));
542                }
543            }
544        }
545    }
546
547    Ok(None)
548}
549
550// ---------------------------------------------------------------------------
551// Sub-processor dispatch
552// ---------------------------------------------------------------------------
553
554/// Delegate `content` to the processor named in `rule.sub_processor`.
555///
556/// Builds a synthetic [`FileTypeProfile`] from the rule's `sub_fields` and
557/// calls the appropriate built-in processor directly. Returns the processed
558/// content as a `String`.
559fn process_sub_content(content: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
560    use super::env_proc::EnvProcessor;
561    use super::ini_proc::IniProcessor;
562    use super::json_proc::JsonProcessor;
563    use super::log_line::LogLineProcessor;
564    use super::toml_proc::TomlProcessor;
565    use super::yaml_proc::YamlProcessor;
566
567    let name = rule
568        .sub_processor
569        .as_deref()
570        .ok_or_else(|| SanitizeError::InvalidConfig("sub_processor not set".into()))?;
571
572    let sub_profile = FileTypeProfile {
573        processor: name.to_owned(),
574        extensions: Vec::new(),
575        include: Vec::new(),
576        exclude: Vec::new(),
577        fields: rule.sub_fields.clone(),
578        options: HashMap::new(),
579        field_name_signals: Vec::new(),
580    };
581
582    let bytes = content.as_bytes();
583    let out = match name {
584        "yaml" => YamlProcessor.process(bytes, &sub_profile, store)?,
585        "json" => JsonProcessor.process(bytes, &sub_profile, store)?,
586        "toml" => TomlProcessor.process(bytes, &sub_profile, store)?,
587        "ini" => IniProcessor.process(bytes, &sub_profile, store)?,
588        "env" => EnvProcessor.process(bytes, &sub_profile, store)?,
589        "log_line" => LogLineProcessor::new().process(bytes, &sub_profile, store)?,
590        other => {
591            return Err(SanitizeError::InvalidConfig(format!(
592                "unknown sub_processor '{other}' — supported: yaml, json, toml, ini, env, log_line"
593            )))
594        }
595    };
596
597    String::from_utf8(out).map_err(|e| {
598        SanitizeError::IoError(std::io::Error::other(format!(
599            "sub-processor output is not UTF-8: {e}"
600        )))
601    })
602}
603
604// ---------------------------------------------------------------------------
605// Heredoc indent helpers
606// ---------------------------------------------------------------------------
607
608/// Strip the minimum common leading indentation from a set of heredoc body lines.
609///
610/// Implements Ruby's `<<~` squiggly-heredoc semantics: empty or whitespace-only
611/// lines are ignored when computing the minimum indentation so they don't force
612/// the minimum to zero. Returns the joined content and the number of spaces
613/// stripped (needed to re-indent the processed output).
614fn strip_min_indent(lines: &[String]) -> (String, usize) {
615    let min_indent = lines
616        .iter()
617        .filter(|l| !l.trim().is_empty())
618        .map(|l| l.len() - l.trim_start().len())
619        .min()
620        .unwrap_or(0);
621
622    if min_indent == 0 {
623        return (lines.join("\n"), 0);
624    }
625
626    let stripped: String = lines
627        .iter()
628        .map(|l| {
629            if l.trim().is_empty() {
630                l.as_str()
631            } else {
632                &l[min_indent.min(l.len())..]
633            }
634        })
635        .collect::<Vec<_>>()
636        .join("\n");
637
638    (stripped, min_indent)
639}
640
641/// Re-indent every non-empty line of `content` by prepending `indent` spaces.
642///
643/// Used to restore the indentation that was stripped by [`strip_min_indent`]
644/// before the content is written back into the heredoc body.
645fn reindent_content(content: &str, indent: usize) -> String {
646    let prefix = " ".repeat(indent);
647    content
648        .lines()
649        .map(|l| {
650            if l.trim().is_empty() {
651                l.to_owned()
652            } else {
653                format!("{prefix}{l}")
654            }
655        })
656        .collect::<Vec<_>>()
657        .join("\n")
658}
659
660// ---------------------------------------------------------------------------
661// Helpers
662// ---------------------------------------------------------------------------
663
664/// Reconstruct and append a replaced key-value line to `output`.
665///
666/// Does **not** append a trailing newline; the caller is responsible for that.
667fn emit_replaced(
668    raw_key: &str,
669    delimiter: &str,
670    after_delim: &str,
671    quote_char: Option<char>,
672    value: &str,
673    output: &mut String,
674) {
675    let ws = leading_whitespace(after_delim);
676    output.push_str(raw_key);
677    output.push_str(delimiter);
678    output.push_str(ws);
679    if let Some(q) = quote_char {
680        output.push(q);
681        output.push_str(value);
682        output.push(q);
683    } else {
684        output.push_str(value);
685    }
686}
687
688/// Like [`emit_replaced`] but appends a `suffix` after the closing quote.
689///
690/// Used for secondary-delimiter lines (e.g. Ruby hash `'key' => 'value',`)
691/// where a trailing comma or closing brace must be preserved.
692///
693/// Does **not** append a trailing newline; the caller is responsible for that.
694fn emit_replaced_with_suffix(
695    raw_key: &str,
696    delimiter: &str,
697    after_delim: &str,
698    quote_char: Option<char>,
699    value: &str,
700    suffix: &str,
701    output: &mut String,
702) {
703    let ws = leading_whitespace(after_delim);
704    output.push_str(raw_key);
705    output.push_str(delimiter);
706    output.push_str(ws);
707    if let Some(q) = quote_char {
708        output.push(q);
709        output.push_str(value);
710        output.push(q);
711    } else {
712        output.push_str(value);
713    }
714    output.push_str(suffix);
715}
716
717/// Detect a quoted value in `after_delim` and return `(quote_char, inner, suffix)`.
718///
719/// Unlike [`detect_quotes`], this finds the *first* quoted span after any
720/// leading whitespace and captures any trailing suffix (e.g. a comma in a
721/// Ruby hash line `=> 'VALUE',`).  For unquoted values the whole trimmed
722/// string is returned as `inner` with an empty suffix.
723fn detect_quoted_value_with_suffix(after_delim: &str) -> (Option<char>, &str, &str) {
724    let trimmed = after_delim.trim_start();
725    if let Some(&first) = trimmed.as_bytes().first() {
726        if first == b'\'' || first == b'"' {
727            let q = first as char;
728            if let Some(close_pos) = trimmed[1..].find(q) {
729                // inner: the text between the quotes
730                let inner = &trimmed[1..=close_pos];
731                // suffix: everything after the closing quote (e.g. `,`)
732                let suffix = &trimmed[close_pos + 2..];
733                return (Some(q), inner, suffix);
734            }
735        }
736    }
737    (None, trimmed, "")
738}
739
740/// Detect a Ruby-style heredoc opener in `value`.
741///
742/// Returns `Some((end_marker, strip_indent))`:
743/// - `end_marker` — the string that closes the heredoc (trimmed before comparison).
744/// - `strip_indent` — `true` only for `<<~` squiggly heredocs; the caller must
745///   strip the minimum leading indentation from the body before sub-processing
746///   and re-add it afterward.
747///
748/// `<<-` allows an indented end marker but does **not** strip body indentation
749/// (`strip_indent = false`). `<<` with no modifier also gives `false`.
750fn detect_heredoc(value: &str) -> Option<(String, bool)> {
751    let pos = value.find("<<")?;
752    let rest = &value[pos + 2..];
753
754    let (strip_indent, rest) = if let Some(r) = rest.strip_prefix('~') {
755        (true, r)
756    } else if let Some(r) = rest.strip_prefix('-') {
757        (false, r)
758    } else {
759        (false, rest)
760    };
761
762    let marker = if let Some(inner) = rest.strip_prefix('\'').and_then(|s| s.split('\'').next()) {
763        inner.to_owned()
764    } else if let Some(inner) = rest.strip_prefix('"').and_then(|s| s.split('"').next()) {
765        inner.to_owned()
766    } else {
767        // Unquoted: read until whitespace or end of string.
768        let m: String = rest
769            .chars()
770            .take_while(|c| c.is_alphanumeric() || *c == '_')
771            .collect();
772        if m.is_empty() {
773            return None;
774        }
775        m
776    };
777
778    Some((marker, strip_indent))
779}
780
781/// Extract the leading whitespace of `s` (the portion before the first
782/// non-whitespace character).
783fn leading_whitespace(s: &str) -> &str {
784    let trimmed = s.trim_start();
785    &s[..s.len() - trimmed.len()]
786}
787
788/// Detect surrounding quotes and return `(quote_char, inner_value)`.
789fn detect_quotes(value: &str) -> (Option<char>, &str) {
790    if value.len() >= 2 {
791        let first = value.as_bytes()[0];
792        let last = value.as_bytes()[value.len() - 1];
793        if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') {
794            return (Some(first as char), &value[1..value.len() - 1]);
795        }
796    }
797    (None, value)
798}
799
800// ---------------------------------------------------------------------------
801// Tests
802// ---------------------------------------------------------------------------
803
804#[cfg(test)]
805mod tests {
806    use super::*;
807    use crate::category::Category;
808    use crate::generator::HmacGenerator;
809    use crate::processor::profile::FieldRule;
810    use crate::store::MappingStore;
811    use std::sync::Arc;
812
813    fn make_store() -> Arc<MappingStore> {
814        let gen = Arc::new(HmacGenerator::new([1u8; 32]));
815        Arc::new(MappingStore::new(gen, None))
816    }
817
818    fn make_profile(fields: Vec<FieldRule>) -> FileTypeProfile {
819        FileTypeProfile::new("key_value", fields)
820    }
821
822    fn process(content: &str, profile: &FileTypeProfile, store: &MappingStore) -> String {
823        let out = KeyValueProcessor
824            .process(content.as_bytes(), profile, store)
825            .unwrap();
826        String::from_utf8(out).unwrap()
827    }
828
829    // ---- basic key = value ----
830
831    #[test]
832    fn replaces_matched_key() {
833        let store = make_store();
834        let profile = make_profile(vec![
835            FieldRule::new("password").with_category(Category::Custom("password".into()))
836        ]);
837        let input = "password = secret123\n";
838        let out = process(input, &profile, &store);
839        assert!(out.starts_with("password = "));
840        assert!(!out.contains("secret123"));
841    }
842
843    #[test]
844    fn preserves_unmatched_key() {
845        let store = make_store();
846        let profile = make_profile(vec![FieldRule::new("password")]);
847        let input = "host = db.internal\n";
848        let out = process(input, &profile, &store);
849        assert_eq!(out, input);
850    }
851
852    #[test]
853    fn preserves_quotes() {
854        let store = make_store();
855        let profile = make_profile(vec![FieldRule::new("password")]);
856        let input = "password = \"secret\"\n";
857        let out = process(input, &profile, &store);
858        assert!(out.contains('"'));
859        assert!(!out.contains("secret"));
860    }
861
862    #[test]
863    fn preserves_single_quotes() {
864        let store = make_store();
865        let profile = make_profile(vec![FieldRule::new("key")]);
866        let input = "key = 'value'\n";
867        let out = process(input, &profile, &store);
868        assert!(out.contains('\''));
869        assert!(!out.contains("value"));
870    }
871
872    #[test]
873    fn preserves_comments_when_no_field_matches() {
874        let store = make_store();
875        let profile = make_profile(vec![]);
876        let input = "# this is a comment\nkey = val\n";
877        let out = process(input, &profile, &store);
878        assert!(out.contains("# this is a comment"));
879    }
880
881    #[test]
882    fn sanitizes_commented_out_field_by_default() {
883        let store = make_store();
884        let profile = make_profile(vec![
885            FieldRule::new("*password*").with_category(Category::Custom("password".into()))
886        ]);
887        let input = "# smtp_password = \"hunter2\"\n";
888        let out = process(input, &profile, &store);
889        assert!(
890            out.starts_with("# smtp_password = "),
891            "comment prefix preserved: {out}"
892        );
893        assert!(!out.contains("hunter2"), "secret should be replaced: {out}");
894    }
895
896    #[test]
897    fn sanitizes_commented_field_secondary_delimiter_arrow() {
898        let store = make_store();
899        let mut profile = make_profile(vec![
900            FieldRule::new("*secret*").with_category(Category::Custom("auth_token".into()))
901        ]);
902        profile
903            .options
904            .insert("secondary_delimiter".into(), "=>,:".into());
905        let input = "#   'client_secret' => 'THIS-IS-SECRET',\n";
906        let out = process(input, &profile, &store);
907        assert!(out.starts_with('#'), "comment prefix preserved: {out}");
908        assert!(
909            !out.contains("THIS-IS-SECRET"),
910            "secret should be replaced: {out}"
911        );
912    }
913
914    #[test]
915    fn sanitizes_commented_field_secondary_delimiter_colon() {
916        let store = make_store();
917        let mut profile = make_profile(vec![
918            FieldRule::new("*secret*").with_category(Category::Custom("auth_token".into()))
919        ]);
920        profile
921            .options
922            .insert("secondary_delimiter".into(), "=>,:".into());
923        let input = "#   'client_secret': 'THIS-IS-SECRET',\n";
924        let out = process(input, &profile, &store);
925        assert!(out.starts_with('#'), "comment prefix preserved: {out}");
926        assert!(
927            !out.contains("THIS-IS-SECRET"),
928            "secret should be replaced: {out}"
929        );
930    }
931
932    #[test]
933    fn ignore_comments_option_preserves_verbatim() {
934        let store = make_store();
935        let mut profile = make_profile(vec![
936            FieldRule::new("*password*").with_category(Category::Custom("password".into()))
937        ]);
938        profile
939            .options
940            .insert("ignore_comments".into(), "true".into());
941        let input = "# smtp_password = \"hunter2\"\n";
942        let out = process(input, &profile, &store);
943        assert_eq!(
944            out, input,
945            "with ignore_comments:true the line should be verbatim"
946        );
947    }
948
949    #[test]
950    fn preserves_blank_lines() {
951        let store = make_store();
952        let profile = make_profile(vec![]);
953        let input = "a = 1\n\nb = 2\n";
954        let out = process(input, &profile, &store);
955        assert_eq!(out, input);
956    }
957
958    #[test]
959    fn glob_pattern_matches_ruby_bracket_key() {
960        let store = make_store();
961        let profile =
962            make_profile(vec![FieldRule::new("*['smtp_password']")
963                .with_category(Category::Custom("password".into()))]);
964        let input = "gitlab_rails['smtp_password'] = \"secret\"\n";
965        let out = process(input, &profile, &store);
966        assert!(!out.contains("secret"));
967        assert!(out.contains('"'));
968    }
969
970    // ---- heredoc detection ----
971
972    #[test]
973    fn detects_heredoc_single_quoted() {
974        let (marker, strip) = detect_heredoc("YAML.load <<-'EOS'").unwrap();
975        assert_eq!(marker, "EOS");
976        assert!(!strip, "<<- does not strip indent");
977    }
978
979    #[test]
980    fn detects_heredoc_double_quoted() {
981        let (marker, strip) = detect_heredoc("JSON.parse <<-\"END\"").unwrap();
982        assert_eq!(marker, "END");
983        assert!(!strip);
984    }
985
986    #[test]
987    fn detects_heredoc_squiggly() {
988        let (marker, strip) = detect_heredoc("<<~YAML").unwrap();
989        assert_eq!(marker, "YAML");
990        assert!(strip, "<<~ must signal strip_indent");
991    }
992
993    #[test]
994    fn detects_heredoc_no_modifier() {
995        let (marker, strip) = detect_heredoc("<<EOS").unwrap();
996        assert_eq!(marker, "EOS");
997        assert!(!strip);
998    }
999
1000    #[test]
1001    fn no_heredoc_for_plain_value() {
1002        assert!(detect_heredoc("\"smtp.server\"").is_none());
1003        assert!(detect_heredoc("nil").is_none());
1004    }
1005
1006    // ---- sub-processor: yaml heredoc ----
1007
1008    #[test]
1009    fn sub_processor_yaml_heredoc() {
1010        let store = make_store();
1011        let sub_fields = vec![
1012            FieldRule::new("*.password").with_category(Category::Custom("password".into())),
1013            FieldRule::new("*.bind_dn").with_category(Category::Custom("dn".into())),
1014        ];
1015        let profile = make_profile(vec![FieldRule::new("*['ldap_servers']")
1016            .with_sub_processor("yaml")
1017            .with_sub_fields(sub_fields)]);
1018
1019        let input = "\
1020gitlab_rails['ldap_servers'] = YAML.load <<-'EOS'
1021  main:
1022    bind_dn: 'cn=admin,dc=example,dc=com'
1023    password: 'real-ldap-password'
1024EOS
1025other_key = 'untouched'
1026";
1027        let out = process(input, &profile, &store);
1028
1029        // Opening and closing lines preserved verbatim.
1030        assert!(out.contains("gitlab_rails['ldap_servers'] = YAML.load <<-'EOS'"));
1031        assert!(out.contains("EOS"));
1032
1033        // Sensitive values replaced.
1034        assert!(!out.contains("real-ldap-password"));
1035        assert!(!out.contains("cn=admin,dc=example,dc=com"));
1036
1037        // Unrelated key untouched.
1038        assert!(out.contains("other_key = 'untouched'"));
1039    }
1040
1041    #[test]
1042    fn sub_processor_yaml_heredoc_end_marker_indented() {
1043        let store = make_store();
1044        let sub_fields =
1045            vec![FieldRule::new("*.secret").with_category(Category::Custom("s".into()))];
1046        let profile = make_profile(vec![FieldRule::new("config")
1047            .with_sub_processor("yaml")
1048            .with_sub_fields(sub_fields)]);
1049
1050        let input = "\
1051config = <<-'EOS'
1052  app:
1053    secret: 'mysecret'
1054  EOS
1055";
1056        let out = process(input, &profile, &store);
1057        assert!(!out.contains("mysecret"));
1058        assert!(out.contains("EOS"));
1059    }
1060
1061    // ---- sub-processor: <<~ squiggly heredoc strips and restores indent ----
1062
1063    #[test]
1064    fn squiggly_heredoc_strips_and_restores_indent() {
1065        // `<<~` strips the minimum indentation before sub-processing and
1066        // re-adds it afterward so the output file preserves the original
1067        // whitespace structure.
1068        let store = make_store();
1069        let sub_fields =
1070            vec![FieldRule::new("*.password").with_category(Category::Custom("password".into()))];
1071        let profile = make_profile(vec![FieldRule::new("*['ldap_servers']")
1072            .with_sub_processor("yaml")
1073            .with_sub_fields(sub_fields)]);
1074
1075        // Body is indented by 2 spaces (typical gitlab.rb <<~ usage).
1076        let input = "\
1077gitlab_rails['ldap_servers'] = YAML.load <<~'EOS'
1078  main:
1079    password: 'real-ldap-password'
1080EOS
1081other_key = 'untouched'
1082";
1083        let out = process(input, &profile, &store);
1084
1085        // Secret is replaced.
1086        assert!(
1087            !out.contains("real-ldap-password"),
1088            "secret must be replaced: {out}"
1089        );
1090
1091        // The 2-space indentation on the YAML lines must be preserved in output.
1092        // Check that the `main:` line still starts with exactly two spaces.
1093        let main_line = out
1094            .lines()
1095            .find(|l| l.trim_start().starts_with("main:"))
1096            .expect("main: line must exist in output");
1097        assert!(
1098            main_line.starts_with("  "),
1099            "indentation must be preserved for <<~ heredoc: {out}"
1100        );
1101
1102        // Opener and end marker preserved verbatim.
1103        assert!(
1104            out.contains("<<~'EOS'"),
1105            "heredoc opener must be preserved: {out}"
1106        );
1107        assert!(
1108            out.contains("\nEOS\n"),
1109            "end marker must be preserved: {out}"
1110        );
1111
1112        // Unrelated key untouched.
1113        assert!(out.contains("other_key = 'untouched'"));
1114    }
1115
1116    #[test]
1117    fn squiggly_heredoc_strip_min_indent_ignores_blank_lines() {
1118        // Blank lines between YAML blocks must not force min_indent to 0.
1119        let lines = vec![
1120            "  key1: val1".to_owned(),
1121            String::new(), // blank — ignored when computing min
1122            "  key2: val2".to_owned(),
1123        ];
1124        let (content, indent) = strip_min_indent(&lines);
1125        assert_eq!(indent, 2);
1126        assert_eq!(content, "key1: val1\n\nkey2: val2");
1127    }
1128
1129    #[test]
1130    fn reindent_content_roundtrips_strip() {
1131        let original_lines = vec!["  main:".to_owned(), "    password: replaced".to_owned()];
1132        let (stripped, indent) = strip_min_indent(&original_lines);
1133        let restored = reindent_content(&stripped, indent);
1134        // Each line should start with the original indentation again.
1135        assert!(restored.starts_with("  main:"), "first line: {restored}");
1136        assert!(
1137            restored.contains("\n    password:"),
1138            "second line: {restored}"
1139        );
1140    }
1141
1142    // ---- sub-processor: non-heredoc inline value ----
1143
1144    #[test]
1145    fn sub_processor_inline_json_value() {
1146        let store = make_store();
1147        let sub_fields =
1148            vec![FieldRule::new("password").with_category(Category::Custom("p".into()))];
1149        let profile = make_profile(vec![FieldRule::new("config")
1150            .with_sub_processor("json")
1151            .with_sub_fields(sub_fields)]);
1152
1153        let input = "config = {\"password\": \"topsecret\"}\n";
1154        let out = process(input, &profile, &store);
1155        assert!(!out.contains("topsecret"));
1156        assert!(out.starts_with("config = "));
1157    }
1158
1159    // ---- sub-processor: unknown name ----
1160
1161    #[test]
1162    fn sub_processor_unknown_returns_error() {
1163        let store = make_store();
1164        let profile = make_profile(vec![FieldRule::new("key")
1165            .with_sub_processor("hcl")
1166            .with_sub_fields(vec![])]);
1167        let input = "key = \"value\"\n";
1168        let result = KeyValueProcessor.process(input.as_bytes(), &profile, &store);
1169        assert!(result.is_err());
1170    }
1171
1172    // ---- field rule builder ----
1173
1174    #[test]
1175    fn field_rule_with_sub_processor() {
1176        let rule = FieldRule::new("*.data")
1177            .with_sub_processor("yaml")
1178            .with_sub_fields(vec![FieldRule::new("*.password")]);
1179        assert_eq!(rule.sub_processor.as_deref(), Some("yaml"));
1180        assert_eq!(rule.sub_fields.len(), 1);
1181    }
1182}