Skip to main content

sanitize_engine/processor/
key_value.rs

1//! Key-value processor for `gitlab.rb`-style configuration files.
2//!
3//! Handles files with lines of the form:
4//!
5//! ```text
6//! key = "value"
7//! key = 'value'
8//! key = value
9//! # comment lines are preserved
10//! ```
11//!
12//! The delimiter, comment prefix, and quoting style are configurable
13//! via the profile's `options` map.
14//!
15//! # Profile Options
16//!
17//! | Key              | Default | Description                                  |
18//! |------------------|---------|----------------------------------------------|
19//! | `delimiter`           | `"="`   | The key-value separator.                     |
20//! | `secondary_delimiter` | *(none)*| Optional additional delimiter(s) tried when  |
21//! |                       |         | the primary delimiter's key does not match   |
22//! |                       |         | any field rule. Comma-separate multiple      |
23//! |                       |         | values (e.g. `"=>,:"`). Surrounding quotes  |
24//! |                       |         | are stripped from the key before matching,   |
25//! |                       |         | and any suffix after the value (e.g. a      |
26//! |                       |         | trailing `,`) is preserved in the output.    |
27//! |                       |         | Useful for Ruby hash literals that use `=>`  |
28//! |                       |         | or `:` alongside a `=`-delimited file.       |
29//! | `comment_prefix`      | `"#"`   | Lines starting with this (after whitespace)  |
30//! |                  |         | are treated as comments.                     |
31//! | `ignore_comments`     | `false` | When `true`, comment lines are preserved     |
32//! |                  |         | verbatim and never sanitized. By default,    |
33//! |                  |         | field rules are applied to commented-out     |
34//! |                  |         | key-value lines so that secrets left behind  |
35//! |                  |         | in comments are still replaced.              |
36//! | `value_strip_suffix`  | *(none)*| Strip this suffix from value before          |
37//! |                       |         | sanitizing and re-append it afterwards.      |
38//! |                       |         | Use `";"` for nginx-style `key value;` files.|
39//!
40//! # Heredoc / Sub-processor Support
41//!
42//! When a matched field rule has `sub_processor` set and the value is a
43//! Ruby-style heredoc (`<<-'EOS'`, `<<~EOS`, etc.), the processor switches
44//! into collection mode: it accumulates heredoc lines until the end marker,
45//! then delegates the collected content to the named sub-processor using the
46//! rule's `sub_fields`. This allows structured content embedded inside
47//! key-value files (e.g. YAML inside `gitlab.rb`) to be sanitized at the
48//! field level rather than relying solely on the streaming scanner.
49//!
50//! For non-heredoc values with `sub_processor`, the value (after quote
51//! stripping) is passed directly to the sub-processor.
52//!
53//! # Formatting Preservation
54//!
55//! - Blank lines and indentation are preserved verbatim.
56//! - Comment lines are preserved verbatim when no field rule matches their body,
57//!   or when `ignore_comments: true` is set in the profile options.
58//! - The original quoting style (single, double, or unquoted) is kept.
59//! - Whitespace around the delimiter is preserved where possible.
60//! - Heredoc opening and closing marker lines are preserved verbatim.
61
62use crate::error::{Result, SanitizeError};
63use crate::processor::limits::DEFAULT_INPUT_SIZE;
64use crate::processor::profile::FieldRule;
65use crate::processor::{
66    find_field_signal, find_matching_rule, replace_by_signal, replace_value, FileTypeProfile,
67    Processor,
68};
69use crate::store::MappingStore;
70use std::collections::HashMap;
71
72// ---------------------------------------------------------------------------
73// Per-file configuration (constant across all lines in one processing call)
74// ---------------------------------------------------------------------------
75
76/// Bundles the per-file options that are constant for every line in a
77/// `process_line` invocation, so the call site stays readable and adding
78/// new options doesn't widen the function signature.
79struct KvConfig<'a> {
80    delimiter: &'a str,
81    comment_prefix: &'a str,
82    secondary_delimiters: &'a [&'a str],
83    value_strip_suffix: Option<&'a str>,
84    ignore_comments: bool,
85    profile: &'a FileTypeProfile,
86    store: &'a MappingStore,
87}
88
89// ---------------------------------------------------------------------------
90// Internal state machine
91// ---------------------------------------------------------------------------
92
93/// Processing state for the line-by-line loop.
94enum LineState {
95    Normal,
96    /// Collecting lines of a heredoc until `end_marker` is seen.
97    Heredoc {
98        end_marker: String,
99        rule: FieldRule,
100        lines: Vec<String>,
101        /// `true` for `<<~` squiggly heredocs: the minimum leading indentation
102        /// is stripped from the body before passing to the sub-processor, then
103        /// re-added to every output line so the file structure is preserved.
104        strip_indent: bool,
105    },
106}
107
108// ---------------------------------------------------------------------------
109// Processor implementation
110// ---------------------------------------------------------------------------
111
112/// Structured processor for key = value configuration files.
113pub struct KeyValueProcessor;
114
115impl Processor for KeyValueProcessor {
116    fn name(&self) -> &'static str {
117        "key_value"
118    }
119
120    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
121        matches!(profile.processor.as_str(), "key_value" | "key-value")
122    }
123
124    fn process(
125        &self,
126        content: &[u8],
127        profile: &FileTypeProfile,
128        store: &MappingStore,
129    ) -> Result<Vec<u8>> {
130        if content.len() > DEFAULT_INPUT_SIZE {
131            return Err(SanitizeError::InputTooLarge {
132                size: content.len(),
133                limit: DEFAULT_INPUT_SIZE,
134            });
135        }
136
137        let text = String::from_utf8_lossy(content);
138        let delimiter = profile.options.get("delimiter").map_or("=", |s| s.as_str());
139        let comment_prefix = profile
140            .options
141            .get("comment_prefix")
142            .map_or("#", |s| s.as_str());
143        let secondary_delimiter_raw = profile
144            .options
145            .get("secondary_delimiter")
146            .map_or("", |s| s.as_str());
147        let secondary_delimiters: Vec<&str> = if secondary_delimiter_raw.is_empty() {
148            vec![]
149        } else {
150            secondary_delimiter_raw.split(',').collect()
151        };
152        let value_strip_suffix = profile
153            .options
154            .get("value_strip_suffix")
155            .map(|s| s.as_str());
156        let ignore_comments = profile
157            .options
158            .get("ignore_comments")
159            .is_some_and(|s| s == "true");
160
161        let cfg = KvConfig {
162            delimiter,
163            comment_prefix,
164            secondary_delimiters: &secondary_delimiters,
165            value_strip_suffix,
166            ignore_comments,
167            profile,
168            store,
169        };
170
171        let mut output = String::with_capacity(text.len());
172        let mut state = LineState::Normal;
173
174        for line in text.split('\n') {
175            process_line(line, &mut state, &mut output, &cfg)?;
176        }
177
178        // Normalise trailing newline: strip all, then re-add exactly one
179        // iff the original ended with one. This corrects the extra '\n'
180        // that split('\n') produces for a trailing-newline input.
181        while output.ends_with('\n') {
182            output.pop();
183        }
184        if text.ends_with('\n') {
185            output.push('\n');
186        }
187
188        Ok(output.into_bytes())
189    }
190}
191
192// ---------------------------------------------------------------------------
193// Per-line processing (extracted to stay within clippy line limit)
194// ---------------------------------------------------------------------------
195
196#[allow(clippy::too_many_lines)]
197fn process_line(
198    line: &str,
199    state: &mut LineState,
200    output: &mut String,
201    cfg: &KvConfig<'_>,
202) -> Result<()> {
203    match state {
204        LineState::Heredoc {
205            ref end_marker,
206            ref rule,
207            ref mut lines,
208            strip_indent,
209        } => {
210            if line.trim() == end_marker.as_str() {
211                // For `<<~` squiggly heredocs, strip the minimum common
212                // indentation before sub-processing (matching Ruby semantics),
213                // then re-add that indentation to every output line so the
214                // file structure is preserved verbatim.
215                let (content, stripped_indent) = if *strip_indent {
216                    strip_min_indent(lines)
217                } else {
218                    (lines.join("\n"), 0)
219                };
220                let processed = process_sub_content(&content, rule, cfg.store)?;
221                let final_content = if *strip_indent && stripped_indent > 0 {
222                    reindent_content(&processed, stripped_indent)
223                } else {
224                    processed
225                };
226                for processed_line in final_content.split('\n') {
227                    output.push_str(processed_line);
228                    output.push('\n');
229                }
230                output.push_str(line);
231                output.push('\n');
232                *state = LineState::Normal;
233            } else {
234                lines.push(line.to_owned());
235            }
236        }
237        LineState::Normal => {
238            let trimmed = line.trim();
239            if trimmed.is_empty() {
240                output.push_str(line);
241                output.push('\n');
242                return Ok(());
243            }
244            if trimmed.starts_with(cfg.comment_prefix) {
245                if !cfg.ignore_comments {
246                    // Find where the comment prefix starts in the original line
247                    // and split into header (everything up to and including the
248                    // prefix) and body (the rest, which may be a key-value pair).
249                    if let Some(prefix_pos) = line.find(cfg.comment_prefix) {
250                        let prefix_end = prefix_pos + cfg.comment_prefix.len();
251                        let comment_header = &line[..prefix_end];
252                        let body = &line[prefix_end..];
253                        if let Some(sanitized_body) = try_sanitize_kv_body(body, cfg)? {
254                            output.push_str(comment_header);
255                            output.push_str(&sanitized_body);
256                            output.push('\n');
257                            return Ok(());
258                        }
259                    }
260                }
261                output.push_str(line);
262                output.push('\n');
263                return Ok(());
264            }
265            // Search for the delimiter in the indent-stripped line so that
266            // indented directives (e.g. nginx `    proxy_pass URL;`) are found
267            // even when the delimiter is a space character.
268            let line_body = line.trim_start();
269            let indent_len = line.len() - line_body.len();
270            if let Some(delim_pos) = line_body.find(cfg.delimiter) {
271                // raw_key preserves leading indent for faithful output reconstruction.
272                let raw_key = &line[..indent_len + delim_pos];
273                let after_delim = &line_body[delim_pos + cfg.delimiter.len()..];
274                let key = line_body[..delim_pos].trim();
275                if let Some(rule) = find_matching_rule(key, cfg.profile) {
276                    if rule.sub_processor.is_some() {
277                        if let Some((marker, strip_indent)) = detect_heredoc(after_delim) {
278                            output.push_str(line);
279                            output.push('\n');
280                            *state = LineState::Heredoc {
281                                end_marker: marker,
282                                rule: rule.clone(),
283                                lines: Vec::new(),
284                                strip_indent,
285                            };
286                            return Ok(());
287                        }
288                        let raw_value = after_delim.trim();
289                        let (quote_char, inner) = detect_quotes(raw_value);
290                        let processed = process_sub_content(inner, rule, cfg.store)?;
291                        emit_replaced(
292                            raw_key,
293                            cfg.delimiter,
294                            after_delim,
295                            quote_char,
296                            &processed,
297                            output,
298                        );
299                        output.push('\n');
300                        return Ok(());
301                    }
302                    let raw_value = after_delim.trim();
303                    let (quote_char, inner) = detect_quotes(raw_value);
304                    let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
305                        Some(sfx) if inner.ends_with(sfx) => {
306                            (&inner[..inner.len() - sfx.len()], sfx)
307                        }
308                        _ => (inner, ""),
309                    };
310                    let replaced = replace_value(sanitize_inner, rule, cfg.store)?;
311                    if suffix.is_empty() {
312                        emit_replaced(
313                            raw_key,
314                            cfg.delimiter,
315                            after_delim,
316                            quote_char,
317                            &replaced,
318                            output,
319                        );
320                    } else {
321                        emit_replaced_with_suffix(
322                            raw_key,
323                            cfg.delimiter,
324                            after_delim,
325                            quote_char,
326                            &replaced,
327                            suffix,
328                            output,
329                        );
330                    }
331                    output.push('\n');
332                    return Ok(());
333                } else if let Some(sig) = find_field_signal(key, &cfg.profile.field_name_signals) {
334                    let raw_value = after_delim.trim();
335                    let (quote_char, inner) = detect_quotes(raw_value);
336                    let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
337                        Some(sfx) if inner.ends_with(sfx) => {
338                            (&inner[..inner.len() - sfx.len()], sfx)
339                        }
340                        _ => (inner, ""),
341                    };
342                    if let Some(replaced) = replace_by_signal(sanitize_inner, sig, cfg.store)? {
343                        if suffix.is_empty() {
344                            emit_replaced(
345                                raw_key,
346                                cfg.delimiter,
347                                after_delim,
348                                quote_char,
349                                &replaced,
350                                output,
351                            );
352                        } else {
353                            emit_replaced_with_suffix(
354                                raw_key,
355                                cfg.delimiter,
356                                after_delim,
357                                quote_char,
358                                &replaced,
359                                suffix,
360                                output,
361                            );
362                        }
363                        output.push('\n');
364                        return Ok(());
365                    }
366                }
367            }
368            // Try secondary delimiters in order (e.g. `=>` and `:` for Ruby
369            // hash lines like `'aws_access_key_id' => 'KEY',` or
370            // `'client_secret': 'VALUE',`).
371            for &sec_delim in cfg.secondary_delimiters {
372                if let Some(delim_pos) = line.find(sec_delim) {
373                    let raw_key = &line[..delim_pos];
374                    let after_delim = &line[delim_pos + sec_delim.len()..];
375                    // Strip surrounding quotes from the key before matching
376                    // (e.g. `'aws_access_key_id'` → `aws_access_key_id`).
377                    let trimmed_key = raw_key.trim();
378                    let (_, unquoted_key) = detect_quotes(trimmed_key);
379                    if let Some(rule) = find_matching_rule(unquoted_key, cfg.profile) {
380                        let (quote_char, inner, suffix) =
381                            detect_quoted_value_with_suffix(after_delim);
382                        let replaced = replace_value(inner, rule, cfg.store)?;
383                        emit_replaced_with_suffix(
384                            raw_key,
385                            sec_delim,
386                            after_delim,
387                            quote_char,
388                            &replaced,
389                            suffix,
390                            output,
391                        );
392                        output.push('\n');
393                        return Ok(());
394                    } else if let Some(sig) =
395                        find_field_signal(unquoted_key, &cfg.profile.field_name_signals)
396                    {
397                        let (quote_char, inner, suffix) =
398                            detect_quoted_value_with_suffix(after_delim);
399                        if let Some(replaced) = replace_by_signal(inner, sig, cfg.store)? {
400                            emit_replaced_with_suffix(
401                                raw_key,
402                                sec_delim,
403                                after_delim,
404                                quote_char,
405                                &replaced,
406                                suffix,
407                                output,
408                            );
409                            output.push('\n');
410                            return Ok(());
411                        }
412                    }
413                }
414            }
415            output.push_str(line);
416            output.push('\n');
417        }
418    }
419    Ok(())
420}
421
422// ---------------------------------------------------------------------------
423// Comment-body sanitization
424// ---------------------------------------------------------------------------
425
426/// Try to parse and sanitize `body` (the text after the comment prefix on a
427/// commented-out line) as a key-value pair using the same field rules as normal
428/// lines. Returns `Some(sanitized_body)` — without a trailing newline — when a
429/// field rule matched and the value was replaced; `None` when nothing matched
430/// and the line should be preserved verbatim.
431#[allow(clippy::too_many_lines)]
432fn try_sanitize_kv_body(body: &str, cfg: &KvConfig<'_>) -> Result<Option<String>> {
433    let body_trimmed = body.trim_start();
434    let indent_len = body.len() - body_trimmed.len();
435
436    // Try primary delimiter.
437    if let Some(delim_pos) = body_trimmed.find(cfg.delimiter) {
438        let raw_key = &body[..indent_len + delim_pos];
439        let after_delim = &body_trimmed[delim_pos + cfg.delimiter.len()..];
440        let key = body_trimmed[..delim_pos].trim();
441        if let Some(rule) = find_matching_rule(key, cfg.profile) {
442            let raw_value = after_delim.trim();
443            let (quote_char, inner) = detect_quotes(raw_value);
444            let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
445                Some(sfx) if inner.ends_with(sfx) => (&inner[..inner.len() - sfx.len()], sfx),
446                _ => (inner, ""),
447            };
448            let replaced = replace_value(sanitize_inner, rule, cfg.store)?;
449            let mut out = String::new();
450            if suffix.is_empty() {
451                emit_replaced(
452                    raw_key,
453                    cfg.delimiter,
454                    after_delim,
455                    quote_char,
456                    &replaced,
457                    &mut out,
458                );
459            } else {
460                emit_replaced_with_suffix(
461                    raw_key,
462                    cfg.delimiter,
463                    after_delim,
464                    quote_char,
465                    &replaced,
466                    suffix,
467                    &mut out,
468                );
469            }
470            return Ok(Some(out));
471        } else if let Some(sig) = find_field_signal(key, &cfg.profile.field_name_signals) {
472            let raw_value = after_delim.trim();
473            let (quote_char, inner) = detect_quotes(raw_value);
474            let (sanitize_inner, suffix) = match cfg.value_strip_suffix {
475                Some(sfx) if inner.ends_with(sfx) => (&inner[..inner.len() - sfx.len()], sfx),
476                _ => (inner, ""),
477            };
478            if let Some(replaced) = replace_by_signal(sanitize_inner, sig, cfg.store)? {
479                let mut out = String::new();
480                if suffix.is_empty() {
481                    emit_replaced(
482                        raw_key,
483                        cfg.delimiter,
484                        after_delim,
485                        quote_char,
486                        &replaced,
487                        &mut out,
488                    );
489                } else {
490                    emit_replaced_with_suffix(
491                        raw_key,
492                        cfg.delimiter,
493                        after_delim,
494                        quote_char,
495                        &replaced,
496                        suffix,
497                        &mut out,
498                    );
499                }
500                return Ok(Some(out));
501            }
502        }
503    }
504
505    // Try secondary delimiters in order.
506    for &sec_delim in cfg.secondary_delimiters {
507        if let Some(delim_pos) = body.find(sec_delim) {
508            let raw_key = &body[..delim_pos];
509            let after_delim = &body[delim_pos + sec_delim.len()..];
510            let trimmed_key = raw_key.trim();
511            let (_, unquoted_key) = detect_quotes(trimmed_key);
512            if let Some(rule) = find_matching_rule(unquoted_key, cfg.profile) {
513                let (quote_char, inner, suffix) = detect_quoted_value_with_suffix(after_delim);
514                let replaced = replace_value(inner, rule, cfg.store)?;
515                let mut out = String::new();
516                emit_replaced_with_suffix(
517                    raw_key,
518                    sec_delim,
519                    after_delim,
520                    quote_char,
521                    &replaced,
522                    suffix,
523                    &mut out,
524                );
525                return Ok(Some(out));
526            } else if let Some(sig) =
527                find_field_signal(unquoted_key, &cfg.profile.field_name_signals)
528            {
529                let (quote_char, inner, suffix) = detect_quoted_value_with_suffix(after_delim);
530                if let Some(replaced) = replace_by_signal(inner, sig, cfg.store)? {
531                    let mut out = String::new();
532                    emit_replaced_with_suffix(
533                        raw_key,
534                        sec_delim,
535                        after_delim,
536                        quote_char,
537                        &replaced,
538                        suffix,
539                        &mut out,
540                    );
541                    return Ok(Some(out));
542                }
543            }
544        }
545    }
546
547    Ok(None)
548}
549
550// ---------------------------------------------------------------------------
551// Sub-processor dispatch
552// ---------------------------------------------------------------------------
553
554/// Delegate `content` to the processor named in `rule.sub_processor`.
555///
556/// Builds a synthetic [`FileTypeProfile`] from the rule's `sub_fields` and
557/// calls the appropriate built-in processor directly. Returns the processed
558/// content as a `String`.
559fn process_sub_content(content: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
560    use super::env_proc::EnvProcessor;
561    use super::ini_proc::IniProcessor;
562    use super::json_proc::JsonProcessor;
563    use super::log_line::LogLineProcessor;
564    use super::toml_proc::TomlProcessor;
565    use super::yaml_proc::YamlProcessor;
566
567    let name = rule
568        .sub_processor
569        .as_deref()
570        .ok_or_else(|| SanitizeError::InvalidConfig("sub_processor not set".into()))?;
571
572    let sub_profile = FileTypeProfile {
573        processor: name.to_owned(),
574        extensions: Vec::new(),
575        include: Vec::new(),
576        exclude: Vec::new(),
577        fields: rule.sub_fields.clone(),
578        options: HashMap::new(),
579        field_name_signals: Vec::new(),
580    };
581
582    let bytes = content.as_bytes();
583    let out = match name {
584        "yaml" => YamlProcessor.process(bytes, &sub_profile, store)?,
585        "json" => JsonProcessor.process(bytes, &sub_profile, store)?,
586        "toml" => TomlProcessor.process(bytes, &sub_profile, store)?,
587        "ini" => IniProcessor.process(bytes, &sub_profile, store)?,
588        "env" => EnvProcessor.process(bytes, &sub_profile, store)?,
589        "log_line" => LogLineProcessor::new().process(bytes, &sub_profile, store)?,
590        other => {
591            return Err(SanitizeError::InvalidConfig(format!(
592                "unknown sub_processor '{other}' — supported: yaml, json, toml, ini, env, log_line"
593            )))
594        }
595    };
596
597    String::from_utf8(out)
598        .map_err(|e| SanitizeError::IoError(format!("sub-processor output is not UTF-8: {e}")))
599}
600
601// ---------------------------------------------------------------------------
602// Heredoc indent helpers
603// ---------------------------------------------------------------------------
604
605/// Strip the minimum common leading indentation from a set of heredoc body lines.
606///
607/// Implements Ruby's `<<~` squiggly-heredoc semantics: empty or whitespace-only
608/// lines are ignored when computing the minimum indentation so they don't force
609/// the minimum to zero. Returns the joined content and the number of spaces
610/// stripped (needed to re-indent the processed output).
611fn strip_min_indent(lines: &[String]) -> (String, usize) {
612    let min_indent = lines
613        .iter()
614        .filter(|l| !l.trim().is_empty())
615        .map(|l| l.len() - l.trim_start().len())
616        .min()
617        .unwrap_or(0);
618
619    if min_indent == 0 {
620        return (lines.join("\n"), 0);
621    }
622
623    let stripped: String = lines
624        .iter()
625        .map(|l| {
626            if l.trim().is_empty() {
627                l.as_str()
628            } else {
629                &l[min_indent.min(l.len())..]
630            }
631        })
632        .collect::<Vec<_>>()
633        .join("\n");
634
635    (stripped, min_indent)
636}
637
638/// Re-indent every non-empty line of `content` by prepending `indent` spaces.
639///
640/// Used to restore the indentation that was stripped by [`strip_min_indent`]
641/// before the content is written back into the heredoc body.
642fn reindent_content(content: &str, indent: usize) -> String {
643    let prefix = " ".repeat(indent);
644    content
645        .lines()
646        .map(|l| {
647            if l.trim().is_empty() {
648                l.to_owned()
649            } else {
650                format!("{prefix}{l}")
651            }
652        })
653        .collect::<Vec<_>>()
654        .join("\n")
655}
656
657// ---------------------------------------------------------------------------
658// Helpers
659// ---------------------------------------------------------------------------
660
661/// Reconstruct and append a replaced key-value line to `output`.
662///
663/// Does **not** append a trailing newline; the caller is responsible for that.
664fn emit_replaced(
665    raw_key: &str,
666    delimiter: &str,
667    after_delim: &str,
668    quote_char: Option<char>,
669    value: &str,
670    output: &mut String,
671) {
672    let ws = leading_whitespace(after_delim);
673    output.push_str(raw_key);
674    output.push_str(delimiter);
675    output.push_str(ws);
676    if let Some(q) = quote_char {
677        output.push(q);
678        output.push_str(value);
679        output.push(q);
680    } else {
681        output.push_str(value);
682    }
683}
684
685/// Like [`emit_replaced`] but appends a `suffix` after the closing quote.
686///
687/// Used for secondary-delimiter lines (e.g. Ruby hash `'key' => 'value',`)
688/// where a trailing comma or closing brace must be preserved.
689///
690/// Does **not** append a trailing newline; the caller is responsible for that.
691fn emit_replaced_with_suffix(
692    raw_key: &str,
693    delimiter: &str,
694    after_delim: &str,
695    quote_char: Option<char>,
696    value: &str,
697    suffix: &str,
698    output: &mut String,
699) {
700    let ws = leading_whitespace(after_delim);
701    output.push_str(raw_key);
702    output.push_str(delimiter);
703    output.push_str(ws);
704    if let Some(q) = quote_char {
705        output.push(q);
706        output.push_str(value);
707        output.push(q);
708    } else {
709        output.push_str(value);
710    }
711    output.push_str(suffix);
712}
713
714/// Detect a quoted value in `after_delim` and return `(quote_char, inner, suffix)`.
715///
716/// Unlike [`detect_quotes`], this finds the *first* quoted span after any
717/// leading whitespace and captures any trailing suffix (e.g. a comma in a
718/// Ruby hash line `=> 'VALUE',`).  For unquoted values the whole trimmed
719/// string is returned as `inner` with an empty suffix.
720fn detect_quoted_value_with_suffix(after_delim: &str) -> (Option<char>, &str, &str) {
721    let trimmed = after_delim.trim_start();
722    if let Some(&first) = trimmed.as_bytes().first() {
723        if first == b'\'' || first == b'"' {
724            let q = first as char;
725            if let Some(close_pos) = trimmed[1..].find(q) {
726                // inner: the text between the quotes
727                let inner = &trimmed[1..=close_pos];
728                // suffix: everything after the closing quote (e.g. `,`)
729                let suffix = &trimmed[close_pos + 2..];
730                return (Some(q), inner, suffix);
731            }
732        }
733    }
734    (None, trimmed, "")
735}
736
737/// Detect a Ruby-style heredoc opener in `value`.
738///
739/// Returns `Some((end_marker, strip_indent))`:
740/// - `end_marker` — the string that closes the heredoc (trimmed before comparison).
741/// - `strip_indent` — `true` only for `<<~` squiggly heredocs; the caller must
742///   strip the minimum leading indentation from the body before sub-processing
743///   and re-add it afterward.
744///
745/// `<<-` allows an indented end marker but does **not** strip body indentation
746/// (`strip_indent = false`). `<<` with no modifier also gives `false`.
747fn detect_heredoc(value: &str) -> Option<(String, bool)> {
748    let pos = value.find("<<")?;
749    let rest = &value[pos + 2..];
750
751    let (strip_indent, rest) = if let Some(r) = rest.strip_prefix('~') {
752        (true, r)
753    } else if let Some(r) = rest.strip_prefix('-') {
754        (false, r)
755    } else {
756        (false, rest)
757    };
758
759    let marker = if let Some(inner) = rest.strip_prefix('\'').and_then(|s| s.split('\'').next()) {
760        inner.to_owned()
761    } else if let Some(inner) = rest.strip_prefix('"').and_then(|s| s.split('"').next()) {
762        inner.to_owned()
763    } else {
764        // Unquoted: read until whitespace or end of string.
765        let m: String = rest
766            .chars()
767            .take_while(|c| c.is_alphanumeric() || *c == '_')
768            .collect();
769        if m.is_empty() {
770            return None;
771        }
772        m
773    };
774
775    Some((marker, strip_indent))
776}
777
778/// Extract the leading whitespace of `s` (the portion before the first
779/// non-whitespace character).
780fn leading_whitespace(s: &str) -> &str {
781    let trimmed = s.trim_start();
782    &s[..s.len() - trimmed.len()]
783}
784
785/// Detect surrounding quotes and return `(quote_char, inner_value)`.
786fn detect_quotes(value: &str) -> (Option<char>, &str) {
787    if value.len() >= 2 {
788        let first = value.as_bytes()[0];
789        let last = value.as_bytes()[value.len() - 1];
790        if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') {
791            return (Some(first as char), &value[1..value.len() - 1]);
792        }
793    }
794    (None, value)
795}
796
797// ---------------------------------------------------------------------------
798// Tests
799// ---------------------------------------------------------------------------
800
801#[cfg(test)]
802mod tests {
803    use super::*;
804    use crate::category::Category;
805    use crate::generator::HmacGenerator;
806    use crate::processor::profile::FieldRule;
807    use crate::store::MappingStore;
808    use std::sync::Arc;
809
810    fn make_store() -> Arc<MappingStore> {
811        let gen = Arc::new(HmacGenerator::new([1u8; 32]));
812        Arc::new(MappingStore::new(gen, None))
813    }
814
815    fn make_profile(fields: Vec<FieldRule>) -> FileTypeProfile {
816        FileTypeProfile::new("key_value", fields)
817    }
818
819    fn process(content: &str, profile: &FileTypeProfile, store: &MappingStore) -> String {
820        let out = KeyValueProcessor
821            .process(content.as_bytes(), profile, store)
822            .unwrap();
823        String::from_utf8(out).unwrap()
824    }
825
826    // ---- basic key = value ----
827
828    #[test]
829    fn replaces_matched_key() {
830        let store = make_store();
831        let profile = make_profile(vec![
832            FieldRule::new("password").with_category(Category::Custom("password".into()))
833        ]);
834        let input = "password = secret123\n";
835        let out = process(input, &profile, &store);
836        assert!(out.starts_with("password = "));
837        assert!(!out.contains("secret123"));
838    }
839
840    #[test]
841    fn preserves_unmatched_key() {
842        let store = make_store();
843        let profile = make_profile(vec![FieldRule::new("password")]);
844        let input = "host = db.internal\n";
845        let out = process(input, &profile, &store);
846        assert_eq!(out, input);
847    }
848
849    #[test]
850    fn preserves_quotes() {
851        let store = make_store();
852        let profile = make_profile(vec![FieldRule::new("password")]);
853        let input = "password = \"secret\"\n";
854        let out = process(input, &profile, &store);
855        assert!(out.contains('"'));
856        assert!(!out.contains("secret"));
857    }
858
859    #[test]
860    fn preserves_single_quotes() {
861        let store = make_store();
862        let profile = make_profile(vec![FieldRule::new("key")]);
863        let input = "key = 'value'\n";
864        let out = process(input, &profile, &store);
865        assert!(out.contains('\''));
866        assert!(!out.contains("value"));
867    }
868
869    #[test]
870    fn preserves_comments_when_no_field_matches() {
871        let store = make_store();
872        let profile = make_profile(vec![]);
873        let input = "# this is a comment\nkey = val\n";
874        let out = process(input, &profile, &store);
875        assert!(out.contains("# this is a comment"));
876    }
877
878    #[test]
879    fn sanitizes_commented_out_field_by_default() {
880        let store = make_store();
881        let profile = make_profile(vec![
882            FieldRule::new("*password*").with_category(Category::Custom("password".into()))
883        ]);
884        let input = "# smtp_password = \"hunter2\"\n";
885        let out = process(input, &profile, &store);
886        assert!(
887            out.starts_with("# smtp_password = "),
888            "comment prefix preserved: {out}"
889        );
890        assert!(!out.contains("hunter2"), "secret should be replaced: {out}");
891    }
892
893    #[test]
894    fn sanitizes_commented_field_secondary_delimiter_arrow() {
895        let store = make_store();
896        let mut profile = make_profile(vec![
897            FieldRule::new("*secret*").with_category(Category::Custom("auth_token".into()))
898        ]);
899        profile
900            .options
901            .insert("secondary_delimiter".into(), "=>,:".into());
902        let input = "#   'client_secret' => 'THIS-IS-SECRET',\n";
903        let out = process(input, &profile, &store);
904        assert!(out.starts_with('#'), "comment prefix preserved: {out}");
905        assert!(
906            !out.contains("THIS-IS-SECRET"),
907            "secret should be replaced: {out}"
908        );
909    }
910
911    #[test]
912    fn sanitizes_commented_field_secondary_delimiter_colon() {
913        let store = make_store();
914        let mut profile = make_profile(vec![
915            FieldRule::new("*secret*").with_category(Category::Custom("auth_token".into()))
916        ]);
917        profile
918            .options
919            .insert("secondary_delimiter".into(), "=>,:".into());
920        let input = "#   'client_secret': 'THIS-IS-SECRET',\n";
921        let out = process(input, &profile, &store);
922        assert!(out.starts_with('#'), "comment prefix preserved: {out}");
923        assert!(
924            !out.contains("THIS-IS-SECRET"),
925            "secret should be replaced: {out}"
926        );
927    }
928
929    #[test]
930    fn ignore_comments_option_preserves_verbatim() {
931        let store = make_store();
932        let mut profile = make_profile(vec![
933            FieldRule::new("*password*").with_category(Category::Custom("password".into()))
934        ]);
935        profile
936            .options
937            .insert("ignore_comments".into(), "true".into());
938        let input = "# smtp_password = \"hunter2\"\n";
939        let out = process(input, &profile, &store);
940        assert_eq!(
941            out, input,
942            "with ignore_comments:true the line should be verbatim"
943        );
944    }
945
946    #[test]
947    fn preserves_blank_lines() {
948        let store = make_store();
949        let profile = make_profile(vec![]);
950        let input = "a = 1\n\nb = 2\n";
951        let out = process(input, &profile, &store);
952        assert_eq!(out, input);
953    }
954
955    #[test]
956    fn glob_pattern_matches_ruby_bracket_key() {
957        let store = make_store();
958        let profile =
959            make_profile(vec![FieldRule::new("*['smtp_password']")
960                .with_category(Category::Custom("password".into()))]);
961        let input = "gitlab_rails['smtp_password'] = \"secret\"\n";
962        let out = process(input, &profile, &store);
963        assert!(!out.contains("secret"));
964        assert!(out.contains('"'));
965    }
966
967    // ---- heredoc detection ----
968
969    #[test]
970    fn detects_heredoc_single_quoted() {
971        let (marker, strip) = detect_heredoc("YAML.load <<-'EOS'").unwrap();
972        assert_eq!(marker, "EOS");
973        assert!(!strip, "<<- does not strip indent");
974    }
975
976    #[test]
977    fn detects_heredoc_double_quoted() {
978        let (marker, strip) = detect_heredoc("JSON.parse <<-\"END\"").unwrap();
979        assert_eq!(marker, "END");
980        assert!(!strip);
981    }
982
983    #[test]
984    fn detects_heredoc_squiggly() {
985        let (marker, strip) = detect_heredoc("<<~YAML").unwrap();
986        assert_eq!(marker, "YAML");
987        assert!(strip, "<<~ must signal strip_indent");
988    }
989
990    #[test]
991    fn detects_heredoc_no_modifier() {
992        let (marker, strip) = detect_heredoc("<<EOS").unwrap();
993        assert_eq!(marker, "EOS");
994        assert!(!strip);
995    }
996
997    #[test]
998    fn no_heredoc_for_plain_value() {
999        assert!(detect_heredoc("\"smtp.server\"").is_none());
1000        assert!(detect_heredoc("nil").is_none());
1001    }
1002
1003    // ---- sub-processor: yaml heredoc ----
1004
1005    #[test]
1006    fn sub_processor_yaml_heredoc() {
1007        let store = make_store();
1008        let sub_fields = vec![
1009            FieldRule::new("*.password").with_category(Category::Custom("password".into())),
1010            FieldRule::new("*.bind_dn").with_category(Category::Custom("dn".into())),
1011        ];
1012        let profile = make_profile(vec![FieldRule::new("*['ldap_servers']")
1013            .with_sub_processor("yaml")
1014            .with_sub_fields(sub_fields)]);
1015
1016        let input = "\
1017gitlab_rails['ldap_servers'] = YAML.load <<-'EOS'
1018  main:
1019    bind_dn: 'cn=admin,dc=example,dc=com'
1020    password: 'real-ldap-password'
1021EOS
1022other_key = 'untouched'
1023";
1024        let out = process(input, &profile, &store);
1025
1026        // Opening and closing lines preserved verbatim.
1027        assert!(out.contains("gitlab_rails['ldap_servers'] = YAML.load <<-'EOS'"));
1028        assert!(out.contains("EOS"));
1029
1030        // Sensitive values replaced.
1031        assert!(!out.contains("real-ldap-password"));
1032        assert!(!out.contains("cn=admin,dc=example,dc=com"));
1033
1034        // Unrelated key untouched.
1035        assert!(out.contains("other_key = 'untouched'"));
1036    }
1037
1038    #[test]
1039    fn sub_processor_yaml_heredoc_end_marker_indented() {
1040        let store = make_store();
1041        let sub_fields =
1042            vec![FieldRule::new("*.secret").with_category(Category::Custom("s".into()))];
1043        let profile = make_profile(vec![FieldRule::new("config")
1044            .with_sub_processor("yaml")
1045            .with_sub_fields(sub_fields)]);
1046
1047        let input = "\
1048config = <<-'EOS'
1049  app:
1050    secret: 'mysecret'
1051  EOS
1052";
1053        let out = process(input, &profile, &store);
1054        assert!(!out.contains("mysecret"));
1055        assert!(out.contains("EOS"));
1056    }
1057
1058    // ---- sub-processor: <<~ squiggly heredoc strips and restores indent ----
1059
1060    #[test]
1061    fn squiggly_heredoc_strips_and_restores_indent() {
1062        // `<<~` strips the minimum indentation before sub-processing and
1063        // re-adds it afterward so the output file preserves the original
1064        // whitespace structure.
1065        let store = make_store();
1066        let sub_fields =
1067            vec![FieldRule::new("*.password").with_category(Category::Custom("password".into()))];
1068        let profile = make_profile(vec![FieldRule::new("*['ldap_servers']")
1069            .with_sub_processor("yaml")
1070            .with_sub_fields(sub_fields)]);
1071
1072        // Body is indented by 2 spaces (typical gitlab.rb <<~ usage).
1073        let input = "\
1074gitlab_rails['ldap_servers'] = YAML.load <<~'EOS'
1075  main:
1076    password: 'real-ldap-password'
1077EOS
1078other_key = 'untouched'
1079";
1080        let out = process(input, &profile, &store);
1081
1082        // Secret is replaced.
1083        assert!(
1084            !out.contains("real-ldap-password"),
1085            "secret must be replaced: {out}"
1086        );
1087
1088        // The 2-space indentation on the YAML lines must be preserved in output.
1089        // Check that the `main:` line still starts with exactly two spaces.
1090        let main_line = out
1091            .lines()
1092            .find(|l| l.trim_start().starts_with("main:"))
1093            .expect("main: line must exist in output");
1094        assert!(
1095            main_line.starts_with("  "),
1096            "indentation must be preserved for <<~ heredoc: {out}"
1097        );
1098
1099        // Opener and end marker preserved verbatim.
1100        assert!(
1101            out.contains("<<~'EOS'"),
1102            "heredoc opener must be preserved: {out}"
1103        );
1104        assert!(
1105            out.contains("\nEOS\n"),
1106            "end marker must be preserved: {out}"
1107        );
1108
1109        // Unrelated key untouched.
1110        assert!(out.contains("other_key = 'untouched'"));
1111    }
1112
1113    #[test]
1114    fn squiggly_heredoc_strip_min_indent_ignores_blank_lines() {
1115        // Blank lines between YAML blocks must not force min_indent to 0.
1116        let lines = vec![
1117            "  key1: val1".to_owned(),
1118            String::new(), // blank — ignored when computing min
1119            "  key2: val2".to_owned(),
1120        ];
1121        let (content, indent) = strip_min_indent(&lines);
1122        assert_eq!(indent, 2);
1123        assert_eq!(content, "key1: val1\n\nkey2: val2");
1124    }
1125
1126    #[test]
1127    fn reindent_content_roundtrips_strip() {
1128        let original_lines = vec!["  main:".to_owned(), "    password: replaced".to_owned()];
1129        let (stripped, indent) = strip_min_indent(&original_lines);
1130        let restored = reindent_content(&stripped, indent);
1131        // Each line should start with the original indentation again.
1132        assert!(restored.starts_with("  main:"), "first line: {restored}");
1133        assert!(
1134            restored.contains("\n    password:"),
1135            "second line: {restored}"
1136        );
1137    }
1138
1139    // ---- sub-processor: non-heredoc inline value ----
1140
1141    #[test]
1142    fn sub_processor_inline_json_value() {
1143        let store = make_store();
1144        let sub_fields =
1145            vec![FieldRule::new("password").with_category(Category::Custom("p".into()))];
1146        let profile = make_profile(vec![FieldRule::new("config")
1147            .with_sub_processor("json")
1148            .with_sub_fields(sub_fields)]);
1149
1150        let input = "config = {\"password\": \"topsecret\"}\n";
1151        let out = process(input, &profile, &store);
1152        assert!(!out.contains("topsecret"));
1153        assert!(out.starts_with("config = "));
1154    }
1155
1156    // ---- sub-processor: unknown name ----
1157
1158    #[test]
1159    fn sub_processor_unknown_returns_error() {
1160        let store = make_store();
1161        let profile = make_profile(vec![FieldRule::new("key")
1162            .with_sub_processor("hcl")
1163            .with_sub_fields(vec![])]);
1164        let input = "key = \"value\"\n";
1165        let result = KeyValueProcessor.process(input.as_bytes(), &profile, &store);
1166        assert!(result.is_err());
1167    }
1168
1169    // ---- field rule builder ----
1170
1171    #[test]
1172    fn field_rule_with_sub_processor() {
1173        let rule = FieldRule::new("*.data")
1174            .with_sub_processor("yaml")
1175            .with_sub_fields(vec![FieldRule::new("*.password")]);
1176        assert_eq!(rule.sub_processor.as_deref(), Some("yaml"));
1177        assert_eq!(rule.sub_fields.len(), 1);
1178    }
1179}