Skip to main content

diffguard_domain/
preprocess.rs

1use std::fmt;
2use std::str::FromStr;
3
4/// Supported programming languages for preprocessing.
5///
6/// Each language has specific comment and string syntax that the preprocessor
7/// uses to correctly mask comments and strings.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
9pub enum Language {
10    Rust,
11    Python,
12    JavaScript,
13    TypeScript,
14    Go,
15    Ruby,
16    C,
17    Cpp,
18    CSharp,
19    Java,
20    Kotlin,
21    Shell,
22    Swift,
23    Scala,
24    Sql,
25    Xml,
26    Php,
27    Yaml,
28    Toml,
29    Json,
30    #[default]
31    Unknown,
32}
33
34impl FromStr for Language {
35    type Err = std::convert::Infallible;
36
37    /// Parse a language identifier string into a Language enum.
38    ///
39    /// The matching is case-insensitive. Unknown languages return `Language::Unknown`.
40    fn from_str(s: &str) -> Result<Self, Self::Err> {
41        Ok(match s.to_ascii_lowercase().as_str() {
42            "rust" => Language::Rust,
43            "python" => Language::Python,
44            "javascript" => Language::JavaScript,
45            "typescript" => Language::TypeScript,
46            "go" => Language::Go,
47            "ruby" => Language::Ruby,
48            "c" => Language::C,
49            "cpp" => Language::Cpp,
50            "csharp" => Language::CSharp,
51            "java" => Language::Java,
52            "kotlin" => Language::Kotlin,
53            "shell" | "bash" | "sh" | "zsh" | "ksh" | "fish" => Language::Shell,
54            "swift" => Language::Swift,
55            "scala" => Language::Scala,
56            "sql" => Language::Sql,
57            "xml" | "html" | "xhtml" | "svg" | "xsl" | "xslt" => Language::Xml,
58            "php" => Language::Php,
59            "yaml" | "yml" => Language::Yaml,
60            "toml" => Language::Toml,
61            "json" | "jsonc" | "json5" => Language::Json,
62            _ => Language::Unknown,
63        })
64    }
65}
66
67impl Language {
68    /// Returns the comment syntax for this language.
69    pub fn comment_syntax(self) -> CommentSyntax {
70        match self {
71            Language::Python | Language::Ruby | Language::Shell => CommentSyntax::Hash,
72            // Rust, Swift, and Scala support nested block comments
73            Language::Rust | Language::Swift | Language::Scala => CommentSyntax::CStyleNested,
74            // SQL uses -- for line comments
75            Language::Sql => CommentSyntax::Sql,
76            // XML/HTML uses <!-- --> block comments only
77            Language::Xml => CommentSyntax::Xml,
78            // PHP uses //, #, and /* */
79            Language::Php => CommentSyntax::Php,
80            // YAML/TOML use # comments
81            Language::Yaml | Language::Toml => CommentSyntax::Hash,
82            // JSON supports comments in jsonc/json5 dialects
83            Language::Json => CommentSyntax::CStyle,
84            _ => CommentSyntax::CStyle,
85        }
86    }
87
88    /// Returns the string syntax for this language.
89    pub fn string_syntax(self) -> StringSyntax {
90        match self {
91            Language::Rust => StringSyntax::Rust,
92            Language::Python => StringSyntax::Python,
93            // Ruby uses single quotes for strings (not char literals like C)
94            Language::JavaScript | Language::TypeScript | Language::Ruby => {
95                StringSyntax::JavaScript
96            }
97            Language::Go => StringSyntax::Go,
98            Language::Shell => StringSyntax::Shell,
99            // Swift and Scala support triple-quoted strings like Python
100            Language::Swift | Language::Scala => StringSyntax::SwiftScala,
101            // SQL uses single quotes for strings
102            Language::Sql => StringSyntax::Sql,
103            // XML uses both single and double quotes for attribute values
104            Language::Xml => StringSyntax::Xml,
105            // PHP uses both single and double quotes
106            Language::Php => StringSyntax::Php,
107            // YAML/TOML/JSON strings are C-style-like in this best-effort model
108            Language::Yaml | Language::Toml | Language::Json => StringSyntax::CStyle,
109            _ => StringSyntax::CStyle,
110        }
111    }
112}
113
114/// Comment syntax variants for different programming languages.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum CommentSyntax {
117    /// C-style comments: `//` line comments and `/* */` block comments
118    CStyle,
119    /// C-style comments with nesting support (Rust, Swift, Scala): `//` and `/* */` with nesting
120    CStyleNested,
121    /// Hash comments only: `#` line comments (Python, Ruby)
122    Hash,
123    /// SQL comments: `--` line comments and `/* */` block comments
124    Sql,
125    /// XML/HTML comments: `<!-- -->` block comments only
126    Xml,
127    /// PHP comments: `//`, `#` line comments and `/* */` block comments
128    Php,
129}
130
131/// String syntax variants for different programming languages.
132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133pub enum StringSyntax {
134    /// C-style strings: `"..."` with backslash escapes
135    CStyle,
136    /// Rust strings: `"..."`, `r#"..."#`, `b"..."`
137    Rust,
138    /// Python strings: `"..."`, `'...'`, `"""..."""`, `'''...'''`
139    Python,
140    /// JavaScript strings: `"..."`, `'...'`, `` `...` `` (template literals)
141    JavaScript,
142    /// Go strings: `"..."`, `` `...` `` (raw strings)
143    Go,
144    /// Shell strings: `'...'` (literal, no escapes), `"..."` (with escapes), `$'...'` (ANSI-C)
145    Shell,
146    /// Swift/Scala strings: `"..."`, `"""..."""` multi-line strings
147    SwiftScala,
148    /// SQL strings: `'...'` single quotes only
149    Sql,
150    /// XML/HTML strings: `"..."` and `'...'` for attribute values
151    Xml,
152    /// PHP strings: `'...'` (literal) and `"..."` (with escapes)
153    Php,
154}
155
156/// Preprocessing options.
157///
158/// `mask_*` controls whether the corresponding token class is replaced with spaces.
159///
160/// Regardless of masking, the preprocessor may still *track* strings when masking
161/// comments, so that comment markers inside strings do not start a comment.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub struct PreprocessOptions {
164    pub mask_comments: bool,
165    pub mask_strings: bool,
166}
167
168impl PreprocessOptions {
169    pub fn none() -> Self {
170        Self {
171            mask_comments: false,
172            mask_strings: false,
173        }
174    }
175
176    pub fn comments_only() -> Self {
177        Self {
178            mask_comments: true,
179            mask_strings: false,
180        }
181    }
182
183    pub fn strings_only() -> Self {
184        Self {
185            mask_comments: false,
186            mask_strings: true,
187        }
188    }
189
190    pub fn comments_and_strings() -> Self {
191        Self {
192            mask_comments: true,
193            mask_strings: true,
194        }
195    }
196
197    fn track_strings(self) -> bool {
198        self.mask_strings || self.mask_comments
199    }
200}
201
202#[derive(Clone, Copy, PartialEq, Eq)]
203enum Mode {
204    Normal,
205    LineComment,
206    BlockComment {
207        depth: u32,
208    },
209    NormalString {
210        escaped: bool,
211        quote: u8,
212    },
213    RawString {
214        hashes: usize,
215    },
216    Char {
217        escaped: bool,
218    },
219    TemplateLiteral {
220        escaped: bool,
221    },
222    TripleQuotedString {
223        escaped: bool,
224        quote: u8,
225    },
226    /// Shell literal string: '...' - no escapes at all
227    ShellLiteralString,
228    /// Shell ANSI-C string: $'...' - with escape sequences
229    ShellAnsiCString {
230        escaped: bool,
231    },
232    /// XML/HTML block comment: <!-- ... -->
233    XmlComment,
234}
235
236impl fmt::Debug for Mode {
237    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238        match self {
239            Mode::Normal => write!(f, "Normal"),
240            Mode::LineComment => write!(f, "LineComment"),
241            Mode::BlockComment { depth } => write!(f, "BlockComment(depth={depth})"),
242            Mode::NormalString { escaped, quote } => {
243                write!(f, "NormalString(escaped={escaped}, quote={quote})")
244            }
245            Mode::RawString { hashes } => write!(f, "RawString(hashes={hashes})"),
246            Mode::Char { escaped } => write!(f, "Char(escaped={escaped})"),
247            Mode::TemplateLiteral { escaped } => write!(f, "TemplateLiteral(escaped={escaped})"),
248            Mode::TripleQuotedString { escaped, quote } => {
249                write!(f, "TripleQuotedString(escaped={escaped}, quote={quote})")
250            }
251            Mode::ShellLiteralString => write!(f, "ShellLiteralString"),
252            Mode::ShellAnsiCString { escaped } => {
253                write!(f, "ShellAnsiCString(escaped={escaped})")
254            }
255            Mode::XmlComment => write!(f, "XmlComment"),
256        }
257    }
258}
259
260/// A stateful preprocessor, intended to be run on sequential lines of the *same file*.
261///
262/// The state tracks multi-line comments/strings best-effort. If the diff begins inside an
263/// existing comment/string, the preprocessor cannot infer that.
264#[derive(Debug, Clone)]
265pub struct Preprocessor {
266    opts: PreprocessOptions,
267    mode: Mode,
268    lang: Language,
269}
270
271impl Preprocessor {
272    pub fn new(opts: PreprocessOptions) -> Self {
273        Self {
274            opts,
275            mode: Mode::Normal,
276            lang: Language::Unknown,
277        }
278    }
279
280    /// Create a new preprocessor with language-specific syntax support.
281    pub fn with_language(opts: PreprocessOptions, lang: Language) -> Self {
282        Self {
283            opts,
284            mode: Mode::Normal,
285            lang,
286        }
287    }
288
289    /// Set the language for this preprocessor and reset state.
290    pub fn set_language(&mut self, lang: Language) {
291        self.lang = lang;
292        self.reset();
293    }
294
295    pub fn reset(&mut self) {
296        self.mode = Mode::Normal;
297    }
298
299    /// Returns a sanitized line where masked segments are replaced with spaces.
300    ///
301    /// The output is the same length in bytes as the input.
302    #[cfg_attr(mutants, mutants::skip)]
303    #[allow(clippy::collapsible_if)]
304    pub fn sanitize_line(&mut self, line: &str) -> String {
305        let mut out: Vec<u8> = line.as_bytes().to_vec();
306        let bytes = line.as_bytes();
307        let len = bytes.len();
308
309        let comment_syntax = self.lang.comment_syntax();
310        let string_syntax = self.lang.string_syntax();
311
312        let mut i = 0;
313
314        while i < len {
315            match self.mode {
316                Mode::Normal => {
317                    // String detection (language-specific)
318                    if self.opts.track_strings() {
319                        // Rust raw string start detection: r#"..."# or br#"..."#
320                        if string_syntax == StringSyntax::Rust {
321                            if let Some((_, end_quote_i, hashes)) =
322                                detect_raw_string_start(bytes, i)
323                            {
324                                if self.opts.mask_strings {
325                                    mask_range(&mut out, i, end_quote_i + 1);
326                                }
327                                self.mode = Mode::RawString { hashes };
328                                i = end_quote_i + 1;
329                                continue;
330                            }
331
332                            // Byte string: b"..."
333                            if bytes[i] == b'b' && i + 1 < len && bytes[i + 1] == b'"' {
334                                if self.opts.mask_strings {
335                                    mask_range(&mut out, i, i + 2);
336                                }
337                                self.mode = Mode::NormalString {
338                                    escaped: false,
339                                    quote: b'"',
340                                };
341                                i += 2;
342                                continue;
343                            }
344                        }
345
346                        // Triple-quoted strings: """...""" or '''...''' (Python)
347                        // Swift/Scala only use """...""" (not single-quote triple)
348                        if string_syntax == StringSyntax::Python
349                            || string_syntax == StringSyntax::SwiftScala
350                        {
351                            if let Some((quote, end_i)) = detect_triple_quote_start(bytes, i) {
352                                // Swift/Scala only support double-quote triple strings
353                                if string_syntax == StringSyntax::SwiftScala && quote != b'"' {
354                                    // Fall through to normal string handling
355                                } else {
356                                    if self.opts.mask_strings {
357                                        mask_range(&mut out, i, end_i);
358                                    }
359                                    self.mode = Mode::TripleQuotedString {
360                                        escaped: false,
361                                        quote,
362                                    };
363                                    i = end_i;
364                                    continue;
365                                }
366                            }
367                        }
368
369                        // JavaScript/TypeScript template literals: `...`
370                        if string_syntax == StringSyntax::JavaScript && bytes[i] == b'`' {
371                            if self.opts.mask_strings {
372                                out[i] = b' ';
373                            }
374                            self.mode = Mode::TemplateLiteral { escaped: false };
375                            i += 1;
376                            continue;
377                        }
378
379                        // Go raw strings: `...`
380                        if string_syntax == StringSyntax::Go && bytes[i] == b'`' {
381                            if self.opts.mask_strings {
382                                out[i] = b' ';
383                            }
384                            // Go raw strings don't support escapes, use RawString with 0 hashes
385                            self.mode = Mode::RawString { hashes: 0 };
386                            i += 1;
387                            continue;
388                        }
389
390                        // Shell ANSI-C quoting: $'...'
391                        if string_syntax == StringSyntax::Shell
392                            && bytes[i] == b'$'
393                            && i + 1 < len
394                            && bytes[i + 1] == b'\''
395                        {
396                            if self.opts.mask_strings {
397                                mask_range(&mut out, i, i + 2);
398                            }
399                            self.mode = Mode::ShellAnsiCString { escaped: false };
400                            i += 2;
401                            continue;
402                        }
403
404                        // Shell single-quoted literal strings: '...' (no escapes!)
405                        if string_syntax == StringSyntax::Shell && bytes[i] == b'\'' {
406                            if self.opts.mask_strings {
407                                out[i] = b' ';
408                            }
409                            self.mode = Mode::ShellLiteralString;
410                            i += 1;
411                            continue;
412                        }
413
414                        // SQL strings: '...' (single quotes with '' escape)
415                        if string_syntax == StringSyntax::Sql && bytes[i] == b'\'' {
416                            if self.opts.mask_strings {
417                                out[i] = b' ';
418                            }
419                            self.mode = Mode::NormalString {
420                                escaped: false,
421                                quote: b'\'',
422                            };
423                            i += 1;
424                            continue;
425                        }
426
427                        // XML/HTML attribute strings: both "..." and '...'
428                        if string_syntax == StringSyntax::Xml
429                            && (bytes[i] == b'"' || bytes[i] == b'\'')
430                        {
431                            let quote = bytes[i];
432                            if self.opts.mask_strings {
433                                out[i] = b' ';
434                            }
435                            // XML strings don't have escape sequences
436                            self.mode = Mode::NormalString {
437                                escaped: false,
438                                quote,
439                            };
440                            i += 1;
441                            continue;
442                        }
443
444                        // PHP strings: '...' (literal, minimal escapes) and "..." (with escapes)
445                        if string_syntax == StringSyntax::Php
446                            && (bytes[i] == b'"' || bytes[i] == b'\'')
447                        {
448                            let quote = bytes[i];
449                            if self.opts.mask_strings {
450                                out[i] = b' ';
451                            }
452                            self.mode = Mode::NormalString {
453                                escaped: false,
454                                quote,
455                            };
456                            i += 1;
457                            continue;
458                        }
459
460                        // Normal double-quoted string: "..."
461                        // Note: SQL only uses single quotes for strings, so skip this for SQL
462                        if bytes[i] == b'"' && string_syntax != StringSyntax::Sql {
463                            if self.opts.mask_strings {
464                                out[i] = b' ';
465                            }
466                            self.mode = Mode::NormalString {
467                                escaped: false,
468                                quote: b'"',
469                            };
470                            i += 1;
471                            continue;
472                        }
473
474                        // Single-quoted strings for Python, JavaScript, Ruby
475                        if (string_syntax == StringSyntax::Python
476                            || string_syntax == StringSyntax::JavaScript
477                            || string_syntax == StringSyntax::CStyle)
478                            && bytes[i] == b'\''
479                        {
480                            // For C-style languages, single quote is a char literal
481                            if string_syntax == StringSyntax::CStyle {
482                                if self.opts.mask_strings {
483                                    out[i] = b' ';
484                                }
485                                self.mode = Mode::Char { escaped: false };
486                                i += 1;
487                                continue;
488                            }
489                            // For Python/JavaScript, single quote is a string
490                            if self.opts.mask_strings {
491                                out[i] = b' ';
492                            }
493                            self.mode = Mode::NormalString {
494                                escaped: false,
495                                quote: b'\'',
496                            };
497                            i += 1;
498                            continue;
499                        }
500
501                        // Rust char literal: '...'
502                        if string_syntax == StringSyntax::Rust && bytes[i] == b'\'' {
503                            if self.opts.mask_strings {
504                                out[i] = b' ';
505                            }
506                            self.mode = Mode::Char { escaped: false };
507                            i += 1;
508                            continue;
509                        }
510                    }
511
512                    // Comment detection (language-specific)
513                    if self.opts.mask_comments {
514                        // Hash comments for Python/Ruby/Shell
515                        if comment_syntax == CommentSyntax::Hash && bytes[i] == b'#' {
516                            mask_range(&mut out, i, len);
517                            self.mode = Mode::LineComment;
518                            break;
519                        }
520
521                        // PHP comments: // and # for line comments, /* */ for block
522                        if comment_syntax == CommentSyntax::Php {
523                            if bytes[i] == b'#' {
524                                mask_range(&mut out, i, len);
525                                self.mode = Mode::LineComment;
526                                break;
527                            }
528                            if bytes[i] == b'/' && i + 1 < len {
529                                let n = bytes[i + 1];
530                                if n == b'/' {
531                                    mask_range(&mut out, i, len);
532                                    self.mode = Mode::LineComment;
533                                    break;
534                                }
535                                if n == b'*' {
536                                    mask_range(&mut out, i, i + 2);
537                                    self.mode = Mode::BlockComment { depth: 1 };
538                                    i += 2;
539                                    continue;
540                                }
541                            }
542                        }
543
544                        // SQL comments: -- for line comments, /* */ for block
545                        if comment_syntax == CommentSyntax::Sql {
546                            // -- line comment
547                            if bytes[i] == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
548                                mask_range(&mut out, i, len);
549                                self.mode = Mode::LineComment;
550                                break;
551                            }
552                            // /* */ block comment
553                            if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
554                                mask_range(&mut out, i, i + 2);
555                                self.mode = Mode::BlockComment { depth: 1 };
556                                i += 2;
557                                continue;
558                            }
559                        }
560
561                        // XML/HTML comments: <!-- -->
562                        if comment_syntax == CommentSyntax::Xml
563                            && bytes[i] == b'<'
564                            && i + 3 < len
565                            && bytes[i + 1] == b'!'
566                            && bytes[i + 2] == b'-'
567                            && bytes[i + 3] == b'-'
568                        {
569                            mask_range(&mut out, i, i + 4);
570                            self.mode = Mode::XmlComment;
571                            i += 4;
572                            continue;
573                        }
574
575                        // C-style comments: // and /* */
576                        if (comment_syntax == CommentSyntax::CStyle
577                            || comment_syntax == CommentSyntax::CStyleNested)
578                            && bytes[i] == b'/'
579                            && i + 1 < len
580                        {
581                            let n = bytes[i + 1];
582                            if n == b'/' {
583                                // line comment until EOL
584                                mask_range(&mut out, i, len);
585                                self.mode = Mode::LineComment;
586                                break;
587                            }
588                            if n == b'*' {
589                                // block comment
590                                mask_range(&mut out, i, i + 2);
591                                self.mode = Mode::BlockComment { depth: 1 };
592                                i += 2;
593                                continue;
594                            }
595                        }
596                    }
597
598                    i += 1;
599                }
600
601                Mode::LineComment => {
602                    // End-of-line resets line comments.
603                    self.mode = Mode::Normal;
604                    break;
605                }
606
607                Mode::BlockComment { depth } => {
608                    // Everything is masked in a block comment.
609                    if self.opts.mask_comments {
610                        out[i] = b' ';
611                    }
612
613                    // Nested block comments are possible in Rust.
614                    let supports_nesting = comment_syntax == CommentSyntax::CStyleNested;
615                    if supports_nesting && bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
616                        if self.opts.mask_comments {
617                            out[i + 1] = b' ';
618                        }
619                        self.mode = Mode::BlockComment { depth: depth + 1 };
620                        i += 2;
621                        continue;
622                    }
623
624                    if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
625                        if self.opts.mask_comments {
626                            out[i + 1] = b' ';
627                        }
628                        if depth == 1 {
629                            self.mode = Mode::Normal;
630                        } else {
631                            self.mode = Mode::BlockComment { depth: depth - 1 };
632                        }
633                        i += 2;
634                        continue;
635                    }
636
637                    i += 1;
638                }
639
640                Mode::NormalString { escaped, quote } => {
641                    if self.opts.mask_strings {
642                        out[i] = b' ';
643                    }
644
645                    if escaped {
646                        self.mode = Mode::NormalString {
647                            escaped: false,
648                            quote,
649                        };
650                        i += 1;
651                        continue;
652                    }
653
654                    if bytes[i] == b'\\' {
655                        self.mode = Mode::NormalString {
656                            escaped: true,
657                            quote,
658                        };
659                        i += 1;
660                        continue;
661                    }
662
663                    if bytes[i] == quote {
664                        // End of string
665                        self.mode = Mode::Normal;
666                        i += 1;
667                        continue;
668                    }
669
670                    i += 1;
671                }
672
673                Mode::Char { escaped } => {
674                    if self.opts.mask_strings {
675                        out[i] = b' ';
676                    }
677
678                    if escaped {
679                        self.mode = Mode::Char { escaped: false };
680                        i += 1;
681                        continue;
682                    }
683
684                    if bytes[i] == b'\\' {
685                        self.mode = Mode::Char { escaped: true };
686                        i += 1;
687                        continue;
688                    }
689
690                    if bytes[i] == b'\'' {
691                        self.mode = Mode::Normal;
692                        i += 1;
693                        continue;
694                    }
695
696                    i += 1;
697                }
698
699                Mode::RawString { hashes } => {
700                    if self.opts.mask_strings {
701                        out[i] = b' ';
702                    }
703
704                    // For Go raw strings (hashes == 0), look for closing backtick
705                    if hashes == 0 && string_syntax == StringSyntax::Go {
706                        if bytes[i] == b'`' {
707                            self.mode = Mode::Normal;
708                            i += 1;
709                            continue;
710                        }
711                        i += 1;
712                        continue;
713                    }
714
715                    // For Rust raw strings, look for end delimiter: "###
716                    if bytes[i] == b'"' {
717                        let mut ok = true;
718                        for j in 0..hashes {
719                            if i + 1 + j >= len || bytes[i + 1 + j] != b'#' {
720                                ok = false;
721                                break;
722                            }
723                        }
724
725                        if ok {
726                            if self.opts.mask_strings {
727                                mask_range(&mut out, i, (i + 1 + hashes).min(len));
728                            }
729                            self.mode = Mode::Normal;
730                            i = (i + 1 + hashes).min(len);
731                            continue;
732                        }
733                    }
734
735                    i += 1;
736                }
737
738                Mode::TemplateLiteral { escaped } => {
739                    if self.opts.mask_strings {
740                        out[i] = b' ';
741                    }
742
743                    if escaped {
744                        self.mode = Mode::TemplateLiteral { escaped: false };
745                        i += 1;
746                        continue;
747                    }
748
749                    if bytes[i] == b'\\' {
750                        self.mode = Mode::TemplateLiteral { escaped: true };
751                        i += 1;
752                        continue;
753                    }
754
755                    if bytes[i] == b'`' {
756                        // End of template literal
757                        self.mode = Mode::Normal;
758                        i += 1;
759                        continue;
760                    }
761
762                    i += 1;
763                }
764
765                Mode::TripleQuotedString { escaped, quote } => {
766                    if self.opts.mask_strings {
767                        out[i] = b' ';
768                    }
769
770                    if escaped {
771                        self.mode = Mode::TripleQuotedString {
772                            escaped: false,
773                            quote,
774                        };
775                        i += 1;
776                        continue;
777                    }
778
779                    if bytes[i] == b'\\' {
780                        self.mode = Mode::TripleQuotedString {
781                            escaped: true,
782                            quote,
783                        };
784                        i += 1;
785                        continue;
786                    }
787
788                    // Check for closing triple quote
789                    if bytes[i] == quote
790                        && i + 2 < len
791                        && bytes[i + 1] == quote
792                        && bytes[i + 2] == quote
793                    {
794                        if self.opts.mask_strings {
795                            mask_range(&mut out, i, i + 3);
796                        }
797                        self.mode = Mode::Normal;
798                        i += 3;
799                        continue;
800                    }
801
802                    i += 1;
803                }
804
805                Mode::ShellLiteralString => {
806                    // Shell single-quoted strings: NO escapes at all!
807                    // The only way out is a closing single quote.
808                    if self.opts.mask_strings {
809                        out[i] = b' ';
810                    }
811
812                    if bytes[i] == b'\'' {
813                        // End of literal string
814                        self.mode = Mode::Normal;
815                        i += 1;
816                        continue;
817                    }
818
819                    i += 1;
820                }
821
822                Mode::ShellAnsiCString { escaped } => {
823                    // Shell ANSI-C strings: $'...' with escape sequences
824                    if self.opts.mask_strings {
825                        out[i] = b' ';
826                    }
827
828                    if escaped {
829                        self.mode = Mode::ShellAnsiCString { escaped: false };
830                        i += 1;
831                        continue;
832                    }
833
834                    if bytes[i] == b'\\' {
835                        self.mode = Mode::ShellAnsiCString { escaped: true };
836                        i += 1;
837                        continue;
838                    }
839
840                    if bytes[i] == b'\'' {
841                        // End of ANSI-C string
842                        self.mode = Mode::Normal;
843                        i += 1;
844                        continue;
845                    }
846
847                    i += 1;
848                }
849
850                Mode::XmlComment => {
851                    // XML/HTML comments: <!-- ... -->
852                    // Everything inside is masked until we see -->
853                    if self.opts.mask_comments {
854                        out[i] = b' ';
855                    }
856
857                    // Check for closing -->
858                    if bytes[i] == b'-'
859                        && i + 2 < len
860                        && bytes[i + 1] == b'-'
861                        && bytes[i + 2] == b'>'
862                    {
863                        if self.opts.mask_comments {
864                            out[i + 1] = b' ';
865                            out[i + 2] = b' ';
866                        }
867                        self.mode = Mode::Normal;
868                        i += 3;
869                        continue;
870                    }
871
872                    i += 1;
873                }
874            }
875        }
876
877        // Line comments end at EOL.
878        if matches!(self.mode, Mode::LineComment) {
879            self.mode = Mode::Normal;
880        }
881
882        String::from_utf8_lossy(&out).into_owned()
883    }
884}
885
886fn mask_range(out: &mut [u8], start: usize, end: usize) {
887    let end = end.min(out.len());
888    for b in &mut out[start..end] {
889        *b = b' ';
890    }
891}
892
893/// Detect a triple-quoted string start (Python): """...""" or '''...'''
894///
895/// Returns (quote_char, end_index) where end_index is the position after the opening triple quote.
896fn detect_triple_quote_start(bytes: &[u8], i: usize) -> Option<(u8, usize)> {
897    let len = bytes.len();
898    if i + 2 >= len {
899        return None;
900    }
901
902    let quote = bytes[i];
903    if (quote == b'"' || quote == b'\'') && bytes[i + 1] == quote && bytes[i + 2] == quote {
904        Some((quote, i + 3))
905    } else {
906        None
907    }
908}
909
910/// Detect a raw string start (Rust): r#"..."# or br#"..."#.
911///
912/// Returns (start_index, quote_index, hash_count) where quote_index points to the opening `"`.
913fn detect_raw_string_start(bytes: &[u8], i: usize) -> Option<(usize, usize, usize)> {
914    let len = bytes.len();
915
916    // Either r... or br...
917    let (start, r_i) = if bytes.get(i) == Some(&b'r') {
918        (i, i)
919    } else if bytes.get(i) == Some(&b'b') && bytes.get(i + 1) == Some(&b'r') {
920        (i, i + 1)
921    } else {
922        return None;
923    };
924
925    let j = r_i + 1;
926    let hashes = bytes
927        .get(j..len)
928        .unwrap_or(&[])
929        .iter()
930        .take_while(|&&b| b == b'#')
931        .count();
932    let j = j + hashes;
933
934    if j < len && bytes[j] == b'"' {
935        Some((start, j, hashes))
936    } else {
937        None
938    }
939}
940
941#[cfg(test)]
942mod tests {
943    use super::*;
944
945    // ==================== Language enum tests ====================
946
947    #[test]
948    fn preprocess_options_track_strings_reflects_masks() {
949        assert!(!PreprocessOptions::none().track_strings());
950        assert!(PreprocessOptions::comments_only().track_strings());
951        assert!(PreprocessOptions::strings_only().track_strings());
952        assert!(PreprocessOptions::comments_and_strings().track_strings());
953    }
954
955    #[test]
956    fn mode_debug_format_includes_variant() {
957        assert_eq!(format!("{:?}", Mode::Normal), "Normal");
958        assert_eq!(format!("{:?}", Mode::LineComment), "LineComment");
959        assert_eq!(
960            format!("{:?}", Mode::BlockComment { depth: 2 }),
961            "BlockComment(depth=2)"
962        );
963        assert_eq!(
964            format!(
965                "{:?}",
966                Mode::NormalString {
967                    escaped: true,
968                    quote: b'\"'
969                }
970            ),
971            "NormalString(escaped=true, quote=34)"
972        );
973    }
974
975    #[test]
976    fn detect_triple_quote_start_detects_quotes() {
977        assert_eq!(detect_triple_quote_start(b"\"\"\"rest", 0), Some((b'"', 3)));
978        assert_eq!(detect_triple_quote_start(b"'''abc", 0), Some((b'\'', 3)));
979        assert_eq!(detect_triple_quote_start(b"x\"\"y", 1), None);
980        assert_eq!(detect_triple_quote_start(b"\"x\"", 0), None);
981        assert_eq!(detect_triple_quote_start(b"''", 0), None);
982        assert_eq!(detect_triple_quote_start(b"x'''y", 0), None);
983    }
984
985    #[test]
986    fn detect_raw_string_start_detects_rust_raw_strings() {
987        assert_eq!(detect_raw_string_start(b"r\"rest", 0), Some((0, 1, 0)));
988        assert_eq!(detect_raw_string_start(b"br\"rest", 0), Some((0, 2, 0)));
989        assert_eq!(detect_raw_string_start(b"r#\"rest", 0), Some((0, 2, 1)));
990        assert_eq!(detect_raw_string_start(b"br##\"rest", 0), Some((0, 4, 2)));
991        assert_eq!(detect_raw_string_start(b"b\"\"rest", 0), None);
992        assert_eq!(detect_raw_string_start(b"b\"rest", 0), None);
993        assert_eq!(detect_raw_string_start(b"x\"rest", 0), None);
994        assert_eq!(detect_raw_string_start(b"r###", 0), None);
995    }
996
997    #[test]
998    fn language_from_str_known_languages() {
999        assert_eq!("rust".parse::<Language>().unwrap(), Language::Rust);
1000        assert_eq!("python".parse::<Language>().unwrap(), Language::Python);
1001        assert_eq!(
1002            "javascript".parse::<Language>().unwrap(),
1003            Language::JavaScript
1004        );
1005        assert_eq!(
1006            "typescript".parse::<Language>().unwrap(),
1007            Language::TypeScript
1008        );
1009        assert_eq!("go".parse::<Language>().unwrap(), Language::Go);
1010        assert_eq!("ruby".parse::<Language>().unwrap(), Language::Ruby);
1011        assert_eq!("c".parse::<Language>().unwrap(), Language::C);
1012        assert_eq!("cpp".parse::<Language>().unwrap(), Language::Cpp);
1013        assert_eq!("csharp".parse::<Language>().unwrap(), Language::CSharp);
1014        assert_eq!("java".parse::<Language>().unwrap(), Language::Java);
1015        assert_eq!("kotlin".parse::<Language>().unwrap(), Language::Kotlin);
1016        assert_eq!("yaml".parse::<Language>().unwrap(), Language::Yaml);
1017        assert_eq!("toml".parse::<Language>().unwrap(), Language::Toml);
1018        assert_eq!("json".parse::<Language>().unwrap(), Language::Json);
1019    }
1020
1021    #[test]
1022    fn language_from_str_case_insensitive() {
1023        assert_eq!("RUST".parse::<Language>().unwrap(), Language::Rust);
1024        assert_eq!("Python".parse::<Language>().unwrap(), Language::Python);
1025        assert_eq!(
1026            "JavaScript".parse::<Language>().unwrap(),
1027            Language::JavaScript
1028        );
1029        assert_eq!(
1030            "TypeScript".parse::<Language>().unwrap(),
1031            Language::TypeScript
1032        );
1033        assert_eq!("GO".parse::<Language>().unwrap(), Language::Go);
1034        assert_eq!("RUBY".parse::<Language>().unwrap(), Language::Ruby);
1035        assert_eq!("C".parse::<Language>().unwrap(), Language::C);
1036        assert_eq!("CPP".parse::<Language>().unwrap(), Language::Cpp);
1037        assert_eq!("CSharp".parse::<Language>().unwrap(), Language::CSharp);
1038        assert_eq!("JAVA".parse::<Language>().unwrap(), Language::Java);
1039        assert_eq!("KOTLIN".parse::<Language>().unwrap(), Language::Kotlin);
1040        assert_eq!("YAML".parse::<Language>().unwrap(), Language::Yaml);
1041        assert_eq!("TOML".parse::<Language>().unwrap(), Language::Toml);
1042        assert_eq!("JSON".parse::<Language>().unwrap(), Language::Json);
1043    }
1044
1045    #[test]
1046    fn language_from_str_unknown() {
1047        assert_eq!("unknown".parse::<Language>().unwrap(), Language::Unknown);
1048        assert_eq!("".parse::<Language>().unwrap(), Language::Unknown);
1049        assert_eq!("fortran".parse::<Language>().unwrap(), Language::Unknown);
1050        assert_eq!("cobol".parse::<Language>().unwrap(), Language::Unknown);
1051    }
1052
1053    #[test]
1054    fn language_default_is_unknown() {
1055        assert_eq!(Language::default(), Language::Unknown);
1056    }
1057
1058    // ==================== CommentSyntax tests ====================
1059
1060    #[test]
1061    fn comment_syntax_hash_languages() {
1062        assert_eq!(Language::Python.comment_syntax(), CommentSyntax::Hash);
1063        assert_eq!(Language::Ruby.comment_syntax(), CommentSyntax::Hash);
1064        assert_eq!(Language::Yaml.comment_syntax(), CommentSyntax::Hash);
1065        assert_eq!(Language::Toml.comment_syntax(), CommentSyntax::Hash);
1066    }
1067
1068    #[test]
1069    fn comment_syntax_cstyle_nested_languages() {
1070        assert_eq!(Language::Rust.comment_syntax(), CommentSyntax::CStyleNested);
1071    }
1072
1073    #[test]
1074    fn comment_syntax_cstyle_languages() {
1075        assert_eq!(Language::JavaScript.comment_syntax(), CommentSyntax::CStyle);
1076        assert_eq!(Language::TypeScript.comment_syntax(), CommentSyntax::CStyle);
1077        assert_eq!(Language::Go.comment_syntax(), CommentSyntax::CStyle);
1078        assert_eq!(Language::C.comment_syntax(), CommentSyntax::CStyle);
1079        assert_eq!(Language::Cpp.comment_syntax(), CommentSyntax::CStyle);
1080        assert_eq!(Language::CSharp.comment_syntax(), CommentSyntax::CStyle);
1081        assert_eq!(Language::Java.comment_syntax(), CommentSyntax::CStyle);
1082        assert_eq!(Language::Kotlin.comment_syntax(), CommentSyntax::CStyle);
1083        assert_eq!(Language::Json.comment_syntax(), CommentSyntax::CStyle);
1084        assert_eq!(Language::Unknown.comment_syntax(), CommentSyntax::CStyle);
1085    }
1086
1087    // ==================== StringSyntax tests ====================
1088
1089    #[test]
1090    fn string_syntax_rust() {
1091        assert_eq!(Language::Rust.string_syntax(), StringSyntax::Rust);
1092    }
1093
1094    #[test]
1095    fn string_syntax_python() {
1096        assert_eq!(Language::Python.string_syntax(), StringSyntax::Python);
1097    }
1098
1099    #[test]
1100    fn string_syntax_javascript() {
1101        assert_eq!(
1102            Language::JavaScript.string_syntax(),
1103            StringSyntax::JavaScript
1104        );
1105        assert_eq!(
1106            Language::TypeScript.string_syntax(),
1107            StringSyntax::JavaScript
1108        );
1109    }
1110
1111    #[test]
1112    fn string_syntax_go() {
1113        assert_eq!(Language::Go.string_syntax(), StringSyntax::Go);
1114    }
1115
1116    #[test]
1117    fn string_syntax_cstyle_languages() {
1118        assert_eq!(Language::C.string_syntax(), StringSyntax::CStyle);
1119        assert_eq!(Language::Cpp.string_syntax(), StringSyntax::CStyle);
1120        assert_eq!(Language::CSharp.string_syntax(), StringSyntax::CStyle);
1121        assert_eq!(Language::Java.string_syntax(), StringSyntax::CStyle);
1122        assert_eq!(Language::Kotlin.string_syntax(), StringSyntax::CStyle);
1123        assert_eq!(Language::Unknown.string_syntax(), StringSyntax::CStyle);
1124    }
1125
1126    #[test]
1127    fn cstyle_masks_double_quoted_strings() {
1128        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::C);
1129        let s = p.sanitize_line("printf(\"hello\");");
1130        assert!(s.contains("printf("));
1131        assert!(!s.contains("hello"));
1132    }
1133
1134    #[test]
1135    fn cstyle_masks_char_literals() {
1136        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::C);
1137        let line = r#"char c = 'x'; char nl = '\n';"#;
1138        let s = p.sanitize_line(line);
1139        assert!(!s.contains("'x'"));
1140        assert!(!s.contains("'\\n'"));
1141        assert_eq!(s.len(), line.len());
1142    }
1143
1144    #[test]
1145    fn rust_masks_char_literals() {
1146        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
1147        let line = r#"let c = 'z'; let escaped = '\'';"#;
1148        let s = p.sanitize_line(line);
1149        assert!(!s.contains("'z'"));
1150        assert!(!s.contains("'\\''"));
1151        assert_eq!(s.len(), line.len());
1152    }
1153
1154    #[test]
1155    fn string_syntax_ruby() {
1156        // Ruby uses JavaScript-style string syntax (single quotes are strings, not char literals)
1157        assert_eq!(Language::Ruby.string_syntax(), StringSyntax::JavaScript);
1158    }
1159
1160    // ==================== Preprocessor constructor tests ====================
1161
1162    #[test]
1163    fn preprocessor_with_language() {
1164        let p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Python);
1165        assert_eq!(p.lang, Language::Python);
1166    }
1167
1168    #[test]
1169    fn preprocessor_set_language() {
1170        let mut p = Preprocessor::new(PreprocessOptions::comments_only());
1171        assert_eq!(p.lang, Language::Unknown);
1172        p.set_language(Language::Python);
1173        assert_eq!(p.lang, Language::Python);
1174    }
1175
1176    // ==================== Preprocessor tests (default/unknown language) ====================
1177
1178    #[test]
1179    fn masks_line_comments_when_enabled() {
1180        let mut p = Preprocessor::new(PreprocessOptions::comments_only());
1181        let s = p.sanitize_line("let x = 1; // .unwrap() should be ignored");
1182        assert!(s.contains("let x = 1;"));
1183        assert!(!s.contains("unwrap"));
1184    }
1185
1186    #[test]
1187    fn does_not_mask_line_comments_when_disabled() {
1188        let mut p = Preprocessor::new(PreprocessOptions::none());
1189        let s = p.sanitize_line("// .unwrap() should be visible");
1190        assert!(s.contains("unwrap"));
1191    }
1192
1193    #[test]
1194    fn masks_strings_when_enabled() {
1195        let mut p = Preprocessor::new(PreprocessOptions::strings_only());
1196        let s = p.sanitize_line("let s = \".unwrap()\";");
1197        assert!(!s.contains("unwrap"));
1198        assert!(s.contains("let s ="));
1199    }
1200
1201    #[test]
1202    fn does_not_start_comment_inside_string() {
1203        let mut p = Preprocessor::new(PreprocessOptions::comments_only());
1204        let s = p.sanitize_line("let s = \"// not a comment\"; // real comment");
1205        assert!(s.contains("// not a comment"));
1206        assert!(!s.contains("real comment"));
1207    }
1208
1209    #[test]
1210    fn masks_raw_string() {
1211        let mut p =
1212            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Rust);
1213        let s = p.sanitize_line("let s = r#\".unwrap()\"#;");
1214        assert!(!s.contains("unwrap"));
1215    }
1216
1217    // ==================== Python-specific tests ====================
1218
1219    #[test]
1220    fn python_masks_hash_comments() {
1221        let mut p =
1222            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Python);
1223        let s = p.sanitize_line("x = 1  # this is a comment with print()");
1224        assert!(s.contains("x = 1"));
1225        assert!(!s.contains("print"));
1226        assert!(!s.contains("comment"));
1227    }
1228
1229    #[test]
1230    fn python_does_not_mask_hash_in_string() {
1231        let mut p =
1232            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Python);
1233        let s = p.sanitize_line("x = \"# not a comment\"  # real comment");
1234        assert!(s.contains("# not a comment"));
1235        assert!(!s.contains("real comment"));
1236    }
1237
1238    #[test]
1239    fn python_masks_single_quoted_strings() {
1240        let mut p =
1241            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1242        let s = p.sanitize_line("x = 'print() inside string'");
1243        assert!(s.contains("x ="));
1244        assert!(!s.contains("print"));
1245    }
1246
1247    #[test]
1248    fn python_masks_double_quoted_strings() {
1249        let mut p =
1250            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1251        let s = p.sanitize_line("x = \"print() inside string\"");
1252        assert!(s.contains("x ="));
1253        assert!(!s.contains("print"));
1254    }
1255
1256    #[test]
1257    fn python_masks_triple_double_quoted_strings() {
1258        let mut p =
1259            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1260        let s = p.sanitize_line("x = \"\"\"print() inside triple string\"\"\"");
1261        assert!(s.contains("x ="));
1262        assert!(!s.contains("print"));
1263    }
1264
1265    #[test]
1266    fn python_masks_triple_single_quoted_strings() {
1267        let mut p =
1268            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1269        let s = p.sanitize_line("x = '''print() inside triple string'''");
1270        assert!(s.contains("x ="));
1271        assert!(!s.contains("print"));
1272    }
1273
1274    #[test]
1275    fn python_triple_quoted_string_multiline() {
1276        let mut p =
1277            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1278        // First line starts the triple-quoted string
1279        let s1 = p.sanitize_line("x = \"\"\"start of");
1280        assert!(s1.contains("x ="));
1281        assert!(!s1.contains("start"));
1282
1283        // Second line is inside the string
1284        let s2 = p.sanitize_line("print() in middle");
1285        assert!(!s2.contains("print"));
1286
1287        // Third line ends the string
1288        let s3 = p.sanitize_line("end of string\"\"\" + y");
1289        assert!(!s3.contains("end of string"));
1290        assert!(s3.contains("+ y"));
1291    }
1292
1293    // ==================== JavaScript/TypeScript-specific tests ====================
1294
1295    #[test]
1296    fn javascript_masks_line_comments() {
1297        let mut p =
1298            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1299        let s = p.sanitize_line("let x = 1; // console.log here");
1300        assert!(s.contains("let x = 1;"));
1301        assert!(!s.contains("console"));
1302    }
1303
1304    #[test]
1305    fn javascript_masks_block_comments() {
1306        let mut p =
1307            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1308        let s = p.sanitize_line("let x = /* console.log */ 1;");
1309        assert!(s.contains("let x ="));
1310        assert!(s.contains("1;"));
1311        assert!(!s.contains("console"));
1312    }
1313
1314    #[test]
1315    fn javascript_masks_single_quoted_strings() {
1316        let mut p =
1317            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1318        let s = p.sanitize_line("let x = 'console.log inside';");
1319        assert!(s.contains("let x ="));
1320        assert!(!s.contains("console"));
1321    }
1322
1323    #[test]
1324    fn javascript_masks_double_quoted_strings() {
1325        let mut p =
1326            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1327        let s = p.sanitize_line("let x = \"console.log inside\";");
1328        assert!(s.contains("let x ="));
1329        assert!(!s.contains("console"));
1330    }
1331
1332    #[test]
1333    fn javascript_masks_template_literals() {
1334        let mut p =
1335            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1336        let s = p.sanitize_line("let x = `console.log inside template`;");
1337        assert!(s.contains("let x ="));
1338        assert!(!s.contains("console"));
1339    }
1340
1341    #[test]
1342    fn javascript_template_literal_multiline() {
1343        let mut p =
1344            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1345        // First line starts the template literal
1346        let s1 = p.sanitize_line("let x = `start of");
1347        assert!(s1.contains("let x ="));
1348        assert!(!s1.contains("start"));
1349
1350        // Second line is inside the template literal
1351        let s2 = p.sanitize_line("console.log in middle");
1352        assert!(!s2.contains("console"));
1353
1354        // Third line ends the template literal
1355        let s3 = p.sanitize_line("end of template` + y;");
1356        assert!(!s3.contains("end of template"));
1357        assert!(s3.contains("+ y;"));
1358    }
1359
1360    #[test]
1361    fn typescript_masks_template_literals() {
1362        let mut p =
1363            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::TypeScript);
1364        let s = p.sanitize_line("let x = `console.log inside template`;");
1365        assert!(s.contains("let x ="));
1366        assert!(!s.contains("console"));
1367    }
1368
1369    // ==================== Go-specific tests ====================
1370
1371    #[test]
1372    fn go_masks_line_comments() {
1373        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Go);
1374        let s = p.sanitize_line("x := 1 // fmt.Println here");
1375        assert!(s.contains("x := 1"));
1376        assert!(!s.contains("fmt"));
1377    }
1378
1379    #[test]
1380    fn go_masks_block_comments() {
1381        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Go);
1382        let s = p.sanitize_line("x := /* fmt.Println */ 1");
1383        assert!(s.contains("x :="));
1384        assert!(s.contains("1"));
1385        assert!(!s.contains("fmt"));
1386    }
1387
1388    #[test]
1389    fn go_masks_double_quoted_strings() {
1390        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Go);
1391        let s = p.sanitize_line("x := \"fmt.Println inside\"");
1392        assert!(s.contains("x :="));
1393        assert!(!s.contains("fmt"));
1394    }
1395
1396    #[test]
1397    fn go_masks_backtick_raw_strings() {
1398        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Go);
1399        let s = p.sanitize_line("x := `fmt.Println inside raw string`");
1400        assert!(s.contains("x :="));
1401        assert!(!s.contains("fmt"));
1402    }
1403
1404    #[test]
1405    fn go_backtick_raw_string_multiline() {
1406        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Go);
1407        // First line starts the raw string
1408        let s1 = p.sanitize_line("x := `start of");
1409        assert!(s1.contains("x :="));
1410        assert!(!s1.contains("start"));
1411
1412        // Second line is inside the raw string
1413        let s2 = p.sanitize_line("fmt.Println in middle");
1414        assert!(!s2.contains("fmt"));
1415
1416        // Third line ends the raw string
1417        let s3 = p.sanitize_line("end of raw` + y");
1418        assert!(!s3.contains("end of raw"));
1419        assert!(s3.contains("+ y"));
1420    }
1421
1422    // ==================== Ruby-specific tests ====================
1423
1424    #[test]
1425    fn ruby_masks_hash_comments() {
1426        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Ruby);
1427        let s = p.sanitize_line("x = 1  # this is a comment with puts");
1428        assert!(s.contains("x = 1"));
1429        assert!(!s.contains("puts"));
1430        assert!(!s.contains("comment"));
1431    }
1432
1433    #[test]
1434    fn ruby_does_not_mask_hash_in_string() {
1435        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Ruby);
1436        let s = p.sanitize_line("x = \"# not a comment\"  # real comment");
1437        assert!(s.contains("# not a comment"));
1438        assert!(!s.contains("real comment"));
1439    }
1440
1441    #[test]
1442    fn ruby_masks_single_quoted_strings() {
1443        // Ruby uses single quotes for strings (not char literals like C/Rust)
1444        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Ruby);
1445        let s = p.sanitize_line("puts 'hello world'");
1446        assert!(s.contains("puts"));
1447        // The content inside the single quotes should be masked
1448        assert!(!s.contains("hello"));
1449        assert!(!s.contains("world"));
1450    }
1451
1452    #[test]
1453    fn ruby_masks_double_quoted_strings() {
1454        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Ruby);
1455        let s = p.sanitize_line("puts \"hello world\"");
1456        assert!(s.contains("puts"));
1457        assert!(!s.contains("hello"));
1458        assert!(!s.contains("world"));
1459    }
1460
1461    // ==================== Unknown/fallback language tests ====================
1462
1463    #[test]
1464    fn unknown_language_uses_cstyle_comments() {
1465        let mut p =
1466            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Unknown);
1467        let s = p.sanitize_line("x = 1; // this is a comment");
1468        assert!(s.contains("x = 1;"));
1469        assert!(!s.contains("comment"));
1470    }
1471
1472    #[test]
1473    fn unknown_language_uses_cstyle_block_comments() {
1474        let mut p =
1475            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Unknown);
1476        let s = p.sanitize_line("x = /* comment */ 1;");
1477        assert!(s.contains("x ="));
1478        assert!(s.contains("1;"));
1479        assert!(!s.contains("comment"));
1480    }
1481
1482    #[test]
1483    fn unknown_language_does_not_mask_hash_as_comment() {
1484        let mut p =
1485            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Unknown);
1486        let s = p.sanitize_line("x = 1  # this is NOT a comment");
1487        // Hash should NOT be treated as a comment for unknown languages
1488        assert!(s.contains("# this is NOT a comment"));
1489    }
1490
1491    // ==================== Line length preservation tests ====================
1492
1493    #[test]
1494    fn preserves_line_length_python_hash_comment() {
1495        let mut p =
1496            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Python);
1497        let line = "x = 1  # comment";
1498        let s = p.sanitize_line(line);
1499        assert_eq!(s.len(), line.len());
1500    }
1501
1502    #[test]
1503    fn preserves_line_length_javascript_template_literal() {
1504        let mut p =
1505            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1506        let line = "let x = `template`;";
1507        let s = p.sanitize_line(line);
1508        assert_eq!(s.len(), line.len());
1509    }
1510
1511    #[test]
1512    fn preserves_line_length_go_raw_string() {
1513        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Go);
1514        let line = "x := `raw string`";
1515        let s = p.sanitize_line(line);
1516        assert_eq!(s.len(), line.len());
1517    }
1518
1519    #[test]
1520    fn preserves_line_length_python_triple_quoted() {
1521        let mut p =
1522            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1523        let line = "x = \"\"\"triple\"\"\"";
1524        let s = p.sanitize_line(line);
1525        assert_eq!(s.len(), line.len());
1526    }
1527
1528    // ==================== Multi-line block comment tests (Requirement 9.3) ====================
1529
1530    #[test]
1531    fn multiline_block_comment_cstyle() {
1532        let mut p =
1533            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1534
1535        // First line starts the block comment
1536        let s1 = p.sanitize_line("let x = 1; /* start of comment");
1537        assert!(s1.contains("let x = 1;"));
1538        assert!(!s1.contains("start of comment"));
1539
1540        // Second line is entirely inside the block comment
1541        let s2 = p.sanitize_line("console.log('hidden') in middle");
1542        assert!(!s2.contains("console"));
1543        assert!(!s2.contains("hidden"));
1544
1545        // Third line ends the block comment
1546        let s3 = p.sanitize_line("end of comment */ let y = 2;");
1547        assert!(!s3.contains("end of comment"));
1548        assert!(s3.contains("let y = 2;"));
1549    }
1550
1551    #[test]
1552    fn multiline_block_comment_rust_nested() {
1553        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
1554
1555        // First line starts a nested block comment
1556        let s1 = p.sanitize_line("let x = 1; /* outer /* inner");
1557        assert!(s1.contains("let x = 1;"));
1558        assert!(!s1.contains("outer"));
1559        assert!(!s1.contains("inner"));
1560
1561        // Second line is inside nested comment
1562        let s2 = p.sanitize_line("still in comment");
1563        assert!(!s2.contains("still"));
1564
1565        // Third line closes inner comment but still in outer
1566        let s3 = p.sanitize_line("inner closed */ still outer");
1567        assert!(!s3.contains("inner closed"));
1568        assert!(!s3.contains("still outer"));
1569
1570        // Fourth line closes outer comment
1571        let s4 = p.sanitize_line("outer closed */ let y = 2;");
1572        assert!(!s4.contains("outer closed"));
1573        assert!(s4.contains("let y = 2;"));
1574    }
1575
1576    #[test]
1577    fn multiline_block_comment_go() {
1578        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Go);
1579
1580        // First line starts the block comment
1581        let s1 = p.sanitize_line("x := 1 /* start");
1582        assert!(s1.contains("x := 1"));
1583        assert!(!s1.contains("start"));
1584
1585        // Second line is inside the block comment
1586        let s2 = p.sanitize_line("fmt.Println hidden");
1587        assert!(!s2.contains("fmt"));
1588        assert!(!s2.contains("hidden"));
1589
1590        // Third line ends the block comment
1591        let s3 = p.sanitize_line("end */ y := 2");
1592        assert!(!s3.contains("end"));
1593        assert!(s3.contains("y := 2"));
1594    }
1595
1596    #[test]
1597    fn multiline_block_comment_java() {
1598        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Java);
1599
1600        // First line starts the block comment
1601        let s1 = p.sanitize_line("int x = 1; /* javadoc style");
1602        assert!(s1.contains("int x = 1;"));
1603        assert!(!s1.contains("javadoc"));
1604
1605        // Second line is inside the block comment
1606        let s2 = p.sanitize_line(" * System.out.println hidden");
1607        assert!(!s2.contains("System"));
1608        assert!(!s2.contains("hidden"));
1609
1610        // Third line ends the block comment
1611        let s3 = p.sanitize_line(" */ int y = 2;");
1612        assert!(s3.contains("int y = 2;"));
1613    }
1614
1615    #[test]
1616    fn multiline_block_comment_preserves_line_length() {
1617        let mut p =
1618            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1619
1620        let line1 = "let x = 1; /* start";
1621        let s1 = p.sanitize_line(line1);
1622        assert_eq!(s1.len(), line1.len());
1623
1624        let line2 = "middle of comment";
1625        let s2 = p.sanitize_line(line2);
1626        assert_eq!(s2.len(), line2.len());
1627
1628        let line3 = "end */ let y = 2;";
1629        let s3 = p.sanitize_line(line3);
1630        assert_eq!(s3.len(), line3.len());
1631    }
1632
1633    // ==================== Multi-line string tests (Requirement 9.3) ====================
1634
1635    #[test]
1636    fn multiline_string_with_escaped_newline() {
1637        let mut p =
1638            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1639
1640        // First line starts a string with escaped newline at end
1641        let s1 = p.sanitize_line("let x = \"start\\");
1642        assert!(s1.contains("let x ="));
1643        // The string content should be masked
1644        assert!(!s1.contains("start"));
1645
1646        // Second line continues the string (escaped newline means string continues)
1647        let s2 = p.sanitize_line("console.log hidden\"");
1648        // After the escaped backslash, we're still in the string
1649        // The string ends with the closing quote
1650        assert!(!s2.contains("console"));
1651    }
1652
1653    #[test]
1654    fn multiline_rust_raw_string() {
1655        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
1656
1657        // First line starts a raw string
1658        let s1 = p.sanitize_line("let x = r#\"start of raw");
1659        assert!(s1.contains("let x ="));
1660        assert!(!s1.contains("start"));
1661
1662        // Second line is inside the raw string
1663        let s2 = p.sanitize_line("unwrap() hidden in raw string");
1664        assert!(!s2.contains("unwrap"));
1665
1666        // Third line ends the raw string
1667        let s3 = p.sanitize_line("end of raw\"# + y;");
1668        assert!(!s3.contains("end of raw"));
1669        assert!(s3.contains("+ y;"));
1670    }
1671
1672    #[test]
1673    fn multiline_rust_raw_string_with_hashes() {
1674        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
1675
1676        // First line starts a raw string with multiple hashes
1677        let s1 = p.sanitize_line("let x = r##\"start");
1678        assert!(s1.contains("let x ="));
1679        assert!(!s1.contains("start"));
1680
1681        // Second line has a fake ending that shouldn't close the string
1682        let s2 = p.sanitize_line("fake end\"# still inside");
1683        assert!(!s2.contains("fake"));
1684        assert!(!s2.contains("still inside"));
1685
1686        // Third line has the real ending with correct number of hashes
1687        let s3 = p.sanitize_line("real end\"## + y;");
1688        assert!(!s3.contains("real end"));
1689        assert!(s3.contains("+ y;"));
1690    }
1691
1692    #[test]
1693    fn multiline_python_triple_quoted_with_embedded_quotes() {
1694        let mut p =
1695            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1696
1697        // First line starts a triple-quoted string
1698        let s1 = p.sanitize_line("x = \"\"\"start with \"embedded\" quote");
1699        assert!(s1.contains("x ="));
1700        assert!(!s1.contains("start"));
1701        assert!(!s1.contains("embedded"));
1702
1703        // Second line has more embedded quotes
1704        let s2 = p.sanitize_line("more \"quotes\" and 'single' too");
1705        assert!(!s2.contains("quotes"));
1706        assert!(!s2.contains("single"));
1707
1708        // Third line ends the triple-quoted string
1709        let s3 = p.sanitize_line("end\"\"\" + y");
1710        assert!(!s3.contains("end"));
1711        assert!(s3.contains("+ y"));
1712    }
1713
1714    #[test]
1715    fn multiline_javascript_template_literal_with_expressions() {
1716        let mut p =
1717            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1718
1719        // First line starts a template literal
1720        let s1 = p.sanitize_line("let x = `start ${expr}");
1721        assert!(s1.contains("let x ="));
1722        assert!(!s1.contains("start"));
1723
1724        // Second line is inside the template literal
1725        let s2 = p.sanitize_line("console.log in template");
1726        assert!(!s2.contains("console"));
1727
1728        // Third line ends the template literal
1729        let s3 = p.sanitize_line("end` + y;");
1730        assert!(!s3.contains("end"));
1731        assert!(s3.contains("+ y;"));
1732    }
1733
1734    // ==================== State reset tests (Requirement 9.3) ====================
1735
1736    #[test]
1737    fn reset_clears_block_comment_state() {
1738        let mut p =
1739            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1740
1741        // Start a block comment
1742        let s1 = p.sanitize_line("let x = 1; /* start comment");
1743        assert!(!s1.contains("start comment"));
1744
1745        // Verify we're in block comment mode
1746        let s2 = p.sanitize_line("still in comment");
1747        assert!(!s2.contains("still"));
1748
1749        // Reset the preprocessor
1750        p.reset();
1751
1752        // After reset, the same line should NOT be treated as inside a comment
1753        let s3 = p.sanitize_line("not in comment anymore");
1754        assert!(s3.contains("not in comment anymore"));
1755    }
1756
1757    #[test]
1758    fn reset_clears_string_state() {
1759        let mut p =
1760            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
1761
1762        // Start a triple-quoted string
1763        let s1 = p.sanitize_line("x = \"\"\"start of string");
1764        assert!(!s1.contains("start"));
1765
1766        // Verify we're in string mode
1767        let s2 = p.sanitize_line("still in string");
1768        assert!(!s2.contains("still"));
1769
1770        // Reset the preprocessor
1771        p.reset();
1772
1773        // After reset, the same line should NOT be treated as inside a string
1774        let s3 = p.sanitize_line("not in string anymore");
1775        assert!(s3.contains("not in string anymore"));
1776    }
1777
1778    #[test]
1779    fn reset_clears_template_literal_state() {
1780        let mut p =
1781            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
1782
1783        // Start a template literal
1784        let s1 = p.sanitize_line("let x = `start of template");
1785        assert!(!s1.contains("start"));
1786
1787        // Verify we're in template literal mode
1788        let s2 = p.sanitize_line("still in template");
1789        assert!(!s2.contains("still"));
1790
1791        // Reset the preprocessor
1792        p.reset();
1793
1794        // After reset, the same line should NOT be treated as inside a template literal
1795        let s3 = p.sanitize_line("not in template anymore");
1796        assert!(s3.contains("not in template anymore"));
1797    }
1798
1799    #[test]
1800    fn reset_clears_raw_string_state() {
1801        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Go);
1802
1803        // Start a raw string
1804        let s1 = p.sanitize_line("x := `start of raw");
1805        assert!(!s1.contains("start"));
1806
1807        // Verify we're in raw string mode
1808        let s2 = p.sanitize_line("still in raw");
1809        assert!(!s2.contains("still"));
1810
1811        // Reset the preprocessor
1812        p.reset();
1813
1814        // After reset, the same line should NOT be treated as inside a raw string
1815        let s3 = p.sanitize_line("not in raw anymore");
1816        assert!(s3.contains("not in raw anymore"));
1817    }
1818
1819    #[test]
1820    fn set_language_resets_state() {
1821        let mut p =
1822            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1823
1824        // Start a block comment
1825        let s1 = p.sanitize_line("let x = 1; /* start comment");
1826        assert!(!s1.contains("start comment"));
1827
1828        // Verify we're in block comment mode
1829        let s2 = p.sanitize_line("still in comment");
1830        assert!(!s2.contains("still"));
1831
1832        // Change language (which should reset state)
1833        p.set_language(Language::Python);
1834
1835        // After set_language, the state should be reset
1836        let s3 = p.sanitize_line("not in comment anymore");
1837        assert!(s3.contains("not in comment anymore"));
1838    }
1839
1840    #[test]
1841    fn set_language_changes_syntax_and_resets() {
1842        let mut p =
1843            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::JavaScript);
1844
1845        // Start a block comment in JavaScript
1846        let s1 = p.sanitize_line("let x = 1; /* start");
1847        assert!(!s1.contains("start"));
1848
1849        // Change to Python (which uses hash comments)
1850        p.set_language(Language::Python);
1851
1852        // Now hash should be treated as comment, not /* */
1853        let s2 = p.sanitize_line("x = 1  # python comment");
1854        assert!(s2.contains("x = 1"));
1855        assert!(!s2.contains("python comment"));
1856
1857        // And /* should NOT be treated as comment start in Python
1858        let s3 = p.sanitize_line("x = 1 /* not a comment */");
1859        assert!(s3.contains("/* not a comment */"));
1860    }
1861
1862    #[test]
1863    fn state_reset_between_files_simulation() {
1864        let mut p =
1865            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Rust);
1866
1867        // Process "file 1" - start a block comment
1868        let f1_l1 = p.sanitize_line("// File 1");
1869        assert!(!f1_l1.contains("File 1"));
1870
1871        let f1_l2 = p.sanitize_line("let x = 1; /* unclosed comment");
1872        assert!(!f1_l2.contains("unclosed"));
1873
1874        // Simulate switching to "file 2" by resetting
1875        p.reset();
1876
1877        // Process "file 2" - should start fresh
1878        let f2_l1 = p.sanitize_line("// File 2");
1879        assert!(!f2_l1.contains("File 2"));
1880
1881        let f2_l2 = p.sanitize_line("let y = 2; // normal code");
1882        assert!(f2_l2.contains("let y = 2;"));
1883        assert!(!f2_l2.contains("normal code"));
1884    }
1885
1886    #[test]
1887    fn state_reset_between_files_with_language_change() {
1888        let mut p = Preprocessor::with_language(
1889            PreprocessOptions::comments_and_strings(),
1890            Language::Python,
1891        );
1892
1893        // Process Python file - start a triple-quoted string
1894        let py_l1 = p.sanitize_line("x = \"\"\"unclosed");
1895        assert!(!py_l1.contains("unclosed"));
1896
1897        // Simulate switching to JavaScript file
1898        p.set_language(Language::JavaScript);
1899
1900        // Process JavaScript file - should start fresh with JS syntax
1901        let js_l1 = p.sanitize_line("let x = `template`;");
1902        assert!(js_l1.contains("let x ="));
1903        assert!(!js_l1.contains("template"));
1904
1905        // Verify template literal works correctly
1906        let js_l2 = p.sanitize_line("let y = 2; // comment");
1907        assert!(js_l2.contains("let y = 2;"));
1908        assert!(!js_l2.contains("comment"));
1909    }
1910
1911    #[test]
1912    fn nested_rust_block_comment_state_tracking() {
1913        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
1914
1915        // Start nested block comments
1916        let s1 = p.sanitize_line("/* level 1 /* level 2");
1917        assert!(!s1.contains("level 1"));
1918        assert!(!s1.contains("level 2"));
1919
1920        // Close one level
1921        let s2 = p.sanitize_line("close level 2 */ still level 1");
1922        assert!(!s2.contains("close level 2"));
1923        assert!(!s2.contains("still level 1"));
1924
1925        // Close final level
1926        let s3 = p.sanitize_line("close level 1 */ visible code");
1927        assert!(!s3.contains("close level 1"));
1928        assert!(s3.contains("visible code"));
1929    }
1930
1931    // ==================== Shell/Bash-specific tests ====================
1932
1933    #[test]
1934    fn shell_language_from_str() {
1935        assert_eq!("shell".parse::<Language>().unwrap(), Language::Shell);
1936        assert_eq!("bash".parse::<Language>().unwrap(), Language::Shell);
1937        assert_eq!("sh".parse::<Language>().unwrap(), Language::Shell);
1938        assert_eq!("zsh".parse::<Language>().unwrap(), Language::Shell);
1939        assert_eq!("ksh".parse::<Language>().unwrap(), Language::Shell);
1940        assert_eq!("fish".parse::<Language>().unwrap(), Language::Shell);
1941    }
1942
1943    #[test]
1944    fn shell_comment_syntax() {
1945        assert_eq!(Language::Shell.comment_syntax(), CommentSyntax::Hash);
1946    }
1947
1948    #[test]
1949    fn shell_string_syntax() {
1950        assert_eq!(Language::Shell.string_syntax(), StringSyntax::Shell);
1951    }
1952
1953    #[test]
1954    fn shell_masks_hash_comments() {
1955        let mut p =
1956            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Shell);
1957        let s = p.sanitize_line("echo hello  # this is a comment");
1958        assert!(s.contains("echo hello"));
1959        assert!(!s.contains("this is a comment"));
1960    }
1961
1962    #[test]
1963    fn shell_does_not_mask_hash_in_string() {
1964        let mut p =
1965            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Shell);
1966        let s = p.sanitize_line("echo \"# not a comment\"  # real comment");
1967        assert!(s.contains("# not a comment"));
1968        assert!(!s.contains("real comment"));
1969    }
1970
1971    #[test]
1972    fn shell_single_quoted_string_no_escapes() {
1973        // Shell single quotes are literal - backslash has NO special meaning
1974        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
1975        let s = p.sanitize_line("echo 'hello\\nworld'");
1976        assert!(s.contains("echo"));
1977        assert!(!s.contains("hello"));
1978        assert!(!s.contains("world"));
1979        // Verify the backslash is masked too
1980        assert!(!s.contains("\\n"));
1981    }
1982
1983    #[test]
1984    fn shell_single_quoted_cannot_contain_single_quote() {
1985        // In shell, you cannot escape a single quote inside single quotes
1986        // 'hello' is a complete string, then world' is the next token
1987        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
1988        let s = p.sanitize_line("echo 'hello' world");
1989        assert!(s.contains("echo"));
1990        assert!(!s.contains("hello"));
1991        assert!(s.contains("world")); // world is outside the string
1992    }
1993
1994    #[test]
1995    fn shell_double_quoted_strings() {
1996        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
1997        let s = p.sanitize_line("echo \"hello world\"");
1998        assert!(s.contains("echo"));
1999        assert!(!s.contains("hello"));
2000        assert!(!s.contains("world"));
2001    }
2002
2003    #[test]
2004    fn shell_double_quoted_with_escapes() {
2005        // Shell double quotes support backslash escapes
2006        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2007        let s = p.sanitize_line("echo \"say \\\"hello\\\"\"");
2008        assert!(s.contains("echo"));
2009        assert!(!s.contains("say"));
2010        assert!(!s.contains("hello"));
2011    }
2012
2013    #[test]
2014    fn shell_ansi_c_quoting() {
2015        // Shell ANSI-C quoting: $'...'
2016        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2017        let s = p.sanitize_line("echo $'hello\\nworld'");
2018        assert!(s.contains("echo"));
2019        assert!(!s.contains("hello"));
2020        assert!(!s.contains("world"));
2021    }
2022
2023    #[test]
2024    fn shell_ansi_c_quoting_with_escapes() {
2025        // ANSI-C quoting supports escapes like \t, \n, \'
2026        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2027        let s = p.sanitize_line("echo $'tab\\there'");
2028        assert!(s.contains("echo"));
2029        assert!(!s.contains("tab"));
2030        assert!(!s.contains("here"));
2031    }
2032
2033    #[test]
2034    fn shell_ansi_c_escaped_single_quote() {
2035        // Unlike regular single quotes, $' allows \' to escape a quote
2036        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2037        let s = p.sanitize_line("echo $'it\\'s ok'");
2038        assert!(s.contains("echo"));
2039        assert!(!s.contains("it"));
2040        assert!(!s.contains("ok"));
2041    }
2042
2043    #[test]
2044    fn shell_preserves_line_length() {
2045        let mut p =
2046            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Shell);
2047        let line = "echo 'hello' # comment";
2048        let s = p.sanitize_line(line);
2049        assert_eq!(s.len(), line.len());
2050    }
2051
2052    #[test]
2053    fn shell_multiline_double_quoted_string() {
2054        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2055
2056        // First line starts a double-quoted string with escaped newline
2057        let s1 = p.sanitize_line("echo \"start\\");
2058        assert!(s1.contains("echo"));
2059        assert!(!s1.contains("start"));
2060
2061        // Second line continues the string
2062        let s2 = p.sanitize_line("middle\" end");
2063        // After escaped backslash, we're still in the string until closing quote
2064        assert!(s2.contains("end"));
2065    }
2066
2067    #[test]
2068    fn shell_hash_not_comment_in_string() {
2069        let mut p =
2070            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Shell);
2071        let s = p.sanitize_line("grep '#include' file.c  # search for includes");
2072        assert!(!s.contains("#include")); // masked (in string)
2073        assert!(!s.contains("search")); // masked (in comment)
2074        assert!(s.contains("grep"));
2075        assert!(s.contains("file.c"));
2076    }
2077
2078    #[test]
2079    fn shell_complex_mixed_quotes() {
2080        let mut p =
2081            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Shell);
2082        // Mix of single, double, and $' quoting
2083        let s = p.sanitize_line("echo 'single' \"double\" $'ansi' # comment");
2084        assert!(s.contains("echo"));
2085        assert!(!s.contains("single"));
2086        assert!(!s.contains("double"));
2087        assert!(!s.contains("ansi"));
2088        assert!(!s.contains("comment"));
2089    }
2090
2091    // ==================== Swift-specific tests ====================
2092
2093    #[test]
2094    fn swift_language_from_str() {
2095        assert_eq!("swift".parse::<Language>().unwrap(), Language::Swift);
2096        assert_eq!("Swift".parse::<Language>().unwrap(), Language::Swift);
2097        assert_eq!("SWIFT".parse::<Language>().unwrap(), Language::Swift);
2098    }
2099
2100    #[test]
2101    fn swift_comment_syntax() {
2102        assert_eq!(
2103            Language::Swift.comment_syntax(),
2104            CommentSyntax::CStyleNested
2105        );
2106    }
2107
2108    #[test]
2109    fn swift_string_syntax() {
2110        assert_eq!(Language::Swift.string_syntax(), StringSyntax::SwiftScala);
2111    }
2112
2113    #[test]
2114    fn swift_masks_line_comments() {
2115        let mut p =
2116            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Swift);
2117        let s = p.sanitize_line("let x = 1 // print() here");
2118        assert!(s.contains("let x = 1"));
2119        assert!(!s.contains("print"));
2120    }
2121
2122    #[test]
2123    fn swift_masks_block_comments() {
2124        let mut p =
2125            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Swift);
2126        let s = p.sanitize_line("let x = /* print() */ 1");
2127        assert!(s.contains("let x ="));
2128        assert!(s.contains("1"));
2129        assert!(!s.contains("print"));
2130    }
2131
2132    #[test]
2133    fn swift_nested_block_comments() {
2134        let mut p =
2135            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Swift);
2136        // Swift supports nested block comments like Rust
2137        let s1 = p.sanitize_line("let x = 1 /* outer /* inner");
2138        assert!(s1.contains("let x = 1"));
2139        assert!(!s1.contains("outer"));
2140        assert!(!s1.contains("inner"));
2141
2142        // Still in nested comment
2143        let s2 = p.sanitize_line("still inside");
2144        assert!(!s2.contains("still"));
2145
2146        // Close inner
2147        let s3 = p.sanitize_line("close inner */ still outer");
2148        assert!(!s3.contains("close inner"));
2149        assert!(!s3.contains("still outer"));
2150
2151        // Close outer
2152        let s4 = p.sanitize_line("close outer */ let y = 2");
2153        assert!(!s4.contains("close outer"));
2154        assert!(s4.contains("let y = 2"));
2155    }
2156
2157    #[test]
2158    fn swift_masks_double_quoted_strings() {
2159        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Swift);
2160        let s = p.sanitize_line("let x = \"print() inside\"");
2161        assert!(s.contains("let x ="));
2162        assert!(!s.contains("print"));
2163    }
2164
2165    #[test]
2166    fn swift_masks_triple_quoted_strings() {
2167        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Swift);
2168        let s = p.sanitize_line("let x = \"\"\"print() inside\"\"\"");
2169        assert!(s.contains("let x ="));
2170        assert!(!s.contains("print"));
2171    }
2172
2173    #[test]
2174    fn swift_triple_quoted_string_multiline() {
2175        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Swift);
2176        // First line starts the triple-quoted string
2177        let s1 = p.sanitize_line("let x = \"\"\"start of");
2178        assert!(s1.contains("let x ="));
2179        assert!(!s1.contains("start"));
2180
2181        // Second line is inside the string
2182        let s2 = p.sanitize_line("print() in middle");
2183        assert!(!s2.contains("print"));
2184
2185        // Third line ends the string
2186        let s3 = p.sanitize_line("end of string\"\"\" + y");
2187        assert!(!s3.contains("end of string"));
2188        assert!(s3.contains("+ y"));
2189    }
2190
2191    #[test]
2192    fn swift_preserves_line_length() {
2193        let mut p =
2194            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Swift);
2195        let line = "let x = \"hello\" // comment";
2196        let s = p.sanitize_line(line);
2197        assert_eq!(s.len(), line.len());
2198    }
2199
2200    // ==================== Scala-specific tests ====================
2201
2202    #[test]
2203    fn scala_language_from_str() {
2204        assert_eq!("scala".parse::<Language>().unwrap(), Language::Scala);
2205        assert_eq!("Scala".parse::<Language>().unwrap(), Language::Scala);
2206        assert_eq!("SCALA".parse::<Language>().unwrap(), Language::Scala);
2207    }
2208
2209    #[test]
2210    fn scala_comment_syntax() {
2211        assert_eq!(
2212            Language::Scala.comment_syntax(),
2213            CommentSyntax::CStyleNested
2214        );
2215    }
2216
2217    #[test]
2218    fn scala_string_syntax() {
2219        assert_eq!(Language::Scala.string_syntax(), StringSyntax::SwiftScala);
2220    }
2221
2222    #[test]
2223    fn scala_masks_line_comments() {
2224        let mut p =
2225            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Scala);
2226        let s = p.sanitize_line("val x = 1 // println() here");
2227        assert!(s.contains("val x = 1"));
2228        assert!(!s.contains("println"));
2229    }
2230
2231    #[test]
2232    fn scala_masks_block_comments() {
2233        let mut p =
2234            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Scala);
2235        let s = p.sanitize_line("val x = /* println() */ 1");
2236        assert!(s.contains("val x ="));
2237        assert!(s.contains("1"));
2238        assert!(!s.contains("println"));
2239    }
2240
2241    #[test]
2242    fn scala_nested_block_comments() {
2243        let mut p =
2244            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Scala);
2245        // Scala supports nested block comments
2246        let s1 = p.sanitize_line("val x = 1 /* outer /* inner");
2247        assert!(s1.contains("val x = 1"));
2248        assert!(!s1.contains("outer"));
2249
2250        let s2 = p.sanitize_line("still inside");
2251        assert!(!s2.contains("still"));
2252
2253        let s3 = p.sanitize_line("inner */ still outer");
2254        assert!(!s3.contains("inner"));
2255
2256        let s4 = p.sanitize_line("outer */ val y = 2");
2257        assert!(s4.contains("val y = 2"));
2258    }
2259
2260    #[test]
2261    fn scala_masks_double_quoted_strings() {
2262        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Scala);
2263        let s = p.sanitize_line("val x = \"println() inside\"");
2264        assert!(s.contains("val x ="));
2265        assert!(!s.contains("println"));
2266    }
2267
2268    #[test]
2269    fn scala_masks_triple_quoted_strings() {
2270        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Scala);
2271        let s = p.sanitize_line("val x = \"\"\"println() inside\"\"\"");
2272        assert!(s.contains("val x ="));
2273        assert!(!s.contains("println"));
2274    }
2275
2276    #[test]
2277    fn scala_triple_single_quotes_do_not_start_triple_string() {
2278        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Scala);
2279        let line = "val x = '''println() inside'''";
2280        let s = p.sanitize_line(line);
2281        assert_eq!(s, line);
2282        assert!(p.mode == Mode::Normal);
2283    }
2284
2285    #[test]
2286    fn scala_triple_quoted_string_multiline() {
2287        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Scala);
2288        let s1 = p.sanitize_line("val x = \"\"\"start of");
2289        assert!(s1.contains("val x ="));
2290        assert!(!s1.contains("start"));
2291
2292        let s2 = p.sanitize_line("println() in middle");
2293        assert!(!s2.contains("println"));
2294
2295        let s3 = p.sanitize_line("end of string\"\"\" + y");
2296        assert!(!s3.contains("end of string"));
2297        assert!(s3.contains("+ y"));
2298    }
2299
2300    #[test]
2301    fn scala_preserves_line_length() {
2302        let mut p =
2303            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Scala);
2304        let line = "val x = \"hello\" // comment";
2305        let s = p.sanitize_line(line);
2306        assert_eq!(s.len(), line.len());
2307    }
2308
2309    // ==================== SQL-specific tests ====================
2310
2311    #[test]
2312    fn sql_language_from_str() {
2313        assert_eq!("sql".parse::<Language>().unwrap(), Language::Sql);
2314        assert_eq!("SQL".parse::<Language>().unwrap(), Language::Sql);
2315        assert_eq!("Sql".parse::<Language>().unwrap(), Language::Sql);
2316    }
2317
2318    #[test]
2319    fn sql_comment_syntax() {
2320        assert_eq!(Language::Sql.comment_syntax(), CommentSyntax::Sql);
2321    }
2322
2323    #[test]
2324    fn sql_string_syntax() {
2325        assert_eq!(Language::Sql.string_syntax(), StringSyntax::Sql);
2326    }
2327
2328    #[test]
2329    fn sql_masks_double_dash_comments() {
2330        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Sql);
2331        let s = p.sanitize_line("SELECT * FROM users -- secret query");
2332        assert!(s.contains("SELECT * FROM users"));
2333        assert!(!s.contains("secret"));
2334    }
2335
2336    #[test]
2337    fn sql_masks_block_comments() {
2338        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Sql);
2339        let s = p.sanitize_line("SELECT /* hidden */ * FROM users");
2340        assert!(s.contains("SELECT"));
2341        assert!(s.contains("* FROM users"));
2342        assert!(!s.contains("hidden"));
2343    }
2344
2345    #[test]
2346    fn sql_multiline_block_comment() {
2347        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Sql);
2348        let s1 = p.sanitize_line("SELECT * /* start comment");
2349        assert!(s1.contains("SELECT *"));
2350        assert!(!s1.contains("start"));
2351
2352        let s2 = p.sanitize_line("hidden query");
2353        assert!(!s2.contains("hidden"));
2354
2355        let s3 = p.sanitize_line("end comment */ FROM users");
2356        assert!(!s3.contains("end comment"));
2357        assert!(s3.contains("FROM users"));
2358    }
2359
2360    #[test]
2361    fn sql_masks_single_quoted_strings() {
2362        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Sql);
2363        let s = p.sanitize_line("SELECT * FROM users WHERE name = 'secret_password'");
2364        assert!(s.contains("SELECT * FROM users WHERE name ="));
2365        assert!(!s.contains("secret_password"));
2366    }
2367
2368    #[test]
2369    fn sql_does_not_mask_double_quoted_as_string() {
2370        // In SQL, double quotes are for identifiers, not strings
2371        // But for simplicity, we don't handle them specially
2372        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Sql);
2373        let s = p.sanitize_line("SELECT \"column\" FROM users");
2374        // SQL string syntax only uses single quotes, so double quotes pass through
2375        assert!(s.contains("SELECT"));
2376        assert!(s.contains("column")); // Not masked because SQL uses single quotes
2377    }
2378
2379    #[test]
2380    fn sql_single_dash_not_comment() {
2381        // A single dash should not start a comment in SQL
2382        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Sql);
2383        let s = p.sanitize_line("SELECT a - b FROM table");
2384        assert!(s.contains("SELECT a - b FROM table"));
2385    }
2386
2387    #[test]
2388    fn sql_preserves_line_length() {
2389        let mut p =
2390            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Sql);
2391        let line = "SELECT 'hello' -- comment";
2392        let s = p.sanitize_line(line);
2393        assert_eq!(s.len(), line.len());
2394    }
2395
2396    #[test]
2397    fn sql_does_not_mask_hash_in_string() {
2398        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Sql);
2399        let s = p.sanitize_line("SELECT * WHERE name = '-- not a comment' -- real comment");
2400        assert!(s.contains("-- not a comment"));
2401        assert!(!s.contains("real comment"));
2402    }
2403
2404    // ==================== XML/HTML-specific tests ====================
2405
2406    #[test]
2407    fn xml_language_from_str() {
2408        assert_eq!("xml".parse::<Language>().unwrap(), Language::Xml);
2409        assert_eq!("html".parse::<Language>().unwrap(), Language::Xml);
2410        assert_eq!("xhtml".parse::<Language>().unwrap(), Language::Xml);
2411        assert_eq!("svg".parse::<Language>().unwrap(), Language::Xml);
2412        assert_eq!("xsl".parse::<Language>().unwrap(), Language::Xml);
2413        assert_eq!("xslt".parse::<Language>().unwrap(), Language::Xml);
2414    }
2415
2416    #[test]
2417    fn xml_comment_syntax() {
2418        assert_eq!(Language::Xml.comment_syntax(), CommentSyntax::Xml);
2419    }
2420
2421    #[test]
2422    fn xml_string_syntax() {
2423        assert_eq!(Language::Xml.string_syntax(), StringSyntax::Xml);
2424    }
2425
2426    #[test]
2427    fn xml_masks_comments() {
2428        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Xml);
2429        let s = p.sanitize_line("<div><!-- secret comment --></div>");
2430        assert!(s.contains("<div>"));
2431        assert!(s.contains("</div>"));
2432        assert!(!s.contains("secret"));
2433    }
2434
2435    #[test]
2436    fn xml_multiline_comment() {
2437        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Xml);
2438        let s1 = p.sanitize_line("<div><!-- start comment");
2439        assert!(s1.contains("<div>"));
2440        assert!(!s1.contains("start"));
2441
2442        let s2 = p.sanitize_line("hidden content");
2443        assert!(!s2.contains("hidden"));
2444
2445        let s3 = p.sanitize_line("end comment --></div>");
2446        assert!(!s3.contains("end comment"));
2447        assert!(s3.contains("</div>"));
2448    }
2449
2450    #[test]
2451    fn xml_masks_double_quoted_attributes() {
2452        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Xml);
2453        let s = p.sanitize_line("<input type=\"password\" value=\"secret\">");
2454        assert!(s.contains("<input type="));
2455        assert!(s.contains("value="));
2456        assert!(!s.contains("password"));
2457        assert!(!s.contains("secret"));
2458    }
2459
2460    #[test]
2461    fn xml_masks_single_quoted_attributes() {
2462        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Xml);
2463        let s = p.sanitize_line("<input type='password' value='secret'>");
2464        assert!(s.contains("<input type="));
2465        assert!(s.contains("value="));
2466        assert!(!s.contains("password"));
2467        assert!(!s.contains("secret"));
2468    }
2469
2470    #[test]
2471    fn xml_mixed_quotes() {
2472        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Xml);
2473        let s = p.sanitize_line("<div class=\"myclass\" id='myid'>");
2474        assert!(s.contains("<div class="));
2475        assert!(s.contains("id="));
2476        assert!(!s.contains("myclass"));
2477        assert!(!s.contains("myid"));
2478    }
2479
2480    #[test]
2481    fn xml_preserves_line_length() {
2482        let mut p =
2483            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Xml);
2484        let line = "<div class=\"test\"><!-- comment --></div>";
2485        let s = p.sanitize_line(line);
2486        assert_eq!(s.len(), line.len());
2487    }
2488
2489    #[test]
2490    fn xml_comment_delimiter_not_in_string() {
2491        // When masking only comments, strings should preserve their content
2492        // The <!-- inside the string should NOT be treated as a comment start
2493        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Xml);
2494        let s = p.sanitize_line("<div data-comment=\"<!-- not a comment -->\"><!-- real --></div>");
2495        // The string content is preserved because we're only masking comments
2496        assert!(s.contains("<!-- not a comment -->"));
2497        // The real comment is masked
2498        assert!(!s.contains("real"));
2499    }
2500
2501    // ==================== PHP-specific tests ====================
2502
2503    #[test]
2504    fn php_language_from_str() {
2505        assert_eq!("php".parse::<Language>().unwrap(), Language::Php);
2506        assert_eq!("PHP".parse::<Language>().unwrap(), Language::Php);
2507        assert_eq!("Php".parse::<Language>().unwrap(), Language::Php);
2508    }
2509
2510    #[test]
2511    fn php_comment_syntax() {
2512        assert_eq!(Language::Php.comment_syntax(), CommentSyntax::Php);
2513    }
2514
2515    #[test]
2516    fn php_string_syntax() {
2517        assert_eq!(Language::Php.string_syntax(), StringSyntax::Php);
2518    }
2519
2520    #[test]
2521    fn php_masks_double_slash_comments() {
2522        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2523        let s = p.sanitize_line("$x = 1; // echo secret");
2524        assert!(s.contains("$x = 1;"));
2525        assert!(!s.contains("echo"));
2526        assert!(!s.contains("secret"));
2527    }
2528
2529    #[test]
2530    fn php_masks_hash_comments() {
2531        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2532        let s = p.sanitize_line("$x = 1; # echo secret");
2533        assert!(s.contains("$x = 1;"));
2534        assert!(!s.contains("echo"));
2535        assert!(!s.contains("secret"));
2536    }
2537
2538    #[test]
2539    fn php_masks_block_comments() {
2540        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2541        let s = p.sanitize_line("$x = /* echo secret */ 1;");
2542        assert!(s.contains("$x ="));
2543        assert!(s.contains("1;"));
2544        assert!(!s.contains("echo"));
2545        assert!(!s.contains("secret"));
2546    }
2547
2548    #[test]
2549    fn php_multiline_block_comment() {
2550        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2551        let s1 = p.sanitize_line("$x = 1; /* start comment");
2552        assert!(s1.contains("$x = 1;"));
2553        assert!(!s1.contains("start"));
2554
2555        let s2 = p.sanitize_line("hidden code");
2556        assert!(!s2.contains("hidden"));
2557
2558        let s3 = p.sanitize_line("end comment */ $y = 2;");
2559        assert!(!s3.contains("end comment"));
2560        assert!(s3.contains("$y = 2;"));
2561    }
2562
2563    #[test]
2564    fn php_masks_double_quoted_strings() {
2565        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Php);
2566        let s = p.sanitize_line("$x = \"echo secret\";");
2567        assert!(s.contains("$x ="));
2568        assert!(!s.contains("echo"));
2569        assert!(!s.contains("secret"));
2570    }
2571
2572    #[test]
2573    fn php_masks_single_quoted_strings() {
2574        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Php);
2575        let s = p.sanitize_line("$x = 'echo secret';");
2576        assert!(s.contains("$x ="));
2577        assert!(!s.contains("echo"));
2578        assert!(!s.contains("secret"));
2579    }
2580
2581    #[test]
2582    fn php_string_with_escapes() {
2583        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Php);
2584        let s = p.sanitize_line("$x = \"say \\\"hello\\\"\";");
2585        assert!(s.contains("$x ="));
2586        assert!(!s.contains("say"));
2587        assert!(!s.contains("hello"));
2588    }
2589
2590    #[test]
2591    fn php_hash_not_comment_in_string() {
2592        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2593        let s = p.sanitize_line("$x = \"# not a comment\"; # real comment");
2594        assert!(s.contains("# not a comment"));
2595        assert!(!s.contains("real comment"));
2596    }
2597
2598    #[test]
2599    fn php_slash_not_comment_in_string() {
2600        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2601        let s = p.sanitize_line("$x = \"// not a comment\"; // real comment");
2602        assert!(s.contains("// not a comment"));
2603        assert!(!s.contains("real comment"));
2604    }
2605
2606    #[test]
2607    fn php_preserves_line_length() {
2608        let mut p =
2609            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Php);
2610        let line = "$x = 'hello'; // comment";
2611        let s = p.sanitize_line(line);
2612        assert_eq!(s.len(), line.len());
2613    }
2614
2615    #[test]
2616    fn php_mixed_comments_and_strings() {
2617        let mut p =
2618            Preprocessor::with_language(PreprocessOptions::comments_and_strings(), Language::Php);
2619        let s = p.sanitize_line("echo 'single' . \"double\"; // comment # more");
2620        assert!(s.contains("echo"));
2621        assert!(s.contains("."));
2622        assert!(!s.contains("single"));
2623        assert!(!s.contains("double"));
2624        assert!(!s.contains("comment"));
2625        assert!(!s.contains("more"));
2626    }
2627
2628    // ==================== YAML/TOML/JSON-specific tests ====================
2629
2630    #[test]
2631    fn yaml_hash_comment_ignored() {
2632        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Yaml);
2633        let s = p.sanitize_line("key: value # secret");
2634        assert!(s.contains("key: value"));
2635        assert!(!s.contains("secret"));
2636    }
2637
2638    #[test]
2639    fn toml_hash_comment_ignored() {
2640        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Toml);
2641        let s = p.sanitize_line("name = \"app\" # local");
2642        assert!(s.contains("name = \"app\""));
2643        assert!(!s.contains("local"));
2644    }
2645
2646    #[test]
2647    fn jsonc_double_slash_comment_ignored() {
2648        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Json);
2649        let s = p.sanitize_line("{\"key\": \"value\" // trailing note");
2650        assert!(s.contains("{\"key\": \"value\""));
2651        assert!(!s.contains("trailing note"));
2652    }
2653
2654    #[test]
2655    fn mode_debug_formats_variants() {
2656        let modes = [
2657            Mode::RawString { hashes: 2 },
2658            Mode::Char { escaped: false },
2659            Mode::TemplateLiteral { escaped: true },
2660            Mode::TripleQuotedString {
2661                escaped: false,
2662                quote: b'"',
2663            },
2664            Mode::ShellLiteralString,
2665            Mode::ShellAnsiCString { escaped: false },
2666            Mode::XmlComment,
2667        ];
2668
2669        for mode in modes {
2670            let rendered = format!("{:?}", mode);
2671            assert!(!rendered.is_empty());
2672        }
2673    }
2674
2675    #[test]
2676    fn rust_raw_and_byte_strings_masked() {
2677        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
2678        let line = "let a = r#\"raw\"#; let b = b\"byte\";";
2679        let s = p.sanitize_line(line);
2680        assert!(s.contains("let a ="));
2681        assert!(s.contains("let b ="));
2682        assert!(!s.contains("raw"));
2683        assert!(!s.contains("byte"));
2684    }
2685
2686    #[test]
2687    fn python_triple_quoted_string_handles_escapes() {
2688        let mut p =
2689            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
2690        let line = "x = \"\"\"a\\\\b\"\"\"";
2691        let s = p.sanitize_line(line);
2692        assert!(s.contains("x ="));
2693        assert!(!s.contains("a\\\\b"));
2694    }
2695
2696    #[test]
2697    fn shell_ansi_c_string_masks_and_escapes() {
2698        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2699        let line = "echo $'a\\\\n'";
2700        let s = p.sanitize_line(line);
2701        assert!(s.contains("echo"));
2702        assert!(!s.contains("a"));
2703        assert!(!s.contains("n"));
2704    }
2705
2706    #[test]
2707    fn php_block_comments_masked() {
2708        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2709        let line = "<?php /* block */ echo $x; ?>";
2710        let s = p.sanitize_line(line);
2711        assert!(s.contains("echo"));
2712        assert!(!s.contains("block"));
2713    }
2714
2715    #[test]
2716    fn line_comment_resets_mode() {
2717        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
2718        let s1 = p.sanitize_line("// comment");
2719        assert!(!s1.contains("comment"));
2720        let s2 = p.sanitize_line("let x = 1;");
2721        assert!(s2.contains("let x = 1;"));
2722    }
2723
2724    #[test]
2725    fn nested_block_comment_masks_nested() {
2726        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
2727        let line = "/* outer /* inner */ tail */ let x = 1;";
2728        let s = p.sanitize_line(line);
2729        assert!(s.contains("let x = 1;"));
2730        assert!(!s.contains("outer"));
2731        assert!(!s.contains("inner"));
2732        assert!(!s.contains("tail"));
2733    }
2734
2735    #[test]
2736    fn xml_comment_masks_and_closes() {
2737        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Xml);
2738        let line = "<!-- hi --> <tag>";
2739        let s = p.sanitize_line(line);
2740        assert!(s.contains("<tag>"));
2741        assert!(!s.contains("hi"));
2742    }
2743
2744    #[test]
2745    fn rust_byte_string_masks_when_strings_only() {
2746        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
2747        let s = p.sanitize_line(r#"let b = b"bytes";"#);
2748        assert!(!s.contains("bytes"));
2749    }
2750
2751    #[test]
2752    fn rust_raw_string_masks_end_delimiter() {
2753        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
2754        let _ = p.sanitize_line("let s = r#\"raw");
2755        let end = p.sanitize_line("\"#;");
2756        assert!(!end.contains("\"#"));
2757    }
2758
2759    #[test]
2760    fn shell_ansi_c_string_masks_prefix_and_body() {
2761        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2762        let s = p.sanitize_line("$'a\\n'");
2763        assert!(s.trim().is_empty());
2764    }
2765
2766    #[test]
2767    fn shell_literal_string_masks_body() {
2768        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Shell);
2769        let s = p.sanitize_line("'abc'");
2770        assert!(s.trim().is_empty());
2771    }
2772
2773    #[test]
2774    fn swift_double_quoted_masks_when_strings_only() {
2775        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Swift);
2776        let s = p.sanitize_line("let s = \"hello\";");
2777        assert!(!s.contains("hello"));
2778    }
2779
2780    #[test]
2781    fn php_block_comment_masks_opening() {
2782        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2783        let s = p.sanitize_line("/* php block */");
2784        assert!(!s.contains("php"));
2785    }
2786
2787    #[test]
2788    fn line_comment_mode_branch_executes() {
2789        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
2790        p.mode = Mode::LineComment;
2791        let _ = p.sanitize_line("still comment");
2792        assert!(p.mode == Mode::Normal);
2793    }
2794
2795    #[test]
2796    fn triple_quoted_string_masks_closing() {
2797        let mut p =
2798            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Python);
2799        let _ = p.sanitize_line("s = '''hello");
2800        let end = p.sanitize_line("world'''");
2801        assert!(!end.contains("'''"));
2802    }
2803
2804    #[test]
2805    fn rust_raw_and_byte_strings_preserved_when_strings_not_masked() {
2806        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Rust);
2807        let line = "let a = r#\"raw\"#; let b = b\"byte\";";
2808        let s = p.sanitize_line(line);
2809        assert!(s.contains("raw"));
2810        assert!(s.contains("byte"));
2811    }
2812
2813    #[test]
2814    fn python_triple_quoted_string_preserved_when_strings_not_masked() {
2815        let mut p =
2816            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Python);
2817        let line = "x = \"\"\"triple\"\"\"";
2818        let s = p.sanitize_line(line);
2819        assert!(s.contains("triple"));
2820    }
2821
2822    #[test]
2823    fn shell_strings_preserved_when_strings_not_masked() {
2824        let mut p =
2825            Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Shell);
2826        let line = "echo $'a\\n' 'b'";
2827        let s = p.sanitize_line(line);
2828        assert!(s.contains("a"));
2829        assert!(s.contains("b"));
2830    }
2831
2832    #[test]
2833    fn template_literal_escape_branches_execute() {
2834        let mut p =
2835            Preprocessor::with_language(PreprocessOptions::strings_only(), Language::JavaScript);
2836        let line: String = ['`', '\\', '`', 'x', '`'].iter().collect();
2837        let s = p.sanitize_line(&line);
2838        assert_eq!(s.len(), line.len());
2839    }
2840
2841    #[test]
2842    fn php_slash_not_comment_in_mask_comments_mode() {
2843        let mut p = Preprocessor::with_language(PreprocessOptions::comments_only(), Language::Php);
2844        let line = "/x";
2845        let s = p.sanitize_line(line);
2846        assert_eq!(s, line);
2847    }
2848
2849    #[test]
2850    fn block_comment_mode_without_masking_handles_nested_and_close() {
2851        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Rust);
2852        p.mode = Mode::BlockComment { depth: 1 };
2853        let _ = p.sanitize_line("/*");
2854        assert!(p.mode == Mode::BlockComment { depth: 2 });
2855
2856        p.mode = Mode::BlockComment { depth: 1 };
2857        let _ = p.sanitize_line("*/");
2858        assert!(p.mode == Mode::Normal);
2859    }
2860
2861    #[test]
2862    fn xml_comment_mode_without_masking_handles_close() {
2863        let mut p = Preprocessor::with_language(PreprocessOptions::strings_only(), Language::Xml);
2864        p.mode = Mode::XmlComment;
2865        let _ = p.sanitize_line("-->");
2866        assert!(p.mode == Mode::Normal);
2867    }
2868}