repotoire 0.8.0

//! Shared text-level utilities for detectors that scan raw line content.
//!
//! Distinct from `detectors/security/ast_helpers.rs`, which contains AST/
//! tree-sitter-based helpers. This module hosts pure-text helpers that
//! detectors apply to lines of source code BEFORE running regex / substring
//! matching.
//!
//! The motivating case: regex-based detectors that match patterns on raw
//! line text (e.g. `\bnew Promise\(`, `\.then\(`) will fire false positives
//! when the pattern appears inside a string literal — for example, a
//! generated TypeScript file that embeds commit messages as data, where
//! the message text happens to contain "Promise" or ".then(" verbatim.
//! Stripping string-literal contents to spaces before running pattern
//! matching eliminates this class of FP without changing column offsets.
//!
//! This module depends only on Rust stdlib. It MUST NOT depend on any
//! other module in the crate to keep import graph simple.

/// Replace the contents of JS/TS string literals on a line with spaces.
///
/// Returns a string of identical length to the input where bytes inside
/// `'...'`, `"..."`, and `` `...` `` literals are replaced with ASCII
/// spaces. The opening/closing quote bytes themselves are preserved so
/// column offsets remain stable for downstream span reporting.
///
/// **Backslash escapes** (e.g. `\"`, `\\`) inside string literals are
/// also blanked so escape sequences cannot reintroduce pattern bytes.
///
/// **Template-literal interpolations** `${...}` are left UNTOUCHED so
/// genuine code inside them is still scanned by callers.
///
/// This is a single-line helper. Multi-line string literals (template
/// literals that span lines, multi-line strings via `\` continuation)
/// are scanned per-line; the second line will start mid-literal and
/// the helper has no way to know — accept this limitation. In practice
/// regex detectors operate per-line anyway.
///
/// Use this on the line text BEFORE applying any regex/substring match
/// that could be confused by data appearing inside string literals.
///
/// # Examples
///
/// ```ignore
/// use repotoire::detectors::text_utils::strip_string_literals;
/// // Pattern "Promise" inside a string literal is blanked:
/// assert_eq!(
///     strip_string_literals(r#"const desc = "fix Promise leak";"#),
///     r#"const desc = "                ";"#,
/// );
/// // Real code is preserved:
/// assert_eq!(
///     strip_string_literals(r#"new Promise((res) => res(1))"#),
///     r#"new Promise((res) => res(1))"#,
/// );
/// // Template-literal interpolations are kept scannable:
/// let stripped = strip_string_literals(r#"`prefix ${new Promise(r)} suffix`"#);
/// assert!(stripped.contains("new Promise"));
/// ```
pub fn strip_string_literals(line: &str) -> String {
    let bytes = line.as_bytes();
    let mut out = Vec::with_capacity(bytes.len());
    let mut i = 0;
    while i < bytes.len() {
        let c = bytes[i];
        if c == b'\'' || c == b'"' || c == b'`' {
            let quote = c;
            out.push(c);
            i += 1;
            while i < bytes.len() {
                let b = bytes[i];
                if b == b'\\' && i + 1 < bytes.len() {
                    // Escape sequence — preserve backslash + next byte as spaces
                    // so columns stay stable but the content can't match patterns.
                    out.push(b' ');
                    out.push(b' ');
                    i += 2;
                    continue;
                }
                if b == quote {
                    out.push(b);
                    i += 1;
                    break;
                }
                // Inside a template literal, descend into ${...} so embedded
                // expressions remain scannable. Track brace depth to find the
                // matching '}'.
                if quote == b'`' && b == b'$' && i + 1 < bytes.len() && bytes[i + 1] == b'{' {
                    out.push(b'$');
                    out.push(b'{');
                    i += 2;
                    let mut depth: i32 = 1;
                    while i < bytes.len() && depth > 0 {
                        let bb = bytes[i];
                        if bb == b'{' {
                            depth += 1;
                        } else if bb == b'}' {
                            depth -= 1;
                            if depth == 0 {
                                out.push(b'}');
                                i += 1;
                                break;
                            }
                        }
                        out.push(bb);
                        i += 1;
                    }
                    continue;
                }
                // Replace literal byte with a space (ASCII-safe; multi-byte UTF-8
                // bytes are also replaced byte-wise — that's fine because the
                // line text is only re-used for ASCII pattern scans).
                out.push(b' ');
                i += 1;
            }
        } else {
            out.push(c);
            i += 1;
        }
    }
    // Safety: we only replaced bytes inside ASCII-quoted regions with ASCII
    // spaces; outer characters are preserved verbatim. The result is valid UTF-8
    // because we never split a multi-byte sequence across the quote boundary.
    String::from_utf8(out).unwrap_or_else(|_| line.to_string())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn blanks_double_quoted_contents() {
        assert_eq!(
            strip_string_literals(r#"const x = "Promise.then";"#),
            r#"const x = "            ";"#,
        );
    }

    #[test]
    fn blanks_single_quoted_contents() {
        assert_eq!(
            strip_string_literals(r#"const x = 'Promise.then';"#),
            r#"const x = '            ';"#,
        );
    }

    #[test]
    fn preserves_real_code_outside_strings() {
        let line = r#"new Promise((res) => res(1)).then(handle);"#;
        assert_eq!(strip_string_literals(line), line);
    }

    #[test]
    fn keeps_template_interpolations_scannable() {
        let stripped = strip_string_literals(r#"`prefix ${new Promise(r)} suffix`"#);
        // outer template content is blanked, but ${...} contents are preserved
        assert!(stripped.contains("new Promise"));
        assert!(stripped.contains("${"));
    }

    #[test]
    fn blanks_escape_sequences() {
        // \" inside double-quoted string is treated as part of the literal.
        let stripped = strip_string_literals(r#"const x = "a\"Promise";"#);
        assert!(!stripped.contains("Promise"));
        assert_eq!(stripped.len(), r#"const x = "a\"Promise";"#.len());
    }

    #[test]
    fn preserves_length_for_column_stability() {
        let line = r#"const desc = "fix Promise leak"; new Promise(r);"#;
        let stripped = strip_string_literals(line);
        assert_eq!(stripped.len(), line.len());
        // Pattern "Promise" inside the string is gone, but the one outside remains.
        let inside_idx = line.find("\"fix Promise").unwrap();
        let outside_idx = line.find("new Promise").unwrap();
        assert!(!stripped[inside_idx..outside_idx].contains("Promise"));
        assert!(stripped[outside_idx..].contains("Promise"));
    }
}