regex_ansi/
lib.rs

1//! regex_ansi: ANSI escape sequence matching.
2//!
3//! High‑fidelity Rust port of the JavaScript `ansi-regex` pattern.
4
5use regex::Regex;
6use std::sync::LazyLock;
7
8// Pattern derived to align with ansi-regex JS semantics (v6 era):
9// Matches:
10// 1. OSC: ESC ] ... (terminated by BEL | ESC \\ | 0x9C) – non-greedy.
11// 2. CSI / control sequences with parameter bytes and a final byte in valid range.
12// 3. Other 2-byte escape sequences used by some terminals.
13// This pattern intentionally does not attempt to validate every numeric range; it
14// mirrors practical coverage of color/style + link sequences.
15// The pattern is kept as a single constant string literal for compile-time embedding.
16
17// Simplified & Rust-regex-compatible form (negated char class for OSC body):
18//  - OSC: ESC ] then any bytes except BEL, ESC, 0x9C lazily until a terminator BEL | ESC \ | 0x9C
19//  - CSI/other: same final-byte class as ansi-regex JS.
20// Adjusted pattern for Rust `regex` crate limitations:
21// - Replace nested `[[` consumption with an explicit character class that does
22//   not prematurely introduce an unclosed set (Clippy previously flagged it).
23// - Equivalent intent: match ESC or single-byte CSI, then zero+ of the allowed
24//   parameter/intermediate bytes, then a final byte from the upstream class.
25// Pattern components:
26// 1. OSC: ESC ] ... (terminator BEL | ESC \\ | 0x9C) with lazy body so incomplete OSC doesn't match.
27// 2. CSI (two forms): ESC [ ... final OR single 0x9B ... final.
28// 3. VT52 & short escapes: ESC followed by a single char from allowed set.
29// 4. Charset selection: ESC ( or ) then one of A B 0 1 2.
30// 5. DEC line/screen alignment etc with '#'.
31// These extra explicit branches ensure ESC A etc match while ESC ] (incomplete OSC) does not.
32pub const ANSI_REGEX_PATTERN: &str = concat!(
33    // OSC branch
34    "(?:\\x1B\\][^\\x07\\x1B\\x9C]*?(?:\\x07|\\x1B\\\\|\\x9C))",
35    "|",
36    // CSI ESC[ ...
37    "(?:\\x1B\\[[\\[\\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~])",
38    "|",
39    // CSI single-byte 0x9B ...
40    "(?:\\x9B[\\[\\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~])",
41    "|",
42    // VT52 / short escapes (single final)
43    // Added E (NEL), M (RI), c (reset), m (SGR reset), plus existing cursor & mode keys.
44    "(?:\\x1B[ABCDHIKJSTZ=><sum78EMcNO])",
45    "|",
46    // Charset selection ESC (X or )X where X in A B 0 1 2
47    "(?:\\x1B[()][AB012])",
48    "|",
49    // Hash sequences ESC # 3 4 5 6 8
50    "(?:\\x1B#[34568])",
51    "|",
52    // Device status reports / queries: ESC [ 5 n etc (already covered by CSI) but bare 'ESC 5 n' appears in fixtures => add generic ESC [0-9]+[n] pattern fallback
53    "(?:\\x1B[0-9]+n)"
54);
55
56static ANSI_REGEX_GLOBAL: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(ANSI_REGEX_PATTERN).expect("valid ANSI regex"));
58
59// For first-match semantics we can reuse same pattern; users just use methods like find.
60static ANSI_REGEX_FIRST: LazyLock<Regex> =
61    LazyLock::new(|| Regex::new(ANSI_REGEX_PATTERN).expect("valid ANSI regex"));
62
63/// Return the compiled global-style ANSI regex (intended for finding all matches).
64pub fn ansi_regex() -> &'static Regex {
65    &ANSI_REGEX_GLOBAL
66}
67
68/// Return the compiled first-match ANSI regex (semantic helper; identical underlying pattern).
69pub fn ansi_regex_first() -> &'static Regex {
70    &ANSI_REGEX_FIRST
71}
72
73/// Return the raw ANSI regex pattern string.
74pub fn pattern() -> &'static str {
75    ANSI_REGEX_PATTERN
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    #[test]
83    fn pattern_compiles() {
84        let _ = ansi_regex();
85    }
86
87    #[test]
88    fn matches_basic_csi() {
89        let text = "\x1b[31mRed\x1b[0m";
90        let re = ansi_regex();
91        let parts: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
92        assert_eq!(parts.len(), 2);
93        assert!(parts[0].starts_with("\x1b[31m"));
94    }
95
96    #[test]
97    fn osc_hyperlink_variants() {
98        // OSC 8 ; ; url ST (terminated by BEL)
99        let s1 = "\x1b]8;;https://example.com\x07label\x1b]8;;\x07";
100        // ESC backslash termination
101        let s2 = "\x1b]8;;https://example.com\x1b\\label\x1b]8;;\x1b\\";
102        // 0x9C termination
103        let s3 = format!(
104            "\x1b]8;;https://example.com{}label\x1b]8;;{}",
105            '\u{9C}', '\u{9C}'
106        );
107        for s in [s1, s2, s3.as_str()] {
108            assert_eq!(ansi_regex().replace_all(s, ""), "label");
109        }
110    }
111
112    #[test]
113    fn no_false_positive_plain_brackets() {
114        let t = "[not an escape]";
115        assert!(ansi_regex().find(t).is_none());
116    }
117
118    #[test]
119    fn incomplete_escape_left_intact() {
120        let t = "\x1b"; // lone ESC
121        assert!(ansi_regex().find(t).is_none());
122    }
123
124    #[test]
125    fn large_text_performance_safety() {
126        let mut s = String::new();
127        for _ in 0..2_000 {
128            s.push_str("Line \x1b[32mGREEN\x1b[0m end\n");
129        }
130        let stripped = ansi_regex().replace_all(&s, "");
131        assert!(stripped.contains("GREEN"));
132        assert!(!stripped.contains("\x1b[32m"));
133    }
134}