regex_ansi/lib.rs
1//! regex_ansi: ANSI escape sequence matching.
2//!
3//! High‑fidelity Rust port of the JavaScript `ansi-regex` pattern.
4
5use regex::Regex;
6use std::sync::LazyLock;
7
8// Pattern derived to align with ansi-regex JS semantics (v6 era):
9// Matches:
10// 1. OSC: ESC ] ... (terminated by BEL | ESC \\ | 0x9C) – non-greedy.
11// 2. CSI / control sequences with parameter bytes and a final byte in valid range.
12// 3. Other 2-byte escape sequences used by some terminals.
13// This pattern intentionally does not attempt to validate every numeric range; it
14// mirrors practical coverage of color/style + link sequences.
15// The pattern is kept as a single constant string literal for compile-time embedding.
16
17// Simplified & Rust-regex-compatible form (negated char class for OSC body):
18// - OSC: ESC ] then any bytes except BEL, ESC, 0x9C lazily until a terminator BEL | ESC \ | 0x9C
19// - CSI/other: same final-byte class as ansi-regex JS.
20// Adjusted pattern for Rust `regex` crate limitations:
21// - Replace nested `[[` consumption with an explicit character class that does
22// not prematurely introduce an unclosed set (Clippy previously flagged it).
23// - Equivalent intent: match ESC or single-byte CSI, then zero+ of the allowed
24// parameter/intermediate bytes, then a final byte from the upstream class.
25// Pattern components:
26// 1. OSC: ESC ] ... (terminator BEL | ESC \\ | 0x9C) with lazy body so incomplete OSC doesn't match.
27// 2. CSI (two forms): ESC [ ... final OR single 0x9B ... final.
28// 3. VT52 & short escapes: ESC followed by a single char from allowed set.
29// 4. Charset selection: ESC ( or ) then one of A B 0 1 2.
30// 5. DEC line/screen alignment etc with '#'.
31// These extra explicit branches ensure ESC A etc match while ESC ] (incomplete OSC) does not.
32pub const ANSI_REGEX_PATTERN: &str = concat!(
33 // OSC branch
34 "(?:\\x1B\\][^\\x07\\x1B\\x9C]*?(?:\\x07|\\x1B\\\\|\\x9C))",
35 "|",
36 // CSI ESC[ ...
37 "(?:\\x1B\\[[\\[\\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~])",
38 "|",
39 // CSI single-byte 0x9B ...
40 "(?:\\x9B[\\[\\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~])",
41 "|",
42 // VT52 / short escapes (single final)
43 // Added E (NEL), M (RI), c (reset), m (SGR reset), plus existing cursor & mode keys.
44 "(?:\\x1B[ABCDHIKJSTZ=><sum78EMcNO])",
45 "|",
46 // Charset selection ESC (X or )X where X in A B 0 1 2
47 "(?:\\x1B[()][AB012])",
48 "|",
49 // Hash sequences ESC # 3 4 5 6 8
50 "(?:\\x1B#[34568])",
51 "|",
52 // Device status reports / queries: ESC [ 5 n etc (already covered by CSI) but bare 'ESC 5 n' appears in fixtures => add generic ESC [0-9]+[n] pattern fallback
53 "(?:\\x1B[0-9]+n)"
54);
55
56static ANSI_REGEX_GLOBAL: LazyLock<Regex> =
57 LazyLock::new(|| Regex::new(ANSI_REGEX_PATTERN).expect("valid ANSI regex"));
58
59// For first-match semantics we can reuse same pattern; users just use methods like find.
60static ANSI_REGEX_FIRST: LazyLock<Regex> =
61 LazyLock::new(|| Regex::new(ANSI_REGEX_PATTERN).expect("valid ANSI regex"));
62
63/// Return the compiled global-style ANSI regex (intended for finding all matches).
64pub fn ansi_regex() -> &'static Regex {
65 &ANSI_REGEX_GLOBAL
66}
67
68/// Return the compiled first-match ANSI regex (semantic helper; identical underlying pattern).
69pub fn ansi_regex_first() -> &'static Regex {
70 &ANSI_REGEX_FIRST
71}
72
73/// Return the raw ANSI regex pattern string.
74pub fn pattern() -> &'static str {
75 ANSI_REGEX_PATTERN
76}
77
78#[cfg(test)]
79mod tests {
80 use super::*;
81
82 #[test]
83 fn pattern_compiles() {
84 let _ = ansi_regex();
85 }
86
87 #[test]
88 fn matches_basic_csi() {
89 let text = "\x1b[31mRed\x1b[0m";
90 let re = ansi_regex();
91 let parts: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
92 assert_eq!(parts.len(), 2);
93 assert!(parts[0].starts_with("\x1b[31m"));
94 }
95
96 #[test]
97 fn osc_hyperlink_variants() {
98 // OSC 8 ; ; url ST (terminated by BEL)
99 let s1 = "\x1b]8;;https://example.com\x07label\x1b]8;;\x07";
100 // ESC backslash termination
101 let s2 = "\x1b]8;;https://example.com\x1b\\label\x1b]8;;\x1b\\";
102 // 0x9C termination
103 let s3 = format!(
104 "\x1b]8;;https://example.com{}label\x1b]8;;{}",
105 '\u{9C}', '\u{9C}'
106 );
107 for s in [s1, s2, s3.as_str()] {
108 assert_eq!(ansi_regex().replace_all(s, ""), "label");
109 }
110 }
111
112 #[test]
113 fn no_false_positive_plain_brackets() {
114 let t = "[not an escape]";
115 assert!(ansi_regex().find(t).is_none());
116 }
117
118 #[test]
119 fn incomplete_escape_left_intact() {
120 let t = "\x1b"; // lone ESC
121 assert!(ansi_regex().find(t).is_none());
122 }
123
124 #[test]
125 fn large_text_performance_safety() {
126 let mut s = String::new();
127 for _ in 0..2_000 {
128 s.push_str("Line \x1b[32mGREEN\x1b[0m end\n");
129 }
130 let stripped = ansi_regex().replace_all(&s, "");
131 assert!(stripped.contains("GREEN"));
132 assert!(!stripped.contains("\x1b[32m"));
133 }
134}