Skip to main content

intelli_shell/utils/
string.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4use unidecode::unidecode;
5
6/// Converts all types of newline sequences (`\r`, `\n`, `\r\n`) in a string to a single newline character (`\n`).
7///
8/// This is useful for normalizing text input that might come from different operating systems or sources with
9/// inconsistent line endings.
10///
11/// # Examples
12///
13/// ```rust
14/// # use intelli_shell::utils::unify_newlines;
15/// let text = "Hello\r\nWorld\nAnother\rLine";
16/// let unified = unify_newlines(text);
17/// assert_eq!(unified, "Hello\nWorld\nAnother\nLine");
18/// ```
19pub fn unify_newlines(str: impl AsRef<str>) -> String {
20    /// Regex to match various newline sequences (`\r`, `\n`, `\r\n`)
21    static NEW_LINES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"\r\n|\r|\n"#).unwrap());
22
23    NEW_LINES.replace_all(str.as_ref(), "\n").to_string()
24}
25
26/// Removes newline sequences and any surrounding whitespace, replacing them with a single space.
27///
28/// This function is useful for converting multi-line text into a single line while preserving word separation.
29/// It collapses multiple lines and adjacent whitespace into one space.
30///
31/// # Examples
32///
33/// ```rust
34/// # use intelli_shell::utils::remove_newlines;
35/// let text = "Line 1\n  Line 2 \r\n\tLine 3";
36/// let single_line = remove_newlines(text);
37/// assert_eq!(single_line, "Line 1 Line 2 Line 3");
38///
39/// // Example with potentially escaped newline
40/// let text_escaped = "Line A \\\n Line B";
41/// let single_line_escaped = remove_newlines(text_escaped);
42/// assert_eq!(single_line_escaped, "Line A Line B");
43/// ```
44pub fn remove_newlines(str: impl AsRef<str>) -> String {
45    /// Regex to match newline sequences potentially surrounded by whitespace.
46    ///
47    /// It also handles an optional backslash (`\`) preceding the newline, which might indicate an escaped newline in
48    /// shell contexts.
49    static NEW_LINE_AND_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"\s*(\\)?(\r\n|\r|\n)\s*"#).unwrap());
50
51    NEW_LINE_AND_SPACES.replace_all(str.as_ref(), " ").to_string()
52}
53
54/// Normalizes a string by performing ASCII transliteration and converting to lowercase.
55///
56/// This uses the [unidecode] crate to approximate non-ASCII characters with their closest ASCII equivalents, and then
57/// converts the entire string to lowercase. Then, remove any non-alphanumeric character and consecutive whitespaces,
58/// returning the trimmed string.
59///
60/// # Examples
61///
62/// ```rust
63/// # use intelli_shell::utils::flatten_str;
64/// let text = "Héllö Wörld! (-123) ";
65/// let flattened = flatten_str(text);
66/// assert_eq!(flattened, "hello world -123");
67/// ```
68pub fn flatten_str(s: impl AsRef<str>) -> String {
69    /// Regex to match any non-allowed character on the flattened version
70    static FLAT_STRING_FORBIDDEN_CHARS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^a-z0-9\s-]").unwrap());
71
72    flatten(s, &FLAT_STRING_FORBIDDEN_CHARS)
73}
74
75/// Normalizes a variable name string by performing ASCII transliteration and converting to lowercase.
76///
77/// This uses the [unidecode] crate to approximate non-ASCII characters with their closest ASCII equivalents, and then
78/// converts the entire string to lowercase. Then, remove any non-allowed character and consecutive whitespaces,
79/// returning the trimmed string.
80///
81/// # Examples
82///
83/// ```rust
84/// # use intelli_shell::utils::flatten_variable_name;
85/// let variable = "  SÉCOND Part ";
86/// let flattened = flatten_variable_name(variable);
87/// assert_eq!(flattened, "second part");
88/// ```
89pub fn flatten_variable_name(variable_name: impl AsRef<str>) -> String {
90    /// Regex to match any non-allowed character on the flattened version of a variable
91    static VARIABLE_FORBIDDEN_CHARS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^a-z0-9\s_:-]").unwrap());
92
93    flatten(variable_name, &VARIABLE_FORBIDDEN_CHARS)
94}
95
96fn flatten(s: impl AsRef<str>, forbidden_chars: &Regex) -> String {
97    // Unidecode and lowercase
98    let decoded = unidecode(s.as_ref()).to_lowercase();
99
100    // Keep only allowed characters
101    let flattened = forbidden_chars.replace_all(&decoded, "");
102
103    /// Regex to match consecutive whitespaces
104    static FLATTEN_COLLAPSE_WHITESPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+").unwrap());
105
106    // Remove consecutive whitespaces
107    FLATTEN_COLLAPSE_WHITESPACE_REGEX
108        .replace_all(&flattened, " ")
109        .trim()
110        .to_string()
111}
112
113/// Extracts the root command from a shell command string, skipping environment variables and common prefixes
114/// like `sudo`, `time`, etc., as well as shell operators like `&&` or `;`.
115pub fn extract_root_cmd(command: &str) -> Option<String> {
116    fn is_env_var(s: &str) -> bool {
117        // Handle PowerShell: `$env:VAR=val`, Nushell `$env.VAR=val`
118        let s = s.trim_start_matches("$env:").trim_start_matches("$env.");
119
120        let mut parts = s.splitn(2, '=');
121        let name = parts.next().unwrap_or("");
122
123        if name.is_empty() || parts.next().is_none() {
124            return false;
125        }
126
127        // Allow basic alphanumeric, underscore, and dots/colons from some shells
128        name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == ':')
129    }
130
131    let parts = match shell_words::split(command) {
132        Ok(p) => p,
133        Err(_) => command.split_whitespace().map(|s| s.to_string()).collect(),
134    };
135
136    let mut skip_next_n = 0;
137
138    for (i, part) in parts.iter().enumerate() {
139        if skip_next_n > 0 {
140            skip_next_n -= 1;
141            continue;
142        }
143
144        let p = part.as_str();
145
146        // Strip trailing semicolons or other separators that might have attached to the word in fallback parsing
147        let p = p.strip_suffix(';').unwrap_or(p);
148
149        if is_env_var(p) {
150            continue;
151        }
152
153        match p {
154            "&&" | "||" | ";" | "|" | "sudo" | "doas" | "time" | "env" | "function" | "def" | "def-env" | "export" => {
155                continue;
156            }
157            // Nushell assignment like: `let-env VAR = "val"` or `$env.VAR = "val"`
158            "let-env" | "let" | "mut" => {
159                // Skips variable name, `=`, and value (e.g. `let-env VAR = val`)
160                // In some cases it's just `let VAR = val`
161                if parts.get(i + 2).map(|s| s.as_str()) == Some("=") {
162                    skip_next_n = 3;
163                } else if parts.get(i + 1).map(|s| s.as_str()) == Some("=") {
164                    // maybe `let-env = val` ?
165                    skip_next_n = 2;
166                } else {
167                    // let VAR val (unlikely in Nushell, but just to be safe)
168                    skip_next_n = 2;
169                }
170                continue;
171            }
172            // Fish assignment like `set -x VAR val` or `set VAR val`
173            "set" => {
174                // Skip everything in `parts` until we hit a delimiter, then let the loop resume from there.
175                let mut skipped = 0;
176                for next_part in parts.iter().skip(i + 1) {
177                    let next_part_stripped = next_part.as_str().strip_suffix(';').unwrap_or(next_part.as_str());
178                    if matches!(next_part_stripped, ";" | "&&" | "||" | "|") {
179                        break;
180                    }
181                    skipped += 1;
182                    if next_part.as_str().ends_with(';') {
183                        // The token has `;` attached to it (e.g. `val;`).
184                        break;
185                    }
186                }
187                skip_next_n = skipped;
188                continue;
189            }
190            _ => {}
191        }
192
193        // if the part itself is `$env.VAR` and the next part is `=`
194        if (p.starts_with("$env.") || p.starts_with("$env:"))
195            && parts.get(i + 1).map(|s| s.as_str()) == Some("=") {
196                skip_next_n = 2; // skip `=` and `val`
197                continue;
198            }
199
200        if p.starts_with('-') {
201            continue;
202        }
203
204        let trimmed = p.strip_suffix("()").unwrap_or(p).to_string();
205        if !trimmed.is_empty() {
206            return Some(trimmed);
207        }
208    }
209    None
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_extract_root_cmd() {
218        assert_eq!(extract_root_cmd("VAR1=val1 VAR2=\"val2\" root argument").as_deref(), Some("root"));
219        assert_eq!(extract_root_cmd("VAR4='value 4' && root argument").as_deref(), Some("root"));
220        assert_eq!(extract_root_cmd("VAR5=val\\ 5 ; root argument").as_deref(), Some("root"));
221        assert_eq!(extract_root_cmd("sudo root arg1 arg2").as_deref(), Some("root"));
222        assert_eq!(extract_root_cmd("time sudo root arg1 arg2").as_deref(), Some("root"));
223        assert_eq!(extract_root_cmd("env VAR=1 root arg1").as_deref(), Some("root"));
224        assert_eq!(extract_root_cmd("root arg1").as_deref(), Some("root"));
225        assert_eq!(extract_root_cmd(""), None);
226        assert_eq!(extract_root_cmd("VAR=val"), None);
227        assert_eq!(extract_root_cmd("my_fn() { echo a; }").as_deref(), Some("my_fn"));
228        assert_eq!(extract_root_cmd("function my_fn() { echo a; }").as_deref(), Some("my_fn"));
229        assert_eq!(extract_root_cmd("function my_fn { echo a; }").as_deref(), Some("my_fn"));
230        assert_eq!(extract_root_cmd("ENV={{variable-name:kebab}} function my_fn() { echo a; }").as_deref(), Some("my_fn"));
231
232        // PowerShell
233        assert_eq!(extract_root_cmd("$env:VAR=\"val\"; root argument").as_deref(), Some("root"));
234        assert_eq!(extract_root_cmd("$env:VAR=val; root argument").as_deref(), Some("root"));
235
236        // Nushell
237        assert_eq!(extract_root_cmd("let-env VAR = \"val\"; root argument").as_deref(), Some("root"));
238        assert_eq!(extract_root_cmd("let VAR = \"val\"; root argument").as_deref(), Some("root"));
239        assert_eq!(extract_root_cmd("$env.VAR = \"val\"; root argument").as_deref(), Some("root"));
240        assert_eq!(extract_root_cmd("def my_fn [] { echo a }").as_deref(), Some("my_fn"));
241        assert_eq!(extract_root_cmd("def-env my_fn [] { echo a }").as_deref(), Some("my_fn"));
242
243        // Fish
244        assert_eq!(extract_root_cmd("env VAR=val root argument").as_deref(), Some("root"));
245        assert_eq!(extract_root_cmd("function my_fn; echo a; end").as_deref(), Some("my_fn"));
246        assert_eq!(extract_root_cmd("export VAR=val; root argument").as_deref(), Some("root"));
247        assert_eq!(extract_root_cmd("set -x VAR val; root argument").as_deref(), Some("root"));
248    }
249}