Skip to main content

aster/
utils.rs

1use tokio_util::sync::CancellationToken;
2use unicode_normalization::UnicodeNormalization;
3
4/// Check if a character is in the Unicode Tags Block range (U+E0000-U+E007F)
5/// These characters are invisible and can be used for steganographic attacks
6fn is_in_unicode_tag_range(c: char) -> bool {
7    matches!(c, '\u{E0000}'..='\u{E007F}')
8}
9
10pub fn contains_unicode_tags(text: &str) -> bool {
11    text.chars().any(is_in_unicode_tag_range)
12}
13
14/// Sanitize Unicode Tags Block characters from text
15pub fn sanitize_unicode_tags(text: &str) -> String {
16    let normalized: String = text.nfc().collect();
17
18    normalized
19        .chars()
20        .filter(|&c| !is_in_unicode_tag_range(c))
21        .collect()
22}
23
24/// Safely truncate a string at character boundaries, not byte boundaries
25///
26/// This function ensures that multi-byte UTF-8 characters (like Japanese, emoji, etc.)
27/// are not split in the middle, which would cause a panic.
28///
29/// # Arguments
30/// * `s` - The string to truncate
31/// * `max_chars` - Maximum number of characters to keep
32///
33/// # Returns
34/// A truncated string with "..." appended if truncation occurred
35pub fn safe_truncate(s: &str, max_chars: usize) -> String {
36    if s.chars().count() <= max_chars {
37        s.to_string()
38    } else {
39        let truncated: String = s.chars().take(max_chars.saturating_sub(3)).collect();
40        format!("{}...", truncated)
41    }
42}
43
44pub fn is_token_cancelled(cancellation_token: &Option<CancellationToken>) -> bool {
45    cancellation_token
46        .as_ref()
47        .is_some_and(|t| t.is_cancelled())
48}
49
50#[cfg(test)]
51mod tests {
52    use super::*;
53
54    #[test]
55    fn test_contains_unicode_tags() {
56        // Test detection of Unicode Tags Block characters
57        assert!(contains_unicode_tags("Hello\u{E0041}world"));
58        assert!(contains_unicode_tags("\u{E0000}"));
59        assert!(contains_unicode_tags("\u{E007F}"));
60        assert!(!contains_unicode_tags("Hello world"));
61        assert!(!contains_unicode_tags("Hello δΈ–η•Œ 🌍"));
62        assert!(!contains_unicode_tags(""));
63    }
64
65    #[test]
66    fn test_sanitize_unicode_tags() {
67        // Test that Unicode Tags Block characters are removed
68        let malicious = "Hello\u{E0041}\u{E0042}\u{E0043}world"; // Invisible "ABC"
69        let cleaned = sanitize_unicode_tags(malicious);
70        assert_eq!(cleaned, "Helloworld");
71    }
72
73    #[test]
74    fn test_sanitize_unicode_tags_preserves_legitimate_unicode() {
75        // Test that legitimate Unicode characters are preserved
76        let clean_text = "Hello world δΈ–η•Œ 🌍";
77        let cleaned = sanitize_unicode_tags(clean_text);
78        assert_eq!(cleaned, clean_text);
79    }
80
81    #[test]
82    fn test_sanitize_unicode_tags_empty_string() {
83        let empty = "";
84        let cleaned = sanitize_unicode_tags(empty);
85        assert_eq!(cleaned, "");
86    }
87
88    #[test]
89    fn test_sanitize_unicode_tags_only_malicious() {
90        // Test string containing only Unicode Tags characters
91        let only_malicious = "\u{E0041}\u{E0042}\u{E0043}";
92        let cleaned = sanitize_unicode_tags(only_malicious);
93        assert_eq!(cleaned, "");
94    }
95
96    #[test]
97    fn test_sanitize_unicode_tags_mixed_content() {
98        // Test mixed legitimate and malicious Unicode
99        let mixed = "Hello\u{E0041} δΈ–η•Œ\u{E0042} 🌍\u{E0043}!";
100        let cleaned = sanitize_unicode_tags(mixed);
101        assert_eq!(cleaned, "Hello δΈ–η•Œ 🌍!");
102    }
103
104    #[test]
105    fn test_safe_truncate_ascii() {
106        assert_eq!(safe_truncate("hello world", 20), "hello world");
107        assert_eq!(safe_truncate("hello world", 8), "hello...");
108        assert_eq!(safe_truncate("hello", 5), "hello");
109        assert_eq!(safe_truncate("hello", 3), "...");
110    }
111
112    #[test]
113    fn test_safe_truncate_japanese() {
114        // Japanese characters: "γ“γ‚“γ«γ‘γ―δΈ–η•Œ" (Hello World)
115        let japanese = "γ“γ‚“γ«γ‘γ―δΈ–η•Œ";
116        assert_eq!(safe_truncate(japanese, 10), japanese);
117        assert_eq!(safe_truncate(japanese, 5), "こん...");
118        assert_eq!(safe_truncate(japanese, 7), japanese);
119    }
120
121    #[test]
122    fn test_safe_truncate_mixed() {
123        // Mixed ASCII and Japanese
124        let mixed = "Hello こんにけは";
125        assert_eq!(safe_truncate(mixed, 20), mixed);
126        assert_eq!(safe_truncate(mixed, 8), "Hello...");
127    }
128}