Skip to main content

redstr/transformations/
unicode.rs

1use crate::rng::SimpleRng;
2
3/// Replaces characters with random Unicode variations.
4///
5/// Substitutes common letters with accented or modified Unicode variants.
6/// For example, 'a' might become 'à', 'á', 'â', 'ã', or 'ä'. This creates
7/// visually similar text that can bypass simple string matching while testing
8/// Unicode normalization and internationalization handling.
9///
10/// # Use Cases
11///
12/// - **Red Team**: Bypass keyword filters with Unicode lookalikes
13/// - **Blue Team**: Test Unicode normalization in security controls
14/// - **i18n Testing**: Verify internationalized text handling
15/// - **Content Filters**: Test accent-insensitive matching
16///
17/// # Examples
18///
19/// ```
20/// use redstr::unicode_variations;
21///
22/// let result = unicode_variations("admin");
23/// // Example output: "ädmïn" or "àdmîn" or "ádmįn" (varies each run)
24/// assert_eq!(result.chars().count(), 5);
25///
26/// // Bypass keyword filters
27/// let word = unicode_variations("password");
28/// // Example: "pässwörd" or "pàsswôrd"
29///
30/// // Test Unicode normalization
31/// let email = unicode_variations("user@example.com");
32/// // Example: "ûsér@ëxämplè.çöm"
33/// ```
34pub fn unicode_variations(input: &str) -> String {
35    let mut rng = SimpleRng::new();
36
37    input
38        .chars()
39        .map(|c| {
40            let lower = c.to_lowercase().to_string();
41            match lower.as_str() {
42                "a" => ["a", "à", "á", "â", "ã", "ä", "å", "ā", "ă"][rng.next() as usize % 9],
43                "e" => ["e", "è", "é", "ê", "ë", "ē", "ĕ", "ė"][rng.next() as usize % 8],
44                "i" => ["i", "ì", "í", "î", "ï", "ī", "ĭ", "į"][rng.next() as usize % 8],
45                "o" => ["o", "ò", "ó", "ô", "õ", "ö", "ō", "ŏ"][rng.next() as usize % 8],
46                "u" => ["u", "ù", "ú", "û", "ü", "ū", "ŭ", "ů"][rng.next() as usize % 8],
47                "c" => ["c", "ç", "ć", "ĉ", "ċ", "č"][rng.next() as usize % 6],
48                "n" => ["n", "ñ", "ń", "ņ", "ň"][rng.next() as usize % 5],
49                "s" => ["s", "ś", "ŝ", "ş", "š"][rng.next() as usize % 5],
50                _ => return c.to_string(),
51            }
52            .to_string()
53        })
54        .collect()
55}
56
57/// Adds zalgo combining characters to create corrupted-looking text.
58///
59/// Attaches 1-3 random Unicode combining diacritical marks to each alphabetic
60/// character, creating "zalgo text" that appears corrupted or glitchy. These
61/// combining characters stack above and below letters, useful for testing
62/// display rendering and Unicode edge cases.
63///
64/// # Use Cases
65///
66/// - **Display Testing**: Verify how systems render combining characters
67/// - **Blue Team**: Test input sanitization and character filtering
68/// - **DoS Testing**: Check if excessive combining marks cause issues
69/// - **Unicode Handling**: Validate proper Unicode normalization
70///
71/// # Examples
72///
73/// ```
74/// use redstr::zalgo_text;
75///
76/// let result = zalgo_text("test");
77/// // Example output: "t̃̂e̊̋s̈̃t̂̃" (with combining marks)
78/// assert!(result.len() > 4);
79///
80/// // Create glitchy-looking text
81/// let username = zalgo_text("admin");
82/// // Output looks like: "a̅̆d̃m̂ĭn̈" (rendered with marks above/below)
83///
84/// // Test display systems
85/// let message = zalgo_text("hello");
86/// // Creates visually corrupted text for testing
87/// ```
88pub fn zalgo_text(input: &str) -> String {
89    let mut rng = SimpleRng::new();
90    let combining_chars = [
91        '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
92        '\u{0307}', '\u{0308}', '\u{0309}', '\u{030A}', '\u{030B}', '\u{030C}', '\u{030D}',
93        '\u{030E}', '\u{030F}', '\u{0310}', '\u{0311}', '\u{0312}', '\u{0313}', '\u{0314}',
94        '\u{0315}', '\u{0316}', '\u{0317}',
95    ];
96
97    input
98        .chars()
99        .map(|c| {
100            let mut result = c.to_string();
101            if c.is_alphabetic() {
102                let count = (rng.next() % 3) + 1;
103                for _ in 0..count {
104                    let idx = rng.next() as usize % combining_chars.len();
105                    result.push(combining_chars[idx]);
106                }
107            }
108            result
109        })
110        .collect()
111}
112
113/// Substitutes characters with similar-looking homoglyphs.
114///
115/// Randomly replaces Latin letters with visually identical or similar Cyrillic
116/// characters (e.g., Latin 'a' → Cyrillic 'а', Latin 'e' → Cyrillic 'е').
117/// About 33% of eligible characters are substituted. This is the core technique
118/// for homograph/IDN spoofing attacks where `example.com` becomes `еxample.com`.
119///
120/// # Use Cases
121///
122/// - **Phishing Testing**: Create lookalike domains for security training
123/// - **IDN Spoofing**: Test internationalized domain name vulnerabilities
124/// - **Red Team**: Bypass domain/URL whitelists and filters
125/// - **Blue Team**: Validate homograph detection systems
126///
127/// # Examples
128///
129/// ```
130/// use redstr::homoglyph_substitution;
131///
132/// let result = homoglyph_substitution("example");
133/// // Example output: "ехаmple" (Cyrillic е and а, Latin m,p,l)
134/// // Looks identical but uses different Unicode codepoints
135///
136/// // Phishing domain generation
137/// let domain = homoglyph_substitution("paypal.com");
138/// // Example: "pаypаl.com" (Cyrillic а instead of Latin a)
139///
140/// // Email spoofing test
141/// let email = homoglyph_substitution("admin@company.com");
142/// // Example: "аdmin@compаny.com" (Cyrillic characters)
143///
144/// // Number lookalikes
145/// let pin = homoglyph_substitution("2021");
146/// // Example: "2О2l" (Letter O and l instead of 0 and 1)
147/// ```
148pub fn homoglyph_substitution(input: &str) -> String {
149    let mut rng = SimpleRng::new();
150
151    input
152        .chars()
153        .map(|c| {
154            if rng.next() % 3 != 0 {
155                return c.to_string();
156            }
157
158            match c {
159                'a' | 'A' => "а", // Cyrillic а
160                'e' | 'E' => "е", // Cyrillic е
161                'o' | 'O' => "о", // Cyrillic о
162                'p' | 'P' => "р", // Cyrillic р
163                'c' | 'C' => "с", // Cyrillic с
164                'x' | 'X' => "х", // Cyrillic х
165                'i' | 'I' => "і", // Cyrillic і
166                '0' => "О",       // Letter O
167                '1' => "l",       // Letter l
168                _ => return c.to_string(),
169            }
170            .to_string()
171        })
172        .collect()
173}
174
175/// Replaces regular spaces with various Unicode space characters.
176///
177/// Substitutes ASCII space characters (U+0020) with random Unicode space
178/// variants including non-breaking space (U+00A0) and various em/en spaces.
179/// These look identical but have different codepoints, useful for testing
180/// whitespace normalization and parser robustness.
181///
182/// # Use Cases
183///
184/// - **WAF Bypass**: Evade filters that only check for ASCII spaces
185/// - **Blue Team**: Test whitespace normalization in parsers
186/// - **Input Validation**: Verify proper handling of Unicode spaces
187/// - **SQL Injection**: Use non-breaking spaces to bypass filters
188///
189/// # Examples
190///
191/// ```
192/// use redstr::space_variants;
193///
194/// let result = space_variants("hello world");
195/// // Looks identical: "hello world"
196/// // But may contain U+00A0, U+2000, U+2001, etc. instead of U+0020
197/// assert_eq!(result.chars().filter(|c| c.is_whitespace()).count(), 1);
198///
199/// // SQL injection with Unicode spaces
200/// let sql = space_variants("SELECT * FROM users");
201/// // Uses non-breaking spaces to bypass filters
202///
203/// // Test whitespace normalization
204/// let text = space_variants("word1 word2 word3");
205/// // Visually identical but with mixed Unicode spaces
206/// ```
207pub fn space_variants(input: &str) -> String {
208    let mut rng = SimpleRng::new();
209    let spaces = [
210        ' ', '\u{00A0}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}',
211    ];
212
213    input
214        .chars()
215        .map(|c| {
216            if c == ' ' {
217                spaces[rng.next() as usize % spaces.len()].to_string()
218            } else {
219                c.to_string()
220            }
221        })
222        .collect()
223}
224
225/// Generates unicode normalization variations (NFD, NFC, NFKC, NFKD concepts).
226///
227/// Creates variations that represent similar Unicode normalization concepts
228/// by randomly replacing characters with composed or decomposed forms, or
229/// adding combining characters. Tests how systems handle different Unicode
230/// normalization forms (NFC, NFD, NFKC, NFKD).
231///
232/// # Use Cases
233///
234/// - **Security Testing**: Test if systems normalize Unicode properly
235/// - **Bypass Filters**: Exploit inconsistent Unicode handling
236/// - **Blue Team**: Validate Unicode normalization in security controls
237/// - **Data Quality**: Test string comparison and matching
238///
239/// # Examples
240///
241/// ```
242/// use redstr::unicode_normalize_variants;
243///
244/// let result = unicode_normalize_variants("café");
245/// // May produce composed (café) or decomposed (cafe\u{0301}) forms
246/// assert!(result.len() >= 4);
247/// ```
248pub fn unicode_normalize_variants(input: &str) -> String {
249    let mut rng = SimpleRng::new();
250
251    input
252        .chars()
253        .map(|c| {
254            // Use compatibility characters and combining marks
255            match c {
256                'a' | 'A' => {
257                    let variants = ["a", "а", "a", "\u{0061}\u{0301}"]; // Latin, Cyrillic, fullwidth, with combining acute
258                    variants[rng.next() as usize % variants.len()]
259                }
260                'e' | 'E' => {
261                    let variants = ["e", "е", "e", "\u{0065}\u{0301}"];
262                    variants[rng.next() as usize % variants.len()]
263                }
264                'o' | 'O' => {
265                    let variants = ["o", "о", "o", "\u{006F}\u{0301}"];
266                    variants[rng.next() as usize % variants.len()]
267                }
268                _ => return c.to_string(),
269            }
270            .to_string()
271        })
272        .collect()
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_homoglyph_contains_cyrillic() {
281        // This test checks that homoglyph substitution can produce Cyrillic characters
282        // Run multiple times to increase chance of substitution
283        let mut found_cyrillic = false;
284        for _ in 0..10 {
285            let result = homoglyph_substitution("aeopcx");
286            if result.chars().any(|c| c as u32 > 127) {
287                found_cyrillic = true;
288                break;
289            }
290        }
291        // Note: Random nature means Cyrillic may or may not appear
292        // This test primarily ensures the function doesn't panic
293    }
294
295    #[test]
296    fn test_homoglyph_empty() {
297        assert_eq!(homoglyph_substitution(""), "");
298    }
299
300    #[test]
301    fn test_homoglyph_single_char() {
302        let result = homoglyph_substitution("a");
303        assert!(!result.is_empty());
304    }
305
306    #[test]
307    fn test_homoglyph_domain() {
308        let result = homoglyph_substitution("example.com");
309        assert!(result.contains('.'));
310    }
311
312    #[test]
313    fn test_homoglyph_email() {
314        let result = homoglyph_substitution("admin@example.com");
315        assert!(result.contains('@'));
316    }
317
318    #[test]
319    fn test_homoglyph_numbers() {
320        let result = homoglyph_substitution("test01");
321        // 0 and 1 may be substituted
322        assert!(!result.is_empty());
323    }
324
325    #[test]
326    fn test_homoglyph_preserves_length_approximate() {
327        let input = "paypal";
328        let result = homoglyph_substitution(input);
329        // May be slightly different due to multi-byte chars
330        assert!(result.chars().count() <= input.chars().count() + 2);
331    }
332
333    #[test]
334    fn test_homoglyph_special_chars() {
335        let result = homoglyph_substitution("test!@#");
336        assert!(result.contains("!@#"));
337    }
338
339    #[test]
340    fn test_homoglyph_uppercase() {
341        let result = homoglyph_substitution("AEOPC");
342        assert!(!result.is_empty());
343    }
344
345    #[test]
346    fn test_homoglyph_phishing() {
347        let result = homoglyph_substitution("secure");
348        assert!(!result.is_empty());
349    }
350
351    #[test]
352    fn test_unicode_variations_changes_chars() {
353        // Run multiple times to ensure some variation happens
354        let mut changed = false;
355        for _ in 0..10 {
356            let result = unicode_variations("aeiou");
357            if result != "aeiou" {
358                changed = true;
359                break;
360            }
361        }
362        assert!(changed);
363    }
364
365    #[test]
366    fn test_unicode_variations_empty() {
367        assert_eq!(unicode_variations(""), "");
368    }
369
370    #[test]
371    fn test_unicode_variations_preserves_count() {
372        let result = unicode_variations("admin");
373        assert_eq!(result.chars().count(), 5);
374    }
375
376    #[test]
377    fn test_unicode_variations_consonants() {
378        let result = unicode_variations("xyz");
379        assert_eq!(result.len(), 3);
380    }
381
382    #[test]
383    fn test_unicode_variations_numbers() {
384        let result = unicode_variations("test123");
385        assert!(result.contains("123"));
386    }
387
388    #[test]
389    fn test_unicode_variations_special_chars() {
390        let result = unicode_variations("test!@#");
391        assert!(result.contains("!@#"));
392    }
393
394    #[test]
395    fn test_unicode_variations_uppercase() {
396        let result = unicode_variations("AEIOU");
397        assert_eq!(result.chars().count(), 5);
398    }
399
400    #[test]
401    fn test_unicode_variations_mixed_case() {
402        let result = unicode_variations("TeSt");
403        assert_eq!(result.chars().count(), 4);
404    }
405
406    #[test]
407    fn test_unicode_variations_long_string() {
408        let result = unicode_variations("administrator");
409        assert!(result.chars().count() >= 13);
410    }
411
412    #[test]
413    fn test_unicode_variations_password() {
414        let result = unicode_variations("password");
415        assert!(!result.is_empty());
416    }
417
418    #[test]
419    fn test_space_variants_preserves_length() {
420        let input = "hello world test";
421        let result = space_variants(input);
422        assert_eq!(result.chars().count(), input.chars().count());
423    }
424
425    #[test]
426    fn test_space_variants_empty() {
427        assert_eq!(space_variants(""), "");
428    }
429
430    #[test]
431    fn test_space_variants_no_spaces() {
432        let result = space_variants("hello");
433        assert_eq!(result, "hello");
434    }
435
436    #[test]
437    fn test_space_variants_single_space() {
438        let result = space_variants("a b");
439        assert_eq!(result.chars().count(), 3);
440    }
441
442    #[test]
443    fn test_space_variants_multiple_spaces() {
444        let result = space_variants("a b c d");
445        assert!(result.chars().filter(|c| c.is_whitespace()).count() == 3);
446    }
447
448    #[test]
449    fn test_space_variants_preserves_words() {
450        let result = space_variants("hello world");
451        assert!(result.contains("hello") && result.contains("world"));
452    }
453
454    #[test]
455    fn test_space_variants_sql_injection() {
456        let result = space_variants("SELECT * FROM users");
457        assert!(result.contains("SELECT"));
458    }
459
460    #[test]
461    fn test_space_variants_numbers() {
462        let result = space_variants("test 123");
463        assert!(result.contains("123"));
464    }
465
466    #[test]
467    fn test_space_variants_special_chars() {
468        let result = space_variants("test @ test");
469        assert!(result.contains('@'));
470    }
471
472    #[test]
473    fn test_space_variants_waf_bypass() {
474        let result = space_variants("admin login");
475        assert!(!result.is_empty());
476    }
477
478    #[test]
479    fn test_unicode_normalize_variants() {
480        let result = unicode_normalize_variants("cafe");
481        assert!(result.len() >= 4);
482    }
483
484    #[test]
485    fn test_unicode_normalize_variants_empty() {
486        assert_eq!(unicode_normalize_variants(""), "");
487    }
488
489    #[test]
490    fn test_unicode_normalize_variants_single_char() {
491        let result = unicode_normalize_variants("a");
492        assert!(!result.is_empty());
493    }
494
495    #[test]
496    fn test_unicode_normalize_variants_no_variants() {
497        let result = unicode_normalize_variants("xyz");
498        assert_eq!(result, "xyz");
499    }
500
501    #[test]
502    fn test_unicode_normalize_variants_mixed() {
503        let result = unicode_normalize_variants("test");
504        assert!(!result.is_empty());
505    }
506
507    #[test]
508    fn test_unicode_normalize_variants_numbers() {
509        let result = unicode_normalize_variants("test123");
510        assert!(result.contains("123"));
511    }
512
513    #[test]
514    fn test_unicode_normalize_variants_special_chars() {
515        let result = unicode_normalize_variants("test!@#");
516        assert!(result.contains("!@#"));
517    }
518
519    #[test]
520    fn test_unicode_normalize_variants_uppercase() {
521        let result = unicode_normalize_variants("AOE");
522        assert!(!result.is_empty());
523    }
524
525    #[test]
526    fn test_unicode_normalize_variants_preserves_non_variant() {
527        let result = unicode_normalize_variants("xyz123");
528        assert!(result.contains("xyz") && result.contains("123"));
529    }
530
531    #[test]
532    fn test_unicode_normalize_variants_bypass() {
533        let result = unicode_normalize_variants("robot");
534        assert!(!result.is_empty());
535    }
536
537    #[test]
538    fn test_zalgo_text_empty() {
539        assert_eq!(zalgo_text(""), "");
540    }
541
542    #[test]
543    fn test_zalgo_text_single_char() {
544        let result = zalgo_text("a");
545        assert!(result.len() > 1);
546    }
547
548    #[test]
549    fn test_zalgo_text_increases_length() {
550        let input = "test";
551        let result = zalgo_text(input);
552        assert!(result.len() > input.len());
553    }
554
555    #[test]
556    fn test_zalgo_text_preserves_base_chars() {
557        let result = zalgo_text("abc");
558        assert!(result.contains('a') || result.contains('b') || result.contains('c'));
559    }
560
561    #[test]
562    fn test_zalgo_text_numbers() {
563        let result = zalgo_text("test123");
564        assert!(result.contains("123"));
565    }
566
567    #[test]
568    fn test_zalgo_text_special_chars() {
569        let result = zalgo_text("test!@#");
570        assert!(result.contains("!@#"));
571    }
572
573    #[test]
574    fn test_zalgo_text_uppercase() {
575        let result = zalgo_text("TEST");
576        assert!(result.len() > 4);
577    }
578
579    #[test]
580    fn test_zalgo_text_mixed_case() {
581        let result = zalgo_text("TeSt");
582        assert!(result.len() > 4);
583    }
584
585    #[test]
586    fn test_zalgo_text_display_corruption() {
587        let result = zalgo_text("hello");
588        assert!(result.len() > 5);
589    }
590
591    #[test]
592    fn test_zalgo_text_unicode_handling() {
593        let result = zalgo_text("test");
594        assert!(!result.is_empty());
595    }
596}