Skip to main content

rumdl_lib/utils/anchor_styles/
common.rs

1//! Common utilities shared across anchor style implementations
2//!
3//! This module provides shared functionality for anchor generation,
4//! including emoji detection, Unicode handling, and regex patterns.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9// ============================================================================
10// Shared Regex Patterns
11// ============================================================================
12
13/// Control character and dangerous Unicode filtering pattern
14pub static CONTROL_CHARS: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(r"[\x00-\x1F\x7F-\x9F\u200B-\u200D\uFEFF]").unwrap());
16
17/// Whitespace normalization pattern (tabs, Unicode spaces)
18pub static WHITESPACE_NORMALIZE: LazyLock<Regex> =
19    LazyLock::new(|| Regex::new(r"[\t\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]").unwrap());
20
21/// Zero-width character pattern for security filtering
22pub static ZERO_WIDTH_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\u200B-\u200D\u2060\uFEFF]").unwrap());
23
24/// RTL override and dangerous Unicode control pattern
25pub static DANGEROUS_UNICODE_PATTERN: LazyLock<Regex> =
26    LazyLock::new(|| Regex::new(r"[\u202A-\u202E\u2066-\u2069\u061C\u200E\u200F]").unwrap());
27
28// ============================================================================
29// Emoji Detection
30// ============================================================================
31
32/// Check if a character is an emoji or symbol
33///
34/// This covers the most common emoji ranges used in headings.
35/// Shared by all anchor styles.
36#[inline]
37pub fn is_emoji_or_symbol(c: char) -> bool {
38    let code = c as u32;
39
40    // Basic emoji ranges
41    (0x1F600..=0x1F64F).contains(&code)     // Emoticons
42        || (0x1F300..=0x1F5FF).contains(&code) // Miscellaneous Symbols and Pictographs
43        || (0x1F680..=0x1F6FF).contains(&code) // Transport and Map Symbols
44        || (0x1F900..=0x1F9FF).contains(&code) // Supplemental Symbols and Pictographs
45        || (0x2600..=0x26FF).contains(&code)   // Miscellaneous Symbols
46        || (0x2700..=0x27BF).contains(&code) // Dingbats
47}
48
49/// Extended emoji detection including country flags and keycaps
50///
51/// Used by GitHub style which has more comprehensive emoji handling.
52#[inline]
53pub fn is_emoji_or_symbol_extended(c: char) -> bool {
54    let code = c as u32;
55
56    // Start with basic ranges
57    is_emoji_or_symbol(c)
58        // Additional ranges for GitHub compatibility
59        || (0x1F1E0..=0x1F1FF).contains(&code) // Regional indicator symbols (flags)
60        || (0x1FA00..=0x1FA6F).contains(&code) // Chess symbols
61        || (0x1FA70..=0x1FAFF).contains(&code) // Symbols and Pictographs Extended-A
62        || (0x231A..=0x231B).contains(&code)   // Watch, Hourglass
63        || (0x23E9..=0x23F3).contains(&code)   // Media control symbols
64        || (0x23F8..=0x23FA).contains(&code)   // More media symbols
65        || (0x25AA..=0x25AB).contains(&code)   // Small squares
66        || code == 0x25B6                      // Play button
67        || code == 0x25C0                      // Reverse button
68        || (0x25FB..=0x25FE).contains(&code)   // Medium squares
69        || (0x2614..=0x2615).contains(&code)   // Umbrella, Hot beverage
70        || (0x2648..=0x2653).contains(&code)   // Zodiac symbols
71        || code == 0x267F                      // Wheelchair symbol
72        || code == 0x2693                      // Anchor
73        || code == 0x26A1                      // High voltage
74        || (0x26AA..=0x26AB).contains(&code)   // White/black circles
75        || (0x26BD..=0x26BE).contains(&code)   // Sports balls
76        || (0x26C4..=0x26C5).contains(&code)   // Snowman, Sun
77        || code == 0x26CE                      // Ophiuchus
78        || code == 0x26D4                      // No entry
79        || code == 0x26EA                      // Church
80        || (0x26F2..=0x26F3).contains(&code)   // Fountain, Golf
81        || code == 0x26F5                      // Sailboat
82        || code == 0x26FA                      // Tent
83        || code == 0x26FD                      // Fuel pump
84        || code == 0x2702                      // Scissors
85        || code == 0x2705                      // Check mark
86        || (0x2708..=0x270D).contains(&code)   // Airplane to writing hand
87        || code == 0x270F                      // Pencil
88        || code == 0x2712                      // Black nib
89        || code == 0x2714                      // Heavy check
90        || code == 0x2716                      // Heavy multiplication
91        || code == 0x271D                      // Latin cross
92        || code == 0x2721                      // Star of David
93        || code == 0x2728                      // Sparkles
94        || (0x2733..=0x2734).contains(&code)   // Eight spoked asterisk
95        || code == 0x2744                      // Snowflake
96        || code == 0x2747                      // Sparkle
97        || code == 0x274C                      // Cross mark
98        || code == 0x274E                      // Cross mark square
99        || (0x2753..=0x2755).contains(&code)   // Question marks
100        || code == 0x2757                      // Exclamation mark
101        || (0x2763..=0x2764).contains(&code)   // Heart exclamation, heart
102        || (0x2795..=0x2797).contains(&code)   // Plus, minus, divide
103        || code == 0x27A1                      // Right arrow
104        || code == 0x27B0                      // Curly loop
105        || code == 0x27BF                      // Double curly loop
106        || (0x2934..=0x2935).contains(&code)   // Arrows
107        || (0x2B05..=0x2B07).contains(&code)   // Arrows
108        || (0x2B1B..=0x2B1C).contains(&code)   // Squares
109        || code == 0x2B50                      // Star
110        || code == 0x2B55                      // Circle
111        || code == 0x3030                      // Wavy dash
112        || code == 0x303D                      // Part alternation mark
113        || code == 0x3297                      // Circled Ideograph Congratulation
114        || code == 0x3299                      // Circled Ideograph Secret
115        || (0xFE00..=0xFE0F).contains(&code)   // Variation selectors (emoji modifiers)
116        || code == 0x200D // Zero-width joiner (used in emoji sequences)
117}
118
119// ============================================================================
120// Unicode Letter Detection
121// ============================================================================
122
123/// Mode for Unicode letter detection
124#[derive(Clone, Copy, Debug, PartialEq)]
125pub enum UnicodeLetterMode {
126    /// Conservative: ASCII + common Latin extended only (Jekyll)
127    Conservative,
128    /// Permissive: All alphabetic except dangerous ranges (KramdownGfm)
129    Permissive,
130    /// Strict: ASCII only (pure Kramdown)
131    AsciiOnly,
132    /// GitHub: Explicit list of safe Unicode ranges with security filtering
133    GitHub,
134}
135
136/// Check if a character is a safe Unicode letter based on the specified mode
137#[inline]
138pub fn is_safe_unicode_letter(c: char, mode: UnicodeLetterMode) -> bool {
139    match mode {
140        UnicodeLetterMode::AsciiOnly => c.is_ascii_alphabetic(),
141
142        UnicodeLetterMode::Conservative => {
143            // ASCII letters
144            if c.is_ascii_alphabetic() {
145                return true;
146            }
147
148            // Common Latin Extended characters (safe subset)
149            match c as u32 {
150                // Latin-1 Supplement letters (excluding symbols)
151                0x00C0..=0x00D6 | 0x00D8..=0x00F6 | 0x00F8..=0x00FF => true,
152                // Latin Extended-A (common European letters)
153                0x0100..=0x017F => true,
154                // Latin Extended Additional (common subset)
155                0x1E00..=0x1EFF => true,
156                _ => false,
157            }
158        }
159
160        UnicodeLetterMode::Permissive => {
161            // ASCII letters always allowed
162            if c.is_ascii_alphabetic() {
163                return true;
164            }
165
166            // Allow all alphabetic except dangerous ranges
167            if c.is_alphabetic() {
168                let code = c as u32;
169                // Exclude dangerous ranges
170                if (0xE000..=0xF8FF).contains(&code)    // Private Use Area
171                    || (0xFE00..=0xFE0F).contains(&code)    // Variation Selectors
172                    || (0x200B..=0x200D).contains(&code)    // Zero-width characters
173                    || (0x202A..=0x202E).contains(&code)
174                // Bidirectional overrides
175                {
176                    return false;
177                }
178                return true;
179            }
180
181            false
182        }
183
184        UnicodeLetterMode::GitHub => {
185            let code = c as u32;
186
187            // Exclude potentially dangerous ranges first
188            if (0xE000..=0xF8FF).contains(&code)       // Private Use Area
189                || (0xF0000..=0xFFFFD).contains(&code) // Supplementary Private Use Area-A
190                || (0x100000..=0x10FFFD).contains(&code) // Supplementary Private Use Area-B
191                || (0xFE00..=0xFE0F).contains(&code)   // Variation Selectors
192                || (0xE0100..=0xE01EF).contains(&code)
193            // Variation Selectors Supplement
194            {
195                return false;
196            }
197
198            // Allow explicit safe Unicode letter ranges
199            (0x0000..=0x007F).contains(&code)    // Basic Latin
200                || (0x0080..=0x00FF).contains(&code)    // Latin-1 Supplement
201                || (0x0100..=0x017F).contains(&code)    // Latin Extended-A
202                || (0x0180..=0x024F).contains(&code)    // Latin Extended-B
203                || (0x0370..=0x03FF).contains(&code)    // Greek and Coptic
204                || (0x0400..=0x04FF).contains(&code)    // Cyrillic
205                || (0x0500..=0x052F).contains(&code)    // Cyrillic Supplement
206                || (0x0590..=0x05FF).contains(&code)    // Hebrew
207                || (0x0600..=0x06FF).contains(&code)    // Arabic
208                || (0x0700..=0x074F).contains(&code)    // Syriac
209                || (0x0750..=0x077F).contains(&code)    // Arabic Supplement
210                || (0x1100..=0x11FF).contains(&code)    // Hangul Jamo
211                || (0x3040..=0x309F).contains(&code)    // Hiragana
212                || (0x30A0..=0x30FF).contains(&code)    // Katakana
213                || (0x3130..=0x318F).contains(&code)    // Hangul Compatibility Jamo
214                || (0x4E00..=0x9FFF).contains(&code)    // CJK Unified Ideographs
215                || (0xAC00..=0xD7AF).contains(&code)    // Hangul Syllables (Korean)
216                || (0xA000..=0xA48F).contains(&code)    // Yi Syllables
217                || (0xA490..=0xA4CF).contains(&code) // Yi Radicals
218        }
219    }
220}
221
222// ============================================================================
223// Input Validation
224// ============================================================================
225
226/// Maximum input length for security (10KB)
227pub const MAX_INPUT_LENGTH: usize = 10240;
228
229/// Maximum input size for permissive validation (1MB)
230pub const MAX_INPUT_SIZE_LARGE: usize = 1024 * 1024;
231
232/// Truncate input at a safe UTF-8 boundary
233#[inline]
234pub fn truncate_at_char_boundary(input: &str, max_len: usize) -> &str {
235    if input.len() <= max_len {
236        return input;
237    }
238
239    // Find the last valid char boundary before max_len
240    for (byte_index, _) in input.char_indices() {
241        if byte_index >= max_len {
242            return &input[..byte_index];
243        }
244    }
245
246    input
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn test_is_emoji_or_symbol() {
255        // Basic emojis
256        assert!(is_emoji_or_symbol('😀'));
257        assert!(is_emoji_or_symbol('🎉'));
258        assert!(is_emoji_or_symbol('❤'));
259
260        // Not emojis
261        assert!(!is_emoji_or_symbol('a'));
262        assert!(!is_emoji_or_symbol('1'));
263        assert!(!is_emoji_or_symbol(' '));
264    }
265
266    #[test]
267    fn test_is_emoji_or_symbol_extended() {
268        // Basic emojis
269        assert!(is_emoji_or_symbol_extended('😀'));
270
271        // Extended ranges
272        assert!(is_emoji_or_symbol_extended('✅')); // 0x2705
273        assert!(is_emoji_or_symbol_extended('⭐')); // 0x2B50
274
275        // Not emojis
276        assert!(!is_emoji_or_symbol_extended('a'));
277    }
278
279    #[test]
280    fn test_is_safe_unicode_letter_modes() {
281        // ASCII works in all modes
282        assert!(is_safe_unicode_letter('a', UnicodeLetterMode::AsciiOnly));
283        assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Conservative));
284        assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Permissive));
285        assert!(is_safe_unicode_letter('a', UnicodeLetterMode::GitHub));
286
287        // Accented chars work in conservative, permissive, and github
288        assert!(!is_safe_unicode_letter('é', UnicodeLetterMode::AsciiOnly));
289        assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Conservative));
290        assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Permissive));
291        assert!(is_safe_unicode_letter('é', UnicodeLetterMode::GitHub));
292
293        // CJK works in permissive and github modes
294        assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::AsciiOnly));
295        assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::Conservative));
296        assert!(is_safe_unicode_letter('日', UnicodeLetterMode::Permissive));
297        assert!(is_safe_unicode_letter('日', UnicodeLetterMode::GitHub));
298
299        // Greek works in permissive and github modes
300        assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::AsciiOnly));
301        assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::Conservative));
302        assert!(is_safe_unicode_letter('α', UnicodeLetterMode::Permissive));
303        assert!(is_safe_unicode_letter('α', UnicodeLetterMode::GitHub));
304    }
305
306    #[test]
307    fn test_truncate_at_char_boundary() {
308        let input = "Hello, 世界!";
309
310        // Within limit
311        assert_eq!(truncate_at_char_boundary(input, 100), input);
312
313        // Truncate at ASCII boundary
314        assert_eq!(truncate_at_char_boundary(input, 5), "Hello");
315
316        // Truncate doesn't split multi-byte chars
317        let truncated = truncate_at_char_boundary(input, 8);
318        assert!(truncated.is_char_boundary(truncated.len()));
319    }
320}