rumdl_lib/utils/anchor_styles/common.rs
1//! Common utilities shared across anchor style implementations
2//!
3//! This module provides shared functionality for anchor generation,
4//! including emoji detection, Unicode handling, and regex patterns.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9// ============================================================================
10// Shared Regex Patterns
11// ============================================================================
12
13/// Control character and dangerous Unicode filtering pattern
14pub static CONTROL_CHARS: LazyLock<Regex> =
15 LazyLock::new(|| Regex::new(r"[\x00-\x1F\x7F-\x9F\u200B-\u200D\uFEFF]").unwrap());
16
17/// Whitespace normalization pattern (tabs, Unicode spaces)
18pub static WHITESPACE_NORMALIZE: LazyLock<Regex> =
19 LazyLock::new(|| Regex::new(r"[\t\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]").unwrap());
20
21/// Zero-width character pattern for security filtering
22pub static ZERO_WIDTH_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\u200B-\u200D\u2060\uFEFF]").unwrap());
23
24/// RTL override and dangerous Unicode control pattern
25pub static DANGEROUS_UNICODE_PATTERN: LazyLock<Regex> =
26 LazyLock::new(|| Regex::new(r"[\u202A-\u202E\u2066-\u2069\u061C\u200E\u200F]").unwrap());
27
28// ============================================================================
29// Emoji Detection
30// ============================================================================
31
32/// Check if a character is an emoji or symbol
33///
34/// This covers the most common emoji ranges used in headings.
35/// Shared by all anchor styles.
36#[inline]
37pub fn is_emoji_or_symbol(c: char) -> bool {
38 let code = c as u32;
39
40 // Basic emoji ranges
41 (0x1F600..=0x1F64F).contains(&code) // Emoticons
42 || (0x1F300..=0x1F5FF).contains(&code) // Miscellaneous Symbols and Pictographs
43 || (0x1F680..=0x1F6FF).contains(&code) // Transport and Map Symbols
44 || (0x1F900..=0x1F9FF).contains(&code) // Supplemental Symbols and Pictographs
45 || (0x2600..=0x26FF).contains(&code) // Miscellaneous Symbols
46 || (0x2700..=0x27BF).contains(&code) // Dingbats
47}
48
49/// Extended emoji detection including country flags and keycaps
50///
51/// Used by GitHub style which has more comprehensive emoji handling.
52#[inline]
53pub fn is_emoji_or_symbol_extended(c: char) -> bool {
54 let code = c as u32;
55
56 // Start with basic ranges
57 is_emoji_or_symbol(c)
58 // Additional ranges for GitHub compatibility
59 || (0x1F1E0..=0x1F1FF).contains(&code) // Regional indicator symbols (flags)
60 || (0x1FA00..=0x1FA6F).contains(&code) // Chess symbols
61 || (0x1FA70..=0x1FAFF).contains(&code) // Symbols and Pictographs Extended-A
62 || (0x231A..=0x231B).contains(&code) // Watch, Hourglass
63 || (0x23E9..=0x23F3).contains(&code) // Media control symbols
64 || (0x23F8..=0x23FA).contains(&code) // More media symbols
65 || (0x25AA..=0x25AB).contains(&code) // Small squares
66 || code == 0x25B6 // Play button
67 || code == 0x25C0 // Reverse button
68 || (0x25FB..=0x25FE).contains(&code) // Medium squares
69 || (0x2614..=0x2615).contains(&code) // Umbrella, Hot beverage
70 || (0x2648..=0x2653).contains(&code) // Zodiac symbols
71 || code == 0x267F // Wheelchair symbol
72 || code == 0x2693 // Anchor
73 || code == 0x26A1 // High voltage
74 || (0x26AA..=0x26AB).contains(&code) // White/black circles
75 || (0x26BD..=0x26BE).contains(&code) // Sports balls
76 || (0x26C4..=0x26C5).contains(&code) // Snowman, Sun
77 || code == 0x26CE // Ophiuchus
78 || code == 0x26D4 // No entry
79 || code == 0x26EA // Church
80 || (0x26F2..=0x26F3).contains(&code) // Fountain, Golf
81 || code == 0x26F5 // Sailboat
82 || code == 0x26FA // Tent
83 || code == 0x26FD // Fuel pump
84 || code == 0x2702 // Scissors
85 || code == 0x2705 // Check mark
86 || (0x2708..=0x270D).contains(&code) // Airplane to writing hand
87 || code == 0x270F // Pencil
88 || code == 0x2712 // Black nib
89 || code == 0x2714 // Heavy check
90 || code == 0x2716 // Heavy multiplication
91 || code == 0x271D // Latin cross
92 || code == 0x2721 // Star of David
93 || code == 0x2728 // Sparkles
94 || (0x2733..=0x2734).contains(&code) // Eight spoked asterisk
95 || code == 0x2744 // Snowflake
96 || code == 0x2747 // Sparkle
97 || code == 0x274C // Cross mark
98 || code == 0x274E // Cross mark square
99 || (0x2753..=0x2755).contains(&code) // Question marks
100 || code == 0x2757 // Exclamation mark
101 || (0x2763..=0x2764).contains(&code) // Heart exclamation, heart
102 || (0x2795..=0x2797).contains(&code) // Plus, minus, divide
103 || code == 0x27A1 // Right arrow
104 || code == 0x27B0 // Curly loop
105 || code == 0x27BF // Double curly loop
106 || (0x2934..=0x2935).contains(&code) // Arrows
107 || (0x2B05..=0x2B07).contains(&code) // Arrows
108 || (0x2B1B..=0x2B1C).contains(&code) // Squares
109 || code == 0x2B50 // Star
110 || code == 0x2B55 // Circle
111 || code == 0x3030 // Wavy dash
112 || code == 0x303D // Part alternation mark
113 || code == 0x3297 // Circled Ideograph Congratulation
114 || code == 0x3299 // Circled Ideograph Secret
115 || (0xFE00..=0xFE0F).contains(&code) // Variation selectors (emoji modifiers)
116 || code == 0x200D // Zero-width joiner (used in emoji sequences)
117}
118
119// ============================================================================
120// Unicode Letter Detection
121// ============================================================================
122
123/// Mode for Unicode letter detection
124#[derive(Clone, Copy, Debug, PartialEq)]
125pub enum UnicodeLetterMode {
126 /// Conservative: ASCII + common Latin extended only (Jekyll)
127 Conservative,
128 /// Permissive: All alphabetic except dangerous ranges (KramdownGfm)
129 Permissive,
130 /// Strict: ASCII only (pure Kramdown)
131 AsciiOnly,
132 /// GitHub: Explicit list of safe Unicode ranges with security filtering
133 GitHub,
134}
135
136/// Check if a character is a safe Unicode letter based on the specified mode
137#[inline]
138pub fn is_safe_unicode_letter(c: char, mode: UnicodeLetterMode) -> bool {
139 match mode {
140 UnicodeLetterMode::AsciiOnly => c.is_ascii_alphabetic(),
141
142 UnicodeLetterMode::Conservative => {
143 // ASCII letters
144 if c.is_ascii_alphabetic() {
145 return true;
146 }
147
148 // Common Latin Extended characters (safe subset)
149 match c as u32 {
150 // Latin-1 Supplement letters (excluding symbols)
151 0x00C0..=0x00D6 | 0x00D8..=0x00F6 | 0x00F8..=0x00FF => true,
152 // Latin Extended-A (common European letters)
153 0x0100..=0x017F => true,
154 // Latin Extended Additional (common subset)
155 0x1E00..=0x1EFF => true,
156 _ => false,
157 }
158 }
159
160 UnicodeLetterMode::Permissive => {
161 // ASCII letters always allowed
162 if c.is_ascii_alphabetic() {
163 return true;
164 }
165
166 // Allow all alphabetic except dangerous ranges
167 if c.is_alphabetic() {
168 let code = c as u32;
169 // Exclude dangerous ranges
170 if (0xE000..=0xF8FF).contains(&code) // Private Use Area
171 || (0xFE00..=0xFE0F).contains(&code) // Variation Selectors
172 || (0x200B..=0x200D).contains(&code) // Zero-width characters
173 || (0x202A..=0x202E).contains(&code)
174 // Bidirectional overrides
175 {
176 return false;
177 }
178 return true;
179 }
180
181 false
182 }
183
184 UnicodeLetterMode::GitHub => {
185 let code = c as u32;
186
187 // Exclude potentially dangerous ranges first
188 if (0xE000..=0xF8FF).contains(&code) // Private Use Area
189 || (0xF0000..=0xFFFFD).contains(&code) // Supplementary Private Use Area-A
190 || (0x100000..=0x10FFFD).contains(&code) // Supplementary Private Use Area-B
191 || (0xFE00..=0xFE0F).contains(&code) // Variation Selectors
192 || (0xE0100..=0xE01EF).contains(&code)
193 // Variation Selectors Supplement
194 {
195 return false;
196 }
197
198 // Allow explicit safe Unicode letter ranges
199 (0x0000..=0x007F).contains(&code) // Basic Latin
200 || (0x0080..=0x00FF).contains(&code) // Latin-1 Supplement
201 || (0x0100..=0x017F).contains(&code) // Latin Extended-A
202 || (0x0180..=0x024F).contains(&code) // Latin Extended-B
203 || (0x0370..=0x03FF).contains(&code) // Greek and Coptic
204 || (0x0400..=0x04FF).contains(&code) // Cyrillic
205 || (0x0500..=0x052F).contains(&code) // Cyrillic Supplement
206 || (0x0590..=0x05FF).contains(&code) // Hebrew
207 || (0x0600..=0x06FF).contains(&code) // Arabic
208 || (0x0700..=0x074F).contains(&code) // Syriac
209 || (0x0750..=0x077F).contains(&code) // Arabic Supplement
210 || (0x1100..=0x11FF).contains(&code) // Hangul Jamo
211 || (0x3040..=0x309F).contains(&code) // Hiragana
212 || (0x30A0..=0x30FF).contains(&code) // Katakana
213 || (0x3130..=0x318F).contains(&code) // Hangul Compatibility Jamo
214 || (0x4E00..=0x9FFF).contains(&code) // CJK Unified Ideographs
215 || (0xAC00..=0xD7AF).contains(&code) // Hangul Syllables (Korean)
216 || (0xA000..=0xA48F).contains(&code) // Yi Syllables
217 || (0xA490..=0xA4CF).contains(&code) // Yi Radicals
218 }
219 }
220}
221
222// ============================================================================
223// Input Validation
224// ============================================================================
225
226/// Maximum input length for security (10KB)
227pub const MAX_INPUT_LENGTH: usize = 10240;
228
229/// Maximum input size for permissive validation (1MB)
230pub const MAX_INPUT_SIZE_LARGE: usize = 1024 * 1024;
231
232/// Truncate input at a safe UTF-8 boundary
233#[inline]
234pub fn truncate_at_char_boundary(input: &str, max_len: usize) -> &str {
235 if input.len() <= max_len {
236 return input;
237 }
238
239 // Find the last valid char boundary before max_len
240 for (byte_index, _) in input.char_indices() {
241 if byte_index >= max_len {
242 return &input[..byte_index];
243 }
244 }
245
246 input
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252
253 #[test]
254 fn test_is_emoji_or_symbol() {
255 // Basic emojis
256 assert!(is_emoji_or_symbol('😀'));
257 assert!(is_emoji_or_symbol('🎉'));
258 assert!(is_emoji_or_symbol('❤'));
259
260 // Not emojis
261 assert!(!is_emoji_or_symbol('a'));
262 assert!(!is_emoji_or_symbol('1'));
263 assert!(!is_emoji_or_symbol(' '));
264 }
265
266 #[test]
267 fn test_is_emoji_or_symbol_extended() {
268 // Basic emojis
269 assert!(is_emoji_or_symbol_extended('😀'));
270
271 // Extended ranges
272 assert!(is_emoji_or_symbol_extended('✅')); // 0x2705
273 assert!(is_emoji_or_symbol_extended('⭐')); // 0x2B50
274
275 // Not emojis
276 assert!(!is_emoji_or_symbol_extended('a'));
277 }
278
279 #[test]
280 fn test_is_safe_unicode_letter_modes() {
281 // ASCII works in all modes
282 assert!(is_safe_unicode_letter('a', UnicodeLetterMode::AsciiOnly));
283 assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Conservative));
284 assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Permissive));
285 assert!(is_safe_unicode_letter('a', UnicodeLetterMode::GitHub));
286
287 // Accented chars work in conservative, permissive, and github
288 assert!(!is_safe_unicode_letter('é', UnicodeLetterMode::AsciiOnly));
289 assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Conservative));
290 assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Permissive));
291 assert!(is_safe_unicode_letter('é', UnicodeLetterMode::GitHub));
292
293 // CJK works in permissive and github modes
294 assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::AsciiOnly));
295 assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::Conservative));
296 assert!(is_safe_unicode_letter('日', UnicodeLetterMode::Permissive));
297 assert!(is_safe_unicode_letter('日', UnicodeLetterMode::GitHub));
298
299 // Greek works in permissive and github modes
300 assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::AsciiOnly));
301 assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::Conservative));
302 assert!(is_safe_unicode_letter('α', UnicodeLetterMode::Permissive));
303 assert!(is_safe_unicode_letter('α', UnicodeLetterMode::GitHub));
304 }
305
306 #[test]
307 fn test_truncate_at_char_boundary() {
308 let input = "Hello, 世界!";
309
310 // Within limit
311 assert_eq!(truncate_at_char_boundary(input, 100), input);
312
313 // Truncate at ASCII boundary
314 assert_eq!(truncate_at_char_boundary(input, 5), "Hello");
315
316 // Truncate doesn't split multi-byte chars
317 let truncated = truncate_at_char_boundary(input, 8);
318 assert!(truncated.is_char_boundary(truncated.len()));
319 }
320}