1use entities;
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::borrow::Cow;
7use std::collections::HashMap;
8
9const UNESCAPE_MD_RE : &str = r##"\\([!"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~])"##;
10const ENTITY_RE : &str = r##"&([A-Za-z#][A-Za-z0-9]{1,31});"##;
11
12static DIGITAL_ENTITY_TEST_RE : Lazy<Regex> = Lazy::new(||
13 Regex::new(r#"(?i)^&#(x[a-f0-9]{1,8}|[0-9]{1,8});$"#).unwrap()
14);
15static UNESCAPE_ALL_RE : Lazy<Regex> = Lazy::new(||
16 Regex::new(&format!("{UNESCAPE_MD_RE}|{ENTITY_RE}")).unwrap()
17);
18
19#[allow(clippy::manual_range_contains)]
20pub fn is_valid_entity_code(code: u32) -> bool {
31 if code >= 0xD800 && code <= 0xDFFF { return false; }
33 if code >= 0xFDD0 && code <= 0xFDEF { return false; }
35 if (code & 0xFFFF) == 0xFFFF || (code & 0xFFFF) == 0xFFFE { return false; }
36 if code <= 0x08 { return false; }
38 if code == 0x0B { return false; }
39 if code >= 0x0E && code <= 0x1F { return false; }
40 if code >= 0x7F && code <= 0x9F { return false; }
41 if code > 0x10FFFF { return false; }
43 true
44}
45
46pub fn get_entity_from_str(str: &str) -> Option<&'static str> {
53 pub static ENTITIES_HASH : Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
54 let mut mapping = HashMap::new();
55 for e in &entities::ENTITIES {
56 if e.entity.ends_with(';') {
57 mapping.insert(e.entity, e.characters);
58 }
59 }
60 mapping
61 });
62
63 ENTITIES_HASH.get(str).copied()
64}
65
66#[allow(clippy::from_str_radix_10)]
67fn replace_entity_pattern(str: &str) -> Option<String> {
68 if let Some(entity) = get_entity_from_str(str) {
69 Some((*entity).to_owned())
70 } else if let Some(captures) = DIGITAL_ENTITY_TEST_RE.captures(str) {
71 let str = captures.get(1).unwrap().as_str();
72 let code = if str.starts_with('x') || str.starts_with('X') {
73 u32::from_str_radix(&str[1..], 16).unwrap()
74 } else {
75 u32::from_str_radix(str, 10).unwrap()
76 };
77
78 if is_valid_entity_code(code) {
79 Some(char::from_u32(code).unwrap().into())
80 } else {
81 None
82 }
83 } else {
84 None
85 }
86}
87
88pub fn unescape_all(str: &str) -> Cow<'_, str> {
95 if !str.contains('\\') && !str.contains('&') { return Cow::Borrowed(str); }
96
97 UNESCAPE_ALL_RE.replace_all(str, |captures: ®ex::Captures| {
98 let s = captures.get(0).unwrap().as_str();
99 if let Some(m) = captures.get(1) {
100 m.as_str().to_owned()
102 } else if let Some(replacement) = replace_entity_pattern(s) {
103 replacement
105 } else {
106 s.to_owned()
107 }
108 })
109}
110
111pub fn escape_html(str: &str) -> Cow<'_, str> {
117 html_escape::encode_double_quoted_attribute(str)
118}
119
120pub fn normalize_reference(str: &str) -> String {
130 static SPACE_RE : Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
131
132 let str = SPACE_RE.replace_all(str.trim(), " ");
135
136 str.to_lowercase().to_uppercase()
169}
170
171pub fn rfind_and_count(source: &str, char: char) -> usize {
182 let mut result = 0;
183 for c in source.chars().rev() {
184 if c == char { break; }
185 result += 1;
186 }
187 result
188}
189
190pub fn find_indent_of(line: &str, mut pos: usize) -> (usize, usize) {
199 let mut chars = line[pos..].chars();
200 let mut indent = 0;
201
202 loop {
203 match chars.next() {
204 Some('\t') => {
205 let bs_count = rfind_and_count(&line[..pos], '\t');
206 indent += 4 - bs_count % 4;
207 pos += 1;
208 }
209 Some(' ') => {
210 indent += 1;
211 pos += 1;
212 }
213 _ => return ( indent, pos ),
214 }
215 }
216}
217
218pub fn cut_right_whitespace_with_tabstops(source: &str, indent: i32) -> Cow<'_, str> {
232 let (num_spaces, start) = calc_right_whitespace_with_tabstops(source, indent);
233
234 if num_spaces > 0 {
235 let mut result = " ".repeat(num_spaces);
236 result += &source[start..];
237 Cow::Owned(result)
238 } else {
239 Cow::Borrowed(&source[start..])
240 }
241}
242
243pub fn calc_right_whitespace_with_tabstops(source: &str, mut indent: i32) -> (usize, usize) {
254 let mut start = source.len();
255 let mut chars = source.char_indices().rev();
256
257 while indent > 0 {
258 match chars.next() {
259 Some((pos, '\t')) => {
260 let indent_from_start = rfind_and_count(&source[..pos], '\t');
263 let tab_width = 4 - indent_from_start as i32 % 4;
264
265 if indent < tab_width {
266 return ( indent as usize, start );
267 }
268
269 indent -= tab_width;
270 start = pos;
271 }
272 Some((pos, _)) => {
273 indent -= 1;
274 start = pos;
275 }
276 None => {
277 start = 0;
278 break;
279 }
280 }
281 }
282
283 ( 0, start )
284}
285
286pub fn is_punct_char(ch: char) -> bool {
293 use unicode_general_category::get_general_category;
294 use unicode_general_category::GeneralCategory::*;
295
296 match get_general_category(ch) {
297 ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation |
299 InitialPunctuation | FinalPunctuation | OtherPunctuation => true,
300
301 UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter |
303 NonspacingMark | SpacingMark | EnclosingMark |
305 DecimalNumber | LetterNumber | OtherNumber |
307 MathSymbol | CurrencySymbol | ModifierSymbol | OtherSymbol |
309 SpaceSeparator | LineSeparator | ParagraphSeparator |
311 Control | Format | Surrogate | PrivateUse | Unassigned => false,
313
314 _ => false
315 }
316}
317
318#[cfg(test)]
319mod tests {
320 use super::cut_right_whitespace_with_tabstops as cut_ws;
321 use super::rfind_and_count;
322 use super::find_indent_of;
323 use super::replace_entity_pattern;
324 use super::unescape_all;
325
326 #[test]
327 fn rfind_and_count_test() {
328 assert_eq!(rfind_and_count("", 'b'), 0);
329 assert_eq!(rfind_and_count("abcde", 'e'), 0);
330 assert_eq!(rfind_and_count("abcde", 'b'), 3);
331 assert_eq!(rfind_and_count("abcde", 'z'), 5);
332 assert_eq!(rfind_and_count("abcεπ", 'b'), 3);
333 }
334
335 #[test]
336 fn find_indent_of_simple_test() {
337 assert_eq!(find_indent_of("a", 0), (0, 0));
338 assert_eq!(find_indent_of(" a", 0), (1, 1));
339 assert_eq!(find_indent_of(" a", 0), (3, 3));
340 assert_eq!(find_indent_of(" ", 0), (4, 4));
341 assert_eq!(find_indent_of("\ta", 0), (4, 1));
342 assert_eq!(find_indent_of(" \ta", 0), (4, 2));
343 assert_eq!(find_indent_of(" \ta", 0), (4, 3));
344 assert_eq!(find_indent_of(" \ta", 0), (4, 4));
345 assert_eq!(find_indent_of(" \ta", 0), (8, 5));
346 }
347
348 #[test]
349 fn find_indent_of_with_offset() {
350 assert_eq!(find_indent_of(" a", 2), (1, 3));
351 assert_eq!(find_indent_of(" a", 2), (2, 4));
352 assert_eq!(find_indent_of(" \ta", 2), (2, 3));
353 assert_eq!(find_indent_of(" \ta", 2), (2, 4));
354 assert_eq!(find_indent_of(" \ta", 2), (6, 5));
355 assert_eq!(find_indent_of(" \ta", 2), (6, 6));
356 }
357
358 #[test]
359 fn find_indent_of_tabs_test() {
360 assert_eq!(find_indent_of(" \t \ta", 1), (7, 5));
361 assert_eq!(find_indent_of(" \t \ta", 2), (6, 5));
362 assert_eq!(find_indent_of(" \t \ta", 3), (4, 5));
363 assert_eq!(find_indent_of(" \t \ta", 4), (3, 5));
364 }
365
366 #[test]
367 fn cut_ws_simple() {
368 assert_eq!(cut_ws("abc", -1), "");
369 assert_eq!(cut_ws("abc", 0), "");
370 assert_eq!(cut_ws("abc", 1), "c");
371 assert_eq!(cut_ws("abc", 2), "bc");
372 assert_eq!(cut_ws("abc", 3), "abc");
373 assert_eq!(cut_ws("abc", 4), "abc");
374 }
375
376 #[test]
377 fn cut_ws_unicode() {
378 assert_eq!(cut_ws("αβγδ", 1), "δ");
379 assert_eq!(cut_ws("αβγδ ", 3), "γδ ");
380 }
381
382 #[test]
383 fn cut_ws_expands_partial_tabs() {
384 assert_eq!(cut_ws("\t", 1), " ");
385 assert_eq!(cut_ws("\t", 2), " ");
386 assert_eq!(cut_ws("\t", 3), " ");
387 assert_eq!(cut_ws("\t\t\t", 5), " \t");
388 assert_eq!(cut_ws("\t\t\t", 7), " \t");
389 }
390
391 #[test]
392 fn cut_ws_retains_full_tabs() {
393 assert_eq!(cut_ws("\t\t\t", 4), "\t");
394 assert_eq!(cut_ws("\t\t\t", 8), "\t\t");
395 }
396
397 #[test]
398 fn cut_ws_proper_tabstops() {
399 assert_eq!(cut_ws("a\t", 1), " ");
400 assert_eq!(cut_ws("a\t", 2), " ");
401 assert_eq!(cut_ws("a\t", 3), "\t");
402 assert_eq!(cut_ws("ab\t", 3), "b\t");
403 assert_eq!(cut_ws("abc\t", 3), "bc\t");
404 }
405
406 #[test]
407 fn cut_ws_proper_tabstops_nested() {
408 assert_eq!(cut_ws("a\tb\t", 2), " ");
409 assert_eq!(cut_ws("a\tb\t", 3), "\t");
410 assert_eq!(cut_ws("a\tb\t", 4), "b\t");
411 assert_eq!(cut_ws("a\tb\t", 5), " b\t");
412 assert_eq!(cut_ws("a\tb\t", 6), " b\t");
413 assert_eq!(cut_ws("a\tb\t", 7), "\tb\t");
414 assert_eq!(cut_ws("a\tb\t", 8), "a\tb\t");
415 }
416
417 #[test]
418 fn cut_ws_different_tabstops_nested() {
419 assert_eq!(cut_ws("abc\tde\tf\tg", 3), " g");
420 assert_eq!(cut_ws("abc\tde\tf\tg", 4), "\tg");
421 assert_eq!(cut_ws("abc\tde\tf\tg", 5), "f\tg");
422 assert_eq!(cut_ws("abc\tde\tf\tg", 6), " f\tg");
423 assert_eq!(cut_ws("abc\tde\tf\tg", 7), "\tf\tg");
424 assert_eq!(cut_ws("abc\tde\tf\tg", 9), "de\tf\tg");
425 assert_eq!(cut_ws("abc\tde\tf\tg", 10), "\tde\tf\tg");
426 }
427
428 #[test]
429 fn test_replace_entity_pattern() {
430 assert_eq!(replace_entity_pattern("&"), Some("&".into()));
431 assert_eq!(replace_entity_pattern("€"), Some("€".into()));
432 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
433 assert_eq!(replace_entity_pattern("—"), Some("—".into()));
434 assert_eq!(replace_entity_pattern(" "), Some(" ".into()));
435 assert_eq!(replace_entity_pattern("?"), Some("?".into()));
436 assert_eq!(replace_entity_pattern("&ffff;"), None);
437 assert_eq!(replace_entity_pattern("F;"), None);
438 assert_eq!(replace_entity_pattern("&#xGG;"), None);
439 }
440
441 #[test]
442 fn test_unescape_all_simple() {
443 assert_eq!(unescape_all("&"), "&");
444 assert_eq!(unescape_all("\\&"), "&");
445 }
446
447 #[test]
448 fn test_unescape_all_xss() {
449 assert_eq!(
450 unescape_all(r#"javascript:alert(1)"#),
451 r#"javascript:alert(1)"#);
452
453 assert_eq!(
454 unescape_all(r#"Javascript:alert(1)"#),
455 r#"Javascript:alert(1)"#);
456
457 assert_eq!(
458 unescape_all(r#"&#74;avascript:alert(1)"#),
459 r#"Javascript:alert(1)"#);
460
461 assert_eq!(
462 unescape_all(r#"\Javascript:alert(1)"#),
463 r#"Javascript:alert(1)"#);
464
465 assert_eq!(
466 unescape_all(r#""><script>alert("xss")</script>"#),
467 r#""><script>alert("xss")</script>"#);
468 }
469}