1use crate::license_detection::index::dictionary::{QueryToken, TokenDictionary};
7use once_cell::sync::Lazy;
8use regex::Regex;
9use std::collections::HashSet;
10use std::ops::Range;
11
12const REQUIRED_PHRASE_OPEN: &str = "{{";
13const REQUIRED_PHRASE_CLOSE: &str = "}}";
14
15pub(crate) static STOPWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
20 let mut set = HashSet::new();
21
22 for &word in &["amp", "apos", "gt", "lt", "nbsp", "quot"] {
24 set.insert(word);
25 }
26
27 for &word in &[
29 "a",
30 "abbr",
31 "alt",
32 "blockquote",
33 "body",
34 "br",
35 "class",
36 "div",
37 "em",
38 "h1",
39 "h2",
40 "h3",
41 "h4",
42 "h5",
43 "hr",
44 "href",
45 "img",
46 "li",
47 "ol",
48 "p",
49 "pre",
50 "rel",
51 "script",
52 "span",
53 "src",
54 "td",
55 "th",
56 "tr",
57 "ul",
58 ] {
59 set.insert(word);
60 }
61
62 set.insert("rem"); set.insert("dnl"); set.insert("para");
68 set.insert("ulink");
69
70 for &word in &[
72 "bdquo", "bull", "bullet", "colon", "comma", "emdash", "emsp", "ensp", "ge", "hairsp",
73 "ldquo", "ldquor", "le", "lpar", "lsaquo", "lsquo", "lsquor", "mdash", "ndash", "numsp",
74 "period", "puncsp", "raquo", "rdquo", "rdquor", "rpar", "rsaquo", "rsquo", "rsquor",
75 "sbquo", "semi", "thinsp", "tilde",
76 ] {
77 set.insert(word);
78 }
79
80 set.insert("x3c");
82 set.insert("x3e");
83
84 for &word in &[
86 "lists", "side", "nav", "height", "auto", "border", "padding", "width",
87 ] {
88 set.insert(word);
89 }
90
91 set.insert("head1");
93 set.insert("head2");
94 set.insert("head3");
95
96 set.insert("printf");
98
99 set.insert("echo");
101
102 set
103});
104
105static QUERY_PATTERN: Lazy<Regex> =
115 Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
116
117pub fn tokenize(text: &str) -> Vec<String> {
125 if text.is_empty() {
126 return Vec::new();
127 }
128
129 let mut tokens = Vec::new();
130 let lowercase_text = text.to_lowercase();
131
132 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
133 let token = cap.as_str();
134
135 if !token.is_empty() && !STOPWORDS.contains(token) {
137 tokens.push(token.to_string());
138 }
139 }
140
141 tokens
142}
143
144pub fn tokenize_without_stopwords(text: &str) -> Vec<String> {
151 if text.is_empty() {
152 return Vec::new();
153 }
154
155 let mut tokens = Vec::new();
156 let lowercase_text = text.to_lowercase();
157
158 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
159 let token = cap.as_str();
160
161 if !token.is_empty() {
163 tokens.push(token.to_string());
164 }
165 }
166
167 tokens
168}
169
170pub fn tokenize_as_ids(text: &str, dictionary: &TokenDictionary) -> Vec<QueryToken> {
181 if text.is_empty() {
182 return Vec::new();
183 }
184
185 let mut tokens = Vec::new();
186 let stopwords_set = &*STOPWORDS;
187
188 let lowercase_text = text.to_lowercase();
189
190 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
191 let token = cap.as_str();
192 if token.is_empty() {
193 continue;
194 }
195
196 if stopwords_set.contains(token) {
197 tokens.push(QueryToken::Stopword);
198 } else {
199 tokens.push(dictionary.classify_query_token(token));
200 }
201 }
202
203 tokens
204}
205
206pub fn count_tokens(text: &str) -> usize {
210 if text.is_empty() {
211 return 0;
212 }
213
214 let lowercase_text = text.to_lowercase();
215 let stopwords_set = &*STOPWORDS;
216
217 QUERY_PATTERN
218 .find_iter(&lowercase_text)
219 .filter(|m| !m.as_str().is_empty() && !stopwords_set.contains(m.as_str()))
220 .count()
221}
222
223pub fn parse_required_phrase_spans(text: &str) -> Vec<Range<usize>> {
247 let mut spans = Vec::new();
248 let mut in_required_phrase = false;
249 let mut current_phrase_positions: Vec<usize> = Vec::new();
250 let mut ipos = 0usize;
251
252 for token in required_phrase_tokenizer(text) {
253 if token == REQUIRED_PHRASE_OPEN {
254 if in_required_phrase {
255 log::warn!(
256 "Invalid rule with nested required phrase {{ {{ braces: {}",
257 text
258 );
259 return Vec::new();
260 }
261 in_required_phrase = true;
262 } else if token == REQUIRED_PHRASE_CLOSE {
263 if in_required_phrase {
264 if !current_phrase_positions.is_empty() {
265 let min_pos = *current_phrase_positions.iter().min().unwrap_or(&0);
266 let max_pos = *current_phrase_positions.iter().max().unwrap_or(&0);
267 spans.push(min_pos..max_pos + 1);
268 current_phrase_positions.clear();
269 } else {
270 log::warn!(
271 "Invalid rule with empty required phrase {{}} braces: {}",
272 text
273 );
274 return Vec::new();
275 }
276 in_required_phrase = false;
277 } else {
278 log::warn!(
279 "Invalid rule with dangling required phrase missing closing braces: {}",
280 text
281 );
282 return Vec::new();
283 }
284 } else {
285 if in_required_phrase {
286 current_phrase_positions.push(ipos);
287 }
288 ipos += 1;
289 }
290 }
291
292 if !current_phrase_positions.is_empty() || in_required_phrase {
293 log::warn!(
294 "Invalid rule with dangling required phrase missing final closing braces: {}",
295 text
296 );
297 return Vec::new();
298 }
299
300 spans
301}
302
303fn required_phrase_tokenizer(text: &str) -> RequiredPhraseTokenIter {
308 let lowercase_text = text.to_lowercase();
309 let tokens: Vec<TokenKind> = REQUIRED_PHRASE_PATTERN
310 .find_iter(&lowercase_text)
311 .filter_map(|m| {
312 let token = m.as_str();
313 if token == REQUIRED_PHRASE_OPEN {
314 Some(TokenKind::Open)
315 } else if token == REQUIRED_PHRASE_CLOSE {
316 Some(TokenKind::Close)
317 } else if !token.is_empty() && !STOPWORDS.contains(token) {
318 Some(TokenKind::Word)
319 } else {
320 None
321 }
322 })
323 .collect();
324 RequiredPhraseTokenIter { tokens, pos: 0 }
325}
326
327#[derive(Clone, Copy, PartialEq)]
328enum TokenKind {
329 Open,
330 Close,
331 Word,
332}
333
334struct RequiredPhraseTokenIter {
335 tokens: Vec<TokenKind>,
336 pos: usize,
337}
338
339impl Iterator for RequiredPhraseTokenIter {
340 type Item = &'static str;
341
342 fn next(&mut self) -> Option<Self::Item> {
343 if self.pos >= self.tokens.len() {
344 return None;
345 }
346 let token = self.tokens[self.pos];
347 self.pos += 1;
348 Some(match token {
349 TokenKind::Open => REQUIRED_PHRASE_OPEN,
350 TokenKind::Close => REQUIRED_PHRASE_CLOSE,
351 TokenKind::Word => "word",
352 })
353 }
354}
355
356static REQUIRED_PHRASE_PATTERN: Lazy<Regex> = Lazy::new(|| {
359 Regex::new(r"(?:[^_\W]+\+?[^_\W]*|\{\{|\}\})").expect("Invalid required phrase pattern")
360});
361
362pub fn tokenize_with_stopwords(
370 text: &str,
371) -> (Vec<String>, std::collections::HashMap<usize, usize>) {
372 if text.is_empty() {
373 return (Vec::new(), std::collections::HashMap::new());
374 }
375
376 let mut tokens = Vec::new();
377 let mut stopwords_by_pos = std::collections::HashMap::new();
378
379 let mut pos: i64 = -1;
380 let lowercase_text = text.to_lowercase();
381
382 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
383 let token = cap.as_str();
384 if token.is_empty() {
385 continue;
386 }
387
388 if STOPWORDS.contains(token) {
389 *stopwords_by_pos.entry(pos as usize).or_insert(0) += 1;
390 } else {
391 pos += 1;
392 tokens.push(token.to_string());
393 }
394 }
395
396 (tokens, stopwords_by_pos)
397}
398
399#[cfg(test)]
400mod tests {
401 use super::*;
402
403 #[test]
404 fn test_tokenize_empty() {
405 let result = tokenize("");
406 assert!(result.is_empty());
407 }
408
409 #[test]
410 fn test_tokenize_simple() {
411 let result = tokenize("Hello World");
412 assert_eq!(result, vec!["hello", "world"]);
413 }
414
415 #[test]
416 fn test_tokenize_with_punctuation() {
417 let result = tokenize("Hello, World! This is a test.");
418 assert_eq!(result, vec!["hello", "world", "this", "is", "test"]);
420 }
421
422 #[test]
423 fn test_tokenize_with_spaces() {
424 let result = tokenize("some Text with spAces!");
425 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
426 }
427
428 #[test]
429 fn test_tokenize_with_plus() {
430 let result = tokenize("GPL2+ and GPL3");
431 assert_eq!(result, vec!["gpl2+", "and", "gpl3"]);
432 }
433
434 #[test]
435 fn test_tokenize_filters_stopwords() {
436 let result = tokenize("Hello div World p");
437 assert_eq!(result, vec!["hello", "world"]);
438 }
439
440 #[test]
441 fn test_tokenize_with_special_chars() {
442 let result = tokenize("special+-_!@ chars");
443 assert_eq!(result, vec!["special+", "chars"]);
445 }
446
447 #[test]
448 fn test_tokenize_with_underscores() {
449 let result = tokenize("hello_world foo_bar");
450 assert_eq!(result, vec!["hello", "world", "foo", "bar"]);
451 }
452
453 #[test]
454 fn test_tokenize_with_numbers() {
455 let result = tokenize("version 2.0 and 3.0");
456 assert_eq!(result, vec!["version", "2", "0", "and", "3", "0"]);
457 }
458
459 #[test]
460 fn test_tokenize_without_stopwords_keeps_html_tags() {
461 let result = tokenize_without_stopwords("Hello div World p");
462 assert_eq!(result, vec!["hello", "div", "world", "p"]);
463 }
464
465 #[test]
466 fn test_tokenize_without_stopwords_empty() {
467 let result = tokenize_without_stopwords("");
468 assert!(result.is_empty());
469 }
470
471 #[test]
472 fn test_tokenization_with_plus_in_middle() {
473 let result = tokenize("C++ and GPL+");
474 assert_eq!(result, vec!["c+", "and", "gpl+"]);
475 }
476
477 #[test]
478 fn test_tokenization_braces() {
479 let result = tokenize("{{Hi}}some {{}}Text with{{noth+-_!@ing}} {{junk}}spAces!");
480 assert_eq!(
481 result,
482 vec![
483 "hi", "some", "text", "with", "noth+", "ing", "junk", "spaces"
484 ]
485 );
486 }
487
488 #[test]
489 fn test_tokenize_with_ampersand() {
490 let result = tokenize("some "< markup >\"");
491 assert_eq!(result, vec!["some", "markup"]);
492 }
493
494 #[test]
495 #[allow(non_snake_case)]
496 fn test_query_tokenizer_brace_case() {
497 let result = tokenize("{{}some }}Text with spAces! + _ -");
498 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
499 }
500
501 #[test]
502 fn test_tokenize_unicode_characters() {
503 let result = tokenize("hello 世界 mir");
505 assert_eq!(result, vec!["hello", "世界", "mir"]);
506 }
507
508 #[test]
509 fn test_tokenize_only_special_chars() {
510 let result = tokenize("!@#$%^&*()");
511 assert!(result.is_empty());
512 }
513
514 #[test]
515 fn test_tokenize_only_punctuation() {
516 let result = tokenize(".,;:!?-_=+[]{}()");
517 assert!(result.is_empty());
518 }
519
520 #[test]
521 fn test_tokenize_only_stopwords() {
522 let result = tokenize("div p a br");
523 assert!(result.is_empty());
524 }
525
526 #[test]
527 fn test_tokenize_mixed_stopwords_and_words() {
528 let result = tokenize("div hello p world a test");
529 assert_eq!(result, vec!["hello", "world", "test"]);
530 }
531
532 #[test]
533 fn test_tokenize_very_long_text() {
534 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
535 let text = words.join(" ");
536 let result = tokenize(&text);
537 assert_eq!(result.len(), 1000);
538 assert_eq!(result[0], "word0");
539 assert_eq!(result[999], "word999");
540 }
541
542 #[test]
543 fn test_tokenize_with_newlines_and_tabs() {
544 let result = tokenize("hello\nworld\ttest");
545 assert_eq!(result, vec!["hello", "world", "test"]);
546 }
547
548 #[test]
549 fn test_tokenize_with_carriage_return() {
550 let result = tokenize("hello\r\nworld\rtest");
551 assert_eq!(result, vec!["hello", "world", "test"]);
552 }
553
554 #[test]
555 fn test_tokenize_trailing_plus() {
556 let result = tokenize("GPL2+ LGPL3+");
557 assert_eq!(result, vec!["gpl2+", "lgpl3+"]);
558 }
559
560 #[test]
561 fn test_tokenize_leading_plus() {
562 let result = tokenize("+hello +world");
563 assert_eq!(result, vec!["hello", "world"]);
564 }
565
566 #[test]
567 fn test_tokenize_without_stopwords_preserves_all() {
568 let result = tokenize_without_stopwords("div p a br");
569 assert_eq!(result, vec!["div", "p", "a", "br"]);
570 }
571
572 #[test]
573 fn test_tokenize_without_stopwords_unicode() {
574 let result = tokenize_without_stopwords("hello 世界");
576 assert_eq!(result, vec!["hello", "世界"]);
577 }
578
579 #[test]
580 fn test_tokenize_without_stopwords_only_special() {
581 let result = tokenize_without_stopwords("!@#$%");
582 assert!(result.is_empty());
583 }
584
585 #[test]
586 fn test_tokenize_consecutive_plus() {
587 let result = tokenize("a++b");
588 assert_eq!(result, vec!["a+", "b"]);
589 }
590
591 #[test]
592 fn test_tokenize_hyphenated_words() {
593 let result = tokenize("some-thing foo-bar");
594 assert_eq!(result, vec!["some", "thing", "foo", "bar"]);
595 }
596
597 #[test]
598 fn test_tokenize_email_address() {
599 let result = tokenize("test@example.com");
600 assert_eq!(result, vec!["test", "example", "com"]);
601 }
602
603 #[test]
604 fn test_tokenize_url() {
605 let result = tokenize("https://example.com/path");
606 assert_eq!(result, vec!["https", "example", "com", "path"]);
607 }
608
609 #[test]
610 fn test_tokenize_version_number() {
611 let result = tokenize("version 1.2.3");
612 assert_eq!(result, vec!["version", "1", "2", "3"]);
613 }
614
615 #[test]
616 fn test_tokenize_xml_entities() {
617 let result = tokenize("<div>hello</div>");
618 assert_eq!(result, vec!["hello"]);
619 }
620
621 #[test]
622 fn test_tokenize_whitespace_only() {
623 let result = tokenize(" \t\n\r ");
624 assert!(result.is_empty());
625 }
626
627 #[test]
628 fn test_tokenize_single_char() {
629 let result = tokenize("a");
630 assert!(result.is_empty());
631 }
632
633 #[test]
634 fn test_tokenize_single_word() {
635 let result = tokenize("hello");
636 assert_eq!(result, vec!["hello"]);
637 }
638
639 #[test]
640 fn test_tokenize_numbers_only() {
641 let result = tokenize("123 456 789");
642 assert_eq!(result, vec!["123", "456", "789"]);
643 }
644
645 #[test]
646 fn test_tokenize_alphanumeric_mixed() {
647 let result = tokenize("abc123 def456");
648 assert_eq!(result, vec!["abc123", "def456"]);
649 }
650
651 #[test]
652 fn test_tokenize_underscore_separated() {
653 let result = tokenize("hello_world foo_bar_baz");
654 assert_eq!(result, vec!["hello", "world", "foo", "bar", "baz"]);
655 }
656
657 #[test]
658 fn test_tokenize_all_stopwords_from_list() {
659 let result = tokenize("amp lt gt nbsp quot");
660 assert!(result.is_empty());
661 }
662
663 #[test]
664 fn test_parse_required_phrase_spans_single() {
665 let text = "This is {{enclosed}} in braces";
666 let spans = parse_required_phrase_spans(text);
667 assert_eq!(spans, vec![2..3]);
668 }
669
670 #[test]
671 fn test_parse_required_phrase_spans_multiword() {
672 let text = "This is {{a required phrase}} here";
673 let spans = parse_required_phrase_spans(text);
674 assert_eq!(spans, vec![2..4]);
675 }
676
677 #[test]
678 fn test_parse_required_phrase_spans_multiple() {
679 let text = "{{First}} and {{second}} phrase";
680 let spans = parse_required_phrase_spans(text);
681 assert_eq!(spans, vec![0..1, 2..3]);
682 }
683
684 #[test]
685 fn test_parse_required_phrase_spans_none() {
686 let text = "No required phrases here";
687 let spans = parse_required_phrase_spans(text);
688 assert!(spans.is_empty());
689 }
690
691 #[test]
692 fn test_parse_required_phrase_spans_empty_braces() {
693 let text = "Empty {{}} braces";
694 let spans = parse_required_phrase_spans(text);
695 assert!(spans.is_empty());
696 }
697
698 #[test]
699 fn test_parse_required_phrase_spans_nested() {
700 let text = "Nested {{ outer {{ inner }} }} braces";
701 let spans = parse_required_phrase_spans(text);
702 assert!(spans.is_empty());
703 }
704
705 #[test]
706 fn test_parse_required_phrase_spans_unclosed() {
707 let text = "Unclosed {{ phrase here";
708 let spans = parse_required_phrase_spans(text);
709 assert!(spans.is_empty());
710 }
711
712 #[test]
713 fn test_parse_required_phrase_spans_unopened() {
714 let text = "Unopened }} phrase here";
715 let spans = parse_required_phrase_spans(text);
716 assert!(spans.is_empty());
717 }
718
719 #[test]
720 fn test_tokenize_with_stopwords_basic() {
721 let text = "hello div world p test";
722 let (tokens, stopwords) = tokenize_with_stopwords(text);
723 assert_eq!(tokens, vec!["hello", "world", "test"]);
724 assert_eq!(stopwords.get(&0), Some(&1));
726 assert_eq!(stopwords.get(&1), Some(&1));
727 }
728
729 #[test]
730 fn test_tokenize_with_stopwords_empty() {
731 let (tokens, stopwords) = tokenize_with_stopwords("");
732 assert!(tokens.is_empty());
733 assert!(stopwords.is_empty());
734 }
735
736 #[test]
737 fn test_tokenize_with_stopwords_no_stopwords() {
738 let text = "hello world test";
739 let (tokens, stopwords) = tokenize_with_stopwords(text);
740 assert_eq!(tokens, vec!["hello", "world", "test"]);
741 assert!(stopwords.is_empty());
742 }
743
744 #[test]
745 fn test_parse_required_phrase_spans_filters_stopwords_inside() {
746 let text = "{{hello a world}}";
747 let spans = parse_required_phrase_spans(text);
748 assert_eq!(spans, vec![0..2]);
749 }
750
751 #[test]
752 fn test_parse_required_phrase_spans_filters_stopwords_outside() {
753 let text = "{{Hello}} a {{world}}";
754 let spans = parse_required_phrase_spans(text);
755 assert_eq!(spans, vec![0..1, 1..2]);
756 }
757
758 #[test]
759 fn test_parse_required_phrase_spans_multiple_stopwords() {
760 let text = "{{a p div hello}}";
761 let spans = parse_required_phrase_spans(text);
762 assert_eq!(spans, vec![0..1]);
763 }
764
765 #[test]
766 fn test_parse_required_phrase_spans_case_insensitive_stopwords() {
767 let text = "{{HELLO A WORLD}}";
768 let spans = parse_required_phrase_spans(text);
769 assert_eq!(spans, vec![0..2]);
770 }
771}