1use crate::license_detection::index::dictionary::{QueryToken, TokenDictionary};
10use regex::Regex;
11use std::collections::HashSet;
12use std::ops::Range;
13use std::sync::LazyLock;
14
15const REQUIRED_PHRASE_OPEN: &str = "{{";
16const REQUIRED_PHRASE_CLOSE: &str = "}}";
17
18pub(crate) static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
23 let mut set = HashSet::new();
24
25 for &word in &["amp", "apos", "gt", "lt", "nbsp", "quot"] {
27 set.insert(word);
28 }
29
30 for &word in &[
32 "a",
33 "abbr",
34 "alt",
35 "blockquote",
36 "body",
37 "br",
38 "class",
39 "div",
40 "em",
41 "h1",
42 "h2",
43 "h3",
44 "h4",
45 "h5",
46 "hr",
47 "href",
48 "img",
49 "li",
50 "ol",
51 "p",
52 "pre",
53 "rel",
54 "script",
55 "span",
56 "src",
57 "td",
58 "th",
59 "tr",
60 "ul",
61 ] {
62 set.insert(word);
63 }
64
65 set.insert("rem"); set.insert("dnl"); set.insert("para");
71 set.insert("ulink");
72
73 for &word in &[
75 "bdquo", "bull", "bullet", "colon", "comma", "emdash", "emsp", "ensp", "ge", "hairsp",
76 "ldquo", "ldquor", "le", "lpar", "lsaquo", "lsquo", "lsquor", "mdash", "ndash", "numsp",
77 "period", "puncsp", "raquo", "rdquo", "rdquor", "rpar", "rsaquo", "rsquo", "rsquor",
78 "sbquo", "semi", "thinsp", "tilde",
79 ] {
80 set.insert(word);
81 }
82
83 set.insert("x3c");
85 set.insert("x3e");
86
87 for &word in &[
89 "lists", "side", "nav", "height", "auto", "border", "padding", "width",
90 ] {
91 set.insert(word);
92 }
93
94 set.insert("head1");
96 set.insert("head2");
97 set.insert("head3");
98
99 set.insert("printf");
101
102 set.insert("echo");
104
105 set
106});
107
108static QUERY_PATTERN: LazyLock<Regex> =
118 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
119
120pub fn tokenize(text: &str) -> Vec<String> {
128 if text.is_empty() {
129 return Vec::new();
130 }
131
132 let mut tokens = Vec::new();
133 let lowercase_text = text.to_lowercase();
134
135 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
136 let token = cap.as_str();
137
138 if !token.is_empty() && !STOPWORDS.contains(token) {
140 tokens.push(token.to_string());
141 }
142 }
143
144 tokens
145}
146
147pub fn tokenize_without_stopwords(text: &str) -> Vec<String> {
154 if text.is_empty() {
155 return Vec::new();
156 }
157
158 let mut tokens = Vec::new();
159 let lowercase_text = text.to_lowercase();
160
161 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
162 let token = cap.as_str();
163
164 if !token.is_empty() {
166 tokens.push(token.to_string());
167 }
168 }
169
170 tokens
171}
172
173pub fn tokenize_as_ids(text: &str, dictionary: &TokenDictionary) -> Vec<QueryToken> {
184 if text.is_empty() {
185 return Vec::new();
186 }
187
188 let mut tokens = Vec::new();
189 let stopwords_set = &*STOPWORDS;
190
191 let lowercase_text = text.to_lowercase();
192
193 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
194 let token = cap.as_str();
195 if token.is_empty() {
196 continue;
197 }
198
199 if stopwords_set.contains(token) {
200 tokens.push(QueryToken::Stopword);
201 } else {
202 tokens.push(dictionary.classify_query_token(token));
203 }
204 }
205
206 tokens
207}
208
209pub fn count_tokens(text: &str) -> usize {
213 if text.is_empty() {
214 return 0;
215 }
216
217 let lowercase_text = text.to_lowercase();
218 let stopwords_set = &*STOPWORDS;
219
220 QUERY_PATTERN
221 .find_iter(&lowercase_text)
222 .filter(|m| !m.as_str().is_empty() && !stopwords_set.contains(m.as_str()))
223 .count()
224}
225
226pub fn parse_required_phrase_spans(text: &str) -> Vec<Range<usize>> {
250 let mut spans = Vec::new();
251 let mut in_required_phrase = false;
252 let mut current_phrase_positions: Vec<usize> = Vec::new();
253 let mut ipos = 0usize;
254
255 for token in required_phrase_tokenizer(text) {
256 if token == REQUIRED_PHRASE_OPEN {
257 if in_required_phrase {
258 log::warn!(
259 "Invalid rule with nested required phrase {{ {{ braces: {}",
260 text
261 );
262 return Vec::new();
263 }
264 in_required_phrase = true;
265 } else if token == REQUIRED_PHRASE_CLOSE {
266 if in_required_phrase {
267 if !current_phrase_positions.is_empty() {
268 let min_pos = *current_phrase_positions.iter().min().unwrap_or(&0);
269 let max_pos = *current_phrase_positions.iter().max().unwrap_or(&0);
270 spans.push(min_pos..max_pos + 1);
271 current_phrase_positions.clear();
272 } else {
273 log::warn!(
274 "Invalid rule with empty required phrase {{}} braces: {}",
275 text
276 );
277 return Vec::new();
278 }
279 in_required_phrase = false;
280 } else {
281 log::warn!(
282 "Invalid rule with dangling required phrase missing closing braces: {}",
283 text
284 );
285 return Vec::new();
286 }
287 } else {
288 if in_required_phrase {
289 current_phrase_positions.push(ipos);
290 }
291 ipos += 1;
292 }
293 }
294
295 if !current_phrase_positions.is_empty() || in_required_phrase {
296 log::warn!(
297 "Invalid rule with dangling required phrase missing final closing braces: {}",
298 text
299 );
300 return Vec::new();
301 }
302
303 spans
304}
305
306fn required_phrase_tokenizer(text: &str) -> RequiredPhraseTokenIter {
311 let lowercase_text = text.to_lowercase();
312 let tokens: Vec<TokenKind> = REQUIRED_PHRASE_PATTERN
313 .find_iter(&lowercase_text)
314 .filter_map(|m| {
315 let token = m.as_str();
316 if token == REQUIRED_PHRASE_OPEN {
317 Some(TokenKind::Open)
318 } else if token == REQUIRED_PHRASE_CLOSE {
319 Some(TokenKind::Close)
320 } else if !token.is_empty() && !STOPWORDS.contains(token) {
321 Some(TokenKind::Word)
322 } else {
323 None
324 }
325 })
326 .collect();
327 RequiredPhraseTokenIter { tokens, pos: 0 }
328}
329
330#[derive(Clone, Copy, PartialEq)]
331enum TokenKind {
332 Open,
333 Close,
334 Word,
335}
336
337struct RequiredPhraseTokenIter {
338 tokens: Vec<TokenKind>,
339 pos: usize,
340}
341
342impl Iterator for RequiredPhraseTokenIter {
343 type Item = &'static str;
344
345 fn next(&mut self) -> Option<Self::Item> {
346 if self.pos >= self.tokens.len() {
347 return None;
348 }
349 let token = self.tokens[self.pos];
350 self.pos += 1;
351 Some(match token {
352 TokenKind::Open => REQUIRED_PHRASE_OPEN,
353 TokenKind::Close => REQUIRED_PHRASE_CLOSE,
354 TokenKind::Word => "word",
355 })
356 }
357}
358
359static REQUIRED_PHRASE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
362 Regex::new(r"(?:[^_\W]+\+?[^_\W]*|\{\{|\}\})").expect("Invalid required phrase pattern")
363});
364
365pub fn tokenize_with_stopwords(
373 text: &str,
374) -> (Vec<String>, std::collections::HashMap<Option<usize>, usize>) {
375 if text.is_empty() {
376 return (Vec::new(), std::collections::HashMap::new());
377 }
378
379 let mut tokens = Vec::new();
380 let mut stopwords_by_pos = std::collections::HashMap::new();
381
382 let mut pos: Option<usize> = None;
383 let lowercase_text = text.to_lowercase();
384
385 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
386 let token = cap.as_str();
387 if token.is_empty() {
388 continue;
389 }
390
391 if STOPWORDS.contains(token) {
392 *stopwords_by_pos.entry(pos).or_insert(0) += 1;
393 } else {
394 pos = Some(pos.map_or(0, |p| p + 1));
395 tokens.push(token.to_string());
396 }
397 }
398
399 (tokens, stopwords_by_pos)
400}
401
402#[cfg(test)]
403mod tests {
404 use super::*;
405
406 #[test]
407 fn test_tokenize_empty() {
408 let result = tokenize("");
409 assert!(result.is_empty());
410 }
411
412 #[test]
413 fn test_tokenize_simple() {
414 let result = tokenize("Hello World");
415 assert_eq!(result, vec!["hello", "world"]);
416 }
417
418 #[test]
419 fn test_tokenize_with_punctuation() {
420 let result = tokenize("Hello, World! This is a test.");
421 assert_eq!(result, vec!["hello", "world", "this", "is", "test"]);
423 }
424
425 #[test]
426 fn test_tokenize_with_spaces() {
427 let result = tokenize("some Text with spAces!");
428 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
429 }
430
431 #[test]
432 fn test_tokenize_with_plus() {
433 let result = tokenize("GPL2+ and GPL3");
434 assert_eq!(result, vec!["gpl2+", "and", "gpl3"]);
435 }
436
437 #[test]
438 fn test_tokenize_filters_stopwords() {
439 let result = tokenize("Hello div World p");
440 assert_eq!(result, vec!["hello", "world"]);
441 }
442
443 #[test]
444 fn test_tokenize_with_special_chars() {
445 let result = tokenize("special+-_!@ chars");
446 assert_eq!(result, vec!["special+", "chars"]);
448 }
449
450 #[test]
451 fn test_tokenize_with_underscores() {
452 let result = tokenize("hello_world foo_bar");
453 assert_eq!(result, vec!["hello", "world", "foo", "bar"]);
454 }
455
456 #[test]
457 fn test_tokenize_with_numbers() {
458 let result = tokenize("version 2.0 and 3.0");
459 assert_eq!(result, vec!["version", "2", "0", "and", "3", "0"]);
460 }
461
462 #[test]
463 fn test_tokenize_without_stopwords_keeps_html_tags() {
464 let result = tokenize_without_stopwords("Hello div World p");
465 assert_eq!(result, vec!["hello", "div", "world", "p"]);
466 }
467
468 #[test]
469 fn test_tokenize_without_stopwords_empty() {
470 let result = tokenize_without_stopwords("");
471 assert!(result.is_empty());
472 }
473
474 #[test]
475 fn test_tokenization_with_plus_in_middle() {
476 let result = tokenize("C++ and GPL+");
477 assert_eq!(result, vec!["c+", "and", "gpl+"]);
478 }
479
480 #[test]
481 fn test_tokenization_braces() {
482 let result = tokenize("{{Hi}}some {{}}Text with{{noth+-_!@ing}} {{junk}}spAces!");
483 assert_eq!(
484 result,
485 vec![
486 "hi", "some", "text", "with", "noth+", "ing", "junk", "spaces"
487 ]
488 );
489 }
490
491 #[test]
492 fn test_tokenize_with_ampersand() {
493 let result = tokenize("some "< markup >\"");
494 assert_eq!(result, vec!["some", "markup"]);
495 }
496
497 #[test]
498 #[allow(non_snake_case)]
499 fn test_query_tokenizer_brace_case() {
500 let result = tokenize("{{}some }}Text with spAces! + _ -");
501 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
502 }
503
504 #[test]
505 fn test_tokenize_unicode_characters() {
506 let result = tokenize("hello 世界 mir");
508 assert_eq!(result, vec!["hello", "世界", "mir"]);
509 }
510
511 #[test]
512 fn test_tokenize_only_special_chars() {
513 let result = tokenize("!@#$%^&*()");
514 assert!(result.is_empty());
515 }
516
517 #[test]
518 fn test_tokenize_only_punctuation() {
519 let result = tokenize(".,;:!?-_=+[]{}()");
520 assert!(result.is_empty());
521 }
522
523 #[test]
524 fn test_tokenize_only_stopwords() {
525 let result = tokenize("div p a br");
526 assert!(result.is_empty());
527 }
528
529 #[test]
530 fn test_tokenize_mixed_stopwords_and_words() {
531 let result = tokenize("div hello p world a test");
532 assert_eq!(result, vec!["hello", "world", "test"]);
533 }
534
535 #[test]
536 fn test_tokenize_very_long_text() {
537 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
538 let text = words.join(" ");
539 let result = tokenize(&text);
540 assert_eq!(result.len(), 1000);
541 assert_eq!(result[0], "word0");
542 assert_eq!(result[999], "word999");
543 }
544
545 #[test]
546 fn test_tokenize_with_newlines_and_tabs() {
547 let result = tokenize("hello\nworld\ttest");
548 assert_eq!(result, vec!["hello", "world", "test"]);
549 }
550
551 #[test]
552 fn test_tokenize_with_carriage_return() {
553 let result = tokenize("hello\r\nworld\rtest");
554 assert_eq!(result, vec!["hello", "world", "test"]);
555 }
556
557 #[test]
558 fn test_tokenize_trailing_plus() {
559 let result = tokenize("GPL2+ LGPL3+");
560 assert_eq!(result, vec!["gpl2+", "lgpl3+"]);
561 }
562
563 #[test]
564 fn test_tokenize_leading_plus() {
565 let result = tokenize("+hello +world");
566 assert_eq!(result, vec!["hello", "world"]);
567 }
568
569 #[test]
570 fn test_tokenize_without_stopwords_preserves_all() {
571 let result = tokenize_without_stopwords("div p a br");
572 assert_eq!(result, vec!["div", "p", "a", "br"]);
573 }
574
575 #[test]
576 fn test_tokenize_without_stopwords_unicode() {
577 let result = tokenize_without_stopwords("hello 世界");
579 assert_eq!(result, vec!["hello", "世界"]);
580 }
581
582 #[test]
583 fn test_tokenize_without_stopwords_only_special() {
584 let result = tokenize_without_stopwords("!@#$%");
585 assert!(result.is_empty());
586 }
587
588 #[test]
589 fn test_tokenize_consecutive_plus() {
590 let result = tokenize("a++b");
591 assert_eq!(result, vec!["a+", "b"]);
592 }
593
594 #[test]
595 fn test_tokenize_hyphenated_words() {
596 let result = tokenize("some-thing foo-bar");
597 assert_eq!(result, vec!["some", "thing", "foo", "bar"]);
598 }
599
600 #[test]
601 fn test_tokenize_email_address() {
602 let result = tokenize("test@example.com");
603 assert_eq!(result, vec!["test", "example", "com"]);
604 }
605
606 #[test]
607 fn test_tokenize_url() {
608 let result = tokenize("https://example.com/path");
609 assert_eq!(result, vec!["https", "example", "com", "path"]);
610 }
611
612 #[test]
613 fn test_tokenize_version_number() {
614 let result = tokenize("version 1.2.3");
615 assert_eq!(result, vec!["version", "1", "2", "3"]);
616 }
617
618 #[test]
619 fn test_tokenize_xml_entities() {
620 let result = tokenize("<div>hello</div>");
621 assert_eq!(result, vec!["hello"]);
622 }
623
624 #[test]
625 fn test_tokenize_whitespace_only() {
626 let result = tokenize(" \t\n\r ");
627 assert!(result.is_empty());
628 }
629
630 #[test]
631 fn test_tokenize_single_char() {
632 let result = tokenize("a");
633 assert!(result.is_empty());
634 }
635
636 #[test]
637 fn test_tokenize_single_word() {
638 let result = tokenize("hello");
639 assert_eq!(result, vec!["hello"]);
640 }
641
642 #[test]
643 fn test_tokenize_numbers_only() {
644 let result = tokenize("123 456 789");
645 assert_eq!(result, vec!["123", "456", "789"]);
646 }
647
648 #[test]
649 fn test_tokenize_alphanumeric_mixed() {
650 let result = tokenize("abc123 def456");
651 assert_eq!(result, vec!["abc123", "def456"]);
652 }
653
654 #[test]
655 fn test_tokenize_underscore_separated() {
656 let result = tokenize("hello_world foo_bar_baz");
657 assert_eq!(result, vec!["hello", "world", "foo", "bar", "baz"]);
658 }
659
660 #[test]
661 fn test_tokenize_all_stopwords_from_list() {
662 let result = tokenize("amp lt gt nbsp quot");
663 assert!(result.is_empty());
664 }
665
666 #[test]
667 fn test_parse_required_phrase_spans_single() {
668 let text = "This is {{enclosed}} in braces";
669 let spans = parse_required_phrase_spans(text);
670 assert_eq!(spans, vec![2..3]);
671 }
672
673 #[test]
674 fn test_parse_required_phrase_spans_multiword() {
675 let text = "This is {{a required phrase}} here";
676 let spans = parse_required_phrase_spans(text);
677 assert_eq!(spans, vec![2..4]);
678 }
679
680 #[test]
681 fn test_parse_required_phrase_spans_multiple() {
682 let text = "{{First}} and {{second}} phrase";
683 let spans = parse_required_phrase_spans(text);
684 assert_eq!(spans, vec![0..1, 2..3]);
685 }
686
687 #[test]
688 fn test_parse_required_phrase_spans_none() {
689 let text = "No required phrases here";
690 let spans = parse_required_phrase_spans(text);
691 assert!(spans.is_empty());
692 }
693
694 #[test]
695 fn test_parse_required_phrase_spans_empty_braces() {
696 let text = "Empty {{}} braces";
697 let spans = parse_required_phrase_spans(text);
698 assert!(spans.is_empty());
699 }
700
701 #[test]
702 fn test_parse_required_phrase_spans_nested() {
703 let text = "Nested {{ outer {{ inner }} }} braces";
704 let spans = parse_required_phrase_spans(text);
705 assert!(spans.is_empty());
706 }
707
708 #[test]
709 fn test_parse_required_phrase_spans_unclosed() {
710 let text = "Unclosed {{ phrase here";
711 let spans = parse_required_phrase_spans(text);
712 assert!(spans.is_empty());
713 }
714
715 #[test]
716 fn test_parse_required_phrase_spans_unopened() {
717 let text = "Unopened }} phrase here";
718 let spans = parse_required_phrase_spans(text);
719 assert!(spans.is_empty());
720 }
721
722 #[test]
723 fn test_tokenize_with_stopwords_basic() {
724 let text = "hello div world p test";
725 let (tokens, stopwords) = tokenize_with_stopwords(text);
726 assert_eq!(tokens, vec!["hello", "world", "test"]);
727 assert_eq!(stopwords.get(&Some(0)), Some(&1));
729 assert_eq!(stopwords.get(&Some(1)), Some(&1));
730 }
731
732 #[test]
733 fn test_tokenize_with_stopwords_empty() {
734 let (tokens, stopwords) = tokenize_with_stopwords("");
735 assert!(tokens.is_empty());
736 assert!(stopwords.is_empty());
737 }
738
739 #[test]
740 fn test_tokenize_with_stopwords_no_stopwords() {
741 let text = "hello world test";
742 let (tokens, stopwords) = tokenize_with_stopwords(text);
743 assert_eq!(tokens, vec!["hello", "world", "test"]);
744 assert!(stopwords.is_empty());
745 }
746
747 #[test]
748 fn test_parse_required_phrase_spans_filters_stopwords_inside() {
749 let text = "{{hello a world}}";
750 let spans = parse_required_phrase_spans(text);
751 assert_eq!(spans, vec![0..2]);
752 }
753
754 #[test]
755 fn test_parse_required_phrase_spans_filters_stopwords_outside() {
756 let text = "{{Hello}} a {{world}}";
757 let spans = parse_required_phrase_spans(text);
758 assert_eq!(spans, vec![0..1, 1..2]);
759 }
760
761 #[test]
762 fn test_parse_required_phrase_spans_multiple_stopwords() {
763 let text = "{{a p div hello}}";
764 let spans = parse_required_phrase_spans(text);
765 assert_eq!(spans, vec![0..1]);
766 }
767
768 #[test]
769 fn test_parse_required_phrase_spans_case_insensitive_stopwords() {
770 let text = "{{HELLO A WORLD}}";
771 let spans = parse_required_phrase_spans(text);
772 assert_eq!(spans, vec![0..2]);
773 }
774}