1use crate::license_detection::index::dictionary::{QueryToken, TokenDictionary};
10use regex::Regex;
11use std::collections::HashSet;
12use std::ops::Range;
13use std::sync::LazyLock;
14
15const REQUIRED_PHRASE_OPEN: &str = "{{";
16const REQUIRED_PHRASE_CLOSE: &str = "}}";
17
18pub(crate) static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
23 let mut set = HashSet::new();
24
25 for &word in &["amp", "apos", "gt", "lt", "nbsp", "quot"] {
27 set.insert(word);
28 }
29
30 for &word in &[
32 "a",
33 "abbr",
34 "alt",
35 "blockquote",
36 "body",
37 "br",
38 "class",
39 "div",
40 "em",
41 "h1",
42 "h2",
43 "h3",
44 "h4",
45 "h5",
46 "hr",
47 "href",
48 "img",
49 "li",
50 "ol",
51 "p",
52 "pre",
53 "rel",
54 "script",
55 "span",
56 "src",
57 "td",
58 "th",
59 "tr",
60 "ul",
61 ] {
62 set.insert(word);
63 }
64
65 set.insert("rem"); set.insert("dnl"); set.insert("para");
71 set.insert("ulink");
72
73 for &word in &[
75 "bdquo", "bull", "bullet", "colon", "comma", "emdash", "emsp", "ensp", "ge", "hairsp",
76 "ldquo", "ldquor", "le", "lpar", "lsaquo", "lsquo", "lsquor", "mdash", "ndash", "numsp",
77 "period", "puncsp", "raquo", "rdquo", "rdquor", "rpar", "rsaquo", "rsquo", "rsquor",
78 "sbquo", "semi", "thinsp", "tilde",
79 ] {
80 set.insert(word);
81 }
82
83 set.insert("x3c");
85 set.insert("x3e");
86
87 for &word in &[
89 "lists", "side", "nav", "height", "auto", "border", "padding", "width",
90 ] {
91 set.insert(word);
92 }
93
94 set.insert("head1");
96 set.insert("head2");
97 set.insert("head3");
98
99 set.insert("printf");
101
102 set.insert("echo");
104
105 set
106});
107
108static QUERY_PATTERN: LazyLock<Regex> =
118 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
119
120pub fn tokenize(text: &str) -> Vec<String> {
128 if text.is_empty() {
129 return Vec::new();
130 }
131
132 let mut tokens = Vec::new();
133 let lowercase_text = text.to_lowercase();
134
135 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
136 let token = cap.as_str();
137
138 if !token.is_empty() && !STOPWORDS.contains(token) {
140 tokens.push(token.to_string());
141 }
142 }
143
144 tokens
145}
146
147pub fn tokenize_without_stopwords(text: &str) -> Vec<String> {
154 if text.is_empty() {
155 return Vec::new();
156 }
157
158 let mut tokens = Vec::new();
159 let lowercase_text = text.to_lowercase();
160
161 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
162 let token = cap.as_str();
163
164 if !token.is_empty() {
166 tokens.push(token.to_string());
167 }
168 }
169
170 tokens
171}
172
173pub fn tokenize_as_ids(text: &str, dictionary: &TokenDictionary) -> Vec<QueryToken> {
184 if text.is_empty() {
185 return Vec::new();
186 }
187
188 let mut tokens = Vec::new();
189 let stopwords_set = &*STOPWORDS;
190
191 let lowercase_text = text.to_lowercase();
192
193 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
194 let token = cap.as_str();
195 if token.is_empty() {
196 continue;
197 }
198
199 if stopwords_set.contains(token) {
200 tokens.push(QueryToken::Stopword);
201 } else {
202 tokens.push(dictionary.classify_query_token(token));
203 }
204 }
205
206 tokens
207}
208
209pub fn count_tokens(text: &str) -> usize {
213 if text.is_empty() {
214 return 0;
215 }
216
217 let lowercase_text = text.to_lowercase();
218 let stopwords_set = &*STOPWORDS;
219
220 QUERY_PATTERN
221 .find_iter(&lowercase_text)
222 .filter(|m| !m.as_str().is_empty() && !stopwords_set.contains(m.as_str()))
223 .count()
224}
225
226pub fn parse_required_phrase_spans(text: &str) -> Vec<Range<usize>> {
243 let mut spans = Vec::new();
244 let mut in_required_phrase = false;
245 let mut current_phrase_positions: Vec<usize> = Vec::new();
246 let mut ipos = 0usize;
247
248 for token in required_phrase_tokenizer(text) {
249 if token == REQUIRED_PHRASE_OPEN {
250 if in_required_phrase {
251 log::warn!(
252 "Invalid rule with nested required phrase {{ {{ braces: {}",
253 text
254 );
255 return Vec::new();
256 }
257 in_required_phrase = true;
258 } else if token == REQUIRED_PHRASE_CLOSE {
259 if in_required_phrase {
260 if !current_phrase_positions.is_empty() {
261 let min_pos = *current_phrase_positions.iter().min().unwrap_or(&0);
262 let max_pos = *current_phrase_positions.iter().max().unwrap_or(&0);
263 spans.push(min_pos..max_pos + 1);
264 current_phrase_positions.clear();
265 } else {
266 log::warn!(
267 "Invalid rule with empty required phrase {{}} braces: {}",
268 text
269 );
270 return Vec::new();
271 }
272 in_required_phrase = false;
273 } else {
274 log::warn!(
275 "Invalid rule with dangling required phrase missing closing braces: {}",
276 text
277 );
278 return Vec::new();
279 }
280 } else {
281 if in_required_phrase {
282 current_phrase_positions.push(ipos);
283 }
284 ipos += 1;
285 }
286 }
287
288 if !current_phrase_positions.is_empty() || in_required_phrase {
289 log::warn!(
290 "Invalid rule with dangling required phrase missing final closing braces: {}",
291 text
292 );
293 return Vec::new();
294 }
295
296 spans
297}
298
299fn required_phrase_tokenizer(text: &str) -> RequiredPhraseTokenIter {
304 let lowercase_text = text.to_lowercase();
305 let tokens: Vec<TokenKind> = REQUIRED_PHRASE_PATTERN
306 .find_iter(&lowercase_text)
307 .filter_map(|m| {
308 let token = m.as_str();
309 if token == REQUIRED_PHRASE_OPEN {
310 Some(TokenKind::Open)
311 } else if token == REQUIRED_PHRASE_CLOSE {
312 Some(TokenKind::Close)
313 } else if !token.is_empty() && !STOPWORDS.contains(token) {
314 Some(TokenKind::Word)
315 } else {
316 None
317 }
318 })
319 .collect();
320 RequiredPhraseTokenIter { tokens, pos: 0 }
321}
322
323#[derive(Clone, Copy, PartialEq)]
324enum TokenKind {
325 Open,
326 Close,
327 Word,
328}
329
330struct RequiredPhraseTokenIter {
331 tokens: Vec<TokenKind>,
332 pos: usize,
333}
334
335impl Iterator for RequiredPhraseTokenIter {
336 type Item = &'static str;
337
338 fn next(&mut self) -> Option<Self::Item> {
339 if self.pos >= self.tokens.len() {
340 return None;
341 }
342 let token = self.tokens[self.pos];
343 self.pos += 1;
344 Some(match token {
345 TokenKind::Open => REQUIRED_PHRASE_OPEN,
346 TokenKind::Close => REQUIRED_PHRASE_CLOSE,
347 TokenKind::Word => "word",
348 })
349 }
350}
351
352static REQUIRED_PHRASE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
355 Regex::new(r"(?:[^_\W]+\+?[^_\W]*|\{\{|\}\})").expect("Invalid required phrase pattern")
356});
357
358pub fn tokenize_with_stopwords(
366 text: &str,
367) -> (Vec<String>, std::collections::HashMap<Option<usize>, usize>) {
368 if text.is_empty() {
369 return (Vec::new(), std::collections::HashMap::new());
370 }
371
372 let mut tokens = Vec::new();
373 let mut stopwords_by_pos = std::collections::HashMap::new();
374
375 let mut pos: Option<usize> = None;
376 let lowercase_text = text.to_lowercase();
377
378 for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
379 let token = cap.as_str();
380 if token.is_empty() {
381 continue;
382 }
383
384 if STOPWORDS.contains(token) {
385 *stopwords_by_pos.entry(pos).or_insert(0) += 1;
386 } else {
387 pos = Some(pos.map_or(0, |p| p + 1));
388 tokens.push(token.to_string());
389 }
390 }
391
392 (tokens, stopwords_by_pos)
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398
399 #[test]
400 fn test_tokenize_empty() {
401 let result = tokenize("");
402 assert!(result.is_empty());
403 }
404
405 #[test]
406 fn test_tokenize_simple() {
407 let result = tokenize("Hello World");
408 assert_eq!(result, vec!["hello", "world"]);
409 }
410
411 #[test]
412 fn test_tokenize_with_punctuation() {
413 let result = tokenize("Hello, World! This is a test.");
414 assert_eq!(result, vec!["hello", "world", "this", "is", "test"]);
416 }
417
418 #[test]
419 fn test_tokenize_with_spaces() {
420 let result = tokenize("some Text with spAces!");
421 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
422 }
423
424 #[test]
425 fn test_tokenize_with_plus() {
426 let result = tokenize("GPL2+ and GPL3");
427 assert_eq!(result, vec!["gpl2+", "and", "gpl3"]);
428 }
429
430 #[test]
431 fn test_tokenize_filters_stopwords() {
432 let result = tokenize("Hello div World p");
433 assert_eq!(result, vec!["hello", "world"]);
434 }
435
436 #[test]
437 fn test_tokenize_with_special_chars() {
438 let result = tokenize("special+-_!@ chars");
439 assert_eq!(result, vec!["special+", "chars"]);
441 }
442
443 #[test]
444 fn test_tokenize_with_underscores() {
445 let result = tokenize("hello_world foo_bar");
446 assert_eq!(result, vec!["hello", "world", "foo", "bar"]);
447 }
448
449 #[test]
450 fn test_tokenize_with_numbers() {
451 let result = tokenize("version 2.0 and 3.0");
452 assert_eq!(result, vec!["version", "2", "0", "and", "3", "0"]);
453 }
454
455 #[test]
456 fn test_tokenize_without_stopwords_keeps_html_tags() {
457 let result = tokenize_without_stopwords("Hello div World p");
458 assert_eq!(result, vec!["hello", "div", "world", "p"]);
459 }
460
461 #[test]
462 fn test_tokenize_without_stopwords_empty() {
463 let result = tokenize_without_stopwords("");
464 assert!(result.is_empty());
465 }
466
467 #[test]
468 fn test_tokenization_with_plus_in_middle() {
469 let result = tokenize("C++ and GPL+");
470 assert_eq!(result, vec!["c+", "and", "gpl+"]);
471 }
472
473 #[test]
474 fn test_tokenization_braces() {
475 let result = tokenize("{{Hi}}some {{}}Text with{{noth+-_!@ing}} {{junk}}spAces!");
476 assert_eq!(
477 result,
478 vec![
479 "hi", "some", "text", "with", "noth+", "ing", "junk", "spaces"
480 ]
481 );
482 }
483
484 #[test]
485 fn test_tokenize_with_ampersand() {
486 let result = tokenize("some "< markup >\"");
487 assert_eq!(result, vec!["some", "markup"]);
488 }
489
490 #[test]
491 #[allow(non_snake_case)]
492 fn test_query_tokenizer_brace_case() {
493 let result = tokenize("{{}some }}Text with spAces! + _ -");
494 assert_eq!(result, vec!["some", "text", "with", "spaces"]);
495 }
496
497 #[test]
498 fn test_tokenize_unicode_characters() {
499 let result = tokenize("hello 世界 mir");
501 assert_eq!(result, vec!["hello", "世界", "mir"]);
502 }
503
504 #[test]
505 fn test_tokenize_only_special_chars() {
506 let result = tokenize("!@#$%^&*()");
507 assert!(result.is_empty());
508 }
509
510 #[test]
511 fn test_tokenize_only_punctuation() {
512 let result = tokenize(".,;:!?-_=+[]{}()");
513 assert!(result.is_empty());
514 }
515
516 #[test]
517 fn test_tokenize_only_stopwords() {
518 let result = tokenize("div p a br");
519 assert!(result.is_empty());
520 }
521
522 #[test]
523 fn test_tokenize_mixed_stopwords_and_words() {
524 let result = tokenize("div hello p world a test");
525 assert_eq!(result, vec!["hello", "world", "test"]);
526 }
527
528 #[test]
529 fn test_tokenize_very_long_text() {
530 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
531 let text = words.join(" ");
532 let result = tokenize(&text);
533 assert_eq!(result.len(), 1000);
534 assert_eq!(result[0], "word0");
535 assert_eq!(result[999], "word999");
536 }
537
538 #[test]
539 fn test_tokenize_with_newlines_and_tabs() {
540 let result = tokenize("hello\nworld\ttest");
541 assert_eq!(result, vec!["hello", "world", "test"]);
542 }
543
544 #[test]
545 fn test_tokenize_with_carriage_return() {
546 let result = tokenize("hello\r\nworld\rtest");
547 assert_eq!(result, vec!["hello", "world", "test"]);
548 }
549
550 #[test]
551 fn test_tokenize_trailing_plus() {
552 let result = tokenize("GPL2+ LGPL3+");
553 assert_eq!(result, vec!["gpl2+", "lgpl3+"]);
554 }
555
556 #[test]
557 fn test_tokenize_leading_plus() {
558 let result = tokenize("+hello +world");
559 assert_eq!(result, vec!["hello", "world"]);
560 }
561
562 #[test]
563 fn test_tokenize_without_stopwords_preserves_all() {
564 let result = tokenize_without_stopwords("div p a br");
565 assert_eq!(result, vec!["div", "p", "a", "br"]);
566 }
567
568 #[test]
569 fn test_tokenize_without_stopwords_unicode() {
570 let result = tokenize_without_stopwords("hello 世界");
572 assert_eq!(result, vec!["hello", "世界"]);
573 }
574
575 #[test]
576 fn test_tokenize_without_stopwords_only_special() {
577 let result = tokenize_without_stopwords("!@#$%");
578 assert!(result.is_empty());
579 }
580
581 #[test]
582 fn test_tokenize_consecutive_plus() {
583 let result = tokenize("a++b");
584 assert_eq!(result, vec!["a+", "b"]);
585 }
586
587 #[test]
588 fn test_tokenize_hyphenated_words() {
589 let result = tokenize("some-thing foo-bar");
590 assert_eq!(result, vec!["some", "thing", "foo", "bar"]);
591 }
592
593 #[test]
594 fn test_tokenize_email_address() {
595 let result = tokenize("test@example.com");
596 assert_eq!(result, vec!["test", "example", "com"]);
597 }
598
599 #[test]
600 fn test_tokenize_url() {
601 let result = tokenize("https://example.com/path");
602 assert_eq!(result, vec!["https", "example", "com", "path"]);
603 }
604
605 #[test]
606 fn test_tokenize_version_number() {
607 let result = tokenize("version 1.2.3");
608 assert_eq!(result, vec!["version", "1", "2", "3"]);
609 }
610
611 #[test]
612 fn test_tokenize_xml_entities() {
613 let result = tokenize("<div>hello</div>");
614 assert_eq!(result, vec!["hello"]);
615 }
616
617 #[test]
618 fn test_tokenize_whitespace_only() {
619 let result = tokenize(" \t\n\r ");
620 assert!(result.is_empty());
621 }
622
623 #[test]
624 fn test_tokenize_single_char() {
625 let result = tokenize("a");
626 assert!(result.is_empty());
627 }
628
629 #[test]
630 fn test_tokenize_single_word() {
631 let result = tokenize("hello");
632 assert_eq!(result, vec!["hello"]);
633 }
634
635 #[test]
636 fn test_tokenize_numbers_only() {
637 let result = tokenize("123 456 789");
638 assert_eq!(result, vec!["123", "456", "789"]);
639 }
640
641 #[test]
642 fn test_tokenize_alphanumeric_mixed() {
643 let result = tokenize("abc123 def456");
644 assert_eq!(result, vec!["abc123", "def456"]);
645 }
646
647 #[test]
648 fn test_tokenize_underscore_separated() {
649 let result = tokenize("hello_world foo_bar_baz");
650 assert_eq!(result, vec!["hello", "world", "foo", "bar", "baz"]);
651 }
652
653 #[test]
654 fn test_tokenize_all_stopwords_from_list() {
655 let result = tokenize("amp lt gt nbsp quot");
656 assert!(result.is_empty());
657 }
658
659 #[test]
660 fn test_parse_required_phrase_spans_single() {
661 let text = "This is {{enclosed}} in braces";
662 let spans = parse_required_phrase_spans(text);
663 assert_eq!(spans, vec![2..3]);
664 }
665
666 #[test]
667 fn test_parse_required_phrase_spans_multiword() {
668 let text = "This is {{a required phrase}} here";
669 let spans = parse_required_phrase_spans(text);
670 assert_eq!(spans, vec![2..4]);
671 }
672
673 #[test]
674 fn test_parse_required_phrase_spans_multiple() {
675 let text = "{{First}} and {{second}} phrase";
676 let spans = parse_required_phrase_spans(text);
677 assert_eq!(spans, vec![0..1, 2..3]);
678 }
679
680 #[test]
681 fn test_parse_required_phrase_spans_none() {
682 let text = "No required phrases here";
683 let spans = parse_required_phrase_spans(text);
684 assert!(spans.is_empty());
685 }
686
687 #[test]
688 fn test_parse_required_phrase_spans_empty_braces() {
689 let text = "Empty {{}} braces";
690 let spans = parse_required_phrase_spans(text);
691 assert!(spans.is_empty());
692 }
693
694 #[test]
695 fn test_parse_required_phrase_spans_nested() {
696 let text = "Nested {{ outer {{ inner }} }} braces";
697 let spans = parse_required_phrase_spans(text);
698 assert!(spans.is_empty());
699 }
700
701 #[test]
702 fn test_parse_required_phrase_spans_unclosed() {
703 let text = "Unclosed {{ phrase here";
704 let spans = parse_required_phrase_spans(text);
705 assert!(spans.is_empty());
706 }
707
708 #[test]
709 fn test_parse_required_phrase_spans_unopened() {
710 let text = "Unopened }} phrase here";
711 let spans = parse_required_phrase_spans(text);
712 assert!(spans.is_empty());
713 }
714
715 #[test]
716 fn test_tokenize_with_stopwords_basic() {
717 let text = "hello div world p test";
718 let (tokens, stopwords) = tokenize_with_stopwords(text);
719 assert_eq!(tokens, vec!["hello", "world", "test"]);
720 assert_eq!(stopwords.get(&Some(0)), Some(&1));
722 assert_eq!(stopwords.get(&Some(1)), Some(&1));
723 }
724
725 #[test]
726 fn test_tokenize_with_stopwords_empty() {
727 let (tokens, stopwords) = tokenize_with_stopwords("");
728 assert!(tokens.is_empty());
729 assert!(stopwords.is_empty());
730 }
731
732 #[test]
733 fn test_tokenize_with_stopwords_no_stopwords() {
734 let text = "hello world test";
735 let (tokens, stopwords) = tokenize_with_stopwords(text);
736 assert_eq!(tokens, vec!["hello", "world", "test"]);
737 assert!(stopwords.is_empty());
738 }
739
740 #[test]
741 fn test_parse_required_phrase_spans_filters_stopwords_inside() {
742 let text = "{{hello a world}}";
743 let spans = parse_required_phrase_spans(text);
744 assert_eq!(spans, vec![0..2]);
745 }
746
747 #[test]
748 fn test_parse_required_phrase_spans_filters_stopwords_outside() {
749 let text = "{{Hello}} a {{world}}";
750 let spans = parse_required_phrase_spans(text);
751 assert_eq!(spans, vec![0..1, 1..2]);
752 }
753
754 #[test]
755 fn test_parse_required_phrase_spans_multiple_stopwords() {
756 let text = "{{a p div hello}}";
757 let spans = parse_required_phrase_spans(text);
758 assert_eq!(spans, vec![0..1]);
759 }
760
761 #[test]
762 fn test_parse_required_phrase_spans_case_insensitive_stopwords() {
763 let text = "{{HELLO A WORLD}}";
764 let spans = parse_required_phrase_spans(text);
765 assert_eq!(spans, vec![0..2]);
766 }
767}