provenant/license_detection/
tokenize.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Text tokenization and normalization.
5//!
6//! Tokenization converts text into a sequence of tokens that can be matched
7//! against license rules. This module implements ScanCode-compatible tokenization.
8
9use crate::license_detection::index::dictionary::{QueryToken, TokenDictionary};
10use regex::Regex;
11use std::collections::HashSet;
12use std::ops::Range;
13use std::sync::LazyLock;
14
15const REQUIRED_PHRASE_OPEN: &str = "{{";
16const REQUIRED_PHRASE_CLOSE: &str = "}}";
17
18/// Common words that are ignored from matching such as HTML tags, XML entities, etc.
19///
20/// This is the Rust equivalent of the Python STOPWORDS frozenset from
21/// reference/scancode-toolkit/src/licensedcode/stopwords.py
22pub(crate) static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
23    let mut set = HashSet::new();
24
25    // common XML character references as &quot;
26    for &word in &["amp", "apos", "gt", "lt", "nbsp", "quot"] {
27        set.insert(word);
28    }
29
30    // common html tags as <a href=https://link ...> dfsdfsdf</a>
31    for &word in &[
32        "a",
33        "abbr",
34        "alt",
35        "blockquote",
36        "body",
37        "br",
38        "class",
39        "div",
40        "em",
41        "h1",
42        "h2",
43        "h3",
44        "h4",
45        "h5",
46        "hr",
47        "href",
48        "img",
49        "li",
50        "ol",
51        "p",
52        "pre",
53        "rel",
54        "script",
55        "span",
56        "src",
57        "td",
58        "th",
59        "tr",
60        "ul",
61    ] {
62        set.insert(word);
63    }
64
65    // comment line markers
66    set.insert("rem"); // batch files
67    set.insert("dnl"); // autotools
68
69    // doc book tags as <para>
70    set.insert("para");
71    set.insert("ulink");
72
73    // Some HTML punctuations and entities all as &emdash;
74    for &word in &[
75        "bdquo", "bull", "bullet", "colon", "comma", "emdash", "emsp", "ensp", "ge", "hairsp",
76        "ldquo", "ldquor", "le", "lpar", "lsaquo", "lsquo", "lsquor", "mdash", "ndash", "numsp",
77        "period", "puncsp", "raquo", "rdquo", "rdquor", "rpar", "rsaquo", "rsquo", "rsquor",
78        "sbquo", "semi", "thinsp", "tilde",
79    ] {
80        set.insert(word);
81    }
82
83    // some xml char entities
84    set.insert("x3c");
85    set.insert("x3e");
86
87    // seen in many CSS
88    for &word in &[
89        "lists", "side", "nav", "height", "auto", "border", "padding", "width",
90    ] {
91        set.insert(word);
92    }
93
94    // seen in Perl PODs
95    set.insert("head1");
96    set.insert("head2");
97    set.insert("head3");
98
99    // common in C literals
100    set.insert("printf");
101
102    // common in shell
103    set.insert("echo");
104
105    set
106});
107
108/// Splits on whitespace and punctuation: keep only characters and numbers and + when in the middle or end of a word.
109///
110/// The pattern is equivalent to Python's: `[^_\W]+\+?[^_\W]*`
111/// - `[^_\W]+` - one or more characters that are NOT underscore and NOT non-word (i.e., alphanumeric including Unicode)
112/// - `\+?` - optional plus sign (important for license names like "GPL2+")
113/// - `[^_\W]*` - zero or more alphanumeric characters (including Unicode)
114///
115/// This matches word-like sequences while preserving trailing `+` characters.
116/// Uses Unicode-aware matching to match Python's `re.UNICODE` behavior.
117static QUERY_PATTERN: LazyLock<Regex> =
118    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
119
120/// Tokenizes text to match index rules and queries.
121///
122/// Splits text into tokens using regex pattern, normalizes each token (lowercase),
123/// and filters out empty strings and stopwords.
124///
125/// # Returns
126/// A vector of token strings.
127pub fn tokenize(text: &str) -> Vec<String> {
128    if text.is_empty() {
129        return Vec::new();
130    }
131
132    let mut tokens = Vec::new();
133    let lowercase_text = text.to_lowercase();
134
135    for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
136        let token = cap.as_str();
137
138        // Filter out empty strings and stopwords
139        if !token.is_empty() && !STOPWORDS.contains(token) {
140            tokens.push(token.to_string());
141        }
142    }
143
144    tokens
145}
146
147/// Tokenizes text without filtering stopwords.
148///
149/// This is used for query text where stopwords are handled at a later stage.
150///
151/// # Returns
152/// A vector of token strings.
153pub fn tokenize_without_stopwords(text: &str) -> Vec<String> {
154    if text.is_empty() {
155        return Vec::new();
156    }
157
158    let mut tokens = Vec::new();
159    let lowercase_text = text.to_lowercase();
160
161    for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
162        let token = cap.as_str();
163
164        // Filter out empty strings but keep stopwords
165        if !token.is_empty() {
166            tokens.push(token.to_string());
167        }
168    }
169
170    tokens
171}
172
173/// Tokenizes text and returns QueryTokens directly, avoiding string allocation.
174///
175/// This is the primary tokenization function for query processing.
176/// Tokens are classified against the dictionary immediately:
177/// - Known tokens → QueryToken::Known(KnownToken)
178/// - Unknown tokens → QueryToken::Unknown
179/// - Stopwords → QueryToken::Stopword
180///
181/// # Returns
182/// A vector of QueryTokens (no string allocation).
183pub fn tokenize_as_ids(text: &str, dictionary: &TokenDictionary) -> Vec<QueryToken> {
184    if text.is_empty() {
185        return Vec::new();
186    }
187
188    let mut tokens = Vec::new();
189    let stopwords_set = &*STOPWORDS;
190
191    let lowercase_text = text.to_lowercase();
192
193    for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
194        let token = cap.as_str();
195        if token.is_empty() {
196            continue;
197        }
198
199        if stopwords_set.contains(token) {
200            tokens.push(QueryToken::Stopword);
201        } else {
202            tokens.push(dictionary.classify_query_token(token));
203        }
204    }
205
206    tokens
207}
208
209/// Count tokens in text without allocating strings.
210///
211/// Used for line length detection without memory overhead.
212pub fn count_tokens(text: &str) -> usize {
213    if text.is_empty() {
214        return 0;
215    }
216
217    let lowercase_text = text.to_lowercase();
218    let stopwords_set = &*STOPWORDS;
219
220    QUERY_PATTERN
221        .find_iter(&lowercase_text)
222        .filter(|m| !m.as_str().is_empty() && !stopwords_set.contains(m.as_str()))
223        .count()
224}
225
226/// Parse {{...}} required phrase markers from rule text.
227///
228/// Returns list of token position ranges for required phrases.
229/// The spans represent the positions (after tokenization) of tokens
230/// that MUST be matched for the rule to be considered valid.
231///
232/// # Arguments
233/// * `text` - The rule text containing optional {{...}} markers
234///
235/// # Returns
236/// A vector of Range<usize> representing token positions for each required phrase.
237/// Empty vector if no valid required phrases found.
238///
239/// # Examples
240/// ```
241/// use provenant::license_detection::tokenize::parse_required_phrase_spans;
242///
243/// let text = "This is {{enclosed}} in braces";
244/// let spans = parse_required_phrase_spans(text);
245/// assert_eq!(spans, vec![2..3]);
246/// ```
247///
248/// Based on Python: `get_existing_required_phrase_spans()` in tokenize.py:122-174
249pub fn parse_required_phrase_spans(text: &str) -> Vec<Range<usize>> {
250    let mut spans = Vec::new();
251    let mut in_required_phrase = false;
252    let mut current_phrase_positions: Vec<usize> = Vec::new();
253    let mut ipos = 0usize;
254
255    for token in required_phrase_tokenizer(text) {
256        if token == REQUIRED_PHRASE_OPEN {
257            if in_required_phrase {
258                log::warn!(
259                    "Invalid rule with nested required phrase {{ {{ braces: {}",
260                    text
261                );
262                return Vec::new();
263            }
264            in_required_phrase = true;
265        } else if token == REQUIRED_PHRASE_CLOSE {
266            if in_required_phrase {
267                if !current_phrase_positions.is_empty() {
268                    let min_pos = *current_phrase_positions.iter().min().unwrap_or(&0);
269                    let max_pos = *current_phrase_positions.iter().max().unwrap_or(&0);
270                    spans.push(min_pos..max_pos + 1);
271                    current_phrase_positions.clear();
272                } else {
273                    log::warn!(
274                        "Invalid rule with empty required phrase {{}} braces: {}",
275                        text
276                    );
277                    return Vec::new();
278                }
279                in_required_phrase = false;
280            } else {
281                log::warn!(
282                    "Invalid rule with dangling required phrase missing closing braces: {}",
283                    text
284                );
285                return Vec::new();
286            }
287        } else {
288            if in_required_phrase {
289                current_phrase_positions.push(ipos);
290            }
291            ipos += 1;
292        }
293    }
294
295    if !current_phrase_positions.is_empty() || in_required_phrase {
296        log::warn!(
297            "Invalid rule with dangling required phrase missing final closing braces: {}",
298            text
299        );
300        return Vec::new();
301    }
302
303    spans
304}
305
306/// Tokenizer for parsing required phrase markers.
307///
308/// Yields tokens including "{{" and "}}" markers.
309/// Similar to the required_phrase_tokenizer generator in Python.
310fn required_phrase_tokenizer(text: &str) -> RequiredPhraseTokenIter {
311    let lowercase_text = text.to_lowercase();
312    let tokens: Vec<TokenKind> = REQUIRED_PHRASE_PATTERN
313        .find_iter(&lowercase_text)
314        .filter_map(|m| {
315            let token = m.as_str();
316            if token == REQUIRED_PHRASE_OPEN {
317                Some(TokenKind::Open)
318            } else if token == REQUIRED_PHRASE_CLOSE {
319                Some(TokenKind::Close)
320            } else if !token.is_empty() && !STOPWORDS.contains(token) {
321                Some(TokenKind::Word)
322            } else {
323                None
324            }
325        })
326        .collect();
327    RequiredPhraseTokenIter { tokens, pos: 0 }
328}
329
330#[derive(Clone, Copy, PartialEq)]
331enum TokenKind {
332    Open,
333    Close,
334    Word,
335}
336
337struct RequiredPhraseTokenIter {
338    tokens: Vec<TokenKind>,
339    pos: usize,
340}
341
342impl Iterator for RequiredPhraseTokenIter {
343    type Item = &'static str;
344
345    fn next(&mut self) -> Option<Self::Item> {
346        if self.pos >= self.tokens.len() {
347            return None;
348        }
349        let token = self.tokens[self.pos];
350        self.pos += 1;
351        Some(match token {
352            TokenKind::Open => REQUIRED_PHRASE_OPEN,
353            TokenKind::Close => REQUIRED_PHRASE_CLOSE,
354            TokenKind::Word => "word",
355        })
356    }
357}
358
359/// Pattern for matching words and braces in required phrase tokenizer.
360/// Equivalent to Python's: `(?:[^_\W]+\+?[^_\W]*|\{\{|\}\})`
361static REQUIRED_PHRASE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
362    Regex::new(r"(?:[^_\W]+\+?[^_\W]*|\{\{|\}\})").expect("Invalid required phrase pattern")
363});
364
365/// Tokenize text and track stopwords by position.
366///
367/// Returns (tokens, stopwords_by_pos) where:
368/// - tokens: vector of token strings
369/// - stopwords_by_pos: mapping from token position to count of stopwords after that position
370///
371/// Based on Python: `index_tokenizer_with_stopwords()` in tokenize.py:247-306
372pub fn tokenize_with_stopwords(
373    text: &str,
374) -> (Vec<String>, std::collections::HashMap<Option<usize>, usize>) {
375    if text.is_empty() {
376        return (Vec::new(), std::collections::HashMap::new());
377    }
378
379    let mut tokens = Vec::new();
380    let mut stopwords_by_pos = std::collections::HashMap::new();
381
382    let mut pos: Option<usize> = None;
383    let lowercase_text = text.to_lowercase();
384
385    for cap in QUERY_PATTERN.find_iter(&lowercase_text) {
386        let token = cap.as_str();
387        if token.is_empty() {
388            continue;
389        }
390
391        if STOPWORDS.contains(token) {
392            *stopwords_by_pos.entry(pos).or_insert(0) += 1;
393        } else {
394            pos = Some(pos.map_or(0, |p| p + 1));
395            tokens.push(token.to_string());
396        }
397    }
398
399    (tokens, stopwords_by_pos)
400}
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405
406    #[test]
407    fn test_tokenize_empty() {
408        let result = tokenize("");
409        assert!(result.is_empty());
410    }
411
412    #[test]
413    fn test_tokenize_simple() {
414        let result = tokenize("Hello World");
415        assert_eq!(result, vec!["hello", "world"]);
416    }
417
418    #[test]
419    fn test_tokenize_with_punctuation() {
420        let result = tokenize("Hello, World! This is a test.");
421        // Note: 'a' is filtered because it's in STOPWORDS (it's an HTML tag)
422        assert_eq!(result, vec!["hello", "world", "this", "is", "test"]);
423    }
424
425    #[test]
426    fn test_tokenize_with_spaces() {
427        let result = tokenize("some Text with   spAces!");
428        assert_eq!(result, vec!["some", "text", "with", "spaces"]);
429    }
430
431    #[test]
432    fn test_tokenize_with_plus() {
433        let result = tokenize("GPL2+ and GPL3");
434        assert_eq!(result, vec!["gpl2+", "and", "gpl3"]);
435    }
436
437    #[test]
438    fn test_tokenize_filters_stopwords() {
439        let result = tokenize("Hello div World p");
440        assert_eq!(result, vec!["hello", "world"]);
441    }
442
443    #[test]
444    fn test_tokenize_with_special_chars() {
445        let result = tokenize("special+-_!@ chars");
446        // Based on Python: ['special+', 'chars']
447        assert_eq!(result, vec!["special+", "chars"]);
448    }
449
450    #[test]
451    fn test_tokenize_with_underscores() {
452        let result = tokenize("hello_world foo_bar");
453        assert_eq!(result, vec!["hello", "world", "foo", "bar"]);
454    }
455
456    #[test]
457    fn test_tokenize_with_numbers() {
458        let result = tokenize("version 2.0 and 3.0");
459        assert_eq!(result, vec!["version", "2", "0", "and", "3", "0"]);
460    }
461
462    #[test]
463    fn test_tokenize_without_stopwords_keeps_html_tags() {
464        let result = tokenize_without_stopwords("Hello div World p");
465        assert_eq!(result, vec!["hello", "div", "world", "p"]);
466    }
467
468    #[test]
469    fn test_tokenize_without_stopwords_empty() {
470        let result = tokenize_without_stopwords("");
471        assert!(result.is_empty());
472    }
473
474    #[test]
475    fn test_tokenization_with_plus_in_middle() {
476        let result = tokenize("C++ and GPL+");
477        assert_eq!(result, vec!["c+", "and", "gpl+"]);
478    }
479
480    #[test]
481    fn test_tokenization_braces() {
482        let result = tokenize("{{Hi}}some {{}}Text with{{noth+-_!@ing}}   {{junk}}spAces!");
483        assert_eq!(
484            result,
485            vec![
486                "hi", "some", "text", "with", "noth+", "ing", "junk", "spaces"
487            ]
488        );
489    }
490
491    #[test]
492    fn test_tokenize_with_ampersand() {
493        let result = tokenize("some &quot< markup &gt\"");
494        assert_eq!(result, vec!["some", "markup"]);
495    }
496
497    #[test]
498    #[allow(non_snake_case)]
499    fn test_query_tokenizer_brace_case() {
500        let result = tokenize("{{}some }}Text with   spAces! + _ -");
501        assert_eq!(result, vec!["some", "text", "with", "spaces"]);
502    }
503
504    #[test]
505    fn test_tokenize_unicode_characters() {
506        // With Unicode pattern [^_\W], we match Unicode letters like Python's re.UNICODE
507        let result = tokenize("hello 世界 mir");
508        assert_eq!(result, vec!["hello", "世界", "mir"]);
509    }
510
511    #[test]
512    fn test_tokenize_only_special_chars() {
513        let result = tokenize("!@#$%^&*()");
514        assert!(result.is_empty());
515    }
516
517    #[test]
518    fn test_tokenize_only_punctuation() {
519        let result = tokenize(".,;:!?-_=+[]{}()");
520        assert!(result.is_empty());
521    }
522
523    #[test]
524    fn test_tokenize_only_stopwords() {
525        let result = tokenize("div p a br");
526        assert!(result.is_empty());
527    }
528
529    #[test]
530    fn test_tokenize_mixed_stopwords_and_words() {
531        let result = tokenize("div hello p world a test");
532        assert_eq!(result, vec!["hello", "world", "test"]);
533    }
534
535    #[test]
536    fn test_tokenize_very_long_text() {
537        let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
538        let text = words.join(" ");
539        let result = tokenize(&text);
540        assert_eq!(result.len(), 1000);
541        assert_eq!(result[0], "word0");
542        assert_eq!(result[999], "word999");
543    }
544
545    #[test]
546    fn test_tokenize_with_newlines_and_tabs() {
547        let result = tokenize("hello\nworld\ttest");
548        assert_eq!(result, vec!["hello", "world", "test"]);
549    }
550
551    #[test]
552    fn test_tokenize_with_carriage_return() {
553        let result = tokenize("hello\r\nworld\rtest");
554        assert_eq!(result, vec!["hello", "world", "test"]);
555    }
556
557    #[test]
558    fn test_tokenize_trailing_plus() {
559        let result = tokenize("GPL2+ LGPL3+");
560        assert_eq!(result, vec!["gpl2+", "lgpl3+"]);
561    }
562
563    #[test]
564    fn test_tokenize_leading_plus() {
565        let result = tokenize("+hello +world");
566        assert_eq!(result, vec!["hello", "world"]);
567    }
568
569    #[test]
570    fn test_tokenize_without_stopwords_preserves_all() {
571        let result = tokenize_without_stopwords("div p a br");
572        assert_eq!(result, vec!["div", "p", "a", "br"]);
573    }
574
575    #[test]
576    fn test_tokenize_without_stopwords_unicode() {
577        // With Unicode pattern [^_\W], we match Unicode letters like Python's re.UNICODE
578        let result = tokenize_without_stopwords("hello 世界");
579        assert_eq!(result, vec!["hello", "世界"]);
580    }
581
582    #[test]
583    fn test_tokenize_without_stopwords_only_special() {
584        let result = tokenize_without_stopwords("!@#$%");
585        assert!(result.is_empty());
586    }
587
588    #[test]
589    fn test_tokenize_consecutive_plus() {
590        let result = tokenize("a++b");
591        assert_eq!(result, vec!["a+", "b"]);
592    }
593
594    #[test]
595    fn test_tokenize_hyphenated_words() {
596        let result = tokenize("some-thing foo-bar");
597        assert_eq!(result, vec!["some", "thing", "foo", "bar"]);
598    }
599
600    #[test]
601    fn test_tokenize_email_address() {
602        let result = tokenize("test@example.com");
603        assert_eq!(result, vec!["test", "example", "com"]);
604    }
605
606    #[test]
607    fn test_tokenize_url() {
608        let result = tokenize("https://example.com/path");
609        assert_eq!(result, vec!["https", "example", "com", "path"]);
610    }
611
612    #[test]
613    fn test_tokenize_version_number() {
614        let result = tokenize("version 1.2.3");
615        assert_eq!(result, vec!["version", "1", "2", "3"]);
616    }
617
618    #[test]
619    fn test_tokenize_xml_entities() {
620        let result = tokenize("&lt;div&gt;hello&lt;/div&gt;");
621        assert_eq!(result, vec!["hello"]);
622    }
623
624    #[test]
625    fn test_tokenize_whitespace_only() {
626        let result = tokenize("   \t\n\r   ");
627        assert!(result.is_empty());
628    }
629
630    #[test]
631    fn test_tokenize_single_char() {
632        let result = tokenize("a");
633        assert!(result.is_empty());
634    }
635
636    #[test]
637    fn test_tokenize_single_word() {
638        let result = tokenize("hello");
639        assert_eq!(result, vec!["hello"]);
640    }
641
642    #[test]
643    fn test_tokenize_numbers_only() {
644        let result = tokenize("123 456 789");
645        assert_eq!(result, vec!["123", "456", "789"]);
646    }
647
648    #[test]
649    fn test_tokenize_alphanumeric_mixed() {
650        let result = tokenize("abc123 def456");
651        assert_eq!(result, vec!["abc123", "def456"]);
652    }
653
654    #[test]
655    fn test_tokenize_underscore_separated() {
656        let result = tokenize("hello_world foo_bar_baz");
657        assert_eq!(result, vec!["hello", "world", "foo", "bar", "baz"]);
658    }
659
660    #[test]
661    fn test_tokenize_all_stopwords_from_list() {
662        let result = tokenize("amp lt gt nbsp quot");
663        assert!(result.is_empty());
664    }
665
666    #[test]
667    fn test_parse_required_phrase_spans_single() {
668        let text = "This is {{enclosed}} in braces";
669        let spans = parse_required_phrase_spans(text);
670        assert_eq!(spans, vec![2..3]);
671    }
672
673    #[test]
674    fn test_parse_required_phrase_spans_multiword() {
675        let text = "This is {{a required phrase}} here";
676        let spans = parse_required_phrase_spans(text);
677        assert_eq!(spans, vec![2..4]);
678    }
679
680    #[test]
681    fn test_parse_required_phrase_spans_multiple() {
682        let text = "{{First}} and {{second}} phrase";
683        let spans = parse_required_phrase_spans(text);
684        assert_eq!(spans, vec![0..1, 2..3]);
685    }
686
687    #[test]
688    fn test_parse_required_phrase_spans_none() {
689        let text = "No required phrases here";
690        let spans = parse_required_phrase_spans(text);
691        assert!(spans.is_empty());
692    }
693
694    #[test]
695    fn test_parse_required_phrase_spans_empty_braces() {
696        let text = "Empty {{}} braces";
697        let spans = parse_required_phrase_spans(text);
698        assert!(spans.is_empty());
699    }
700
701    #[test]
702    fn test_parse_required_phrase_spans_nested() {
703        let text = "Nested {{ outer {{ inner }} }} braces";
704        let spans = parse_required_phrase_spans(text);
705        assert!(spans.is_empty());
706    }
707
708    #[test]
709    fn test_parse_required_phrase_spans_unclosed() {
710        let text = "Unclosed {{ phrase here";
711        let spans = parse_required_phrase_spans(text);
712        assert!(spans.is_empty());
713    }
714
715    #[test]
716    fn test_parse_required_phrase_spans_unopened() {
717        let text = "Unopened }} phrase here";
718        let spans = parse_required_phrase_spans(text);
719        assert!(spans.is_empty());
720    }
721
722    #[test]
723    fn test_tokenize_with_stopwords_basic() {
724        let text = "hello div world p test";
725        let (tokens, stopwords) = tokenize_with_stopwords(text);
726        assert_eq!(tokens, vec!["hello", "world", "test"]);
727        // "div" is stopword after "hello" (pos 0), "p" is stopword after "world" (pos 1)
728        assert_eq!(stopwords.get(&Some(0)), Some(&1));
729        assert_eq!(stopwords.get(&Some(1)), Some(&1));
730    }
731
732    #[test]
733    fn test_tokenize_with_stopwords_empty() {
734        let (tokens, stopwords) = tokenize_with_stopwords("");
735        assert!(tokens.is_empty());
736        assert!(stopwords.is_empty());
737    }
738
739    #[test]
740    fn test_tokenize_with_stopwords_no_stopwords() {
741        let text = "hello world test";
742        let (tokens, stopwords) = tokenize_with_stopwords(text);
743        assert_eq!(tokens, vec!["hello", "world", "test"]);
744        assert!(stopwords.is_empty());
745    }
746
747    #[test]
748    fn test_parse_required_phrase_spans_filters_stopwords_inside() {
749        let text = "{{hello a world}}";
750        let spans = parse_required_phrase_spans(text);
751        assert_eq!(spans, vec![0..2]);
752    }
753
754    #[test]
755    fn test_parse_required_phrase_spans_filters_stopwords_outside() {
756        let text = "{{Hello}} a {{world}}";
757        let spans = parse_required_phrase_spans(text);
758        assert_eq!(spans, vec![0..1, 1..2]);
759    }
760
761    #[test]
762    fn test_parse_required_phrase_spans_multiple_stopwords() {
763        let text = "{{a p div hello}}";
764        let spans = parse_required_phrase_spans(text);
765        assert_eq!(spans, vec![0..1]);
766    }
767
768    #[test]
769    fn test_parse_required_phrase_spans_case_insensitive_stopwords() {
770        let text = "{{HELLO A WORLD}}";
771        let spans = parse_required_phrase_spans(text);
772        assert_eq!(spans, vec![0..2]);
773    }
774}
provenant/license_detection/tokenize.rs

provenant/license_detection/
tokenize.rs