1use decompound::{decompound, DecompositionOptions};
2use once_cell::sync::Lazy;
3use probe_code::ranking::get_stemmer;
4use probe_code::search::term_exceptions::{is_exception_term, EXCEPTION_TERMS};
5use std::collections::HashSet;
6use std::sync::Mutex;
7
8static DYNAMIC_SPECIAL_TERMS: Lazy<Mutex<HashSet<String>>> =
11 Lazy::new(|| Mutex::new(HashSet::new()));
12
13pub fn add_special_term(term: &str) {
15 let mut special_terms = DYNAMIC_SPECIAL_TERMS.lock().unwrap();
16 special_terms.insert(term.to_lowercase());
17
18 if std::env::var("DEBUG").unwrap_or_default() == "1" {
20 println!("DEBUG: Added special term: {term}");
21 }
22}
23
24static ENGLISH_STOP_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
26 vec![
27 "a",
28 "about",
29 "above",
30 "after",
31 "again",
32 "against",
33 "all",
34 "am",
35 "an",
36 "and",
37 "any",
38 "are",
39 "aren't",
40 "as",
41 "at",
42 "be",
43 "because",
44 "been",
45 "before",
46 "being",
47 "below",
48 "between",
49 "both",
50 "but",
51 "by",
52 "can't",
53 "cannot",
54 "could",
55 "couldn't",
56 "did",
57 "didn't",
58 "do",
59 "does",
60 "doesn't",
61 "doing",
62 "don't",
63 "down",
64 "during",
65 "each",
66 "few",
67 "for",
68 "from",
69 "further",
70 "had",
71 "hadn't",
72 "has",
73 "hasn't",
74 "have",
75 "haven't",
76 "having",
77 "he",
78 "he'd",
79 "he'll",
80 "he's",
81 "her",
82 "here",
83 "here's",
84 "hers",
85 "herself",
86 "him",
87 "himself",
88 "his",
89 "how",
90 "how's",
91 "i",
92 "i'd",
93 "i'll",
94 "i'm",
95 "i've",
96 "if",
97 "in",
98 "into",
99 "is",
100 "isn't",
101 "it",
102 "it's",
103 "its",
104 "itself",
105 "let's",
106 "me",
107 "more",
108 "most",
109 "mustn't",
110 "my",
111 "myself",
112 "no",
113 "nor",
114 "not",
115 "of",
116 "off",
117 "on",
118 "once",
119 "only",
120 "or",
121 "other",
122 "ought",
123 "our",
124 "ours",
125 "ourselves",
126 "out",
127 "over",
128 "own",
129 "same",
130 "shan't",
131 "she",
132 "she'd",
133 "she'll",
134 "she's",
135 "should",
136 "shouldn't",
137 "so",
138 "some",
139 "such",
140 "than",
141 "that",
142 "that's",
143 "the",
144 "their",
145 "theirs",
146 "them",
147 "themselves",
148 "then",
149 "there",
150 "there's",
151 "these",
152 "they",
153 "they'd",
154 "they'll",
155 "they're",
156 "they've",
157 "this",
158 "those",
159 "through",
160 "to",
161 "too",
162 "under",
163 "until",
164 "up",
165 "very",
166 "was",
167 "wasn't",
168 "we",
169 "we'd",
170 "we'll",
171 "we're",
172 "we've",
173 "were",
174 "weren't",
175 "what",
176 "what's",
177 "when",
178 "when's",
179 "where",
180 "where's",
181 "which",
182 "while",
183 "who",
184 "who's",
185 "whom",
186 "why",
187 "why's",
188 "with",
189 "won't",
190 "would",
191 "wouldn't",
192 "you",
193 "you'd",
194 "you'll",
195 "you're",
196 "you've",
197 "your",
198 "yours",
199 "yourself",
200 "yourselves",
201 "ing",
202 ]
203 .into_iter()
204 .map(String::from)
205 .collect()
206});
207
208static PROGRAMMING_STOP_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
210 vec![
211 "func",
213 "type",
214 "struct",
215 "interface",
216 "chan",
217 "map",
218 "go",
219 "defer",
220 "var",
222 "let",
223 "const",
224 "return",
225 "if",
226 "else",
227 "for",
228 "while",
229 "switch",
230 "case",
231 "break",
232 "continue",
233 "default",
234 "try",
235 "catch",
236 "finally",
237 "throw",
238 "new",
239 "super",
240 "extends",
241 "implements",
242 "function",
243 "class",
244 "method",
245 "this",
246 "public",
248 "private",
249 "protected",
250 "static",
251 "final",
252 "async",
253 "await",
254 "string",
256 "int",
257 "bool",
258 "float",
259 "void",
260 "null",
261 "nil",
262 "class",
263 "enum",
264 "impl",
265 "fn",
266 "mod",
267 ]
268 .into_iter()
269 .map(String::from)
270 .collect()
271});
272
273static SPECIAL_CASE_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
277 vec![
278 "oauth",
280 "oauth2",
281 "ipv4",
282 "ipv6",
283 "ipv",
284 "graphql",
285 "postgresql",
286 "mysql",
287 "mongodb",
288 "javascript",
289 "typescript",
290 "nodejs",
291 "reactjs",
292 "vuejs",
293 "angularjs",
294 "github",
295 "gitlab",
296 "bitbucket",
297 "kubernetes",
298 "docker",
299 "webpack",
300 "rollup",
301 "vite",
302 "eslint",
303 "prettier",
304 "axios",
305 "fetch",
306 "grpc",
307 "http2",
308 "whitelist",
309 "blacklist",
310 "allowlist",
311 "blocklist",
312 "denylist",
313 ]
314 .into_iter()
315 .map(String::from)
316 .collect()
317});
318
319#[inline]
321fn is_uppercase(c: char) -> bool {
322 c.is_ascii_uppercase()
323}
324
325#[inline]
327fn is_lowercase(c: char) -> bool {
328 c.is_ascii_lowercase()
329}
330
331#[inline]
333fn is_number(c: char) -> bool {
334 c.is_ascii_digit()
335}
336
337pub fn is_special_case(word: &str) -> bool {
339 let lowercase = word.to_lowercase();
341
342 if SPECIAL_CASE_WORDS.contains(&lowercase) {
344 return true;
345 }
346
347 let special_terms = DYNAMIC_SPECIAL_TERMS.lock().unwrap();
349 if special_terms.contains(&lowercase) {
350 if std::env::var("DEBUG").unwrap_or_default() == "1" {
352 println!("DEBUG: Found dynamic special term: {lowercase}");
353 }
354 return true;
355 }
356
357 false
358}
359
360pub fn split_camel_case(input: &str) -> Vec<String> {
368 let _debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
369
370 if input.is_empty() {
371 return vec![];
372 }
373
374 if is_special_case(input) {
376 return vec![input.to_lowercase()];
377 }
378
379 let lowercase = input.to_lowercase();
381
382 if lowercase.starts_with("oauth2") {
384 let remaining = &input[6..]; if !remaining.is_empty() {
386 let mut result = vec!["oauth2".to_string()];
387 result.extend(split_camel_case(remaining));
388 return result;
389 }
390 }
391
392 let mut special_cases: Vec<&String> = SPECIAL_CASE_WORDS.iter().collect();
395 special_cases.sort_by_key(|b| std::cmp::Reverse(b.len()));
396
397 for special_case in special_cases {
399 if lowercase.starts_with(special_case) {
400 let _original_part = &input[0..special_case.len()];
402 let remaining = &input[special_case.len()..];
403
404 if !remaining.is_empty() {
405 let mut result = vec![special_case.clone()];
406 result.extend(split_camel_case(remaining));
407
408 return result;
409 }
410 }
411 }
412
413 if input == lowercase && !input.contains('_') && input.len() > 3 {
416 let _potential_splits: Vec<String> = Vec::new();
418
419 let common_terms = EXCEPTION_TERMS
421 .iter()
422 .map(|s| s.as_str())
423 .collect::<Vec<_>>();
424
425 for term in common_terms {
426 if input.contains(term) && term != input {
427 let parts: Vec<&str> = input.split(term).collect();
428 if parts.len() > 1 {
429 let mut result = Vec::new();
430 for (i, part) in parts.iter().enumerate() {
431 if !part.is_empty() {
432 result.push(part.to_string());
433 }
434 if i < parts.len() - 1 {
435 result.push(term.to_string());
436 }
437 }
438 if !result.is_empty() {
439 return result;
441 }
442 }
443 }
444 }
445 }
446
447 let chars: Vec<char> = input.chars().collect();
448 let mut result = Vec::new();
449 let mut current_word = String::new();
450
451 let mut prev_is_lower = false;
453 let mut prev_is_upper = false;
454 let mut prev_is_digit = false;
455
456 for (i, &c) in chars.iter().enumerate() {
457 let is_upper = is_uppercase(c);
458 let is_lower = is_lowercase(c);
459 let is_digit = is_number(c);
460
461 let start_new_word =
466 !current_word.is_empty() && (
468 (prev_is_lower && is_upper) ||
470 (prev_is_digit != is_digit) ||
472 (prev_is_upper && is_upper && i + 1 < chars.len() && is_lowercase(chars[i + 1]))
474 );
475
476 if start_new_word {
477 result.push(current_word);
478 current_word = String::new();
479 }
480
481 current_word.push(c);
482
483 prev_is_lower = is_lower;
485 prev_is_upper = is_upper;
486 prev_is_digit = is_digit;
487 }
488
489 if !current_word.is_empty() {
491 result.push(current_word);
492 }
493
494 result.into_iter().map(|word| word.to_lowercase()).collect()
498}
499
500pub fn is_english_stop_word(word: &str) -> bool {
502 if let Ok(num) = word.parse::<u32>() {
504 if num <= 10 {
505 return true;
506 }
507 }
508
509 ENGLISH_STOP_WORDS.contains(word)
510}
511
512pub fn is_programming_stop_word(word: &str) -> bool {
514 PROGRAMMING_STOP_WORDS.contains(word)
515}
516
517pub fn is_stop_word(word: &str) -> bool {
519 is_english_stop_word(word) || is_programming_stop_word(word)
520}
521
522pub fn split_compound_word(word: &str, vocab: &HashSet<String>) -> Vec<String> {
525 if is_special_case(word) {
527 return vec![word.to_lowercase()];
528 }
529
530 let common_terms = EXCEPTION_TERMS
532 .iter()
533 .map(|s| s.as_str())
534 .collect::<Vec<_>>();
535
536 if common_terms.contains(&word.to_lowercase().as_str()) {
538 return vec![word.to_string()];
539 }
540
541 if vocab.contains(&word.to_lowercase()) {
545 return vec![word.to_string()];
546 }
547
548 let is_valid_word = |w: &str| vocab.contains(&w.to_lowercase());
549
550 match decompound(word, &is_valid_word, DecompositionOptions::empty()) {
551 Ok(parts) if !parts.is_empty() => parts,
552 _ => vec![word.to_string()],
553 }
554}
555
556pub fn load_vocabulary() -> &'static HashSet<String> {
559 static VOCABULARY: Lazy<HashSet<String>> = Lazy::new(|| {
560 vec![
563 "white",
565 "black",
566 "list",
567 "mail",
568 "back",
569 "ground",
570 "book",
571 "mark",
572 "key",
573 "word",
574 "pass",
575 "fire",
576 "wall",
577 "firewall",
578 "water",
579 "fall",
580 "data",
581 "base",
582 "time",
583 "stamp",
584 "air",
585 "port",
586 "blue",
587 "tooth",
588 "green",
589 "house",
590 "red",
591 "hat",
592 "yellow",
593 "pages",
594 "blue",
595 "print",
596 "type",
597 "script",
598 "java",
599 "script",
600 "note",
601 "pad",
602 "web",
603 "site",
604 "page",
605 "view",
606 "code",
607 "base",
608 "name",
609 "space",
610 "class",
611 "room",
612 "work",
613 "flow",
614 "life",
615 "cycle",
616 "end",
617 "point",
618 "check",
619 "box",
620 "drop",
621 "down",
622 "pop",
623 "up",
624 "side",
625 "bar",
626 "tool",
627 "tip",
628 "drag",
629 "drop",
630 "click",
631 "stream",
632 "line",
633 "dead",
634 "lock",
635 "race",
636 "condition",
637 "thread",
638 "safe",
639 "memory",
640 "leak",
641 "stack",
642 "trace",
643 "heap",
644 "dump",
645 "core",
646 "file",
647 "system",
648 "disk",
649 "drive",
650 "hard",
651 "soft",
652 "ware",
653 "firm",
654 "middle",
655 "front",
656 "back",
657 "end",
658 "full",
659 "stack",
660 "dev",
661 "ops",
662 "micro",
663 "service",
664 "mono",
665 "lith",
666 "container",
667 "docker",
668 "pod",
669 "cloud",
670 "native",
671 "server",
672 "less",
673 "function",
674 "as",
675 "service",
676 "infra",
677 "structure",
678 "platform",
679 "test",
680 "driven",
681 "behavior",
682 "continuous",
683 "integration",
684 "deployment",
685 "delivery",
686 "pipeline",
687 "git",
688 "hub",
689 "lab",
690 "version",
691 "control",
692 "branch",
693 "merge",
694 "pull",
695 "request",
696 "commit",
697 "push",
698 "clone",
699 "fork",
700 "repository",
701 "issue",
702 "bug",
703 "feature",
704 "release",
705 "tag",
706 "semantic",
707 "versioning",
708 "major",
709 "minor",
710 "patch",
711 "alpha",
712 "beta",
713 "stable",
714 "unstable",
715 "deprecated",
716 "legacy",
717 "modern",
718 "framework",
719 "library",
720 "package",
721 "module",
722 "component",
723 "prop",
724 "state",
725 "hook",
726 "effect",
727 "context",
728 "provider",
729 "consumer",
730 "reducer",
731 "action",
732 "store",
733 "dispatch",
734 "subscribe",
735 "publish",
736 "event",
737 "handler",
738 "listener",
739 "callback",
740 "promise",
741 "async",
742 "await",
743 "future",
744 "stream",
745 "observable",
746 "reactive",
747 "functional",
748 "object",
749 "oriented",
750 "procedural",
751 "declarative",
752 "imperative",
753 "mutable",
754 "immutable",
755 "pure",
756 "side",
757 "effect",
758 "higher",
759 "order",
760 "first",
761 "class",
762 "citizen",
763 "closure",
764 "scope",
765 "lexical",
766 "dynamic",
767 "static",
768 "type",
769 "inference",
770 "checking",
771 "compile",
772 "time",
773 "run",
774 "error",
775 "exception",
776 "try",
777 "catch",
778 "finally",
779 "throw",
780 "raise",
781 "handle",
782 "logging",
783 "debug",
784 "info",
785 "warn",
786 "error",
787 "fatal",
788 "trace",
789 "metric",
790 "monitor",
791 "alert",
792 "notification",
793 "dashboard",
794 "report",
795 "analytics",
796 "insight",
797 "data",
798 "science",
799 "machine",
800 "learning",
801 "artificial",
802 "intelligence",
803 "neural",
804 "network",
805 "deep",
806 "reinforcement",
807 "supervised",
808 "unsupervised",
809 "classification",
810 "regression",
811 "clustering",
812 "recommendation",
813 "prediction",
814 "inference",
815 "training",
816 "validation",
817 "test",
818 "accuracy",
819 "precision",
820 "recall",
821 "f1",
822 "score",
823 "loss",
824 "function",
825 "gradient",
826 "descent",
827 "back",
828 "propagation",
829 "forward",
830 "pass",
831 "epoch",
832 "batch",
833 "mini",
834 "over",
835 "fitting",
836 "under",
837 "regularization",
838 "dropout",
839 "batch",
840 "normalization",
841 "activation",
842 "sigmoid",
843 "tanh",
844 "relu",
845 "leaky",
846 "softmax",
847 "convolution",
848 "pooling",
849 "recurrent",
850 "lstm",
851 "gru",
852 "transformer",
853 "attention",
854 "encoder",
855 "decoder",
856 "embedding",
857 "token",
858 "tokenization",
859 "stemming",
860 "lemmatization",
861 "stop",
862 "word",
863 "n",
864 "gram",
865 "tf",
866 "idf",
867 "cosine",
868 "similarity",
869 "euclidean",
870 "distance",
871 "manhattan",
872 "jaccard",
873 "index",
874 "precision",
875 "recall",
876 "relevance",
877 "ranking",
878 "page",
879 "rank",
880 "search",
881 "engine",
882 "crawler",
883 "indexer",
884 "query",
885 "result",
886 "snippet",
887 "cache",
888 "hit",
889 "miss",
890 "eviction",
891 "policy",
892 "lru",
893 "fifo",
894 "lifo",
895 "priority",
896 "queue",
897 "stack",
898 "heap",
899 "tree",
900 "binary",
901 "balanced",
902 "avl",
903 "red",
904 "black",
905 "b",
906 "trie",
907 "hash",
908 "map",
909 "set",
910 "list",
911 "linked",
912 "doubly",
913 "circular",
914 "array",
915 "vector",
916 "matrix",
917 "tensor",
918 "graph",
919 "directed",
920 "undirected",
921 "weighted",
922 "unweighted",
923 "adjacency",
924 "matrix",
925 "list",
926 "edge",
927 "vertex",
928 "node",
929 "path",
930 "cycle",
931 "traversal",
932 "breadth",
933 "first",
934 "depth",
935 "topological",
936 "sort",
937 "minimum",
938 "spanning",
939 "tree",
940 "shortest",
941 "path",
942 "dijkstra",
943 "bellman",
944 "ford",
945 "floyd",
946 "warshall",
947 "kruskal",
948 "prim",
949 "greedy",
950 "dynamic",
951 "programming",
952 "divide",
953 "conquer",
954 "backtracking",
955 "branch",
956 "bound",
957 "heuristic",
958 "approximation",
959 "randomized",
960 "parallel",
961 "concurrent",
962 "distributed",
963 "synchronous",
964 "asynchronous",
965 "blocking",
966 "non",
967 "mutex",
968 "semaphore",
969 "lock",
970 "atomic",
971 "volatile",
972 "transaction",
973 "acid",
974 "consistency",
975 "isolation",
976 "durability",
977 "serializable",
978 "repeatable",
979 "read",
980 "committed",
981 "uncommitted",
982 "phantom",
983 "dirty",
984 "read",
985 "write",
986 "skew",
987 "conflict",
988 "resolution",
989 "optimistic",
990 "pessimistic",
991 "two",
992 "phase",
993 "commit",
994 "rollback",
995 "savepoint",
996 "checkpoint",
997 "recovery",
998 "backup",
999 "restore",
1000 "archive",
1001 "log",
1002 "journal",
1003 "redo",
1004 "undo",
1005 "write",
1006 "ahead",
1007 "logging",
1008 "snapshot",
1009 "isolation",
1010 "level",
1011 "serializable",
1012 "repeatable",
1013 "read",
1014 "committed",
1015 "uncommitted",
1016 "phantom",
1017 "dirty",
1018 "read",
1019 "write",
1020 "skew",
1021 "conflict",
1022 "resolution",
1023 "optimistic",
1024 "pessimistic",
1025 "two",
1026 "phase",
1027 "commit",
1028 "rollback",
1029 "savepoint",
1030 "checkpoint",
1031 "recovery",
1032 "backup",
1033 "restore",
1034 "archive",
1035 "log",
1036 "journal",
1037 "redo",
1038 "undo",
1039 "write",
1040 "ahead",
1041 "logging",
1042 "snapshot",
1043 "isolation",
1044 "level",
1045 ]
1046 .into_iter()
1047 .map(String::from)
1048 .collect()
1049 });
1050
1051 &VOCABULARY
1052}
1053
1054#[allow(dead_code)]
1057pub fn tokenize_and_stem(keyword: &str) -> Vec<String> {
1058 let stemmer = get_stemmer();
1059 let vocabulary = load_vocabulary();
1060
1061 let camel_parts = split_camel_case(keyword);
1063
1064 if camel_parts.len() > 1 {
1065 camel_parts
1067 .into_iter()
1068 .filter(|part| !is_stop_word(part))
1069 .map(|part| stemmer.stem(&part).to_string())
1070 .collect()
1071 } else {
1072 let compound_parts = split_compound_word(keyword, vocabulary);
1074
1075 if compound_parts.len() > 1 {
1076 compound_parts
1078 .into_iter()
1079 .filter(|part| !is_stop_word(part))
1080 .map(|part| stemmer.stem(&part).to_string())
1081 .collect()
1082 } else {
1083 vec![stemmer.stem(keyword).to_string()]
1085 }
1086 }
1087}
1088
1089pub fn tokenize(text: &str) -> Vec<String> {
1103 let stemmer = get_stemmer();
1104 let vocabulary = load_vocabulary();
1105
1106 let mut negated_terms = HashSet::new();
1108
1109 let mut tokens = Vec::new();
1113 for word in text.split_whitespace() {
1114 let is_negated = word.starts_with('-');
1116
1117 let mut current_token = String::new();
1119
1120 let mut chars = word.chars();
1122 if is_negated {
1123 chars.next();
1125 }
1126
1127 for c in chars {
1128 if c.is_alphanumeric() {
1129 current_token.push(c);
1130 } else if !current_token.is_empty() {
1131 if is_negated {
1133 negated_terms.insert(current_token.to_lowercase());
1135 }
1136 tokens.push(current_token);
1137 current_token = String::new();
1138 }
1139 }
1140
1141 if !current_token.is_empty() {
1143 if is_negated {
1144 negated_terms.insert(current_token.to_lowercase());
1146 }
1147 tokens.push(current_token);
1148 }
1149 }
1150
1151 let mut processed_tokens = HashSet::new();
1153 let mut result = Vec::new();
1154
1155 for token in tokens {
1157 let parts = split_camel_case(&token);
1160
1161 for part in parts {
1163 let lowercase_part = part.to_lowercase();
1164
1165 if is_stop_word(&lowercase_part) {
1167 continue;
1168 }
1169
1170 if negated_terms.contains(&lowercase_part) {
1172 continue;
1173 }
1174
1175 let compound_parts = split_compound_word(&lowercase_part, vocabulary);
1177
1178 for compound_part in compound_parts {
1179 if is_stop_word(&compound_part) {
1181 continue;
1182 }
1183
1184 if negated_terms.contains(&compound_part) {
1186 continue;
1187 }
1188
1189 if is_exception_term(&compound_part)
1191 && processed_tokens.insert(compound_part.clone())
1192 {
1193 result.push(compound_part.clone());
1194 }
1195
1196 let stemmed_part = stemmer.stem(&compound_part).to_string();
1198 if negated_terms.contains(&stemmed_part) {
1200 continue;
1201 }
1202
1203 if processed_tokens.insert(stemmed_part.clone()) {
1204 result.push(stemmed_part);
1205 }
1206 }
1207 }
1208 }
1209
1210 result
1211}
1212
1213#[cfg(test)]
1214mod tests {
1215 use super::*;
1216
1217 #[test]
1218 fn test_split_camel_case() {
1219 assert_eq!(split_camel_case("camelCase"), vec!["camel", "case"]);
1221
1222 assert_eq!(split_camel_case("PascalCase"), vec!["pascal", "case"]);
1224
1225 assert_eq!(
1227 split_camel_case("parseJSONToHTML5"),
1228 vec!["parse", "json", "to", "html", "5"]
1229 );
1230
1231 assert_eq!(split_camel_case("APIDefinition"), vec!["api", "definition"]);
1233
1234 assert_eq!(
1236 split_camel_case("OAuth2Provider"),
1237 vec!["oauth2", "provider"]
1238 );
1239
1240 assert_eq!(split_camel_case("typeIgnore"), vec!["type", "ignore"]);
1242
1243 assert_eq!(
1245 split_camel_case("migrateEndpointMetaByType"),
1246 vec!["migrate", "endpoint", "meta", "by", "type"]
1247 );
1248 }
1249
1250 #[test]
1251 fn test_stop_words() {
1252 assert!(is_programming_stop_word("func"));
1253 assert!(is_programming_stop_word("type"));
1254 assert!(is_programming_stop_word("struct"));
1255 assert!(!is_programming_stop_word("migrate"));
1256 assert!(!is_programming_stop_word("endpoint"));
1257 }
1258
1259 #[test]
1260 fn test_tokenize() {
1261 let tokens = tokenize("func (a *APIDefinition) MigrateEndpointMeta()");
1263 assert!(tokens.contains(&"api".to_string()));
1264 assert!(tokens.contains(&"definit".to_string())); assert!(tokens.contains(&"migrat".to_string())); assert!(
1269 tokens.contains(&"endpoint".to_string())
1270 || (tokens.contains(&"end".to_string()) && tokens.contains(&"point".to_string()))
1271 );
1272 assert!(tokens.contains(&"meta".to_string()));
1273
1274 let tokens = tokenize("func ParseJSONToHTML5()");
1276 assert!(tokens.contains(&"pars".to_string())); assert!(tokens.contains(&"json".to_string()));
1278 assert!(tokens.contains(&"html".to_string()));
1279 let tokens = tokenize("typeIgnore typeWhitelist");
1284 assert!(tokens.contains(&"ignor".to_string())); let tokens = tokenize("whitelist blackmail firewall");
1288 assert!(tokens.contains(&"whitelist".to_string()));
1290 assert!(tokens.contains(&"black".to_string()));
1291 assert!(tokens.contains(&"mail".to_string()));
1292 assert!(tokens.contains(&"firewall".to_string()));
1293
1294 let tokens = tokenize("enableFirewallWhitelist");
1296 assert!(tokens.contains(&"enabl".to_string())); assert!(tokens.contains(&"firewall".to_string())); assert!(tokens.contains(&"whitelist".to_string()));
1300 }
1301 #[test]
1302 fn test_compound_word_splitting() {
1303 let vocab = HashSet::from([
1305 "white".to_string(),
1306 "list".to_string(),
1307 "black".to_string(),
1308 "mail".to_string(),
1309 ]);
1310
1311 let parts = split_compound_word("whitelist", &vocab);
1313 assert_eq!(parts, vec!["whitelist".to_string()]);
1314
1315 let parts = split_compound_word("blackmail", &vocab);
1317 assert_eq!(parts, vec!["black".to_string(), "mail".to_string()]);
1318
1319 let parts = split_compound_word("computer", &vocab);
1321 assert_eq!(parts, vec!["computer".to_string()]);
1322 }
1323
1324 #[test]
1325 fn test_tokenize_with_compound_words() {
1326 let tokens = tokenize("whitelist blackmail firewall");
1328
1329 assert!(tokens.contains(&"whitelist".to_string()));
1331 assert!(tokens.contains(&"black".to_string()));
1333 assert!(tokens.contains(&"mail".to_string()));
1334 assert!(tokens.contains(&"firewall".to_string()));
1335 }
1336}