probe_code/search/
tokenization.rs

1use decompound::{decompound, DecompositionOptions};
2use once_cell::sync::Lazy;
3use probe_code::ranking::get_stemmer;
4use probe_code::search::term_exceptions::{is_exception_term, EXCEPTION_TERMS};
5use std::collections::HashSet;
6use std::sync::Mutex;
7
8// Dynamic set of special terms that should not be tokenized
9// This includes terms from queries with exact=true or excluded=true flags
10static DYNAMIC_SPECIAL_TERMS: Lazy<Mutex<HashSet<String>>> =
11    Lazy::new(|| Mutex::new(HashSet::new()));
12
13/// Add a term to the dynamic special terms list
14pub fn add_special_term(term: &str) {
15    let mut special_terms = DYNAMIC_SPECIAL_TERMS.lock().unwrap();
16    special_terms.insert(term.to_lowercase());
17
18    // Debug output
19    if std::env::var("DEBUG").unwrap_or_default() == "1" {
20        println!("DEBUG: Added special term: {term}");
21    }
22}
23
24/// Static set of common English stop words
25static ENGLISH_STOP_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
26    vec![
27        "a",
28        "about",
29        "above",
30        "after",
31        "again",
32        "against",
33        "all",
34        "am",
35        "an",
36        "and",
37        "any",
38        "are",
39        "aren't",
40        "as",
41        "at",
42        "be",
43        "because",
44        "been",
45        "before",
46        "being",
47        "below",
48        "between",
49        "both",
50        "but",
51        "by",
52        "can't",
53        "cannot",
54        "could",
55        "couldn't",
56        "did",
57        "didn't",
58        "do",
59        "does",
60        "doesn't",
61        "doing",
62        "don't",
63        "down",
64        "during",
65        "each",
66        "few",
67        "for",
68        "from",
69        "further",
70        "had",
71        "hadn't",
72        "has",
73        "hasn't",
74        "have",
75        "haven't",
76        "having",
77        "he",
78        "he'd",
79        "he'll",
80        "he's",
81        "her",
82        "here",
83        "here's",
84        "hers",
85        "herself",
86        "him",
87        "himself",
88        "his",
89        "how",
90        "how's",
91        "i",
92        "i'd",
93        "i'll",
94        "i'm",
95        "i've",
96        "if",
97        "in",
98        "into",
99        "is",
100        "isn't",
101        "it",
102        "it's",
103        "its",
104        "itself",
105        "let's",
106        "me",
107        "more",
108        "most",
109        "mustn't",
110        "my",
111        "myself",
112        "no",
113        "nor",
114        "not",
115        "of",
116        "off",
117        "on",
118        "once",
119        "only",
120        "or",
121        "other",
122        "ought",
123        "our",
124        "ours",
125        "ourselves",
126        "out",
127        "over",
128        "own",
129        "same",
130        "shan't",
131        "she",
132        "she'd",
133        "she'll",
134        "she's",
135        "should",
136        "shouldn't",
137        "so",
138        "some",
139        "such",
140        "than",
141        "that",
142        "that's",
143        "the",
144        "their",
145        "theirs",
146        "them",
147        "themselves",
148        "then",
149        "there",
150        "there's",
151        "these",
152        "they",
153        "they'd",
154        "they'll",
155        "they're",
156        "they've",
157        "this",
158        "those",
159        "through",
160        "to",
161        "too",
162        "under",
163        "until",
164        "up",
165        "very",
166        "was",
167        "wasn't",
168        "we",
169        "we'd",
170        "we'll",
171        "we're",
172        "we've",
173        "were",
174        "weren't",
175        "what",
176        "what's",
177        "when",
178        "when's",
179        "where",
180        "where's",
181        "which",
182        "while",
183        "who",
184        "who's",
185        "whom",
186        "why",
187        "why's",
188        "with",
189        "won't",
190        "would",
191        "wouldn't",
192        "you",
193        "you'd",
194        "you'll",
195        "you're",
196        "you've",
197        "your",
198        "yours",
199        "yourself",
200        "yourselves",
201        "ing",
202    ]
203    .into_iter()
204    .map(String::from)
205    .collect()
206});
207
208/// Static set of programming language stop words
209static PROGRAMMING_STOP_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
210    vec![
211        // Go-specific keywords
212        "func",
213        "type",
214        "struct",
215        "interface",
216        "chan",
217        "map",
218        "go",
219        "defer",
220        // Common programming keywords
221        "var",
222        "let",
223        "const",
224        "return",
225        "if",
226        "else",
227        "for",
228        "while",
229        "switch",
230        "case",
231        "break",
232        "continue",
233        "default",
234        "try",
235        "catch",
236        "finally",
237        "throw",
238        "new",
239        "super",
240        "extends",
241        "implements",
242        "function",
243        "class",
244        "method",
245        "this",
246        // Common modifiers
247        "public",
248        "private",
249        "protected",
250        "static",
251        "final",
252        "async",
253        "await",
254        // Common types and declarations
255        "string",
256        "int",
257        "bool",
258        "float",
259        "void",
260        "null",
261        "nil",
262        "class",
263        "enum",
264        "impl",
265        "fn",
266        "mod",
267    ]
268    .into_iter()
269    .map(String::from)
270    .collect()
271});
272
273/// Static set of special case words that should be treated as single tokens
274/// These are typically technology names, protocols, or common programming terms
275/// with non-standard capitalization patterns
276static SPECIAL_CASE_WORDS: Lazy<HashSet<String>> = Lazy::new(|| {
277    vec![
278        // Common technology terms with specific capitalization
279        "oauth",
280        "oauth2",
281        "ipv4",
282        "ipv6",
283        "ipv",
284        "graphql",
285        "postgresql",
286        "mysql",
287        "mongodb",
288        "javascript",
289        "typescript",
290        "nodejs",
291        "reactjs",
292        "vuejs",
293        "angularjs",
294        "github",
295        "gitlab",
296        "bitbucket",
297        "kubernetes",
298        "docker",
299        "webpack",
300        "rollup",
301        "vite",
302        "eslint",
303        "prettier",
304        "axios",
305        "fetch",
306        "grpc",
307        "http2",
308        "whitelist",
309        "blacklist",
310        "allowlist",
311        "blocklist",
312        "denylist",
313    ]
314    .into_iter()
315    .map(String::from)
316    .collect()
317});
318
319/// Returns true if the character is uppercase
320#[inline]
321fn is_uppercase(c: char) -> bool {
322    c.is_ascii_uppercase()
323}
324
325/// Returns true if the character is lowercase
326#[inline]
327fn is_lowercase(c: char) -> bool {
328    c.is_ascii_lowercase()
329}
330
331/// Returns true if the character is a number
332#[inline]
333fn is_number(c: char) -> bool {
334    c.is_ascii_digit()
335}
336
337/// Checks if a word is a special case that should be treated as a single token
338pub fn is_special_case(word: &str) -> bool {
339    // Convert to lowercase for case-insensitive comparison
340    let lowercase = word.to_lowercase();
341
342    // Check if the word is in the static special case list
343    if SPECIAL_CASE_WORDS.contains(&lowercase) {
344        return true;
345    }
346
347    // Check if the word is in the dynamic special terms list
348    let special_terms = DYNAMIC_SPECIAL_TERMS.lock().unwrap();
349    if special_terms.contains(&lowercase) {
350        // Debug output
351        if std::env::var("DEBUG").unwrap_or_default() == "1" {
352            println!("DEBUG: Found dynamic special term: {lowercase}");
353        }
354        return true;
355    }
356
357    false
358}
359
360/// Splits a string on camel case boundaries
361/// This function handles:
362/// - camelCase -> ["camel", "case"]
363/// - PascalCase -> ["pascal", "case"]
364/// - acronyms and numbers -> ["parse", "json", "to", "html", "5"]
365/// - special cases like OAuth2 -> ["oauth2"]
366/// - also attempts to split lowercase identifiers that might have been camelCase originally
367pub fn split_camel_case(input: &str) -> Vec<String> {
368    let _debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
369
370    if input.is_empty() {
371        return vec![];
372    }
373
374    // Check if the input is a special case word
375    if is_special_case(input) {
376        return vec![input.to_lowercase()];
377    }
378
379    // Special case for OAuth2Provider and similar patterns
380    let lowercase = input.to_lowercase();
381
382    // Special case for OAuth2Provider -> ["oauth2", "provider"]
383    if lowercase.starts_with("oauth2") {
384        let remaining = &input[6..]; // "oauth2".len() = 6
385        if !remaining.is_empty() {
386            let mut result = vec!["oauth2".to_string()];
387            result.extend(split_camel_case(remaining));
388            return result;
389        }
390    }
391
392    // Get all special case words and sort by length (longest first)
393    // This ensures that longer matches like "ipv4" are checked before shorter ones like "ipv"
394    let mut special_cases: Vec<&String> = SPECIAL_CASE_WORDS.iter().collect();
395    special_cases.sort_by_key(|b| std::cmp::Reverse(b.len()));
396
397    // General special case handling
398    for special_case in special_cases {
399        if lowercase.starts_with(special_case) {
400            // Find the corresponding part in the original input
401            let _original_part = &input[0..special_case.len()];
402            let remaining = &input[special_case.len()..];
403
404            if !remaining.is_empty() {
405                let mut result = vec![special_case.clone()];
406                result.extend(split_camel_case(remaining));
407
408                return result;
409            }
410        }
411    }
412
413    // If input is all lowercase, try to identify potential camelCase boundaries
414    // This is for handling cases where the input was already lowercased
415    if input == lowercase && !input.contains('_') && input.len() > 3 {
416        // Check for common patterns in identifiers
417        let _potential_splits: Vec<String> = Vec::new();
418
419        // Use the exception terms from our centralized list
420        let common_terms = EXCEPTION_TERMS
421            .iter()
422            .map(|s| s.as_str())
423            .collect::<Vec<_>>();
424
425        for term in common_terms {
426            if input.contains(term) && term != input {
427                let parts: Vec<&str> = input.split(term).collect();
428                if parts.len() > 1 {
429                    let mut result = Vec::new();
430                    for (i, part) in parts.iter().enumerate() {
431                        if !part.is_empty() {
432                            result.push(part.to_string());
433                        }
434                        if i < parts.len() - 1 {
435                            result.push(term.to_string());
436                        }
437                    }
438                    if !result.is_empty() {
439                        // println!("Split by common term '{}': {:?}", term, result);
440                        return result;
441                    }
442                }
443            }
444        }
445    }
446
447    let chars: Vec<char> = input.chars().collect();
448    let mut result = Vec::new();
449    let mut current_word = String::new();
450
451    // State tracking
452    let mut prev_is_lower = false;
453    let mut prev_is_upper = false;
454    let mut prev_is_digit = false;
455
456    for (i, &c) in chars.iter().enumerate() {
457        let is_upper = is_uppercase(c);
458        let is_lower = is_lowercase(c);
459        let is_digit = is_number(c);
460
461        // Start a new word when:
462        // 1. Transition from lowercase to uppercase (camelCase)
463        // 2. Transition from uppercase to uppercase followed by lowercase (APIClient -> API, Client)
464        // 3. Transition to/from digits
465        let start_new_word =
466            // Empty current word - no need to start a new one
467            !current_word.is_empty() && (
468                // Case 1: camelCase boundary
469                (prev_is_lower && is_upper) ||
470                // Case 2: Digit boundaries
471                (prev_is_digit != is_digit) ||
472                // Case 3: Uppercase followed by lowercase, but only if we have multiple uppercase in a row
473                (prev_is_upper && is_upper && i + 1 < chars.len() && is_lowercase(chars[i + 1]))
474            );
475
476        if start_new_word {
477            result.push(current_word);
478            current_word = String::new();
479        }
480
481        current_word.push(c);
482
483        // Update state for next iteration
484        prev_is_lower = is_lower;
485        prev_is_upper = is_upper;
486        prev_is_digit = is_digit;
487    }
488
489    // Add the last word
490    if !current_word.is_empty() {
491        result.push(current_word);
492    }
493
494    // Convert all to lowercase for consistency
495
496    // println!("Camel case split result: {:?}", final_result);
497    result.into_iter().map(|word| word.to_lowercase()).collect()
498}
499
500/// Checks if a word is a common English stop word or a simple number (0-10)
501pub fn is_english_stop_word(word: &str) -> bool {
502    // Check if the word is a simple number (0-10)
503    if let Ok(num) = word.parse::<u32>() {
504        if num <= 10 {
505            return true;
506        }
507    }
508
509    ENGLISH_STOP_WORDS.contains(word)
510}
511
512/// Checks if a word is a programming language stop word
513pub fn is_programming_stop_word(word: &str) -> bool {
514    PROGRAMMING_STOP_WORDS.contains(word)
515}
516
517/// Checks if a word is either an English or programming stop word
518pub fn is_stop_word(word: &str) -> bool {
519    is_english_stop_word(word) || is_programming_stop_word(word)
520}
521
522/// Attempts to split a compound word into its constituent parts using a vocabulary
523/// Returns the original word if it cannot be split
524pub fn split_compound_word(word: &str, vocab: &HashSet<String>) -> Vec<String> {
525    // First check if this is a special case word that should never be split
526    if is_special_case(word) {
527        return vec![word.to_lowercase()];
528    }
529
530    // Use the exception terms from our centralized list
531    let common_terms = EXCEPTION_TERMS
532        .iter()
533        .map(|s| s.as_str())
534        .collect::<Vec<_>>();
535
536    // If the word is in common_terms, don't split it
537    if common_terms.contains(&word.to_lowercase().as_str()) {
538        return vec![word.to_string()];
539    }
540
541    // Check if the word is in the vocabulary as a whole
542    // This handles cases where the vocabulary contains both the compound word
543    // and its constituent parts
544    if vocab.contains(&word.to_lowercase()) {
545        return vec![word.to_string()];
546    }
547
548    let is_valid_word = |w: &str| vocab.contains(&w.to_lowercase());
549
550    match decompound(word, &is_valid_word, DecompositionOptions::empty()) {
551        Ok(parts) if !parts.is_empty() => parts,
552        _ => vec![word.to_string()],
553    }
554}
555
556/// Loads a vocabulary for compound word splitting
557/// This is a simplified version that could be expanded with a real dictionary
558pub fn load_vocabulary() -> &'static HashSet<String> {
559    static VOCABULARY: Lazy<HashSet<String>> = Lazy::new(|| {
560        // This is a simplified vocabulary for demonstration
561        // In a real application, this would be loaded from a file or database
562        vec![
563            // Common English words that might appear in compound words
564            "white",
565            "black",
566            "list",
567            "mail",
568            "back",
569            "ground",
570            "book",
571            "mark",
572            "key",
573            "word",
574            "pass",
575            "fire",
576            "wall",
577            "firewall",
578            "water",
579            "fall",
580            "data",
581            "base",
582            "time",
583            "stamp",
584            "air",
585            "port",
586            "blue",
587            "tooth",
588            "green",
589            "house",
590            "red",
591            "hat",
592            "yellow",
593            "pages",
594            "blue",
595            "print",
596            "type",
597            "script",
598            "java",
599            "script",
600            "note",
601            "pad",
602            "web",
603            "site",
604            "page",
605            "view",
606            "code",
607            "base",
608            "name",
609            "space",
610            "class",
611            "room",
612            "work",
613            "flow",
614            "life",
615            "cycle",
616            "end",
617            "point",
618            "check",
619            "box",
620            "drop",
621            "down",
622            "pop",
623            "up",
624            "side",
625            "bar",
626            "tool",
627            "tip",
628            "drag",
629            "drop",
630            "click",
631            "stream",
632            "line",
633            "dead",
634            "lock",
635            "race",
636            "condition",
637            "thread",
638            "safe",
639            "memory",
640            "leak",
641            "stack",
642            "trace",
643            "heap",
644            "dump",
645            "core",
646            "file",
647            "system",
648            "disk",
649            "drive",
650            "hard",
651            "soft",
652            "ware",
653            "firm",
654            "middle",
655            "front",
656            "back",
657            "end",
658            "full",
659            "stack",
660            "dev",
661            "ops",
662            "micro",
663            "service",
664            "mono",
665            "lith",
666            "container",
667            "docker",
668            "pod",
669            "cloud",
670            "native",
671            "server",
672            "less",
673            "function",
674            "as",
675            "service",
676            "infra",
677            "structure",
678            "platform",
679            "test",
680            "driven",
681            "behavior",
682            "continuous",
683            "integration",
684            "deployment",
685            "delivery",
686            "pipeline",
687            "git",
688            "hub",
689            "lab",
690            "version",
691            "control",
692            "branch",
693            "merge",
694            "pull",
695            "request",
696            "commit",
697            "push",
698            "clone",
699            "fork",
700            "repository",
701            "issue",
702            "bug",
703            "feature",
704            "release",
705            "tag",
706            "semantic",
707            "versioning",
708            "major",
709            "minor",
710            "patch",
711            "alpha",
712            "beta",
713            "stable",
714            "unstable",
715            "deprecated",
716            "legacy",
717            "modern",
718            "framework",
719            "library",
720            "package",
721            "module",
722            "component",
723            "prop",
724            "state",
725            "hook",
726            "effect",
727            "context",
728            "provider",
729            "consumer",
730            "reducer",
731            "action",
732            "store",
733            "dispatch",
734            "subscribe",
735            "publish",
736            "event",
737            "handler",
738            "listener",
739            "callback",
740            "promise",
741            "async",
742            "await",
743            "future",
744            "stream",
745            "observable",
746            "reactive",
747            "functional",
748            "object",
749            "oriented",
750            "procedural",
751            "declarative",
752            "imperative",
753            "mutable",
754            "immutable",
755            "pure",
756            "side",
757            "effect",
758            "higher",
759            "order",
760            "first",
761            "class",
762            "citizen",
763            "closure",
764            "scope",
765            "lexical",
766            "dynamic",
767            "static",
768            "type",
769            "inference",
770            "checking",
771            "compile",
772            "time",
773            "run",
774            "error",
775            "exception",
776            "try",
777            "catch",
778            "finally",
779            "throw",
780            "raise",
781            "handle",
782            "logging",
783            "debug",
784            "info",
785            "warn",
786            "error",
787            "fatal",
788            "trace",
789            "metric",
790            "monitor",
791            "alert",
792            "notification",
793            "dashboard",
794            "report",
795            "analytics",
796            "insight",
797            "data",
798            "science",
799            "machine",
800            "learning",
801            "artificial",
802            "intelligence",
803            "neural",
804            "network",
805            "deep",
806            "reinforcement",
807            "supervised",
808            "unsupervised",
809            "classification",
810            "regression",
811            "clustering",
812            "recommendation",
813            "prediction",
814            "inference",
815            "training",
816            "validation",
817            "test",
818            "accuracy",
819            "precision",
820            "recall",
821            "f1",
822            "score",
823            "loss",
824            "function",
825            "gradient",
826            "descent",
827            "back",
828            "propagation",
829            "forward",
830            "pass",
831            "epoch",
832            "batch",
833            "mini",
834            "over",
835            "fitting",
836            "under",
837            "regularization",
838            "dropout",
839            "batch",
840            "normalization",
841            "activation",
842            "sigmoid",
843            "tanh",
844            "relu",
845            "leaky",
846            "softmax",
847            "convolution",
848            "pooling",
849            "recurrent",
850            "lstm",
851            "gru",
852            "transformer",
853            "attention",
854            "encoder",
855            "decoder",
856            "embedding",
857            "token",
858            "tokenization",
859            "stemming",
860            "lemmatization",
861            "stop",
862            "word",
863            "n",
864            "gram",
865            "tf",
866            "idf",
867            "cosine",
868            "similarity",
869            "euclidean",
870            "distance",
871            "manhattan",
872            "jaccard",
873            "index",
874            "precision",
875            "recall",
876            "relevance",
877            "ranking",
878            "page",
879            "rank",
880            "search",
881            "engine",
882            "crawler",
883            "indexer",
884            "query",
885            "result",
886            "snippet",
887            "cache",
888            "hit",
889            "miss",
890            "eviction",
891            "policy",
892            "lru",
893            "fifo",
894            "lifo",
895            "priority",
896            "queue",
897            "stack",
898            "heap",
899            "tree",
900            "binary",
901            "balanced",
902            "avl",
903            "red",
904            "black",
905            "b",
906            "trie",
907            "hash",
908            "map",
909            "set",
910            "list",
911            "linked",
912            "doubly",
913            "circular",
914            "array",
915            "vector",
916            "matrix",
917            "tensor",
918            "graph",
919            "directed",
920            "undirected",
921            "weighted",
922            "unweighted",
923            "adjacency",
924            "matrix",
925            "list",
926            "edge",
927            "vertex",
928            "node",
929            "path",
930            "cycle",
931            "traversal",
932            "breadth",
933            "first",
934            "depth",
935            "topological",
936            "sort",
937            "minimum",
938            "spanning",
939            "tree",
940            "shortest",
941            "path",
942            "dijkstra",
943            "bellman",
944            "ford",
945            "floyd",
946            "warshall",
947            "kruskal",
948            "prim",
949            "greedy",
950            "dynamic",
951            "programming",
952            "divide",
953            "conquer",
954            "backtracking",
955            "branch",
956            "bound",
957            "heuristic",
958            "approximation",
959            "randomized",
960            "parallel",
961            "concurrent",
962            "distributed",
963            "synchronous",
964            "asynchronous",
965            "blocking",
966            "non",
967            "mutex",
968            "semaphore",
969            "lock",
970            "atomic",
971            "volatile",
972            "transaction",
973            "acid",
974            "consistency",
975            "isolation",
976            "durability",
977            "serializable",
978            "repeatable",
979            "read",
980            "committed",
981            "uncommitted",
982            "phantom",
983            "dirty",
984            "read",
985            "write",
986            "skew",
987            "conflict",
988            "resolution",
989            "optimistic",
990            "pessimistic",
991            "two",
992            "phase",
993            "commit",
994            "rollback",
995            "savepoint",
996            "checkpoint",
997            "recovery",
998            "backup",
999            "restore",
1000            "archive",
1001            "log",
1002            "journal",
1003            "redo",
1004            "undo",
1005            "write",
1006            "ahead",
1007            "logging",
1008            "snapshot",
1009            "isolation",
1010            "level",
1011            "serializable",
1012            "repeatable",
1013            "read",
1014            "committed",
1015            "uncommitted",
1016            "phantom",
1017            "dirty",
1018            "read",
1019            "write",
1020            "skew",
1021            "conflict",
1022            "resolution",
1023            "optimistic",
1024            "pessimistic",
1025            "two",
1026            "phase",
1027            "commit",
1028            "rollback",
1029            "savepoint",
1030            "checkpoint",
1031            "recovery",
1032            "backup",
1033            "restore",
1034            "archive",
1035            "log",
1036            "journal",
1037            "redo",
1038            "undo",
1039            "write",
1040            "ahead",
1041            "logging",
1042            "snapshot",
1043            "isolation",
1044            "level",
1045        ]
1046        .into_iter()
1047        .map(String::from)
1048        .collect()
1049    });
1050
1051    &VOCABULARY
1052}
1053
1054/// Tokenize and stem a keyword, handling camel case and compound word splitting
1055/// This function is used by the elastic query parser to process terms in the AST
1056#[allow(dead_code)]
1057pub fn tokenize_and_stem(keyword: &str) -> Vec<String> {
1058    let stemmer = get_stemmer();
1059    let vocabulary = load_vocabulary();
1060
1061    // First try camel case splitting
1062    let camel_parts = split_camel_case(keyword);
1063
1064    if camel_parts.len() > 1 {
1065        // Return stemmed camel case parts, filtering out stop words
1066        camel_parts
1067            .into_iter()
1068            .filter(|part| !is_stop_word(part))
1069            .map(|part| stemmer.stem(&part).to_string())
1070            .collect()
1071    } else {
1072        // Try compound word splitting
1073        let compound_parts = split_compound_word(keyword, vocabulary);
1074
1075        if compound_parts.len() > 1 {
1076            // Return stemmed compound parts, filtering out stop words
1077            compound_parts
1078                .into_iter()
1079                .filter(|part| !is_stop_word(part))
1080                .map(|part| stemmer.stem(&part).to_string())
1081                .collect()
1082        } else {
1083            // Just stem the original keyword
1084            vec![stemmer.stem(keyword).to_string()]
1085        }
1086    }
1087}
1088
1089/// Tokenizes text into words by splitting on whitespace and non-alphanumeric characters,
1090/// removes stop words, and applies stemming. Also splits camelCase/PascalCase identifiers
1091/// and compound words.
1092///
1093/// The tokenization flow follows these steps:
1094/// 1. Split input text on whitespace
1095/// 2. For each token, further split on non-alphanumeric characters (except for leading "-")
1096/// 3. For each resulting token, check if it has mixed case
1097/// 4. If it has mixed case, split using camel case rules
1098/// 5. For each part, attempt to split compound words
1099/// 6. Process each part: remove stop words and apply stemming
1100/// 7. Collect unique tokens
1101/// 8. Exclude terms that were negated with a "-" prefix
1102pub fn tokenize(text: &str) -> Vec<String> {
1103    let stemmer = get_stemmer();
1104    let vocabulary = load_vocabulary();
1105
1106    // Track negated terms to exclude them from the final result
1107    let mut negated_terms = HashSet::new();
1108
1109    // println!("Tokenizing text: {}", text);
1110
1111    // Split by whitespace and collect words
1112    let mut tokens = Vec::new();
1113    for word in text.split_whitespace() {
1114        // Check if this is a negated term
1115        let is_negated = word.starts_with('-');
1116
1117        // Further split by non-alphanumeric characters
1118        let mut current_token = String::new();
1119
1120        // Process the characters, skipping the leading "-" if this is a negated term
1121        let mut chars = word.chars();
1122        if is_negated {
1123            // Skip the leading "-"
1124            chars.next();
1125        }
1126
1127        for c in chars {
1128            if c.is_alphanumeric() {
1129                current_token.push(c);
1130            } else if !current_token.is_empty() {
1131                // We found a non-alphanumeric character, add the current token if not empty
1132                if is_negated {
1133                    // Track this as a negated term
1134                    negated_terms.insert(current_token.to_lowercase());
1135                }
1136                tokens.push(current_token);
1137                current_token = String::new();
1138            }
1139        }
1140
1141        // Add the last token if not empty
1142        if !current_token.is_empty() {
1143            if is_negated {
1144                // Track this as a negated term
1145                negated_terms.insert(current_token.to_lowercase());
1146            }
1147            tokens.push(current_token);
1148        }
1149    }
1150
1151    // Create a set to track unique tokens after processing
1152    let mut processed_tokens = HashSet::new();
1153    let mut result = Vec::new();
1154
1155    // Process each token: filter stop words, apply stemming, and add to result if unique
1156    for token in tokens {
1157        // Always try to split using camel case rules, even for lowercase tokens
1158        // This allows us to handle tokens that were already lowercased
1159        let parts = split_camel_case(&token);
1160
1161        // Process each part
1162        for part in parts {
1163            let lowercase_part = part.to_lowercase();
1164
1165            // Skip both English and programming stop words
1166            if is_stop_word(&lowercase_part) {
1167                continue;
1168            }
1169
1170            // Skip if this is a negated term
1171            if negated_terms.contains(&lowercase_part) {
1172                continue;
1173            }
1174
1175            // Try to split compound words
1176            let compound_parts = split_compound_word(&lowercase_part, vocabulary);
1177
1178            for compound_part in compound_parts {
1179                // Skip stop words in compound parts
1180                if is_stop_word(&compound_part) {
1181                    continue;
1182                }
1183
1184                // Skip if this is a negated term
1185                if negated_terms.contains(&compound_part) {
1186                    continue;
1187                }
1188
1189                // Preserve the original form for all exception terms
1190                if is_exception_term(&compound_part)
1191                    && processed_tokens.insert(compound_part.clone())
1192                {
1193                    result.push(compound_part.clone());
1194                }
1195
1196                // Also add the stemmed part if it's unique
1197                let stemmed_part = stemmer.stem(&compound_part).to_string();
1198                // Skip if the stemmed version is a negated term
1199                if negated_terms.contains(&stemmed_part) {
1200                    continue;
1201                }
1202
1203                if processed_tokens.insert(stemmed_part.clone()) {
1204                    result.push(stemmed_part);
1205                }
1206            }
1207        }
1208    }
1209
1210    result
1211}
1212
1213#[cfg(test)]
1214mod tests {
1215    use super::*;
1216
1217    #[test]
1218    fn test_split_camel_case() {
1219        // Test basic camel case
1220        assert_eq!(split_camel_case("camelCase"), vec!["camel", "case"]);
1221
1222        // Test pascal case
1223        assert_eq!(split_camel_case("PascalCase"), vec!["pascal", "case"]);
1224
1225        // Test acronyms
1226        assert_eq!(
1227            split_camel_case("parseJSONToHTML5"),
1228            vec!["parse", "json", "to", "html", "5"]
1229        );
1230
1231        // Test consecutive uppercase letters
1232        assert_eq!(split_camel_case("APIDefinition"), vec!["api", "definition"]);
1233
1234        // Test special case with OAuth2
1235        assert_eq!(
1236            split_camel_case("OAuth2Provider"),
1237            vec!["oauth2", "provider"]
1238        );
1239
1240        // Test mixed case with type prefix
1241        assert_eq!(split_camel_case("typeIgnore"), vec!["type", "ignore"]);
1242
1243        // Test complex identifiers
1244        assert_eq!(
1245            split_camel_case("migrateEndpointMetaByType"),
1246            vec!["migrate", "endpoint", "meta", "by", "type"]
1247        );
1248    }
1249
1250    #[test]
1251    fn test_stop_words() {
1252        assert!(is_programming_stop_word("func"));
1253        assert!(is_programming_stop_word("type"));
1254        assert!(is_programming_stop_word("struct"));
1255        assert!(!is_programming_stop_word("migrate"));
1256        assert!(!is_programming_stop_word("endpoint"));
1257    }
1258
1259    #[test]
1260    fn test_tokenize() {
1261        // Test method with API acronym
1262        let tokens = tokenize("func (a *APIDefinition) MigrateEndpointMeta()");
1263        assert!(tokens.contains(&"api".to_string()));
1264        assert!(tokens.contains(&"definit".to_string())); // stemmed "definition"
1265        assert!(tokens.contains(&"migrat".to_string())); // stemmed "migrate"
1266                                                         // With compound word splitting, "endpoint" might be split into "end" and "point"
1267                                                         // So we check for both possibilities
1268        assert!(
1269            tokens.contains(&"endpoint".to_string())
1270                || (tokens.contains(&"end".to_string()) && tokens.contains(&"point".to_string()))
1271        );
1272        assert!(tokens.contains(&"meta".to_string()));
1273
1274        // Test complex identifier with acronyms and numbers
1275        let tokens = tokenize("func ParseJSONToHTML5()");
1276        assert!(tokens.contains(&"pars".to_string())); // stemmed "parse"
1277        assert!(tokens.contains(&"json".to_string()));
1278        assert!(tokens.contains(&"html".to_string()));
1279        // Numbers 0-10 are now treated as stop words, so we don't expect "5" to be included
1280        // assert!(tokens.contains(&"5".to_string()));
1281
1282        // Test mixed case with type prefix
1283        let tokens = tokenize("typeIgnore typeWhitelist");
1284        assert!(tokens.contains(&"ignor".to_string())); // stemmed "ignore"
1285
1286        // Test compound word splitting
1287        let tokens = tokenize("whitelist blackmail firewall");
1288        // "whitelist" is now a special case word that should not be split
1289        assert!(tokens.contains(&"whitelist".to_string()));
1290        assert!(tokens.contains(&"black".to_string()));
1291        assert!(tokens.contains(&"mail".to_string()));
1292        assert!(tokens.contains(&"firewall".to_string()));
1293
1294        // Test compound word in camelCase
1295        let tokens = tokenize("enableFirewallWhitelist");
1296        assert!(tokens.contains(&"enabl".to_string())); // stemmed "enable"
1297        assert!(tokens.contains(&"firewall".to_string())); // Now we keep firewall as a whole
1298                                                           // "whitelist" is now a special case word that should not be split
1299        assert!(tokens.contains(&"whitelist".to_string()));
1300    }
1301    #[test]
1302    fn test_compound_word_splitting() {
1303        // Test basic compound word splitting
1304        let vocab = HashSet::from([
1305            "white".to_string(),
1306            "list".to_string(),
1307            "black".to_string(),
1308            "mail".to_string(),
1309        ]);
1310
1311        // "whitelist" is now a special case word that should not be split
1312        let parts = split_compound_word("whitelist", &vocab);
1313        assert_eq!(parts, vec!["whitelist".to_string()]);
1314
1315        // "blackmail" is not in the special case list, so it should still be split
1316        let parts = split_compound_word("blackmail", &vocab);
1317        assert_eq!(parts, vec!["black".to_string(), "mail".to_string()]);
1318
1319        // Test word that can't be split
1320        let parts = split_compound_word("computer", &vocab);
1321        assert_eq!(parts, vec!["computer".to_string()]);
1322    }
1323
1324    #[test]
1325    fn test_tokenize_with_compound_words() {
1326        // Test tokenization with compound word splitting
1327        let tokens = tokenize("whitelist blackmail firewall");
1328
1329        // "whitelist" is now a special case word that should not be split
1330        assert!(tokens.contains(&"whitelist".to_string()));
1331        // "blackmail" is not in the special case list, so it should still be split
1332        assert!(tokens.contains(&"black".to_string()));
1333        assert!(tokens.contains(&"mail".to_string()));
1334        assert!(tokens.contains(&"firewall".to_string()));
1335    }
1336}
probe_code/search/tokenization.rs

probe_code/search/
tokenization.rs