1use crate::constants::{IC_ADJECTIVE, IC_NOUN, IC_VERB};
29use chrono::{DateTime, Datelike, NaiveDate, Utc};
30use rust_stemmers::{Algorithm, Stemmer};
31use serde::{Deserialize, Serialize};
32use std::collections::HashSet;
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
43pub enum PosTag {
44 Noun,
45 Verb,
46 Adjective,
47 ProperNoun,
48 StopWord,
49 Other,
50}
51
52#[derive(Debug, Clone)]
54pub struct TaggedWord {
55 pub text: String,
56 pub stem: String,
57 pub pos: PosTag,
58 pub position: usize,
60}
61
62#[derive(Debug, Clone)]
64pub struct SentenceChunk {
65 pub text: String,
67 pub sentence_idx: usize,
69 pub words: Vec<TaggedWord>,
71}
72
73impl SentenceChunk {
74 pub fn nouns(&self) -> Vec<&TaggedWord> {
76 self.words
77 .iter()
78 .filter(|w| matches!(w.pos, PosTag::Noun | PosTag::ProperNoun))
79 .collect()
80 }
81
82 pub fn verbs(&self) -> Vec<&TaggedWord> {
84 self.words
85 .iter()
86 .filter(|w| w.pos == PosTag::Verb)
87 .collect()
88 }
89
90 pub fn adjectives(&self) -> Vec<&TaggedWord> {
92 self.words
93 .iter()
94 .filter(|w| w.pos == PosTag::Adjective)
95 .collect()
96 }
97
98 pub fn content_words(&self) -> Vec<&TaggedWord> {
100 self.words
101 .iter()
102 .filter(|w| {
103 matches!(
104 w.pos,
105 PosTag::Noun | PosTag::ProperNoun | PosTag::Verb | PosTag::Adjective
106 )
107 })
108 .collect()
109 }
110
111 pub fn cooccurrence_pairs(&self) -> Vec<(&str, &str)> {
114 let content = self.content_words();
115 let mut pairs = Vec::new();
116
117 for i in 0..content.len() {
118 for j in (i + 1)..content.len() {
119 pairs.push((content[i].stem.as_str(), content[j].stem.as_str()));
120 }
121 }
122
123 pairs
124 }
125}
126
127#[derive(Debug, Clone)]
129pub struct ChunkExtraction {
130 pub chunks: Vec<SentenceChunk>,
132 pub unique_nouns: HashSet<String>,
134 pub unique_verbs: HashSet<String>,
136 pub unique_adjectives: HashSet<String>,
138 pub proper_nouns: HashSet<String>,
140}
141
142impl ChunkExtraction {
143 pub fn all_content_stems(&self) -> HashSet<String> {
145 let mut all = self.unique_nouns.clone();
146 all.extend(self.unique_verbs.clone());
147 all.extend(self.unique_adjectives.clone());
148 all.extend(self.proper_nouns.clone());
149 all
150 }
151
152 pub fn all_cooccurrence_pairs(&self) -> Vec<(String, String)> {
154 let mut all_pairs = Vec::new();
155 for chunk in &self.chunks {
156 for (w1, w2) in chunk.cooccurrence_pairs() {
157 all_pairs.push((w1.to_string(), w2.to_string()));
158 }
159 }
160 all_pairs
161 }
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
173pub struct TemporalRef {
174 pub date: NaiveDate,
176 pub original_text: String,
178 pub confidence: f32,
180 pub position: usize,
182 pub ref_type: TemporalRefType,
184}
185
186#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
188pub enum TemporalRefType {
189 Absolute,
191 Relative,
193 DayOfWeek,
195 Month,
197 Year,
199}
200
201#[derive(Debug, Clone, Default)]
203pub struct TemporalExtraction {
204 pub refs: Vec<TemporalRef>,
206 pub earliest: Option<NaiveDate>,
208 pub latest: Option<NaiveDate>,
210}
211
212impl TemporalExtraction {
213 pub fn has_temporal_refs(&self) -> bool {
215 !self.refs.is_empty()
216 }
217
218 pub fn date_range(&self) -> Option<(NaiveDate, NaiveDate)> {
220 match (self.earliest, self.latest) {
221 (Some(e), Some(l)) => Some((e, l)),
222 (Some(e), None) => Some((e, e)),
223 (None, Some(l)) => Some((l, l)),
224 (None, None) => None,
225 }
226 }
227}
228
229pub fn extract_temporal_refs(text: &str) -> TemporalExtraction {
238 let now = Utc::now();
239 let mut refs = Vec::new();
240 let mut earliest: Option<NaiveDate> = None;
241 let mut latest: Option<NaiveDate> = None;
242
243 let is_valid_date = |date: &NaiveDate| -> bool {
245 let year = date.year();
246 year >= 1900 && year <= 2100
247 };
248
249 if let Ok(parsed) = dateparser::parse(text) {
251 let date = parsed.date_naive();
252 if is_valid_date(&date) {
253 refs.push(TemporalRef {
254 date,
255 original_text: text.to_string(),
256 confidence: 0.8,
257 position: 0,
258 ref_type: classify_temporal_ref(text, &date, &now),
259 });
260 update_bounds(&mut earliest, &mut latest, date);
261 }
262 }
263
264 for (pos, sentence) in split_temporal_phrases(text).iter().enumerate() {
266 if let Ok(parsed) = dateparser::parse(sentence) {
267 let date = parsed.date_naive();
268 if !is_valid_date(&date) {
269 continue;
270 }
271 if refs.iter().any(|r| r.date == date) {
272 continue;
273 }
274 refs.push(TemporalRef {
275 date,
276 original_text: sentence.to_string(),
277 confidence: 0.7,
278 position: pos,
279 ref_type: classify_temporal_ref(sentence, &date, &now),
280 });
281 update_bounds(&mut earliest, &mut latest, date);
282 }
283 }
284
285 let explicit_dates = extract_explicit_dates(text);
287 for (date, original, pos) in explicit_dates {
288 if !is_valid_date(&date) {
289 continue;
290 }
291 if refs.iter().any(|r| r.date == date) {
292 continue;
293 }
294 refs.push(TemporalRef {
295 date,
296 original_text: original,
297 confidence: 0.9,
298 position: pos,
299 ref_type: TemporalRefType::Absolute,
300 });
301 update_bounds(&mut earliest, &mut latest, date);
302 }
303
304 refs.sort_by_key(|r| r.position);
306
307 TemporalExtraction {
308 refs,
309 earliest,
310 latest,
311 }
312}
313
314fn classify_temporal_ref(text: &str, date: &NaiveDate, now: &DateTime<Utc>) -> TemporalRefType {
316 let text_lower = text.to_lowercase();
317 let today = now.date_naive();
318
319 if text_lower.contains("yesterday")
321 || text_lower.contains("ago")
322 || text_lower.contains("last")
323 || text_lower.contains("previous")
324 || text_lower.contains("before")
325 || text_lower.contains("earlier")
326 {
327 return TemporalRefType::Relative;
328 }
329
330 let days = [
332 "monday",
333 "tuesday",
334 "wednesday",
335 "thursday",
336 "friday",
337 "saturday",
338 "sunday",
339 ];
340 if days.iter().any(|d| text_lower.contains(d)) {
341 return TemporalRefType::DayOfWeek;
342 }
343
344 let months = [
346 "january",
347 "february",
348 "march",
349 "april",
350 "may",
351 "june",
352 "july",
353 "august",
354 "september",
355 "october",
356 "november",
357 "december",
358 ];
359 let has_month = months.iter().any(|m| text_lower.contains(m));
360 let has_day = text.chars().any(|c| c.is_ascii_digit());
361
362 if has_month && !has_day {
363 return TemporalRefType::Month;
364 }
365
366 if text.len() == 4 && text.chars().all(|c| c.is_ascii_digit()) {
368 return TemporalRefType::Year;
369 }
370
371 let diff = (today - *date).num_days().abs();
373 if diff <= 7 && text_lower.contains("this") {
374 return TemporalRefType::Relative;
375 }
376
377 TemporalRefType::Absolute
378}
379
380fn split_temporal_phrases(text: &str) -> Vec<String> {
382 let mut phrases = Vec::new();
383
384 let markers = [
386 " on ", " in ", " at ", " during ", " since ", " until ", " before ", " after ",
387 " around ", ", ", ". ", "! ", "? ",
388 ];
389
390 let current = text.to_string();
391 for marker in markers {
392 let parts: Vec<&str> = current.split(marker).collect();
393 if parts.len() > 1 {
394 for part in parts {
395 let trimmed = part.trim();
396 if !trimmed.is_empty() && trimmed.len() > 3 {
397 phrases.push(trimmed.to_string());
398 }
399 }
400 break;
401 }
402 }
403
404 if phrases.is_empty() {
406 for sentence in text.split('.') {
407 let trimmed = sentence.trim();
408 if !trimmed.is_empty() && trimmed.len() > 3 {
409 phrases.push(trimmed.to_string());
410 }
411 }
412 }
413
414 phrases
415}
416
417fn extract_explicit_dates(text: &str) -> Vec<(NaiveDate, String, usize)> {
419 use regex::Regex;
420
421 let mut results = Vec::new();
422
423 let month_day_year =
425 Regex::new(r"(?i)(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})")
426 .unwrap();
427
428 for cap in month_day_year.captures_iter(text) {
429 let month_str = &cap[1];
430 let day: u32 = cap[2].parse().unwrap_or(1);
431 let year: i32 = cap[3].parse().unwrap_or(2000);
432 let month = month_to_num(month_str);
433
434 if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
435 let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
436 results.push((date, cap[0].to_string(), pos));
437 }
438 }
439
440 let iso_date = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
442 for cap in iso_date.captures_iter(text) {
443 let year: i32 = cap[1].parse().unwrap_or(2000);
444 let month: u32 = cap[2].parse().unwrap_or(1);
445 let day: u32 = cap[3].parse().unwrap_or(1);
446
447 if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
448 let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
449 results.push((date, cap[0].to_string(), pos));
450 }
451 }
452
453 let slash_date = Regex::new(r"(\d{1,2})/(\d{1,2})/(\d{4})").unwrap();
455 for cap in slash_date.captures_iter(text) {
456 let month: u32 = cap[1].parse().unwrap_or(1);
457 let day: u32 = cap[2].parse().unwrap_or(1);
458 let year: i32 = cap[3].parse().unwrap_or(2000);
459
460 if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
461 let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
462 results.push((date, cap[0].to_string(), pos));
463 }
464 }
465
466 results
467}
468
469fn month_to_num(month: &str) -> u32 {
471 match month.to_lowercase().as_str() {
472 "january" | "jan" => 1,
473 "february" | "feb" => 2,
474 "march" | "mar" => 3,
475 "april" | "apr" => 4,
476 "may" => 5,
477 "june" | "jun" => 6,
478 "july" | "jul" => 7,
479 "august" | "aug" => 8,
480 "september" | "sep" | "sept" => 9,
481 "october" | "oct" => 10,
482 "november" | "nov" => 11,
483 "december" | "dec" => 12,
484 _ => 1,
485 }
486}
487
488fn update_bounds(
490 earliest: &mut Option<NaiveDate>,
491 latest: &mut Option<NaiveDate>,
492 date: NaiveDate,
493) {
494 match earliest {
495 Some(e) if date < *e => *earliest = Some(date),
496 None => *earliest = Some(date),
497 _ => {}
498 }
499 match latest {
500 Some(l) if date > *l => *latest = Some(date),
501 None => *latest = Some(date),
502 _ => {}
503 }
504}
505
506#[derive(Debug, Clone, Copy, PartialEq, Eq)]
513pub enum TemporalIntent {
514 WhenQuestion,
516 SpecificTime,
518 Ordering,
520 Duration,
522 None,
524}
525
526pub fn detect_temporal_intent(query: &str) -> TemporalIntent {
528 let query_lower = query.to_lowercase();
529
530 if query_lower.starts_with("when")
532 || query_lower.contains(" when ")
533 || query_lower.contains("what date")
534 || query_lower.contains("what day")
535 || query_lower.contains("what time")
536 {
537 return TemporalIntent::WhenQuestion;
538 }
539
540 if query_lower.contains("how long")
542 || query_lower.contains("how many days")
543 || query_lower.contains("how many weeks")
544 || query_lower.contains("how many months")
545 || query_lower.contains("how many years")
546 {
547 return TemporalIntent::Duration;
548 }
549
550 if query_lower.contains("before or after")
552 || query_lower.contains("first or")
553 || query_lower.contains("earlier or later")
554 || query_lower.contains("which came first")
555 || query_lower.contains("in what order")
556 {
557 return TemporalIntent::Ordering;
558 }
559
560 let time_indicators = [
562 "yesterday",
563 "today",
564 "last week",
565 "last month",
566 "last year",
567 "this week",
568 "this month",
569 "this year",
570 "in january",
571 "in february",
572 "in march",
573 "in april",
574 "in may",
575 "in june",
576 "in july",
577 "in august",
578 "in september",
579 "in october",
580 "in november",
581 "in december",
582 "on monday",
583 "on tuesday",
584 "on wednesday",
585 "on thursday",
586 "on friday",
587 "on saturday",
588 "on sunday",
589 " ago",
590 " days ago",
591 " weeks ago",
592 " months ago",
593 " years ago",
594 ];
595
596 if time_indicators.iter().any(|t| query_lower.contains(t)) {
597 return TemporalIntent::SpecificTime;
598 }
599
600 let extraction = extract_temporal_refs(query);
602 if extraction.has_temporal_refs() {
603 return TemporalIntent::SpecificTime;
604 }
605
606 TemporalIntent::None
607}
608
609pub fn requires_temporal_filtering(query: &str) -> bool {
621 let intent = detect_temporal_intent(query);
622 matches!(
623 intent,
624 TemporalIntent::SpecificTime | TemporalIntent::Duration | TemporalIntent::Ordering
626 )
627}
628
629pub fn asks_for_temporal_answer(query: &str) -> bool {
634 matches!(detect_temporal_intent(query), TemporalIntent::WhenQuestion)
635}
636
637#[derive(Debug, Clone, PartialEq, Eq)]
650pub enum QueryType {
651 Attribute(AttributeQuery),
653 Temporal,
655 Exploratory,
657}
658
659#[derive(Debug, Clone, PartialEq, Eq)]
661pub struct AttributeQuery {
662 pub entity: String,
664 pub attribute: String,
666 pub attribute_synonyms: Vec<String>,
668 pub original_query: String,
670}
671
672pub fn classify_query(query: &str) -> QueryType {
674 if let Some(attr_query) = detect_attribute_query(query) {
676 return QueryType::Attribute(attr_query);
677 }
678
679 if asks_for_temporal_answer(query) {
681 return QueryType::Temporal;
682 }
683
684 QueryType::Exploratory
686}
687
688pub fn detect_attribute_query(query: &str) -> Option<AttributeQuery> {
698 let query_lower = query.to_lowercase();
699 let query_trimmed = query_lower.trim().trim_end_matches('?');
700
701 if let Some(result) = extract_possessive_pattern(query_trimmed) {
703 return Some(result);
704 }
705
706 if let Some(result) = extract_of_pattern(query_trimmed) {
708 return Some(result);
709 }
710
711 if query_lower.starts_with("where does") || query_lower.starts_with("where is") {
713 if let Some(entity) = extract_entity_after_verb(query_trimmed) {
714 return Some(AttributeQuery {
715 entity,
716 attribute: "location".to_string(),
717 attribute_synonyms: vec![
718 "live".to_string(),
719 "lives".to_string(),
720 "living".to_string(),
721 "resides".to_string(),
722 "located".to_string(),
723 "address".to_string(),
724 "home".to_string(),
725 "place".to_string(),
726 ],
727 original_query: query.to_string(),
728 });
729 }
730 }
731
732 if query_lower.starts_with("how old") {
734 if let Some(entity) = extract_entity_after_verb(query_trimmed) {
735 return Some(AttributeQuery {
736 entity,
737 attribute: "age".to_string(),
738 attribute_synonyms: vec![
739 "age".to_string(),
740 "years old".to_string(),
741 "born".to_string(),
742 "birthday".to_string(),
743 ],
744 original_query: query.to_string(),
745 });
746 }
747 }
748
749 if query_lower.starts_with("is ") {
751 let status_words = [
752 "married",
753 "single",
754 "divorced",
755 "engaged",
756 "dating",
757 "in a relationship",
758 ];
759 for status in &status_words {
760 if query_lower.contains(status) {
761 let after_is = &query_trimmed[3..]; if let Some(pos) = after_is.find(status) {
764 let entity = after_is[..pos].trim().to_string();
765 if !entity.is_empty()
766 && entity.chars().next().map_or(false, |c| c.is_alphabetic())
767 {
768 return Some(AttributeQuery {
769 entity: capitalize_first(&entity),
770 attribute: "relationship_status".to_string(),
771 attribute_synonyms: vec![
772 "single".to_string(),
773 "married".to_string(),
774 "divorced".to_string(),
775 "engaged".to_string(),
776 "dating".to_string(),
777 "relationship".to_string(),
778 "partner".to_string(),
779 "spouse".to_string(),
780 "status".to_string(),
781 ],
782 original_query: query.to_string(),
783 });
784 }
785 }
786 }
787 }
788 }
789
790 None
791}
792
793fn extract_possessive_pattern(query: &str) -> Option<AttributeQuery> {
795 let possessive_patterns = [
797 ("what is ", "'s "),
798 ("what's ", "'s "),
799 ("what is ", "' "),
800 ("what's ", "' "),
801 ];
802
803 for (prefix, possessive) in possessive_patterns {
804 if let Some(start) = query.find(prefix) {
805 let after_prefix = &query[start + prefix.len()..];
806 if let Some(pos_pos) = after_prefix.find(possessive) {
807 let entity = after_prefix[..pos_pos].trim();
808 let attribute = after_prefix[pos_pos + possessive.len()..].trim();
809
810 if !entity.is_empty() && !attribute.is_empty() {
811 return Some(AttributeQuery {
812 entity: capitalize_first(entity),
813 attribute: normalize_attribute(attribute),
814 attribute_synonyms: get_attribute_synonyms(attribute),
815 original_query: query.to_string(),
816 });
817 }
818 }
819 }
820 }
821
822 None
823}
824
825fn extract_of_pattern(query: &str) -> Option<AttributeQuery> {
827 let prefixes = ["what is the ", "what's the "];
829
830 for prefix in prefixes {
831 if let Some(start) = query.find(prefix) {
832 let after_prefix = &query[start + prefix.len()..];
833 if let Some(of_pos) = after_prefix.find(" of ") {
834 let attribute = after_prefix[..of_pos].trim();
835 let entity = after_prefix[of_pos + 4..].trim();
836
837 if !entity.is_empty() && !attribute.is_empty() {
838 return Some(AttributeQuery {
839 entity: capitalize_first(entity),
840 attribute: normalize_attribute(attribute),
841 attribute_synonyms: get_attribute_synonyms(attribute),
842 original_query: query.to_string(),
843 });
844 }
845 }
846 }
847 }
848
849 None
850}
851
852fn extract_entity_after_verb(query: &str) -> Option<String> {
854 let verbs = [" is ", " does "];
855 for verb in verbs {
856 if let Some(pos) = query.find(verb) {
857 let after_verb = query[pos + verb.len()..].trim();
858 let stop_words = ["live", "work", "do", "have", "go", "stay", "come"];
860 let words: Vec<&str> = after_verb.split_whitespace().collect();
861 let mut entity_words = Vec::new();
862 for word in words {
863 if stop_words.contains(&word) {
864 break;
865 }
866 entity_words.push(word);
867 }
868 if !entity_words.is_empty() {
869 return Some(capitalize_first(&entity_words.join(" ")));
870 }
871 }
872 }
873 None
874}
875
876fn normalize_attribute(attr: &str) -> String {
878 attr.trim()
879 .to_lowercase()
880 .replace(' ', "_")
881 .replace('-', "_")
882}
883
884fn get_attribute_synonyms(attribute: &str) -> Vec<String> {
886 let attr_lower = attribute.to_lowercase();
887
888 if attr_lower.contains("relationship")
890 || attr_lower.contains("status")
891 || attr_lower.contains("marital")
892 {
893 return vec![
894 "single".to_string(),
895 "married".to_string(),
896 "divorced".to_string(),
897 "engaged".to_string(),
898 "dating".to_string(),
899 "relationship".to_string(),
900 "partner".to_string(),
901 "spouse".to_string(),
902 "single parent".to_string(),
903 "status".to_string(),
904 "marital".to_string(),
905 ];
906 }
907
908 if attr_lower.contains("job")
910 || attr_lower.contains("occupation")
911 || attr_lower.contains("work")
912 {
913 return vec![
914 "job".to_string(),
915 "work".to_string(),
916 "occupation".to_string(),
917 "profession".to_string(),
918 "career".to_string(),
919 "employed".to_string(),
920 "works as".to_string(),
921 ];
922 }
923
924 if attr_lower.contains("name") {
926 return vec![
927 "name".to_string(),
928 "called".to_string(),
929 "named".to_string(),
930 ];
931 }
932
933 if attr_lower.contains("age") {
935 return vec![
936 "age".to_string(),
937 "old".to_string(),
938 "years".to_string(),
939 "born".to_string(),
940 "birthday".to_string(),
941 ];
942 }
943
944 vec![attr_lower.clone(), attr_lower.replace('_', " ")]
946}
947
948fn capitalize_first(s: &str) -> String {
950 let mut chars = s.chars();
951 match chars.next() {
952 None => String::new(),
953 Some(c) => c.to_uppercase().collect::<String>() + chars.as_str(),
954 }
955}
956
957pub fn extract_chunks(text: &str) -> ChunkExtraction {
968 let stemmer = Stemmer::create(Algorithm::English);
969 let sentences = split_sentences(text);
970
971 let mut chunks = Vec::with_capacity(sentences.len());
972 let mut unique_nouns = HashSet::new();
973 let mut unique_verbs = HashSet::new();
974 let mut unique_adjectives = HashSet::new();
975 let mut proper_nouns = HashSet::new();
976
977 for (sentence_idx, sentence) in sentences.iter().enumerate() {
978 let words = tokenize_with_case(sentence);
979 let mut tagged_words = Vec::with_capacity(words.len());
980
981 for (position, (word, is_capitalized)) in words.iter().enumerate() {
982 let word_lower = word.to_lowercase();
983
984 if word_lower.len() < 2 {
986 continue;
987 }
988
989 let stem = stemmer.stem(&word_lower).to_string();
990 let pos = classify_pos_for_chunking(&word_lower, *is_capitalized, position, &words);
991
992 match pos {
993 PosTag::Noun => {
994 unique_nouns.insert(stem.clone());
995 }
996 PosTag::Verb => {
997 unique_verbs.insert(stem.clone());
998 }
999 PosTag::Adjective => {
1000 unique_adjectives.insert(stem.clone());
1001 }
1002 PosTag::ProperNoun => {
1003 proper_nouns.insert(word.clone());
1004 unique_nouns.insert(stem.clone()); }
1006 _ => {}
1007 }
1008
1009 if pos != PosTag::StopWord {
1010 tagged_words.push(TaggedWord {
1011 text: word.clone(),
1012 stem,
1013 pos,
1014 position,
1015 });
1016 }
1017 }
1018
1019 if !tagged_words.is_empty() {
1020 chunks.push(SentenceChunk {
1021 text: sentence.clone(),
1022 sentence_idx,
1023 words: tagged_words,
1024 });
1025 }
1026 }
1027
1028 ChunkExtraction {
1029 chunks,
1030 unique_nouns,
1031 unique_verbs,
1032 unique_adjectives,
1033 proper_nouns,
1034 }
1035}
1036
1037fn split_sentences(text: &str) -> Vec<String> {
1039 let mut sentences = Vec::new();
1040 let mut current = String::new();
1041
1042 for ch in text.chars() {
1045 current.push(ch);
1046
1047 if ch == '.' || ch == '!' || ch == '?' || ch == '\n' {
1048 let trimmed = current.trim();
1049 if !trimmed.is_empty() && trimmed.len() > 3 {
1050 let last_word: String = trimmed
1052 .split_whitespace()
1053 .last()
1054 .unwrap_or("")
1055 .chars()
1056 .filter(|c| c.is_alphabetic())
1057 .collect();
1058
1059 let is_abbrev = matches!(
1061 last_word.to_lowercase().as_str(),
1062 "mr" | "mrs"
1063 | "ms"
1064 | "dr"
1065 | "prof"
1066 | "sr"
1067 | "jr"
1068 | "vs"
1069 | "etc"
1070 | "eg"
1071 | "ie"
1072 | "st"
1073 | "ave"
1074 | "rd"
1075 | "blvd"
1076 );
1077
1078 if !is_abbrev || ch == '\n' {
1079 sentences.push(trimmed.to_string());
1080 current.clear();
1081 }
1082 }
1083 }
1084 }
1085
1086 let trimmed = current.trim();
1088 if !trimmed.is_empty() {
1089 sentences.push(trimmed.to_string());
1090 }
1091
1092 sentences
1093}
1094
1095fn tokenize_with_case(text: &str) -> Vec<(String, bool)> {
1097 text.split_whitespace()
1098 .map(|w| {
1099 let clean: String = w
1100 .trim_matches(|c: char| !c.is_alphanumeric() && c != '\'')
1101 .to_string();
1102 let is_capitalized = clean
1103 .chars()
1104 .next()
1105 .map(|c| c.is_uppercase())
1106 .unwrap_or(false);
1107 (clean, is_capitalized)
1108 })
1109 .filter(|(w, _)| !w.is_empty())
1110 .collect()
1111}
1112
1113fn classify_pos_for_chunking(
1115 word: &str,
1116 is_capitalized: bool,
1117 position: usize,
1118 _context: &[(String, bool)],
1119) -> PosTag {
1120 if is_stop_word(word) {
1122 return PosTag::StopWord;
1123 }
1124
1125 if is_capitalized && position > 0 {
1127 return PosTag::ProperNoun;
1128 }
1129
1130 if is_verb(word) {
1132 return PosTag::Verb;
1133 }
1134
1135 if is_adjective(word) {
1137 return PosTag::Adjective;
1138 }
1139
1140 if is_noun_for_chunking(word) {
1142 return PosTag::Noun;
1143 }
1144
1145 if word.len() >= 4 {
1148 return PosTag::Noun;
1149 }
1150
1151 PosTag::Other
1152}
1153
1154fn is_noun_for_chunking(word: &str) -> bool {
1156 const NOUN_INDICATORS: &[&str] = &[
1161 "memory",
1163 "graph",
1164 "node",
1165 "edge",
1166 "entity",
1167 "embedding",
1168 "vector",
1169 "index",
1170 "query",
1171 "retrieval",
1172 "activation",
1173 "potentiation",
1174 "consolidation",
1175 "decay",
1176 "strength",
1177 "weight",
1178 "threshold",
1179 "importance",
1180 "robot",
1181 "drone",
1182 "sensor",
1183 "lidar",
1184 "camera",
1185 "motor",
1186 "actuator",
1187 "obstacle",
1188 "path",
1189 "waypoint",
1190 "location",
1191 "coordinates",
1192 "position",
1193 "battery",
1194 "power",
1195 "energy",
1196 "voltage",
1197 "current",
1198 "system",
1199 "module",
1200 "component",
1201 "unit",
1202 "device",
1203 "temperature",
1204 "pressure",
1205 "humidity",
1206 "speed",
1207 "velocity",
1208 "signal",
1209 "communication",
1210 "network",
1211 "link",
1212 "connection",
1213 "navigation",
1214 "guidance",
1215 "control",
1216 "steering",
1217 "data",
1218 "information",
1219 "message",
1220 "command",
1221 "response",
1222 "function",
1223 "method",
1224 "class",
1225 "struct",
1226 "interface",
1227 "package",
1228 "library",
1229 "framework",
1230 "api",
1231 "endpoint",
1232 "request",
1233 "error",
1234 "exception",
1235 "bug",
1236 "fix",
1237 "feature",
1238 "test",
1239 "benchmark",
1240 "performance",
1241 "latency",
1242 "throughput",
1243 "cache",
1244 "buffer",
1245 "queue",
1246 "stack",
1247 "heap",
1248 "thread",
1249 "process",
1250 "server",
1251 "client",
1252 "database",
1253 "table",
1254 "column",
1255 "row",
1256 "schema",
1257 "migration",
1258 "deployment",
1259 "container",
1260 "cluster",
1261 "replica",
1262 "person",
1263 "people",
1264 "user",
1265 "agent",
1266 "operator",
1267 "time",
1268 "date",
1269 "day",
1270 "hour",
1271 "minute",
1272 "second",
1273 "area",
1274 "zone",
1275 "region",
1276 "sector",
1277 "space",
1278 "task",
1279 "mission",
1280 "goal",
1281 "objective",
1282 "target",
1283 "warning",
1284 "alert",
1285 "notification",
1286 "level",
1287 "status",
1288 "state",
1289 "condition",
1290 "mode",
1291 "type",
1292 "kind",
1293 "version",
1294 "release",
1295 "update",
1296 "change",
1297 "result",
1298 "output",
1299 "input",
1300 "value",
1301 "key",
1302 "name",
1303 "id",
1304 "identifier",
1305 "sunrise",
1307 "sunset",
1308 "lake",
1309 "mountain",
1310 "beach",
1311 "forest",
1312 "garden",
1313 "park",
1314 "city",
1315 "town",
1316 "village",
1317 "country",
1318 "house",
1319 "home",
1320 "room",
1321 "building",
1322 "street",
1323 "road",
1324 "car",
1325 "bus",
1326 "train",
1327 "plane",
1328 "boat",
1329 "bicycle",
1330 "food",
1331 "drink",
1332 "water",
1333 "coffee",
1334 "tea",
1335 "breakfast",
1336 "lunch",
1337 "dinner",
1338 "meal",
1339 "book",
1340 "movie",
1341 "music",
1342 "song",
1343 "art",
1344 "painting",
1345 "photo",
1346 "picture",
1347 "video",
1348 "game",
1349 "sport",
1350 "team",
1351 "player",
1352 "match",
1353 "race",
1354 "trip",
1355 "vacation",
1356 "holiday",
1357 "weekend",
1358 "morning",
1359 "evening",
1360 "night",
1361 "week",
1362 "month",
1363 "year",
1364 "birthday",
1365 "wedding",
1366 "party",
1367 "event",
1368 "meeting",
1369 "class",
1370 "lesson",
1371 "course",
1372 "school",
1373 "college",
1374 "university",
1375 "job",
1376 "work",
1377 "office",
1378 "company",
1379 "business",
1380 "project",
1381 "plan",
1382 "idea",
1383 "thought",
1384 "feeling",
1385 "emotion",
1386 "love",
1387 "friend",
1388 "family",
1389 "parent",
1390 "child",
1391 "kid",
1392 "baby",
1393 "mother",
1394 "father",
1395 "sister",
1396 "brother",
1397 "wife",
1398 "husband",
1399 "partner",
1400 "group",
1401 "community",
1402 "society",
1403 "culture",
1404 "tradition",
1405 "story",
1406 "history",
1407 "news",
1408 "article",
1409 "blog",
1410 "post",
1411 "comment",
1412 "email",
1413 "letter",
1414 "phone",
1415 "call",
1416 "text",
1417 "chat",
1418 "conversation",
1419 "discussion",
1420 "talk",
1421 "speech",
1422 "presentation",
1423 "question",
1424 "answer",
1425 "problem",
1426 "solution",
1427 "issue",
1428 "challenge",
1429 "opportunity",
1430 "success",
1431 "failure",
1432 "experience",
1433 "skill",
1434 "knowledge",
1435 "wisdom",
1436 "truth",
1437 "fact",
1438 "opinion",
1439 "belief",
1440 "value",
1441 "principle",
1442 "rule",
1443 "law",
1444 "policy",
1445 "decision",
1446 "choice",
1447 "option",
1448 "alternative",
1449 "reason",
1450 "cause",
1451 "effect",
1452 "impact",
1453 "influence",
1454 "power",
1455 "authority",
1456 "responsibility",
1457 "duty",
1458 "right",
1459 "freedom",
1460 "justice",
1461 "peace",
1462 "war",
1463 "conflict",
1464 "agreement",
1465 "contract",
1466 "deal",
1467 "price",
1468 "cost",
1469 "money",
1470 "dollar",
1471 "euro",
1472 "pound",
1473 "budget",
1474 "investment",
1475 "profit",
1476 "loss",
1477 "risk",
1478 "reward",
1479 "benefit",
1480 "advantage",
1481 "disadvantage",
1482 "strength",
1483 "weakness",
1484 "opportunity",
1485 "threat",
1486 "strategy",
1487 "tactic",
1488 "method",
1489 "approach",
1490 "technique",
1491 "tool",
1492 "resource",
1493 "material",
1494 "product",
1495 "service",
1496 "quality",
1497 "quantity",
1498 "size",
1499 "shape",
1500 "color",
1501 "sound",
1502 "smell",
1503 "taste",
1504 "touch",
1505 "sight",
1506 "sense",
1507 "mind",
1508 "body",
1509 "heart",
1510 "soul",
1511 "spirit",
1512 "health",
1513 "illness",
1514 "disease",
1515 "medicine",
1516 "doctor",
1517 "nurse",
1518 "hospital",
1519 "clinic",
1520 "therapy",
1521 "treatment",
1522 "care",
1523 "support",
1524 "help",
1525 "advice",
1526 "guidance",
1527 "counseling",
1528 "coaching",
1529 "mentoring",
1530 "training",
1531 "education",
1532 "learning",
1533 "teaching",
1534 "research",
1535 "study",
1536 "experiment",
1537 "discovery",
1538 "invention",
1539 "innovation",
1540 "technology",
1541 "science",
1542 "math",
1543 "physics",
1544 "chemistry",
1545 "biology",
1546 "psychology",
1547 "sociology",
1548 "philosophy",
1549 "religion",
1550 "spirituality",
1551 "meditation",
1552 "yoga",
1553 "exercise",
1554 "fitness",
1555 "diet",
1556 "nutrition",
1557 "sleep",
1558 "rest",
1559 "relaxation",
1560 "stress",
1561 "anxiety",
1562 "depression",
1563 "happiness",
1564 "joy",
1565 "sadness",
1566 "anger",
1567 "fear",
1568 "surprise",
1569 "disgust",
1570 "trust",
1571 "hope",
1572 "faith",
1573 "courage",
1574 "confidence",
1575 "pride",
1576 "shame",
1577 "guilt",
1578 "regret",
1579 "gratitude",
1580 "empathy",
1581 "compassion",
1582 "kindness",
1583 "generosity",
1584 "honesty",
1585 "integrity",
1586 "loyalty",
1587 "respect",
1588 "tolerance",
1589 "patience",
1590 "persistence",
1591 "determination",
1592 "motivation",
1593 "inspiration",
1594 "creativity",
1595 "imagination",
1596 "curiosity",
1597 "wonder",
1598 "beauty",
1599 "art",
1600 "music",
1601 "dance",
1602 "theater",
1603 "film",
1604 "literature",
1605 "poetry",
1606 "writing",
1607 "reading",
1608 "speaking",
1609 "listening",
1610 "communication",
1611 "expression",
1612 "interpretation",
1613 "understanding",
1614 "meaning",
1615 "purpose",
1616 "goal",
1617 "dream",
1618 "vision",
1619 "mission",
1620 "passion",
1621 "interest",
1622 "hobby",
1623 "activity",
1624 "adventure",
1625 "journey",
1626 "path",
1627 "way",
1628 "direction",
1629 "destination",
1630 "origin",
1631 "beginning",
1632 "end",
1633 "start",
1634 "finish",
1635 "progress",
1636 "growth",
1637 "development",
1638 "evolution",
1639 "transformation",
1640 "change",
1641 "transition",
1642 "shift",
1643 "movement",
1644 "action",
1645 "reaction",
1646 "response",
1647 "behavior",
1648 "habit",
1649 "pattern",
1650 "routine",
1651 "schedule",
1652 "plan",
1653 "strategy",
1654 "tactic",
1655 "approach",
1656 "method",
1657 "process",
1658 "procedure",
1659 "step",
1660 "stage",
1661 "phase",
1662 "cycle",
1663 "circle",
1664 "loop",
1665 "sequence",
1666 "order",
1667 "arrangement",
1668 "organization",
1669 "structure",
1670 "system",
1671 "network",
1672 "connection",
1673 "relationship",
1674 "bond",
1675 "link",
1676 "tie",
1677 "association",
1678 "affiliation",
1679 "membership",
1680 "participation",
1681 "involvement",
1682 "engagement",
1683 "commitment",
1684 "dedication",
1685 "devotion",
1686 "loyalty",
1687 "allegiance",
1688 "support",
1689 "backing",
1690 "endorsement",
1691 "approval",
1692 "acceptance",
1693 "recognition",
1694 "acknowledgment",
1695 "appreciation",
1696 "gratitude",
1697 "thanks",
1698 "praise",
1699 "compliment",
1700 "criticism",
1701 "feedback",
1702 "evaluation",
1703 "assessment",
1704 "judgment",
1705 "opinion",
1706 "view",
1707 "perspective",
1708 "angle",
1709 "aspect",
1710 "dimension",
1711 "element",
1712 "component",
1713 "part",
1714 "piece",
1715 "section",
1716 "segment",
1717 "portion",
1718 "share",
1719 "fraction",
1720 "percentage",
1721 "ratio",
1722 "proportion",
1723 "balance",
1724 "equilibrium",
1725 "harmony",
1726 "unity",
1727 "diversity",
1728 "variety",
1729 "difference",
1730 "similarity",
1731 "comparison",
1732 "contrast",
1733 "distinction",
1734 "separation",
1735 "division",
1736 "classification",
1737 "category",
1738 "class",
1739 "type",
1740 "kind",
1741 "sort",
1742 "species",
1743 "variety",
1744 "version",
1745 "edition",
1746 "model",
1747 "design",
1748 "style",
1749 "format",
1750 "layout",
1751 "arrangement",
1752 "configuration",
1753 "setup",
1754 "installation",
1755 "deployment",
1756 ];
1757
1758 if NOUN_INDICATORS.contains(&word) {
1759 return true;
1760 }
1761
1762 if word.ends_with("tion")
1764 || word.ends_with("sion")
1765 || word.ends_with("ment")
1766 || word.ends_with("ness")
1767 || word.ends_with("ity")
1768 || word.ends_with("ance")
1769 || word.ends_with("ence")
1770 || word.ends_with("age")
1771 || word.ends_with("ure")
1772 || word.ends_with("dom")
1773 || word.ends_with("ship")
1774 || word.ends_with("hood")
1775 || word.ends_with("ism")
1776 || word.ends_with("ist")
1777 {
1778 return true;
1779 }
1780
1781 if (word.ends_with("er") || word.ends_with("or")) && word.len() > 4 {
1783 let without_suffix = &word[..word.len() - 2];
1785 if !without_suffix.ends_with("t")
1786 && !without_suffix.ends_with("g")
1787 && !without_suffix.ends_with("d")
1788 {
1789 return true;
1790 }
1791 }
1792
1793 false
1794}
1795
1796#[derive(Debug, Clone)]
1798pub struct FocalEntity {
1799 pub text: String,
1800 pub stem: String,
1802 pub ic_weight: f32,
1803 pub is_compound: bool,
1805 pub negated: bool,
1807}
1808
1809#[derive(Debug, Clone)]
1811pub struct Modifier {
1812 pub text: String,
1813 pub stem: String,
1815 pub ic_weight: f32,
1817 pub negated: bool,
1819}
1820
1821#[derive(Debug, Clone)]
1823pub struct Relation {
1824 pub text: String,
1825 pub stem: String,
1827 pub ic_weight: f32,
1829 pub negated: bool,
1831}
1832
1833#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1840pub enum QueryIntent {
1841 Needle,
1845
1846 Exploratory,
1850
1851 Hybrid,
1855}
1856
1857impl Default for QueryIntent {
1858 fn default() -> Self {
1859 QueryIntent::Hybrid
1860 }
1861}
1862
1863#[derive(Debug, Clone)]
1865pub struct QueryAnalysis {
1866 pub focal_entities: Vec<FocalEntity>,
1868
1869 pub discriminative_modifiers: Vec<Modifier>,
1871
1872 pub relational_context: Vec<Relation>,
1874
1875 pub compound_nouns: Vec<String>,
1877
1878 pub original_query: String,
1880
1881 pub has_negation: bool,
1883
1884 pub intent: QueryIntent,
1886}
1887
1888impl QueryAnalysis {
1889 pub fn total_weight(&self) -> f32 {
1891 let entity_weight: f32 = self.focal_entities.iter().map(|e| e.ic_weight).sum();
1892
1893 let modifier_weight: f32 = self
1894 .discriminative_modifiers
1895 .iter()
1896 .map(|m| m.ic_weight)
1897 .sum();
1898
1899 let relation_weight: f32 = self.relational_context.iter().map(|r| r.ic_weight).sum();
1900
1901 let compound_bonus = self.compound_nouns.len() as f32 * 0.5;
1903
1904 entity_weight + modifier_weight + relation_weight + compound_bonus
1905 }
1906
1907 pub fn all_stems(&self) -> HashSet<String> {
1909 let mut stems = HashSet::new();
1910 for e in &self.focal_entities {
1911 stems.insert(e.stem.clone());
1912 }
1913 for m in &self.discriminative_modifiers {
1914 stems.insert(m.stem.clone());
1915 }
1916 for r in &self.relational_context {
1917 stems.insert(r.stem.clone());
1918 }
1919 stems
1920 }
1921
1922 pub fn positive_entity_stems(&self) -> Vec<&str> {
1924 self.focal_entities
1925 .iter()
1926 .filter(|e| !e.negated)
1927 .map(|e| e.stem.as_str())
1928 .collect()
1929 }
1930
1931 pub fn negated_entity_stems(&self) -> Vec<&str> {
1933 self.focal_entities
1934 .iter()
1935 .filter(|e| e.negated)
1936 .map(|e| e.stem.as_str())
1937 .collect()
1938 }
1939
1940 pub fn to_ic_weights(&self) -> std::collections::HashMap<String, f32> {
1949 self.to_ic_weights_with_yake(true)
1950 }
1951
1952 pub fn to_ic_weights_with_yake(
1958 &self,
1959 use_yake: bool,
1960 ) -> std::collections::HashMap<String, f32> {
1961 use crate::embeddings::keywords::{KeywordConfig, KeywordExtractor};
1962
1963 let mut weights = std::collections::HashMap::new();
1964
1965 if use_yake {
1968 let config = KeywordConfig {
1969 max_keywords: 5,
1970 ngrams: 2,
1971 min_length: 3,
1972 ..Default::default()
1973 };
1974 let extractor = KeywordExtractor::with_config(config);
1975 let keywords = extractor.extract(&self.original_query);
1976
1977 for kw in keywords {
1982 let term = kw.text.to_lowercase();
1983
1984 if term.contains(' ') {
1987 continue;
1988 }
1989
1990 let yake_boost = 1.0 + (kw.importance * 5.0);
1994 weights
1995 .entry(term)
1996 .and_modify(|w: &mut f32| *w = w.max(yake_boost))
1997 .or_insert(yake_boost);
1998 }
1999 }
2000
2001 for entity in &self.focal_entities {
2003 let term = entity.text.to_lowercase();
2004 weights
2005 .entry(term)
2006 .and_modify(|w: &mut f32| *w = w.max(entity.ic_weight))
2007 .or_insert(entity.ic_weight);
2008 if entity.stem != entity.text.to_lowercase() {
2010 weights
2011 .entry(entity.stem.clone())
2012 .and_modify(|w: &mut f32| *w = w.max(entity.ic_weight))
2013 .or_insert(entity.ic_weight);
2014 }
2015 }
2016
2017 for modifier in &self.discriminative_modifiers {
2019 let term = modifier.text.to_lowercase();
2020 weights
2021 .entry(term)
2022 .and_modify(|w: &mut f32| *w = w.max(modifier.ic_weight))
2023 .or_insert(modifier.ic_weight);
2024 if modifier.stem != modifier.text.to_lowercase() {
2025 weights
2026 .entry(modifier.stem.clone())
2027 .and_modify(|w: &mut f32| *w = w.max(modifier.ic_weight))
2028 .or_insert(modifier.ic_weight);
2029 }
2030 }
2031
2032 for relation in &self.relational_context {
2034 let term = relation.text.to_lowercase();
2035 weights
2036 .entry(term)
2037 .and_modify(|w: &mut f32| *w = w.max(relation.ic_weight))
2038 .or_insert(relation.ic_weight);
2039 if relation.stem != relation.text.to_lowercase() {
2040 weights
2041 .entry(relation.stem.clone())
2042 .and_modify(|w: &mut f32| *w = w.max(relation.ic_weight))
2043 .or_insert(relation.ic_weight);
2044 }
2045 }
2046
2047 for compound in &self.compound_nouns {
2049 for word in compound.split_whitespace() {
2051 let term = word.to_lowercase();
2052 weights.entry(term).and_modify(|w: &mut f32| *w *= 1.2);
2053 }
2054 }
2055
2056 weights
2057 }
2058
2059 pub fn keyword_discriminativeness(&self) -> (f32, Vec<String>) {
2069 use crate::embeddings::keywords::{KeywordConfig, KeywordExtractor};
2070
2071 let config = KeywordConfig {
2072 max_keywords: 5,
2073 ngrams: 2,
2074 min_length: 2, ..Default::default()
2076 };
2077 let extractor = KeywordExtractor::with_config(config);
2078 let keywords = extractor.extract(&self.original_query);
2079
2080 let mut max_importance = 0.0f32;
2081 let mut discriminative = Vec::new();
2082
2083 for kw in keywords {
2084 if kw.importance > max_importance {
2085 max_importance = kw.importance;
2086 }
2087 if kw.importance > 0.5 {
2089 discriminative.push(kw.text.to_lowercase());
2090 }
2091 }
2092
2093 (max_importance, discriminative)
2094 }
2095
2096 pub fn to_phrase_boosts(&self) -> Vec<(String, f32)> {
2102 let mut phrases = Vec::new();
2103
2104 for compound in &self.compound_nouns {
2106 phrases.push((compound.to_lowercase(), 2.0));
2108 }
2109
2110 if self.focal_entities.len() >= 2 {
2113 for i in 0..self.focal_entities.len() - 1 {
2114 let e1 = &self.focal_entities[i];
2115 let e2 = &self.focal_entities[i + 1];
2116 if !e1.negated && !e2.negated {
2118 let phrase = format!("{} {}", e1.text.to_lowercase(), e2.text.to_lowercase());
2119 if !self
2120 .compound_nouns
2121 .iter()
2122 .any(|c| c.to_lowercase() == phrase)
2123 {
2124 phrases.push((phrase, 1.5));
2126 }
2127 }
2128 }
2129 }
2130
2131 phrases
2132 }
2133}
2134
2135#[derive(Debug)]
2137struct AnnotatedToken {
2138 text: String,
2139 stem: String,
2140 pos: PartOfSpeech,
2141 negated: bool,
2142 position: usize,
2143}
2144
2145#[derive(Debug, Clone, Copy, PartialEq)]
2146enum PartOfSpeech {
2147 Noun,
2148 Adjective,
2149 Verb,
2150 StopWord,
2151 Negation,
2152 Unknown,
2153}
2154
2155pub fn analyze_query(query_text: &str) -> QueryAnalysis {
2157 let stemmer = Stemmer::create(Algorithm::English);
2158 let words = tokenize(query_text);
2159
2160 if words.is_empty() {
2161 return QueryAnalysis {
2162 focal_entities: Vec::new(),
2163 discriminative_modifiers: Vec::new(),
2164 relational_context: Vec::new(),
2165 compound_nouns: Vec::new(),
2166 original_query: query_text.to_string(),
2167 has_negation: false,
2168 intent: QueryIntent::Hybrid,
2169 };
2170 }
2171
2172 let annotated = annotate_tokens(&words, &stemmer);
2174
2175 let compound_nouns = detect_compound_nouns(&annotated);
2177
2178 let mut focal_entities = Vec::new();
2180 let mut discriminative_modifiers = Vec::new();
2181 let mut relational_context = Vec::new();
2182 let mut has_negation = false;
2183
2184 let compound_positions: HashSet<usize> = compound_positions(&annotated, &compound_nouns);
2186
2187 for token in &annotated {
2188 if token.pos == PartOfSpeech::Negation {
2189 has_negation = true;
2190 continue;
2191 }
2192 if token.pos == PartOfSpeech::StopWord {
2193 continue;
2194 }
2195
2196 let is_compound = compound_positions.contains(&token.position);
2197
2198 match token.pos {
2199 PartOfSpeech::Noun | PartOfSpeech::Unknown => {
2200 let weight = calculate_term_weight(&token.text, IC_NOUN);
2202 focal_entities.push(FocalEntity {
2203 text: token.text.clone(),
2204 stem: token.stem.clone(),
2205 ic_weight: weight,
2206 is_compound,
2207 negated: token.negated,
2208 });
2209 }
2210 PartOfSpeech::Adjective => {
2211 let weight = calculate_term_weight(&token.text, IC_ADJECTIVE);
2212 discriminative_modifiers.push(Modifier {
2213 text: token.text.clone(),
2214 stem: token.stem.clone(),
2215 ic_weight: weight,
2216 negated: token.negated,
2217 });
2218 }
2219 PartOfSpeech::Verb => {
2220 let weight = calculate_term_weight(&token.text, IC_VERB);
2221 relational_context.push(Relation {
2222 text: token.text.clone(),
2223 stem: token.stem.clone(),
2224 ic_weight: weight,
2225 negated: token.negated,
2226 });
2227 }
2228 _ => {}
2229 }
2230 }
2231
2232 for compound in &compound_nouns {
2234 let stem = stemmer.stem(compound).to_string();
2235 focal_entities.push(FocalEntity {
2236 text: compound.clone(),
2237 stem,
2238 ic_weight: IC_NOUN * 1.5, is_compound: true,
2240 negated: false,
2241 });
2242 }
2243
2244 let intent = detect_query_intent(query_text, &focal_entities, &relational_context);
2246
2247 QueryAnalysis {
2248 focal_entities,
2249 discriminative_modifiers,
2250 relational_context,
2251 compound_nouns,
2252 original_query: query_text.to_string(),
2253 has_negation,
2254 intent,
2255 }
2256}
2257
2258fn detect_query_intent(
2265 query_text: &str,
2266 focal_entities: &[FocalEntity],
2267 relational_context: &[Relation],
2268) -> QueryIntent {
2269 let lower = query_text.to_lowercase();
2270
2271 let needle_starters = [
2273 "what is", "what's", "who is", "who's", "where is", "where's", "when did", "when was",
2274 "which", "how much", "how many", "find", "get me", "show me", "list", "give me",
2275 ];
2276
2277 let needle_patterns = [
2278 "'s email",
2279 "'s phone",
2280 "'s address",
2281 "'s name",
2282 "email of",
2283 "phone of",
2284 "address of",
2285 "name of",
2286 "id of",
2287 "password",
2288 "api key",
2289 "token",
2290 ];
2291
2292 let exploratory_starters = [
2294 "tell me about",
2295 "explain",
2296 "describe",
2297 "what do we know about",
2298 "summarize",
2299 "overview",
2300 "recap",
2301 "context",
2302 "related to",
2303 "associated with",
2304 "connected to",
2305 "how does",
2306 "how do",
2307 "why does",
2308 "why do",
2309 ];
2310
2311 let exploratory_patterns = [
2312 "all about",
2313 "everything about",
2314 "more about",
2315 "history of",
2316 "background",
2317 "related",
2318 ];
2319
2320 for starter in needle_starters.iter() {
2322 if lower.starts_with(starter) {
2323 return QueryIntent::Needle;
2324 }
2325 }
2326
2327 for pattern in needle_patterns.iter() {
2328 if lower.contains(pattern) {
2329 return QueryIntent::Needle;
2330 }
2331 }
2332
2333 for starter in exploratory_starters.iter() {
2335 if lower.starts_with(starter) || lower.contains(starter) {
2336 return QueryIntent::Exploratory;
2337 }
2338 }
2339
2340 for pattern in exploratory_patterns.iter() {
2341 if lower.contains(pattern) {
2342 return QueryIntent::Exploratory;
2343 }
2344 }
2345
2346 let entity_count = focal_entities.len();
2349 let relation_count = relational_context.len();
2350
2351 if entity_count > 0 && relation_count == 0 {
2352 QueryIntent::Needle
2354 } else if relation_count > entity_count {
2355 QueryIntent::Exploratory
2357 } else {
2358 QueryIntent::Hybrid
2360 }
2361}
2362
2363fn tokenize(text: &str) -> Vec<String> {
2365 text.split_whitespace()
2366 .map(|w| {
2367 w.trim_matches(|c: char| !c.is_alphanumeric())
2368 .to_lowercase()
2369 })
2370 .filter(|w| !w.is_empty())
2371 .collect()
2372}
2373
2374fn annotate_tokens(words: &[String], stemmer: &Stemmer) -> Vec<AnnotatedToken> {
2376 let mut annotated = Vec::with_capacity(words.len());
2377 let mut in_negation_scope = false;
2378 let mut negation_distance = 0;
2379
2380 for (i, word) in words.iter().enumerate() {
2381 let stem = stemmer.stem(word).to_string();
2382 let pos = classify_pos(word, i, words);
2383
2384 if pos == PartOfSpeech::Negation {
2386 in_negation_scope = true;
2387 negation_distance = 0;
2388 } else if in_negation_scope {
2389 negation_distance += 1;
2390 if negation_distance > 3 {
2391 in_negation_scope = false;
2392 }
2393 }
2394
2395 let negated = in_negation_scope && pos != PartOfSpeech::Negation;
2396
2397 annotated.push(AnnotatedToken {
2398 text: word.clone(),
2399 stem,
2400 pos,
2401 negated,
2402 position: i,
2403 });
2404 }
2405
2406 annotated
2407}
2408
2409fn classify_pos(word: &str, position: usize, context: &[String]) -> PartOfSpeech {
2411 if is_negation(word) {
2413 return PartOfSpeech::Negation;
2414 }
2415
2416 if is_stop_word(word) {
2418 return PartOfSpeech::StopWord;
2419 }
2420
2421 if is_verb(word) {
2423 return PartOfSpeech::Verb;
2424 }
2425
2426 if is_adjective(word) {
2427 return PartOfSpeech::Adjective;
2428 }
2429
2430 if is_noun(word, position, context) {
2431 return PartOfSpeech::Noun;
2432 }
2433
2434 PartOfSpeech::Unknown
2436}
2437
2438fn detect_compound_nouns(tokens: &[AnnotatedToken]) -> Vec<String> {
2440 let mut compounds = Vec::new();
2441
2442 const COMPOUND_PATTERNS: &[(&str, &str)] = &[
2444 ("machine", "learning"),
2446 ("deep", "learning"),
2447 ("neural", "network"),
2448 ("natural", "language"),
2449 ("language", "model"),
2450 ("artificial", "intelligence"),
2451 ("knowledge", "graph"),
2452 ("vector", "database"),
2453 ("memory", "system"),
2454 ("data", "structure"),
2455 ("source", "code"),
2456 ("error", "handling"),
2457 ("unit", "test"),
2458 ("integration", "test"),
2459 ("api", "endpoint"),
2460 ("web", "server"),
2461 ("file", "system"),
2462 ("operating", "system"),
2463 ("database", "schema"),
2464 ("user", "interface"),
2465 ("command", "line"),
2466 ("version", "control"),
2467 ("pull", "request"),
2468 ("code", "review"),
2469 ("bug", "fix"),
2470 ("feature", "request"),
2471 ("spreading", "activation"),
2473 ("hebbian", "learning"),
2474 ("long", "term"),
2475 ("short", "term"),
2476 ("working", "memory"),
2477 ("semantic", "search"),
2478 ("graph", "traversal"),
2479 ("edge", "device"),
2480 ("air", "gapped"),
2481 ("support", "group"),
2483 ("pride", "parade"),
2484 ("poetry", "reading"),
2485 ("civil", "rights"),
2486 ("human", "rights"),
2487 ("social", "media"),
2488 ("community", "center"),
2489 ("discussion", "group"),
2490 ("therapy", "session"),
2491 ("art", "therapy"),
2492 ("group", "therapy"),
2493 ];
2494
2495 for i in 0..tokens.len().saturating_sub(1) {
2497 let t1 = &tokens[i];
2498 let t2 = &tokens[i + 1];
2499
2500 if t1.pos == PartOfSpeech::StopWord || t2.pos == PartOfSpeech::StopWord {
2502 continue;
2503 }
2504
2505 for (w1, w2) in COMPOUND_PATTERNS {
2506 if (t1.stem == *w1 || t1.text == *w1) && (t2.stem == *w2 || t2.text == *w2) {
2507 compounds.push(format!("{} {}", t1.text, t2.text));
2508 break;
2509 }
2510 }
2511
2512 if (t1.pos == PartOfSpeech::Noun || t1.pos == PartOfSpeech::Unknown)
2514 && (t2.pos == PartOfSpeech::Noun || t2.pos == PartOfSpeech::Unknown)
2515 {
2516 if has_compound_suffix(&t1.text) || has_compound_suffix(&t2.text) {
2518 let compound = format!("{} {}", t1.text, t2.text);
2519 if !compounds.contains(&compound) {
2520 compounds.push(compound);
2521 }
2522 }
2523 }
2524 }
2525
2526 compounds
2527}
2528
2529fn has_compound_suffix(word: &str) -> bool {
2531 word.ends_with("tion")
2532 || word.ends_with("ment")
2533 || word.ends_with("ing")
2534 || word.ends_with("ness")
2535 || word.ends_with("ity")
2536 || word.ends_with("ance")
2537 || word.ends_with("ence")
2538 || word.ends_with("er")
2539 || word.ends_with("or")
2540 || word.ends_with("ist")
2541 || word.ends_with("ism")
2542}
2543
2544fn compound_positions(tokens: &[AnnotatedToken], compounds: &[String]) -> HashSet<usize> {
2546 let mut positions = HashSet::new();
2547
2548 for compound in compounds {
2549 let parts: Vec<&str> = compound.split_whitespace().collect();
2550 if parts.len() < 2 {
2551 continue;
2552 }
2553
2554 for i in 0..tokens.len().saturating_sub(parts.len() - 1) {
2555 let mut matches = true;
2556 for (j, part) in parts.iter().enumerate() {
2557 if tokens[i + j].text != *part {
2558 matches = false;
2559 break;
2560 }
2561 }
2562 if matches {
2563 for j in 0..parts.len() {
2564 positions.insert(i + j);
2565 }
2566 }
2567 }
2568 }
2569
2570 positions
2571}
2572
2573fn calculate_term_weight(word: &str, base_weight: f32) -> f32 {
2575 let length_factor = if word.len() > 8 {
2577 1.2
2578 } else if word.len() > 5 {
2579 1.1
2580 } else {
2581 1.0
2582 };
2583
2584 let suffix_factor = if word.ends_with("tion")
2586 || word.ends_with("ment")
2587 || word.ends_with("ness")
2588 || word.ends_with("ity")
2589 {
2590 1.1
2591 } else {
2592 1.0
2593 };
2594
2595 base_weight * length_factor * suffix_factor
2596}
2597
2598fn is_negation(word: &str) -> bool {
2600 const NEGATIONS: &[&str] = &[
2601 "not",
2602 "no",
2603 "never",
2604 "none",
2605 "nothing",
2606 "neither",
2607 "nobody",
2608 "nowhere",
2609 "without",
2610 "cannot",
2611 "can't",
2612 "won't",
2613 "don't",
2614 "doesn't",
2615 "didn't",
2616 "isn't",
2617 "aren't",
2618 "wasn't",
2619 "weren't",
2620 "hasn't",
2621 "haven't",
2622 "hadn't",
2623 "shouldn't",
2624 "wouldn't",
2625 "couldn't",
2626 "mustn't",
2627 ];
2628 NEGATIONS.contains(&word)
2629}
2630
2631fn is_noun(word: &str, position: usize, context: &[String]) -> bool {
2633 const NOUN_INDICATORS: &[&str] = &[
2635 "memory",
2637 "graph",
2638 "node",
2639 "edge",
2640 "entity",
2641 "embedding",
2642 "vector",
2643 "index",
2644 "query",
2645 "retrieval",
2646 "activation",
2647 "potentiation",
2648 "consolidation",
2649 "decay",
2650 "strength",
2651 "weight",
2652 "threshold",
2653 "importance",
2654 "robot",
2656 "drone",
2657 "sensor",
2658 "lidar",
2659 "camera",
2660 "motor",
2661 "actuator",
2662 "obstacle",
2663 "path",
2664 "waypoint",
2665 "location",
2666 "coordinates",
2667 "position",
2668 "battery",
2669 "power",
2670 "energy",
2671 "voltage",
2672 "current",
2673 "system",
2674 "module",
2675 "component",
2676 "unit",
2677 "device",
2678 "temperature",
2679 "pressure",
2680 "humidity",
2681 "speed",
2682 "velocity",
2683 "signal",
2684 "communication",
2685 "network",
2686 "link",
2687 "connection",
2688 "navigation",
2689 "guidance",
2690 "control",
2691 "steering",
2692 "data",
2693 "information",
2694 "message",
2695 "command",
2696 "response",
2697 "function",
2699 "method",
2700 "class",
2701 "struct",
2702 "interface",
2703 "module",
2704 "package",
2705 "library",
2706 "framework",
2707 "api",
2708 "endpoint",
2709 "request",
2710 "response",
2711 "error",
2712 "exception",
2713 "bug",
2714 "fix",
2715 "feature",
2716 "test",
2717 "benchmark",
2718 "performance",
2719 "latency",
2720 "throughput",
2721 "cache",
2722 "buffer",
2723 "queue",
2724 "stack",
2725 "heap",
2726 "thread",
2727 "process",
2728 "server",
2729 "client",
2730 "database",
2731 "table",
2732 "column",
2733 "row",
2734 "schema",
2735 "migration",
2736 "deployment",
2737 "container",
2738 "cluster",
2739 "replica",
2740 "person",
2742 "people",
2743 "user",
2744 "agent",
2745 "operator",
2746 "time",
2747 "date",
2748 "day",
2749 "hour",
2750 "minute",
2751 "second",
2752 "area",
2753 "zone",
2754 "region",
2755 "sector",
2756 "space",
2757 "task",
2758 "mission",
2759 "goal",
2760 "objective",
2761 "target",
2762 "warning",
2763 "alert",
2764 "notification",
2765 "level",
2766 "status",
2767 "state",
2768 "condition",
2769 "mode",
2770 "type",
2771 "kind",
2772 "version",
2773 "release",
2774 "update",
2775 "change",
2776 "result",
2777 "output",
2778 "input",
2779 "value",
2780 "key",
2781 "name",
2782 "id",
2783 "identifier",
2784 ];
2785
2786 if NOUN_INDICATORS.contains(&word) {
2787 return true;
2788 }
2789
2790 if word.ends_with("tion")
2792 || word.ends_with("sion")
2793 || word.ends_with("ment")
2794 || word.ends_with("ness")
2795 || word.ends_with("ity")
2796 || word.ends_with("ance")
2797 || word.ends_with("ence")
2798 || word.ends_with("er")
2799 || word.ends_with("or")
2800 || word.ends_with("ist")
2801 || word.ends_with("ism")
2802 || word.ends_with("age")
2803 || word.ends_with("ure")
2804 || word.ends_with("dom")
2805 {
2806 if !(word.ends_with("er") && word.len() < 5) {
2808 return true;
2809 }
2810 }
2811
2812 if position > 0 {
2814 if let Some(prev) = context.get(position - 1) {
2815 let prev = prev.to_lowercase();
2816 if prev == "a" || prev == "an" || prev == "the" || prev == "this" || prev == "that" {
2817 return true;
2818 }
2819 }
2820 }
2821
2822 if position > 0 {
2824 if let Some(prev) = context.get(position - 1) {
2825 if prev.ends_with("'s") || prev.ends_with("s'") {
2826 return true;
2827 }
2828 }
2829 }
2830
2831 false
2832}
2833
2834fn is_adjective(word: &str) -> bool {
2836 const ADJECTIVE_INDICATORS: &[&str] = &[
2837 "red",
2839 "blue",
2840 "green",
2841 "yellow",
2842 "orange",
2843 "purple",
2844 "black",
2845 "white",
2846 "gray",
2847 "grey",
2848 "pink",
2849 "brown",
2850 "big",
2852 "small",
2853 "large",
2854 "tiny",
2855 "huge",
2856 "massive",
2857 "mini",
2858 "micro",
2859 "high",
2860 "low",
2861 "tall",
2862 "short",
2863 "long",
2864 "wide",
2865 "narrow",
2866 "hot",
2868 "cold",
2869 "warm",
2870 "cool",
2871 "frozen",
2872 "heated",
2873 "fast",
2874 "slow",
2875 "quick",
2876 "rapid",
2877 "gradual",
2878 "active",
2879 "inactive",
2880 "enabled",
2881 "disabled",
2882 "open",
2883 "closed",
2884 "locked",
2885 "unlocked",
2886 "full",
2887 "empty",
2888 "partial",
2889 "complete",
2890 "valid",
2891 "invalid",
2892 "correct",
2893 "incorrect",
2894 "true",
2895 "false",
2896 "good",
2898 "bad",
2899 "excellent",
2900 "poor",
2901 "optimal",
2902 "suboptimal",
2903 "normal",
2904 "abnormal",
2905 "stable",
2906 "unstable",
2907 "safe",
2908 "unsafe",
2909 "dangerous",
2910 "hazardous",
2911 "new",
2912 "old",
2913 "recent",
2914 "ancient",
2915 "current",
2916 "latest",
2917 "first",
2918 "last",
2919 "next",
2920 "previous",
2921 "primary",
2922 "secondary",
2923 "main",
2924 "important",
2925 "critical",
2926 "minor",
2927 "major",
2928 "autonomous",
2930 "manual",
2931 "automatic",
2932 "remote",
2933 "digital",
2934 "analog",
2935 "electronic",
2936 "mechanical",
2937 "wireless",
2938 "wired",
2939 "connected",
2940 "disconnected",
2941 "local",
2942 "global",
2943 "private",
2944 "public",
2945 "static",
2946 "dynamic",
2947 "mutable",
2948 "immutable",
2949 "sync",
2950 "async",
2951 "concurrent",
2952 "parallel",
2953 "serial",
2954 "sequential",
2955 "optional",
2956 "required",
2957 "default",
2958 "custom",
2959 ];
2960
2961 if ADJECTIVE_INDICATORS.contains(&word) {
2962 return true;
2963 }
2964
2965 if word.ends_with("ful")
2967 || word.ends_with("less")
2968 || word.ends_with("ous")
2969 || word.ends_with("ive")
2970 || word.ends_with("able")
2971 || word.ends_with("ible")
2972 || word.ends_with("al")
2973 || word.ends_with("ic")
2974 || word.ends_with("ary")
2975 || word.ends_with("ory")
2976 {
2977 let exceptions = ["animal", "interval", "arrival", "approval"];
2979 if !exceptions.contains(&word) {
2980 return true;
2981 }
2982 }
2983
2984 false
2985}
2986
2987fn is_verb(word: &str) -> bool {
2989 const VERB_INDICATORS: &[&str] = &[
2990 "is",
2992 "are",
2993 "was",
2994 "were",
2995 "be",
2996 "been",
2997 "being",
2998 "has",
2999 "have",
3000 "had",
3001 "do",
3002 "does",
3003 "did",
3004 "can",
3005 "could",
3006 "will",
3007 "would",
3008 "shall",
3009 "should",
3010 "may",
3011 "might",
3012 "must",
3013 "go",
3015 "goes",
3016 "went",
3017 "gone",
3018 "going",
3019 "get",
3020 "gets",
3021 "got",
3022 "gotten",
3023 "getting",
3024 "make",
3025 "makes",
3026 "made",
3027 "making",
3028 "take",
3029 "takes",
3030 "took",
3031 "taken",
3032 "taking",
3033 "see",
3034 "sees",
3035 "saw",
3036 "seen",
3037 "seeing",
3038 "give",
3039 "gives",
3040 "gave",
3041 "given",
3042 "giving",
3043 "use",
3044 "uses",
3045 "used",
3046 "using",
3047 "find",
3048 "finds",
3049 "found",
3050 "finding",
3051 "know",
3052 "knows",
3053 "knew",
3054 "known",
3055 "knowing",
3056 "think",
3057 "thinks",
3058 "thought",
3059 "thinking",
3060 "want",
3061 "wants",
3062 "wanted",
3063 "wanting",
3064 "need",
3065 "needs",
3066 "needed",
3067 "needing",
3068 "try",
3069 "tries",
3070 "tried",
3071 "trying",
3072 "detect",
3074 "detects",
3075 "detected",
3076 "detecting",
3077 "observe",
3078 "observes",
3079 "observed",
3080 "observing",
3081 "measure",
3082 "measures",
3083 "measured",
3084 "measuring",
3085 "sense",
3086 "senses",
3087 "sensed",
3088 "sensing",
3089 "scan",
3090 "scans",
3091 "scanned",
3092 "scanning",
3093 "navigate",
3094 "navigates",
3095 "navigated",
3096 "navigating",
3097 "move",
3098 "moves",
3099 "moved",
3100 "moving",
3101 "stop",
3102 "stops",
3103 "stopped",
3104 "stopping",
3105 "start",
3106 "starts",
3107 "started",
3108 "starting",
3109 "reach",
3110 "reaches",
3111 "reached",
3112 "reaching",
3113 "avoid",
3114 "avoids",
3115 "avoided",
3116 "avoiding",
3117 "block",
3118 "blocks",
3119 "blocked",
3120 "blocking",
3121 "create",
3122 "creates",
3123 "created",
3124 "creating",
3125 "delete",
3126 "deletes",
3127 "deleted",
3128 "deleting",
3129 "update",
3130 "updates",
3131 "updated",
3132 "updating",
3133 "read",
3134 "reads",
3135 "reading",
3136 "write",
3137 "writes",
3138 "wrote",
3139 "written",
3140 "writing",
3141 "run",
3142 "runs",
3143 "ran",
3144 "running",
3145 "execute",
3146 "executes",
3147 "executed",
3148 "executing",
3149 "call",
3150 "calls",
3151 "called",
3152 "calling",
3153 "return",
3154 "returns",
3155 "returned",
3156 "returning",
3157 "store",
3158 "stores",
3159 "stored",
3160 "storing",
3161 "load",
3162 "loads",
3163 "loaded",
3164 "loading",
3165 "save",
3166 "saves",
3167 "saved",
3168 "saving",
3169 "fetch",
3170 "fetches",
3171 "fetched",
3172 "fetching",
3173 "send",
3174 "sends",
3175 "sent",
3176 "sending",
3177 "receive",
3178 "receives",
3179 "received",
3180 "receiving",
3181 "connect",
3182 "connects",
3183 "connected",
3184 "connecting",
3185 "disconnect",
3186 "disconnects",
3187 "disconnected",
3188 "disconnecting",
3189 "process",
3190 "processes",
3191 "processed",
3192 "processing",
3193 "handle",
3194 "handles",
3195 "handled",
3196 "handling",
3197 "parse",
3198 "parses",
3199 "parsed",
3200 "parsing",
3201 "compile",
3202 "compiles",
3203 "compiled",
3204 "compiling",
3205 "build",
3206 "builds",
3207 "built",
3208 "building",
3209 "test",
3210 "tests",
3211 "tested",
3212 "testing",
3213 "deploy",
3214 "deploys",
3215 "deployed",
3216 "deploying",
3217 "install",
3218 "installs",
3219 "installed",
3220 "installing",
3221 "configure",
3222 "configures",
3223 "configured",
3224 "configuring",
3225 "initialize",
3226 "initializes",
3227 "initialized",
3228 "initializing",
3229 "shutdown",
3230 "shutdowns",
3231 "terminate",
3232 "terminates",
3233 "terminated",
3234 "terminating",
3235 ];
3236
3237 VERB_INDICATORS.contains(&word)
3238}
3239
3240fn is_stop_word(word: &str) -> bool {
3242 const STOP_WORDS: &[&str] = &[
3243 "a",
3245 "an",
3246 "the",
3247 "this",
3249 "that",
3250 "these",
3251 "those",
3252 "at",
3254 "in",
3255 "on",
3256 "to",
3257 "for",
3258 "of",
3259 "from",
3260 "by",
3261 "with",
3262 "about",
3263 "into",
3264 "through",
3265 "during",
3266 "before",
3267 "after",
3268 "above",
3269 "below",
3270 "between",
3271 "under",
3272 "over",
3273 "and",
3275 "or",
3276 "but",
3277 "nor",
3278 "so",
3279 "yet",
3280 "both",
3281 "either",
3282 "neither",
3283 "i",
3285 "you",
3286 "he",
3287 "she",
3288 "it",
3289 "we",
3290 "they",
3291 "me",
3292 "him",
3293 "her",
3294 "us",
3295 "them",
3296 "my",
3297 "your",
3298 "his",
3299 "its",
3300 "our",
3301 "their",
3302 "mine",
3303 "yours",
3304 "hers",
3305 "ours",
3306 "theirs",
3307 "who",
3308 "whom",
3309 "whose",
3310 "which",
3311 "what",
3312 "whoever",
3313 "whatever",
3314 "whichever",
3315 "that",
3317 "which",
3318 "who",
3319 "whom",
3320 "whose",
3321 "how",
3323 "when",
3324 "where",
3325 "why",
3326 "just",
3328 "only",
3329 "even",
3330 "also",
3331 "too",
3332 "very",
3333 "really",
3334 "quite",
3335 "rather",
3336 "almost",
3337 "already",
3338 "still",
3339 "always",
3340 "never",
3341 "ever",
3342 "often",
3343 "sometimes",
3344 "usually",
3345 "perhaps",
3346 "maybe",
3347 "probably",
3348 "possibly",
3349 "certainly",
3350 "definitely",
3351 "actually",
3352 "basically",
3353 "essentially",
3354 "simply",
3355 "merely",
3356 "as",
3358 "if",
3359 "then",
3360 "than",
3361 "because",
3362 "although",
3363 "though",
3364 "unless",
3365 "until",
3366 "while",
3367 "whereas",
3368 "whether",
3369 "since",
3370 "some",
3372 "any",
3373 "all",
3374 "each",
3375 "every",
3376 "many",
3377 "much",
3378 "more",
3379 "most",
3380 "few",
3381 "less",
3382 "least",
3383 "other",
3384 "another",
3385 "such",
3386 "same",
3387 "different",
3388 "own",
3389 "several",
3390 ];
3391
3392 STOP_WORDS.contains(&word)
3393}
3394
3395#[cfg(test)]
3396mod tests {
3397 use super::*;
3398
3399 #[test]
3400 fn test_noun_detection() {
3401 let query = "robot detected obstacle at coordinates";
3402 let analysis = analyze_query(query);
3403
3404 let noun_texts: Vec<String> = analysis
3405 .focal_entities
3406 .iter()
3407 .map(|e| e.text.clone())
3408 .collect();
3409
3410 assert!(noun_texts.contains(&"robot".to_string()));
3411 assert!(noun_texts.contains(&"obstacle".to_string()));
3412 assert!(noun_texts.contains(&"coordinates".to_string()));
3413 }
3414
3415 #[test]
3416 fn test_adjective_detection() {
3417 let query = "red large obstacle in path";
3418 let analysis = analyze_query(query);
3419
3420 let adj_texts: Vec<String> = analysis
3421 .discriminative_modifiers
3422 .iter()
3423 .map(|m| m.text.clone())
3424 .collect();
3425
3426 assert!(adj_texts.contains(&"red".to_string()));
3427 assert!(adj_texts.contains(&"large".to_string()));
3428 }
3429
3430 #[test]
3431 fn test_verb_detection() {
3432 let query = "robot detected obstacle";
3433 let analysis = analyze_query(query);
3434
3435 let verb_texts: Vec<String> = analysis
3436 .relational_context
3437 .iter()
3438 .map(|r| r.text.clone())
3439 .collect();
3440
3441 assert!(verb_texts.contains(&"detected".to_string()));
3442 }
3443
3444 #[test]
3445 fn test_information_content_weights() {
3446 let query = "sensor detected red obstacle";
3447 let analysis = analyze_query(query);
3448
3449 for entity in &analysis.focal_entities {
3451 assert!(entity.ic_weight >= IC_NOUN * 0.9); }
3453
3454 for modifier in &analysis.discriminative_modifiers {
3456 assert!(modifier.ic_weight >= IC_ADJECTIVE * 0.9);
3457 }
3458
3459 for relation in &analysis.relational_context {
3461 assert!(relation.ic_weight >= IC_VERB * 0.9);
3462 }
3463 }
3464
3465 #[test]
3466 fn test_stemming() {
3467 let query = "running detection algorithms";
3468 let analysis = analyze_query(query);
3469
3470 let stems: Vec<String> = analysis
3472 .focal_entities
3473 .iter()
3474 .map(|e| e.stem.clone())
3475 .collect();
3476
3477 assert!(stems.iter().any(|s| s == "detect"));
3479 assert!(stems.iter().any(|s| s == "algorithm"));
3481 }
3482
3483 #[test]
3484 fn test_compound_noun_detection() {
3485 let query = "machine learning neural network";
3486 let analysis = analyze_query(query);
3487
3488 assert!(analysis
3489 .compound_nouns
3490 .contains(&"machine learning".to_string()));
3491 assert!(analysis
3492 .compound_nouns
3493 .contains(&"neural network".to_string()));
3494 }
3495
3496 #[test]
3497 fn test_negation_detection() {
3498 let query = "not working correctly";
3499 let analysis = analyze_query(query);
3500
3501 assert!(analysis.has_negation);
3502
3503 let negated_entities: Vec<&FocalEntity> = analysis
3505 .focal_entities
3506 .iter()
3507 .filter(|e| e.negated)
3508 .collect();
3509
3510 assert!(!negated_entities.is_empty());
3511 }
3512
3513 #[test]
3514 fn test_negation_scope() {
3515 let query = "the sensor is not detecting obstacles properly";
3516 let analysis = analyze_query(query);
3517
3518 assert!(analysis.has_negation);
3519
3520 let negated_verbs: Vec<&Relation> = analysis
3522 .relational_context
3523 .iter()
3524 .filter(|r| r.negated)
3525 .collect();
3526
3527 assert!(negated_verbs.iter().any(|r| r.text == "detecting"));
3528 }
3529
3530 #[test]
3531 fn test_all_stems_helper() {
3532 let query = "fast robot detecting obstacles";
3533 let analysis = analyze_query(query);
3534
3535 let stems = analysis.all_stems();
3536 assert!(stems.contains("robot"));
3537 assert!(stems.contains("fast"));
3538 assert!(stems.contains("detect"));
3539 assert!(stems.contains("obstacl")); }
3541
3542 #[test]
3543 fn test_positive_and_negated_stems() {
3544 let query = "working memory not failed";
3545 let analysis = analyze_query(query);
3546
3547 let positive = analysis.positive_entity_stems();
3548 let _negated = analysis.negated_entity_stems();
3549
3550 assert!(positive.iter().any(|s| s.contains("memori")));
3552
3553 }
3556
3557 #[test]
3558 fn test_empty_query() {
3559 let query = "";
3560 let analysis = analyze_query(query);
3561
3562 assert!(analysis.focal_entities.is_empty());
3563 assert!(analysis.discriminative_modifiers.is_empty());
3564 assert!(analysis.relational_context.is_empty());
3565 assert!(!analysis.has_negation);
3566 }
3567
3568 #[test]
3569 fn test_stop_words_filtered() {
3570 let query = "the a an is are was were";
3571 let analysis = analyze_query(query);
3572
3573 assert!(analysis.focal_entities.is_empty());
3575 assert!(analysis.discriminative_modifiers.is_empty());
3576 assert!(!analysis.relational_context.is_empty());
3577 }
3578
3579 #[test]
3580 fn test_total_weight_calculation() {
3581 let query = "fast robot detecting red obstacles";
3582 let analysis = analyze_query(query);
3583
3584 let weight = analysis.total_weight();
3585 assert!(weight > 0.0);
3586 }
3587
3588 #[test]
3589 fn test_to_ic_weights() {
3590 use crate::constants::{IC_ADJECTIVE, IC_NOUN, IC_VERB};
3591
3592 let query = "fast robot detecting obstacles";
3593 let analysis = analyze_query(query);
3594 let weights = analysis.to_ic_weights();
3595
3596 assert!(!weights.is_empty(), "Weights should not be empty");
3598
3599 let has_noun_weight = weights.values().any(|&w| (w - IC_NOUN).abs() < 0.01);
3605 let has_adj_weight = weights.values().any(|&w| (w - IC_ADJECTIVE).abs() < 0.01);
3606 let has_verb_weight = weights.values().any(|&w| (w - IC_VERB).abs() < 0.01);
3607
3608 assert!(
3610 has_noun_weight || has_adj_weight || has_verb_weight,
3611 "Should have at least one IC weight type. Weights: {:?}",
3612 weights
3613 );
3614 }
3615
3616 #[test]
3617 fn test_to_phrase_boosts() {
3618 let query = "machine learning model for semantic search";
3620 let analysis = analyze_query(query);
3621 let phrases = analysis.to_phrase_boosts();
3622
3623 let has_ml = phrases.iter().any(|(p, _)| p == "machine learning");
3625 let has_ss = phrases.iter().any(|(p, _)| p == "semantic search");
3626
3627 assert!(
3628 has_ml || has_ss,
3629 "Should detect 'machine learning' or 'semantic search' as phrase. Found: {:?}",
3630 phrases
3631 );
3632
3633 for (phrase, boost) in &phrases {
3635 assert!(
3636 *boost >= 1.0,
3637 "Phrase '{}' should have boost >= 1.0, got {}",
3638 phrase,
3639 boost
3640 );
3641 }
3642 }
3643
3644 #[test]
3645 fn test_to_phrase_boosts_support_group() {
3646 let query = "when did she go to the support group";
3648 let analysis = analyze_query(query);
3649 let phrases = analysis.to_phrase_boosts();
3650
3651 let has_support_group = phrases.iter().any(|(p, _)| p == "support group");
3653 assert!(
3654 has_support_group,
3655 "Should detect 'support group' as phrase. Found: {:?}",
3656 phrases
3657 );
3658 }
3659}