Skip to main content

shodh_memory/memory/
query_parser.rs

1//! Linguistic Query Parser
2//!
3//! Based on:
4//! - Lioma & Ounis (2006): "Content Load of Part of Speech Blocks"
5//! - Bendersky & Croft (2008): "Discovering Key Concepts in Verbose Queries"
6//! - Porter (1980): Stemming algorithm for term normalization
7//!
8//! Extracts focal entities (nouns), discriminative modifiers (adjectives),
9//! and relational context (verbs) from natural language queries.
10//!
11//! # Polished Features (v2)
12//! - Porter2 stemming for term normalization
13//! - Compound noun detection (bigrams/trigrams)
14//! - Context-aware POS disambiguation
15//! - Negation scope tracking
16//! - IDF-inspired term rarity weighting
17//!
18//! # Shallow Parsing / Chunking (v3)
19//! - Sentence-level chunking for co-occurrence detection
20//! - POS-based entity extraction (all nouns, verbs, adjectives - not just top-N)
21//! - Designed for both query analysis AND memory storage
22//!
23//! # Temporal Extraction (v4)
24//! - Extract dates from natural language text ("May 7, 2023", "yesterday", "last week")
25//! - Detect temporal queries ("when did", "what date", "how long ago")
26//! - Based on TEMPR approach (Hindsight paper achieving 89.6% on LoCoMo)
27
28use crate::constants::{IC_ADJECTIVE, IC_NOUN, IC_VERB};
29use chrono::{DateTime, Datelike, NaiveDate, Utc};
30use rust_stemmers::{Algorithm, Stemmer};
31use serde::{Deserialize, Serialize};
32use std::collections::HashSet;
33
34// ============================================================================
35// SHALLOW PARSING / CHUNKING MODULE
36// ============================================================================
37// This section provides sentence-level chunking and POS-based entity extraction.
38// Unlike YAKE (which ranks by frequency and misses rare discriminative terms),
39// this extracts ALL content words (nouns, verbs, adjectives) for graph building.
40
41/// Part of speech tag
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
43pub enum PosTag {
44    Noun,
45    Verb,
46    Adjective,
47    ProperNoun,
48    StopWord,
49    Other,
50}
51
52/// A word with its POS annotation
53#[derive(Debug, Clone)]
54pub struct TaggedWord {
55    pub text: String,
56    pub stem: String,
57    pub pos: PosTag,
58    /// Position within the sentence (0-indexed)
59    pub position: usize,
60}
61
62/// A sentence chunk containing tagged words
63#[derive(Debug, Clone)]
64pub struct SentenceChunk {
65    /// Original sentence text
66    pub text: String,
67    /// Sentence index in document (0-indexed)
68    pub sentence_idx: usize,
69    /// All tagged words in this sentence
70    pub words: Vec<TaggedWord>,
71}
72
73impl SentenceChunk {
74    /// Get all nouns in this chunk
75    pub fn nouns(&self) -> Vec<&TaggedWord> {
76        self.words
77            .iter()
78            .filter(|w| matches!(w.pos, PosTag::Noun | PosTag::ProperNoun))
79            .collect()
80    }
81
82    /// Get all verbs in this chunk
83    pub fn verbs(&self) -> Vec<&TaggedWord> {
84        self.words
85            .iter()
86            .filter(|w| w.pos == PosTag::Verb)
87            .collect()
88    }
89
90    /// Get all adjectives in this chunk
91    pub fn adjectives(&self) -> Vec<&TaggedWord> {
92        self.words
93            .iter()
94            .filter(|w| w.pos == PosTag::Adjective)
95            .collect()
96    }
97
98    /// Get all content words (nouns, verbs, adjectives)
99    pub fn content_words(&self) -> Vec<&TaggedWord> {
100        self.words
101            .iter()
102            .filter(|w| {
103                matches!(
104                    w.pos,
105                    PosTag::Noun | PosTag::ProperNoun | PosTag::Verb | PosTag::Adjective
106                )
107            })
108            .collect()
109    }
110
111    /// Generate co-occurrence pairs (words in same sentence get edges)
112    /// Returns pairs of (word1_stem, word2_stem) for graph edge creation
113    pub fn cooccurrence_pairs(&self) -> Vec<(&str, &str)> {
114        let content = self.content_words();
115        let mut pairs = Vec::new();
116
117        for i in 0..content.len() {
118            for j in (i + 1)..content.len() {
119                pairs.push((content[i].stem.as_str(), content[j].stem.as_str()));
120            }
121        }
122
123        pairs
124    }
125}
126
127/// Result of chunking a document
128#[derive(Debug, Clone)]
129pub struct ChunkExtraction {
130    /// All sentence chunks
131    pub chunks: Vec<SentenceChunk>,
132    /// Unique nouns found (stems)
133    pub unique_nouns: HashSet<String>,
134    /// Unique verbs found (stems)
135    pub unique_verbs: HashSet<String>,
136    /// Unique adjectives found (stems)
137    pub unique_adjectives: HashSet<String>,
138    /// Proper nouns (likely named entities)
139    pub proper_nouns: HashSet<String>,
140}
141
142impl ChunkExtraction {
143    /// Get all unique content word stems
144    pub fn all_content_stems(&self) -> HashSet<String> {
145        let mut all = self.unique_nouns.clone();
146        all.extend(self.unique_verbs.clone());
147        all.extend(self.unique_adjectives.clone());
148        all.extend(self.proper_nouns.clone());
149        all
150    }
151
152    /// Get all co-occurrence pairs across all chunks
153    pub fn all_cooccurrence_pairs(&self) -> Vec<(String, String)> {
154        let mut all_pairs = Vec::new();
155        for chunk in &self.chunks {
156            for (w1, w2) in chunk.cooccurrence_pairs() {
157                all_pairs.push((w1.to_string(), w2.to_string()));
158            }
159        }
160        all_pairs
161    }
162}
163
164// ============================================================================
165// TEMPORAL EXTRACTION MODULE
166// ============================================================================
167// Extracts temporal references from natural language text.
168// Based on TEMPR approach from Hindsight paper (89.6% accuracy on LoCoMo).
169// Key insight: Temporal filtering is critical for multi-hop retrieval.
170
171/// A temporal reference extracted from text
172#[derive(Debug, Clone, Serialize, Deserialize)]
173pub struct TemporalRef {
174    /// The extracted date (normalized to NaiveDate)
175    pub date: NaiveDate,
176    /// Original text that was parsed (e.g., "May 7, 2023", "yesterday")
177    pub original_text: String,
178    /// Confidence in the extraction (0.0-1.0)
179    pub confidence: f32,
180    /// Position in original text (character offset)
181    pub position: usize,
182    /// Type of temporal reference
183    pub ref_type: TemporalRefType,
184}
185
186/// Type of temporal reference
187#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
188pub enum TemporalRefType {
189    /// Absolute date (May 7, 2023)
190    Absolute,
191    /// Relative date (yesterday, last week)
192    Relative,
193    /// Day of week (on Monday, last Tuesday)
194    DayOfWeek,
195    /// Month reference (in May, last March)
196    Month,
197    /// Year reference (in 2023, last year)
198    Year,
199}
200
201/// Result of temporal extraction from text
202#[derive(Debug, Clone, Default)]
203pub struct TemporalExtraction {
204    /// All temporal references found
205    pub refs: Vec<TemporalRef>,
206    /// Earliest date mentioned
207    pub earliest: Option<NaiveDate>,
208    /// Latest date mentioned
209    pub latest: Option<NaiveDate>,
210}
211
212impl TemporalExtraction {
213    /// Check if any temporal references were found
214    pub fn has_temporal_refs(&self) -> bool {
215        !self.refs.is_empty()
216    }
217
218    /// Get date range (earliest, latest) if temporal refs exist
219    pub fn date_range(&self) -> Option<(NaiveDate, NaiveDate)> {
220        match (self.earliest, self.latest) {
221            (Some(e), Some(l)) => Some((e, l)),
222            (Some(e), None) => Some((e, e)),
223            (None, Some(l)) => Some((l, l)),
224            (None, None) => None,
225        }
226    }
227}
228
229/// Extract temporal references from text
230///
231/// Uses date_time_parser crate for natural language date parsing.
232/// Handles:
233/// - Absolute dates: "May 7, 2023", "2023-05-07", "07/05/2023"
234/// - Relative dates: "yesterday", "last week", "3 days ago"
235/// - Day of week: "on Monday", "last Tuesday"
236/// - Month/year: "in May", "last year", "2023"
237pub fn extract_temporal_refs(text: &str) -> TemporalExtraction {
238    let now = Utc::now();
239    let mut refs = Vec::new();
240    let mut earliest: Option<NaiveDate> = None;
241    let mut latest: Option<NaiveDate> = None;
242
243    // Helper to validate date is in reasonable range (1900-2100)
244    let is_valid_date = |date: &NaiveDate| -> bool {
245        let year = date.year();
246        year >= 1900 && year <= 2100
247    };
248
249    // Try dateparser on the full text (returns Result, never panics)
250    if let Ok(parsed) = dateparser::parse(text) {
251        let date = parsed.date_naive();
252        if is_valid_date(&date) {
253            refs.push(TemporalRef {
254                date,
255                original_text: text.to_string(),
256                confidence: 0.8,
257                position: 0,
258                ref_type: classify_temporal_ref(text, &date, &now),
259            });
260            update_bounds(&mut earliest, &mut latest, date);
261        }
262    }
263
264    // Try parsing individual sentences/phrases
265    for (pos, sentence) in split_temporal_phrases(text).iter().enumerate() {
266        if let Ok(parsed) = dateparser::parse(sentence) {
267            let date = parsed.date_naive();
268            if !is_valid_date(&date) {
269                continue;
270            }
271            if refs.iter().any(|r| r.date == date) {
272                continue;
273            }
274            refs.push(TemporalRef {
275                date,
276                original_text: sentence.to_string(),
277                confidence: 0.7,
278                position: pos,
279                ref_type: classify_temporal_ref(sentence, &date, &now),
280            });
281            update_bounds(&mut earliest, &mut latest, date);
282        }
283    }
284
285    // Also use regex-based extraction for explicit date patterns
286    let explicit_dates = extract_explicit_dates(text);
287    for (date, original, pos) in explicit_dates {
288        if !is_valid_date(&date) {
289            continue;
290        }
291        if refs.iter().any(|r| r.date == date) {
292            continue;
293        }
294        refs.push(TemporalRef {
295            date,
296            original_text: original,
297            confidence: 0.9,
298            position: pos,
299            ref_type: TemporalRefType::Absolute,
300        });
301        update_bounds(&mut earliest, &mut latest, date);
302    }
303
304    // Sort by position in text
305    refs.sort_by_key(|r| r.position);
306
307    TemporalExtraction {
308        refs,
309        earliest,
310        latest,
311    }
312}
313
314/// Classify the type of temporal reference
315fn classify_temporal_ref(text: &str, date: &NaiveDate, now: &DateTime<Utc>) -> TemporalRefType {
316    let text_lower = text.to_lowercase();
317    let today = now.date_naive();
318
319    // Check for relative indicators
320    if text_lower.contains("yesterday")
321        || text_lower.contains("ago")
322        || text_lower.contains("last")
323        || text_lower.contains("previous")
324        || text_lower.contains("before")
325        || text_lower.contains("earlier")
326    {
327        return TemporalRefType::Relative;
328    }
329
330    // Check for day of week
331    let days = [
332        "monday",
333        "tuesday",
334        "wednesday",
335        "thursday",
336        "friday",
337        "saturday",
338        "sunday",
339    ];
340    if days.iter().any(|d| text_lower.contains(d)) {
341        return TemporalRefType::DayOfWeek;
342    }
343
344    // Check for month names without day
345    let months = [
346        "january",
347        "february",
348        "march",
349        "april",
350        "may",
351        "june",
352        "july",
353        "august",
354        "september",
355        "october",
356        "november",
357        "december",
358    ];
359    let has_month = months.iter().any(|m| text_lower.contains(m));
360    let has_day = text.chars().any(|c| c.is_ascii_digit());
361
362    if has_month && !has_day {
363        return TemporalRefType::Month;
364    }
365
366    // Check for year-only reference
367    if text.len() == 4 && text.chars().all(|c| c.is_ascii_digit()) {
368        return TemporalRefType::Year;
369    }
370
371    // If date is today or very close, might be relative
372    let diff = (today - *date).num_days().abs();
373    if diff <= 7 && text_lower.contains("this") {
374        return TemporalRefType::Relative;
375    }
376
377    TemporalRefType::Absolute
378}
379
380/// Split text into temporal-relevant phrases
381fn split_temporal_phrases(text: &str) -> Vec<String> {
382    let mut phrases = Vec::new();
383
384    // Split by common temporal markers and punctuation
385    let markers = [
386        " on ", " in ", " at ", " during ", " since ", " until ", " before ", " after ",
387        " around ", ", ", ". ", "! ", "? ",
388    ];
389
390    let current = text.to_string();
391    for marker in markers {
392        let parts: Vec<&str> = current.split(marker).collect();
393        if parts.len() > 1 {
394            for part in parts {
395                let trimmed = part.trim();
396                if !trimmed.is_empty() && trimmed.len() > 3 {
397                    phrases.push(trimmed.to_string());
398                }
399            }
400            break;
401        }
402    }
403
404    // If no splitting happened, try sentence boundaries
405    if phrases.is_empty() {
406        for sentence in text.split('.') {
407            let trimmed = sentence.trim();
408            if !trimmed.is_empty() && trimmed.len() > 3 {
409                phrases.push(trimmed.to_string());
410            }
411        }
412    }
413
414    phrases
415}
416
417/// Extract explicit date patterns that date_time_parser might miss
418fn extract_explicit_dates(text: &str) -> Vec<(NaiveDate, String, usize)> {
419    use regex::Regex;
420
421    let mut results = Vec::new();
422
423    // Pattern: "Month Day, Year" (e.g., "May 7, 2023")
424    let month_day_year =
425        Regex::new(r"(?i)(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})")
426            .unwrap();
427
428    for cap in month_day_year.captures_iter(text) {
429        let month_str = &cap[1];
430        let day: u32 = cap[2].parse().unwrap_or(1);
431        let year: i32 = cap[3].parse().unwrap_or(2000);
432        let month = month_to_num(month_str);
433
434        if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
435            let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
436            results.push((date, cap[0].to_string(), pos));
437        }
438    }
439
440    // Pattern: "YYYY-MM-DD"
441    let iso_date = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
442    for cap in iso_date.captures_iter(text) {
443        let year: i32 = cap[1].parse().unwrap_or(2000);
444        let month: u32 = cap[2].parse().unwrap_or(1);
445        let day: u32 = cap[3].parse().unwrap_or(1);
446
447        if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
448            let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
449            results.push((date, cap[0].to_string(), pos));
450        }
451    }
452
453    // Pattern: "MM/DD/YYYY" or "DD/MM/YYYY" (assume US format MM/DD)
454    let slash_date = Regex::new(r"(\d{1,2})/(\d{1,2})/(\d{4})").unwrap();
455    for cap in slash_date.captures_iter(text) {
456        let month: u32 = cap[1].parse().unwrap_or(1);
457        let day: u32 = cap[2].parse().unwrap_or(1);
458        let year: i32 = cap[3].parse().unwrap_or(2000);
459
460        if let Some(date) = NaiveDate::from_ymd_opt(year, month, day) {
461            let pos = cap.get(0).map(|m| m.start()).unwrap_or(0);
462            results.push((date, cap[0].to_string(), pos));
463        }
464    }
465
466    results
467}
468
469/// Convert month name to number
470fn month_to_num(month: &str) -> u32 {
471    match month.to_lowercase().as_str() {
472        "january" | "jan" => 1,
473        "february" | "feb" => 2,
474        "march" | "mar" => 3,
475        "april" | "apr" => 4,
476        "may" => 5,
477        "june" | "jun" => 6,
478        "july" | "jul" => 7,
479        "august" | "aug" => 8,
480        "september" | "sep" | "sept" => 9,
481        "october" | "oct" => 10,
482        "november" | "nov" => 11,
483        "december" | "dec" => 12,
484        _ => 1,
485    }
486}
487
488/// Update earliest/latest bounds
489fn update_bounds(
490    earliest: &mut Option<NaiveDate>,
491    latest: &mut Option<NaiveDate>,
492    date: NaiveDate,
493) {
494    match earliest {
495        Some(e) if date < *e => *earliest = Some(date),
496        None => *earliest = Some(date),
497        _ => {}
498    }
499    match latest {
500        Some(l) if date > *l => *latest = Some(date),
501        None => *latest = Some(date),
502        _ => {}
503    }
504}
505
506// ============================================================================
507// TEMPORAL QUERY DETECTION
508// ============================================================================
509// Detect when a query is asking about time/dates.
510
511/// Query temporal intent
512#[derive(Debug, Clone, Copy, PartialEq, Eq)]
513pub enum TemporalIntent {
514    /// Query is asking "when" something happened
515    WhenQuestion,
516    /// Query references a specific time period
517    SpecificTime,
518    /// Query asks about temporal ordering (before/after)
519    Ordering,
520    /// Query asks about duration (how long)
521    Duration,
522    /// No temporal intent detected
523    None,
524}
525
526/// Detect temporal intent in a query
527pub fn detect_temporal_intent(query: &str) -> TemporalIntent {
528    let query_lower = query.to_lowercase();
529
530    // "When" questions are highest priority
531    if query_lower.starts_with("when")
532        || query_lower.contains(" when ")
533        || query_lower.contains("what date")
534        || query_lower.contains("what day")
535        || query_lower.contains("what time")
536    {
537        return TemporalIntent::WhenQuestion;
538    }
539
540    // Duration questions
541    if query_lower.contains("how long")
542        || query_lower.contains("how many days")
543        || query_lower.contains("how many weeks")
544        || query_lower.contains("how many months")
545        || query_lower.contains("how many years")
546    {
547        return TemporalIntent::Duration;
548    }
549
550    // Ordering questions
551    if query_lower.contains("before or after")
552        || query_lower.contains("first or")
553        || query_lower.contains("earlier or later")
554        || query_lower.contains("which came first")
555        || query_lower.contains("in what order")
556    {
557        return TemporalIntent::Ordering;
558    }
559
560    // Specific time references
561    let time_indicators = [
562        "yesterday",
563        "today",
564        "last week",
565        "last month",
566        "last year",
567        "this week",
568        "this month",
569        "this year",
570        "in january",
571        "in february",
572        "in march",
573        "in april",
574        "in may",
575        "in june",
576        "in july",
577        "in august",
578        "in september",
579        "in october",
580        "in november",
581        "in december",
582        "on monday",
583        "on tuesday",
584        "on wednesday",
585        "on thursday",
586        "on friday",
587        "on saturday",
588        "on sunday",
589        " ago",
590        " days ago",
591        " weeks ago",
592        " months ago",
593        " years ago",
594    ];
595
596    if time_indicators.iter().any(|t| query_lower.contains(t)) {
597        return TemporalIntent::SpecificTime;
598    }
599
600    // Check for date patterns
601    let extraction = extract_temporal_refs(query);
602    if extraction.has_temporal_refs() {
603        return TemporalIntent::SpecificTime;
604    }
605
606    TemporalIntent::None
607}
608
609/// Check if a query requires temporal filtering for accurate retrieval
610///
611/// Returns true if the query has a temporal component that should be used
612/// to filter/rank memories by their temporal references.
613///
614/// IMPORTANT: "When did X happen?" questions (WhenQuestion) return FALSE
615/// because they are asking FOR a date, not filtering BY a date.
616/// We should search semantically for X and extract the date from results.
617///
618/// "What happened in May 2023?" (SpecificTime) returns TRUE because
619/// it's filtering BY a specific time period.
620pub fn requires_temporal_filtering(query: &str) -> bool {
621    let intent = detect_temporal_intent(query);
622    matches!(
623        intent,
624        // WhenQuestion is EXCLUDED - it asks FOR a date, not BY a date
625        TemporalIntent::SpecificTime | TemporalIntent::Duration | TemporalIntent::Ordering
626    )
627}
628
629/// Check if a query is asking FOR a temporal answer (when did X happen?)
630///
631/// These queries should use semantic search on the event X, then extract
632/// the date from the retrieved content.
633pub fn asks_for_temporal_answer(query: &str) -> bool {
634    matches!(detect_temporal_intent(query), TemporalIntent::WhenQuestion)
635}
636
637// ============================================================================
638// ATTRIBUTE QUERY DETECTION
639// ============================================================================
640// Detect queries asking for specific attributes of entities.
641// These queries need fact-first retrieval, not semantic similarity.
642//
643// Examples:
644// - "What is Caroline's relationship status?" → entity=Caroline, attribute=relationship_status
645// - "What is Melanie's job?" → entity=Melanie, attribute=job
646// - "Where does Caroline live?" → entity=Caroline, attribute=location
647
648/// Type of query for routing to appropriate retrieval strategy
649#[derive(Debug, Clone, PartialEq, Eq)]
650pub enum QueryType {
651    /// Attribute query: "What is X's Y?" - needs fact lookup
652    Attribute(AttributeQuery),
653    /// Temporal query: "When did X do Y?" - needs temporal filtering
654    Temporal,
655    /// Exploratory query: general semantic search
656    Exploratory,
657}
658
659/// Extracted attribute query components
660#[derive(Debug, Clone, PartialEq, Eq)]
661pub struct AttributeQuery {
662    /// The entity being asked about (e.g., "Caroline")
663    pub entity: String,
664    /// The attribute being requested (e.g., "relationship_status")
665    pub attribute: String,
666    /// Attribute synonyms for matching (e.g., ["status", "single", "married", "dating"])
667    pub attribute_synonyms: Vec<String>,
668    /// Original query text
669    pub original_query: String,
670}
671
672/// Classify a query to determine retrieval strategy
673pub fn classify_query(query: &str) -> QueryType {
674    // First check for attribute queries
675    if let Some(attr_query) = detect_attribute_query(query) {
676        return QueryType::Attribute(attr_query);
677    }
678
679    // Check for temporal queries
680    if asks_for_temporal_answer(query) {
681        return QueryType::Temporal;
682    }
683
684    // Default to exploratory
685    QueryType::Exploratory
686}
687
688/// Detect and extract attribute query components
689///
690/// Patterns detected:
691/// - "What is X's Y?" / "What is X's Y"
692/// - "What is the Y of X?"
693/// - "What Y does X have?"
694/// - "Is X Y?" (boolean attribute)
695/// - "Where does/is X?" (location attribute)
696/// - "How old is X?" (age attribute)
697pub fn detect_attribute_query(query: &str) -> Option<AttributeQuery> {
698    let query_lower = query.to_lowercase();
699    let query_trimmed = query_lower.trim().trim_end_matches('?');
700
701    // Pattern 1: "What is X's Y" / "What's X's Y"
702    if let Some(result) = extract_possessive_pattern(query_trimmed) {
703        return Some(result);
704    }
705
706    // Pattern 2: "What is the Y of X"
707    if let Some(result) = extract_of_pattern(query_trimmed) {
708        return Some(result);
709    }
710
711    // Pattern 3: "Where does/is X" (location attribute)
712    if query_lower.starts_with("where does") || query_lower.starts_with("where is") {
713        if let Some(entity) = extract_entity_after_verb(query_trimmed) {
714            return Some(AttributeQuery {
715                entity,
716                attribute: "location".to_string(),
717                attribute_synonyms: vec![
718                    "live".to_string(),
719                    "lives".to_string(),
720                    "living".to_string(),
721                    "resides".to_string(),
722                    "located".to_string(),
723                    "address".to_string(),
724                    "home".to_string(),
725                    "place".to_string(),
726                ],
727                original_query: query.to_string(),
728            });
729        }
730    }
731
732    // Pattern 4: "How old is X" (age attribute)
733    if query_lower.starts_with("how old") {
734        if let Some(entity) = extract_entity_after_verb(query_trimmed) {
735            return Some(AttributeQuery {
736                entity,
737                attribute: "age".to_string(),
738                attribute_synonyms: vec![
739                    "age".to_string(),
740                    "years old".to_string(),
741                    "born".to_string(),
742                    "birthday".to_string(),
743                ],
744                original_query: query.to_string(),
745            });
746        }
747    }
748
749    // Pattern 5: "Is X married/single/..." (boolean relationship status)
750    if query_lower.starts_with("is ") {
751        let status_words = [
752            "married",
753            "single",
754            "divorced",
755            "engaged",
756            "dating",
757            "in a relationship",
758        ];
759        for status in &status_words {
760            if query_lower.contains(status) {
761                // Extract entity between "is" and status word
762                let after_is = &query_trimmed[3..]; // Skip "is "
763                if let Some(pos) = after_is.find(status) {
764                    let entity = after_is[..pos].trim().to_string();
765                    if !entity.is_empty()
766                        && entity.chars().next().map_or(false, |c| c.is_alphabetic())
767                    {
768                        return Some(AttributeQuery {
769                            entity: capitalize_first(&entity),
770                            attribute: "relationship_status".to_string(),
771                            attribute_synonyms: vec![
772                                "single".to_string(),
773                                "married".to_string(),
774                                "divorced".to_string(),
775                                "engaged".to_string(),
776                                "dating".to_string(),
777                                "relationship".to_string(),
778                                "partner".to_string(),
779                                "spouse".to_string(),
780                                "status".to_string(),
781                            ],
782                            original_query: query.to_string(),
783                        });
784                    }
785                }
786            }
787        }
788    }
789
790    None
791}
792
793/// Extract "X's Y" pattern from query
794fn extract_possessive_pattern(query: &str) -> Option<AttributeQuery> {
795    // Find possessive marker ('s or s')
796    let possessive_patterns = [
797        ("what is ", "'s "),
798        ("what's ", "'s "),
799        ("what is ", "' "),
800        ("what's ", "' "),
801    ];
802
803    for (prefix, possessive) in possessive_patterns {
804        if let Some(start) = query.find(prefix) {
805            let after_prefix = &query[start + prefix.len()..];
806            if let Some(pos_pos) = after_prefix.find(possessive) {
807                let entity = after_prefix[..pos_pos].trim();
808                let attribute = after_prefix[pos_pos + possessive.len()..].trim();
809
810                if !entity.is_empty() && !attribute.is_empty() {
811                    return Some(AttributeQuery {
812                        entity: capitalize_first(entity),
813                        attribute: normalize_attribute(attribute),
814                        attribute_synonyms: get_attribute_synonyms(attribute),
815                        original_query: query.to_string(),
816                    });
817                }
818            }
819        }
820    }
821
822    None
823}
824
825/// Extract "the Y of X" pattern from query
826fn extract_of_pattern(query: &str) -> Option<AttributeQuery> {
827    // Pattern: "what is the Y of X"
828    let prefixes = ["what is the ", "what's the "];
829
830    for prefix in prefixes {
831        if let Some(start) = query.find(prefix) {
832            let after_prefix = &query[start + prefix.len()..];
833            if let Some(of_pos) = after_prefix.find(" of ") {
834                let attribute = after_prefix[..of_pos].trim();
835                let entity = after_prefix[of_pos + 4..].trim();
836
837                if !entity.is_empty() && !attribute.is_empty() {
838                    return Some(AttributeQuery {
839                        entity: capitalize_first(entity),
840                        attribute: normalize_attribute(attribute),
841                        attribute_synonyms: get_attribute_synonyms(attribute),
842                        original_query: query.to_string(),
843                    });
844                }
845            }
846        }
847    }
848
849    None
850}
851
852/// Extract entity after a verb like "is" or "does"
853fn extract_entity_after_verb(query: &str) -> Option<String> {
854    let verbs = [" is ", " does "];
855    for verb in verbs {
856        if let Some(pos) = query.find(verb) {
857            let after_verb = query[pos + verb.len()..].trim();
858            // Take first word(s) as entity (stop at common words)
859            let stop_words = ["live", "work", "do", "have", "go", "stay", "come"];
860            let words: Vec<&str> = after_verb.split_whitespace().collect();
861            let mut entity_words = Vec::new();
862            for word in words {
863                if stop_words.contains(&word) {
864                    break;
865                }
866                entity_words.push(word);
867            }
868            if !entity_words.is_empty() {
869                return Some(capitalize_first(&entity_words.join(" ")));
870            }
871        }
872    }
873    None
874}
875
876/// Normalize an attribute name (e.g., "relationship status" → "relationship_status")
877fn normalize_attribute(attr: &str) -> String {
878    attr.trim()
879        .to_lowercase()
880        .replace(' ', "_")
881        .replace('-', "_")
882}
883
884/// Get synonyms for common attributes
885fn get_attribute_synonyms(attribute: &str) -> Vec<String> {
886    let attr_lower = attribute.to_lowercase();
887
888    // Relationship status synonyms
889    if attr_lower.contains("relationship")
890        || attr_lower.contains("status")
891        || attr_lower.contains("marital")
892    {
893        return vec![
894            "single".to_string(),
895            "married".to_string(),
896            "divorced".to_string(),
897            "engaged".to_string(),
898            "dating".to_string(),
899            "relationship".to_string(),
900            "partner".to_string(),
901            "spouse".to_string(),
902            "single parent".to_string(),
903            "status".to_string(),
904            "marital".to_string(),
905        ];
906    }
907
908    // Job/occupation synonyms
909    if attr_lower.contains("job")
910        || attr_lower.contains("occupation")
911        || attr_lower.contains("work")
912    {
913        return vec![
914            "job".to_string(),
915            "work".to_string(),
916            "occupation".to_string(),
917            "profession".to_string(),
918            "career".to_string(),
919            "employed".to_string(),
920            "works as".to_string(),
921        ];
922    }
923
924    // Name synonyms
925    if attr_lower.contains("name") {
926        return vec![
927            "name".to_string(),
928            "called".to_string(),
929            "named".to_string(),
930        ];
931    }
932
933    // Age synonyms
934    if attr_lower.contains("age") {
935        return vec![
936            "age".to_string(),
937            "old".to_string(),
938            "years".to_string(),
939            "born".to_string(),
940            "birthday".to_string(),
941        ];
942    }
943
944    // Default: return the attribute itself and common variations
945    vec![attr_lower.clone(), attr_lower.replace('_', " ")]
946}
947
948/// Capitalize first letter of a string
949fn capitalize_first(s: &str) -> String {
950    let mut chars = s.chars();
951    match chars.next() {
952        None => String::new(),
953        Some(c) => c.to_uppercase().collect::<String>() + chars.as_str(),
954    }
955}
956
957/// Extract chunks from text using shallow parsing
958///
959/// This function:
960/// 1. Splits text into sentences
961/// 2. Tags each word with its POS (noun, verb, adjective, etc.)
962/// 3. Returns chunks that can be used for:
963///    - Entity extraction (all nouns, not just YAKE top-N)
964///    - Co-occurrence edge creation (words in same sentence)
965///
966/// Unlike YAKE, this doesn't rank by frequency - ALL content words are extracted.
967pub fn extract_chunks(text: &str) -> ChunkExtraction {
968    let stemmer = Stemmer::create(Algorithm::English);
969    let sentences = split_sentences(text);
970
971    let mut chunks = Vec::with_capacity(sentences.len());
972    let mut unique_nouns = HashSet::new();
973    let mut unique_verbs = HashSet::new();
974    let mut unique_adjectives = HashSet::new();
975    let mut proper_nouns = HashSet::new();
976
977    for (sentence_idx, sentence) in sentences.iter().enumerate() {
978        let words = tokenize_with_case(sentence);
979        let mut tagged_words = Vec::with_capacity(words.len());
980
981        for (position, (word, is_capitalized)) in words.iter().enumerate() {
982            let word_lower = word.to_lowercase();
983
984            // Skip very short words
985            if word_lower.len() < 2 {
986                continue;
987            }
988
989            let stem = stemmer.stem(&word_lower).to_string();
990            let pos = classify_pos_for_chunking(&word_lower, *is_capitalized, position, &words);
991
992            match pos {
993                PosTag::Noun => {
994                    unique_nouns.insert(stem.clone());
995                }
996                PosTag::Verb => {
997                    unique_verbs.insert(stem.clone());
998                }
999                PosTag::Adjective => {
1000                    unique_adjectives.insert(stem.clone());
1001                }
1002                PosTag::ProperNoun => {
1003                    proper_nouns.insert(word.clone());
1004                    unique_nouns.insert(stem.clone()); // Also add to nouns
1005                }
1006                _ => {}
1007            }
1008
1009            if pos != PosTag::StopWord {
1010                tagged_words.push(TaggedWord {
1011                    text: word.clone(),
1012                    stem,
1013                    pos,
1014                    position,
1015                });
1016            }
1017        }
1018
1019        if !tagged_words.is_empty() {
1020            chunks.push(SentenceChunk {
1021                text: sentence.clone(),
1022                sentence_idx,
1023                words: tagged_words,
1024            });
1025        }
1026    }
1027
1028    ChunkExtraction {
1029        chunks,
1030        unique_nouns,
1031        unique_verbs,
1032        unique_adjectives,
1033        proper_nouns,
1034    }
1035}
1036
1037/// Split text into sentences using punctuation and common patterns
1038fn split_sentences(text: &str) -> Vec<String> {
1039    let mut sentences = Vec::new();
1040    let mut current = String::new();
1041
1042    // Simple sentence splitting on . ! ? and newlines
1043    // More sophisticated than just split('.') because we handle abbreviations
1044    for ch in text.chars() {
1045        current.push(ch);
1046
1047        if ch == '.' || ch == '!' || ch == '?' || ch == '\n' {
1048            let trimmed = current.trim();
1049            if !trimmed.is_empty() && trimmed.len() > 3 {
1050                // Avoid splitting on abbreviations like "Dr." or "Mr."
1051                let last_word: String = trimmed
1052                    .split_whitespace()
1053                    .last()
1054                    .unwrap_or("")
1055                    .chars()
1056                    .filter(|c| c.is_alphabetic())
1057                    .collect();
1058
1059                // Common abbreviations that shouldn't split
1060                let is_abbrev = matches!(
1061                    last_word.to_lowercase().as_str(),
1062                    "mr" | "mrs"
1063                        | "ms"
1064                        | "dr"
1065                        | "prof"
1066                        | "sr"
1067                        | "jr"
1068                        | "vs"
1069                        | "etc"
1070                        | "eg"
1071                        | "ie"
1072                        | "st"
1073                        | "ave"
1074                        | "rd"
1075                        | "blvd"
1076                );
1077
1078                if !is_abbrev || ch == '\n' {
1079                    sentences.push(trimmed.to_string());
1080                    current.clear();
1081                }
1082            }
1083        }
1084    }
1085
1086    // Don't forget the last sentence
1087    let trimmed = current.trim();
1088    if !trimmed.is_empty() {
1089        sentences.push(trimmed.to_string());
1090    }
1091
1092    sentences
1093}
1094
1095/// Tokenize preserving case information for proper noun detection
1096fn tokenize_with_case(text: &str) -> Vec<(String, bool)> {
1097    text.split_whitespace()
1098        .map(|w| {
1099            let clean: String = w
1100                .trim_matches(|c: char| !c.is_alphanumeric() && c != '\'')
1101                .to_string();
1102            let is_capitalized = clean
1103                .chars()
1104                .next()
1105                .map(|c| c.is_uppercase())
1106                .unwrap_or(false);
1107            (clean, is_capitalized)
1108        })
1109        .filter(|(w, _)| !w.is_empty())
1110        .collect()
1111}
1112
1113/// Classify POS for chunking purposes
1114fn classify_pos_for_chunking(
1115    word: &str,
1116    is_capitalized: bool,
1117    position: usize,
1118    _context: &[(String, bool)],
1119) -> PosTag {
1120    // Check stop words first
1121    if is_stop_word(word) {
1122        return PosTag::StopWord;
1123    }
1124
1125    // Capitalized words not at sentence start are likely proper nouns
1126    if is_capitalized && position > 0 {
1127        return PosTag::ProperNoun;
1128    }
1129
1130    // Check verb indicators
1131    if is_verb(word) {
1132        return PosTag::Verb;
1133    }
1134
1135    // Check adjective indicators
1136    if is_adjective(word) {
1137        return PosTag::Adjective;
1138    }
1139
1140    // Check noun indicators
1141    if is_noun_for_chunking(word) {
1142        return PosTag::Noun;
1143    }
1144
1145    // Default: if it's a content word (not too short, not a stop word), treat as noun
1146    // This is the "80% rule" - unknown words are usually nouns in English
1147    if word.len() >= 4 {
1148        return PosTag::Noun;
1149    }
1150
1151    PosTag::Other
1152}
1153
1154/// Noun detection for chunking (more aggressive than query parsing)
1155fn is_noun_for_chunking(word: &str) -> bool {
1156    // All the noun indicators from the original is_noun function
1157    // Plus additional heuristics for storage
1158
1159    // Domain-specific nouns
1160    const NOUN_INDICATORS: &[&str] = &[
1161        // Keep all original indicators
1162        "memory",
1163        "graph",
1164        "node",
1165        "edge",
1166        "entity",
1167        "embedding",
1168        "vector",
1169        "index",
1170        "query",
1171        "retrieval",
1172        "activation",
1173        "potentiation",
1174        "consolidation",
1175        "decay",
1176        "strength",
1177        "weight",
1178        "threshold",
1179        "importance",
1180        "robot",
1181        "drone",
1182        "sensor",
1183        "lidar",
1184        "camera",
1185        "motor",
1186        "actuator",
1187        "obstacle",
1188        "path",
1189        "waypoint",
1190        "location",
1191        "coordinates",
1192        "position",
1193        "battery",
1194        "power",
1195        "energy",
1196        "voltage",
1197        "current",
1198        "system",
1199        "module",
1200        "component",
1201        "unit",
1202        "device",
1203        "temperature",
1204        "pressure",
1205        "humidity",
1206        "speed",
1207        "velocity",
1208        "signal",
1209        "communication",
1210        "network",
1211        "link",
1212        "connection",
1213        "navigation",
1214        "guidance",
1215        "control",
1216        "steering",
1217        "data",
1218        "information",
1219        "message",
1220        "command",
1221        "response",
1222        "function",
1223        "method",
1224        "class",
1225        "struct",
1226        "interface",
1227        "package",
1228        "library",
1229        "framework",
1230        "api",
1231        "endpoint",
1232        "request",
1233        "error",
1234        "exception",
1235        "bug",
1236        "fix",
1237        "feature",
1238        "test",
1239        "benchmark",
1240        "performance",
1241        "latency",
1242        "throughput",
1243        "cache",
1244        "buffer",
1245        "queue",
1246        "stack",
1247        "heap",
1248        "thread",
1249        "process",
1250        "server",
1251        "client",
1252        "database",
1253        "table",
1254        "column",
1255        "row",
1256        "schema",
1257        "migration",
1258        "deployment",
1259        "container",
1260        "cluster",
1261        "replica",
1262        "person",
1263        "people",
1264        "user",
1265        "agent",
1266        "operator",
1267        "time",
1268        "date",
1269        "day",
1270        "hour",
1271        "minute",
1272        "second",
1273        "area",
1274        "zone",
1275        "region",
1276        "sector",
1277        "space",
1278        "task",
1279        "mission",
1280        "goal",
1281        "objective",
1282        "target",
1283        "warning",
1284        "alert",
1285        "notification",
1286        "level",
1287        "status",
1288        "state",
1289        "condition",
1290        "mode",
1291        "type",
1292        "kind",
1293        "version",
1294        "release",
1295        "update",
1296        "change",
1297        "result",
1298        "output",
1299        "input",
1300        "value",
1301        "key",
1302        "name",
1303        "id",
1304        "identifier",
1305        // Additional nouns common in conversational text
1306        "sunrise",
1307        "sunset",
1308        "lake",
1309        "mountain",
1310        "beach",
1311        "forest",
1312        "garden",
1313        "park",
1314        "city",
1315        "town",
1316        "village",
1317        "country",
1318        "house",
1319        "home",
1320        "room",
1321        "building",
1322        "street",
1323        "road",
1324        "car",
1325        "bus",
1326        "train",
1327        "plane",
1328        "boat",
1329        "bicycle",
1330        "food",
1331        "drink",
1332        "water",
1333        "coffee",
1334        "tea",
1335        "breakfast",
1336        "lunch",
1337        "dinner",
1338        "meal",
1339        "book",
1340        "movie",
1341        "music",
1342        "song",
1343        "art",
1344        "painting",
1345        "photo",
1346        "picture",
1347        "video",
1348        "game",
1349        "sport",
1350        "team",
1351        "player",
1352        "match",
1353        "race",
1354        "trip",
1355        "vacation",
1356        "holiday",
1357        "weekend",
1358        "morning",
1359        "evening",
1360        "night",
1361        "week",
1362        "month",
1363        "year",
1364        "birthday",
1365        "wedding",
1366        "party",
1367        "event",
1368        "meeting",
1369        "class",
1370        "lesson",
1371        "course",
1372        "school",
1373        "college",
1374        "university",
1375        "job",
1376        "work",
1377        "office",
1378        "company",
1379        "business",
1380        "project",
1381        "plan",
1382        "idea",
1383        "thought",
1384        "feeling",
1385        "emotion",
1386        "love",
1387        "friend",
1388        "family",
1389        "parent",
1390        "child",
1391        "kid",
1392        "baby",
1393        "mother",
1394        "father",
1395        "sister",
1396        "brother",
1397        "wife",
1398        "husband",
1399        "partner",
1400        "group",
1401        "community",
1402        "society",
1403        "culture",
1404        "tradition",
1405        "story",
1406        "history",
1407        "news",
1408        "article",
1409        "blog",
1410        "post",
1411        "comment",
1412        "email",
1413        "letter",
1414        "phone",
1415        "call",
1416        "text",
1417        "chat",
1418        "conversation",
1419        "discussion",
1420        "talk",
1421        "speech",
1422        "presentation",
1423        "question",
1424        "answer",
1425        "problem",
1426        "solution",
1427        "issue",
1428        "challenge",
1429        "opportunity",
1430        "success",
1431        "failure",
1432        "experience",
1433        "skill",
1434        "knowledge",
1435        "wisdom",
1436        "truth",
1437        "fact",
1438        "opinion",
1439        "belief",
1440        "value",
1441        "principle",
1442        "rule",
1443        "law",
1444        "policy",
1445        "decision",
1446        "choice",
1447        "option",
1448        "alternative",
1449        "reason",
1450        "cause",
1451        "effect",
1452        "impact",
1453        "influence",
1454        "power",
1455        "authority",
1456        "responsibility",
1457        "duty",
1458        "right",
1459        "freedom",
1460        "justice",
1461        "peace",
1462        "war",
1463        "conflict",
1464        "agreement",
1465        "contract",
1466        "deal",
1467        "price",
1468        "cost",
1469        "money",
1470        "dollar",
1471        "euro",
1472        "pound",
1473        "budget",
1474        "investment",
1475        "profit",
1476        "loss",
1477        "risk",
1478        "reward",
1479        "benefit",
1480        "advantage",
1481        "disadvantage",
1482        "strength",
1483        "weakness",
1484        "opportunity",
1485        "threat",
1486        "strategy",
1487        "tactic",
1488        "method",
1489        "approach",
1490        "technique",
1491        "tool",
1492        "resource",
1493        "material",
1494        "product",
1495        "service",
1496        "quality",
1497        "quantity",
1498        "size",
1499        "shape",
1500        "color",
1501        "sound",
1502        "smell",
1503        "taste",
1504        "touch",
1505        "sight",
1506        "sense",
1507        "mind",
1508        "body",
1509        "heart",
1510        "soul",
1511        "spirit",
1512        "health",
1513        "illness",
1514        "disease",
1515        "medicine",
1516        "doctor",
1517        "nurse",
1518        "hospital",
1519        "clinic",
1520        "therapy",
1521        "treatment",
1522        "care",
1523        "support",
1524        "help",
1525        "advice",
1526        "guidance",
1527        "counseling",
1528        "coaching",
1529        "mentoring",
1530        "training",
1531        "education",
1532        "learning",
1533        "teaching",
1534        "research",
1535        "study",
1536        "experiment",
1537        "discovery",
1538        "invention",
1539        "innovation",
1540        "technology",
1541        "science",
1542        "math",
1543        "physics",
1544        "chemistry",
1545        "biology",
1546        "psychology",
1547        "sociology",
1548        "philosophy",
1549        "religion",
1550        "spirituality",
1551        "meditation",
1552        "yoga",
1553        "exercise",
1554        "fitness",
1555        "diet",
1556        "nutrition",
1557        "sleep",
1558        "rest",
1559        "relaxation",
1560        "stress",
1561        "anxiety",
1562        "depression",
1563        "happiness",
1564        "joy",
1565        "sadness",
1566        "anger",
1567        "fear",
1568        "surprise",
1569        "disgust",
1570        "trust",
1571        "hope",
1572        "faith",
1573        "courage",
1574        "confidence",
1575        "pride",
1576        "shame",
1577        "guilt",
1578        "regret",
1579        "gratitude",
1580        "empathy",
1581        "compassion",
1582        "kindness",
1583        "generosity",
1584        "honesty",
1585        "integrity",
1586        "loyalty",
1587        "respect",
1588        "tolerance",
1589        "patience",
1590        "persistence",
1591        "determination",
1592        "motivation",
1593        "inspiration",
1594        "creativity",
1595        "imagination",
1596        "curiosity",
1597        "wonder",
1598        "beauty",
1599        "art",
1600        "music",
1601        "dance",
1602        "theater",
1603        "film",
1604        "literature",
1605        "poetry",
1606        "writing",
1607        "reading",
1608        "speaking",
1609        "listening",
1610        "communication",
1611        "expression",
1612        "interpretation",
1613        "understanding",
1614        "meaning",
1615        "purpose",
1616        "goal",
1617        "dream",
1618        "vision",
1619        "mission",
1620        "passion",
1621        "interest",
1622        "hobby",
1623        "activity",
1624        "adventure",
1625        "journey",
1626        "path",
1627        "way",
1628        "direction",
1629        "destination",
1630        "origin",
1631        "beginning",
1632        "end",
1633        "start",
1634        "finish",
1635        "progress",
1636        "growth",
1637        "development",
1638        "evolution",
1639        "transformation",
1640        "change",
1641        "transition",
1642        "shift",
1643        "movement",
1644        "action",
1645        "reaction",
1646        "response",
1647        "behavior",
1648        "habit",
1649        "pattern",
1650        "routine",
1651        "schedule",
1652        "plan",
1653        "strategy",
1654        "tactic",
1655        "approach",
1656        "method",
1657        "process",
1658        "procedure",
1659        "step",
1660        "stage",
1661        "phase",
1662        "cycle",
1663        "circle",
1664        "loop",
1665        "sequence",
1666        "order",
1667        "arrangement",
1668        "organization",
1669        "structure",
1670        "system",
1671        "network",
1672        "connection",
1673        "relationship",
1674        "bond",
1675        "link",
1676        "tie",
1677        "association",
1678        "affiliation",
1679        "membership",
1680        "participation",
1681        "involvement",
1682        "engagement",
1683        "commitment",
1684        "dedication",
1685        "devotion",
1686        "loyalty",
1687        "allegiance",
1688        "support",
1689        "backing",
1690        "endorsement",
1691        "approval",
1692        "acceptance",
1693        "recognition",
1694        "acknowledgment",
1695        "appreciation",
1696        "gratitude",
1697        "thanks",
1698        "praise",
1699        "compliment",
1700        "criticism",
1701        "feedback",
1702        "evaluation",
1703        "assessment",
1704        "judgment",
1705        "opinion",
1706        "view",
1707        "perspective",
1708        "angle",
1709        "aspect",
1710        "dimension",
1711        "element",
1712        "component",
1713        "part",
1714        "piece",
1715        "section",
1716        "segment",
1717        "portion",
1718        "share",
1719        "fraction",
1720        "percentage",
1721        "ratio",
1722        "proportion",
1723        "balance",
1724        "equilibrium",
1725        "harmony",
1726        "unity",
1727        "diversity",
1728        "variety",
1729        "difference",
1730        "similarity",
1731        "comparison",
1732        "contrast",
1733        "distinction",
1734        "separation",
1735        "division",
1736        "classification",
1737        "category",
1738        "class",
1739        "type",
1740        "kind",
1741        "sort",
1742        "species",
1743        "variety",
1744        "version",
1745        "edition",
1746        "model",
1747        "design",
1748        "style",
1749        "format",
1750        "layout",
1751        "arrangement",
1752        "configuration",
1753        "setup",
1754        "installation",
1755        "deployment",
1756    ];
1757
1758    if NOUN_INDICATORS.contains(&word) {
1759        return true;
1760    }
1761
1762    // Noun suffixes
1763    if word.ends_with("tion")
1764        || word.ends_with("sion")
1765        || word.ends_with("ment")
1766        || word.ends_with("ness")
1767        || word.ends_with("ity")
1768        || word.ends_with("ance")
1769        || word.ends_with("ence")
1770        || word.ends_with("age")
1771        || word.ends_with("ure")
1772        || word.ends_with("dom")
1773        || word.ends_with("ship")
1774        || word.ends_with("hood")
1775        || word.ends_with("ism")
1776        || word.ends_with("ist")
1777    {
1778        return true;
1779    }
1780
1781    // -er/-or suffixes (but not comparative adjectives)
1782    if (word.ends_with("er") || word.ends_with("or")) && word.len() > 4 {
1783        // Exclude likely comparatives
1784        let without_suffix = &word[..word.len() - 2];
1785        if !without_suffix.ends_with("t")
1786            && !without_suffix.ends_with("g")
1787            && !without_suffix.ends_with("d")
1788        {
1789            return true;
1790        }
1791    }
1792
1793    false
1794}
1795
1796/// Focal entity extracted from query (noun)
1797#[derive(Debug, Clone)]
1798pub struct FocalEntity {
1799    pub text: String,
1800    /// Stemmed form for matching
1801    pub stem: String,
1802    pub ic_weight: f32,
1803    /// True if entity is part of a compound noun
1804    pub is_compound: bool,
1805    /// True if preceded by negation
1806    pub negated: bool,
1807}
1808
1809/// Discriminative modifier (adjective/qualifier)
1810#[derive(Debug, Clone)]
1811pub struct Modifier {
1812    pub text: String,
1813    /// Stemmed form for matching
1814    pub stem: String,
1815    /// IC weight for importance scoring (Lioma & Ounis 2006)
1816    pub ic_weight: f32,
1817    /// True if preceded by negation
1818    pub negated: bool,
1819}
1820
1821/// Relational context (verb)
1822#[derive(Debug, Clone)]
1823pub struct Relation {
1824    pub text: String,
1825    /// Stemmed form for matching
1826    pub stem: String,
1827    /// IC weight for importance scoring (Lioma & Ounis 2006)
1828    pub ic_weight: f32,
1829    /// True if preceded by negation
1830    pub negated: bool,
1831}
1832
1833/// Query intent type for retrieval strategy selection (SHO-D6)
1834///
1835/// Determines the optimal retrieval approach based on query structure:
1836/// - Needle: Looking for specific facts → favor BM25/Vector (high precision)
1837/// - Exploratory: Seeking related concepts → favor Graph (high recall)
1838/// - Hybrid: Balanced query → use all retrieval methods equally
1839#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1840pub enum QueryIntent {
1841    /// Needle query: Seeking specific information
1842    /// Examples: "What is John's email?", "When did we deploy v2.0?"
1843    /// Strategy: High BM25/Vector weight, low Graph weight
1844    Needle,
1845
1846    /// Exploratory query: Seeking related concepts and associations
1847    /// Examples: "Tell me about the project", "What do we know about security?"
1848    /// Strategy: Low BM25/Vector weight, high Graph weight
1849    Exploratory,
1850
1851    /// Hybrid query: Balanced between specific and exploratory
1852    /// Examples: "How does authentication work?", "What are the deployment options?"
1853    /// Strategy: Balanced weights for all retrieval methods
1854    Hybrid,
1855}
1856
1857impl Default for QueryIntent {
1858    fn default() -> Self {
1859        QueryIntent::Hybrid
1860    }
1861}
1862
1863/// Complete linguistic analysis of a query
1864#[derive(Debug, Clone)]
1865pub struct QueryAnalysis {
1866    /// Focal entities (nouns) - primary search targets
1867    pub focal_entities: Vec<FocalEntity>,
1868
1869    /// Discriminative modifiers (adjectives) - quality refiners
1870    pub discriminative_modifiers: Vec<Modifier>,
1871
1872    /// Relational context (verbs) - graph traversal guides
1873    pub relational_context: Vec<Relation>,
1874
1875    /// Compound nouns detected (e.g., "machine learning", "neural network")
1876    pub compound_nouns: Vec<String>,
1877
1878    /// Original query text (retained for logging/debugging)
1879    pub original_query: String,
1880
1881    /// True if query contains negation
1882    pub has_negation: bool,
1883
1884    /// Detected query intent for retrieval strategy selection (SHO-D6)
1885    pub intent: QueryIntent,
1886}
1887
1888impl QueryAnalysis {
1889    /// Calculate weighted importance of this query (for ranking)
1890    pub fn total_weight(&self) -> f32 {
1891        let entity_weight: f32 = self.focal_entities.iter().map(|e| e.ic_weight).sum();
1892
1893        let modifier_weight: f32 = self
1894            .discriminative_modifiers
1895            .iter()
1896            .map(|m| m.ic_weight)
1897            .sum();
1898
1899        let relation_weight: f32 = self.relational_context.iter().map(|r| r.ic_weight).sum();
1900
1901        // Compound nouns get bonus weight
1902        let compound_bonus = self.compound_nouns.len() as f32 * 0.5;
1903
1904        entity_weight + modifier_weight + relation_weight + compound_bonus
1905    }
1906
1907    /// Get all stems for efficient matching
1908    pub fn all_stems(&self) -> HashSet<String> {
1909        let mut stems = HashSet::new();
1910        for e in &self.focal_entities {
1911            stems.insert(e.stem.clone());
1912        }
1913        for m in &self.discriminative_modifiers {
1914            stems.insert(m.stem.clone());
1915        }
1916        for r in &self.relational_context {
1917            stems.insert(r.stem.clone());
1918        }
1919        stems
1920    }
1921
1922    /// Get non-negated entity stems (for positive matching)
1923    pub fn positive_entity_stems(&self) -> Vec<&str> {
1924        self.focal_entities
1925            .iter()
1926            .filter(|e| !e.negated)
1927            .map(|e| e.stem.as_str())
1928            .collect()
1929    }
1930
1931    /// Get negated entity stems (for exclusion)
1932    pub fn negated_entity_stems(&self) -> Vec<&str> {
1933        self.focal_entities
1934            .iter()
1935            .filter(|e| e.negated)
1936            .map(|e| e.stem.as_str())
1937            .collect()
1938    }
1939
1940    /// Convert analysis to IC weights HashMap for BM25 term boosting
1941    ///
1942    /// Returns a mapping of lowercase terms to their IC weights:
1943    /// - Nouns (focal entities): IC_NOUN = 1.5
1944    /// - Adjectives (modifiers): IC_ADJECTIVE = 0.9
1945    /// - Verbs (relations): IC_VERB = 0.7
1946    ///
1947    /// Based on Lioma & Ounis (2006) - nouns carry more information content.
1948    pub fn to_ic_weights(&self) -> std::collections::HashMap<String, f32> {
1949        self.to_ic_weights_with_yake(true)
1950    }
1951
1952    /// Convert analysis to IC weights with optional YAKE boosting
1953    ///
1954    /// When use_yake=true, extracts discriminative keywords using YAKE algorithm
1955    /// and boosts their weights. This is critical for multi-hop queries where
1956    /// discriminative terms like "sunrise" must outweigh common terms like "Melanie".
1957    pub fn to_ic_weights_with_yake(
1958        &self,
1959        use_yake: bool,
1960    ) -> std::collections::HashMap<String, f32> {
1961        use crate::embeddings::keywords::{KeywordConfig, KeywordExtractor};
1962
1963        let mut weights = std::collections::HashMap::new();
1964
1965        // YAKE keyword extraction for discriminative term boosting
1966        // YAKE identifies statistically rare/important terms in the query
1967        if use_yake {
1968            let config = KeywordConfig {
1969                max_keywords: 5,
1970                ngrams: 2,
1971                min_length: 3,
1972                ..Default::default()
1973            };
1974            let extractor = KeywordExtractor::with_config(config);
1975            let keywords = extractor.extract(&self.original_query);
1976
1977            // Boost YAKE-identified keywords with high weights
1978            // YAKE importance is 0.0-1.0 where higher = more discriminative
1979            // Key insight: Only boost SINGLE words, not bigrams
1980            // Bigrams like "Melanie paint" match too broadly - we need specific terms
1981            for kw in keywords {
1982                let term = kw.text.to_lowercase();
1983
1984                // Skip bigrams/trigrams - they match documents with either word
1985                // which defeats the purpose of discriminative keyword boosting
1986                if term.contains(' ') {
1987                    continue;
1988                }
1989
1990                // Aggressive boost for single discriminative words
1991                // Scale: importance 0.5 → boost 3.5, importance 1.0 → boost 6.0
1992                // This ensures rare words like "sunrise" dominate over common words
1993                let yake_boost = 1.0 + (kw.importance * 5.0);
1994                weights
1995                    .entry(term)
1996                    .and_modify(|w: &mut f32| *w = w.max(yake_boost))
1997                    .or_insert(yake_boost);
1998            }
1999        }
2000
2001        // Add focal entities (nouns) with highest IC weight
2002        for entity in &self.focal_entities {
2003            let term = entity.text.to_lowercase();
2004            weights
2005                .entry(term)
2006                .and_modify(|w: &mut f32| *w = w.max(entity.ic_weight))
2007                .or_insert(entity.ic_weight);
2008            // Also add stem for fuzzy matching
2009            if entity.stem != entity.text.to_lowercase() {
2010                weights
2011                    .entry(entity.stem.clone())
2012                    .and_modify(|w: &mut f32| *w = w.max(entity.ic_weight))
2013                    .or_insert(entity.ic_weight);
2014            }
2015        }
2016
2017        // Add discriminative modifiers (adjectives)
2018        for modifier in &self.discriminative_modifiers {
2019            let term = modifier.text.to_lowercase();
2020            weights
2021                .entry(term)
2022                .and_modify(|w: &mut f32| *w = w.max(modifier.ic_weight))
2023                .or_insert(modifier.ic_weight);
2024            if modifier.stem != modifier.text.to_lowercase() {
2025                weights
2026                    .entry(modifier.stem.clone())
2027                    .and_modify(|w: &mut f32| *w = w.max(modifier.ic_weight))
2028                    .or_insert(modifier.ic_weight);
2029            }
2030        }
2031
2032        // Add relational context (verbs)
2033        for relation in &self.relational_context {
2034            let term = relation.text.to_lowercase();
2035            weights
2036                .entry(term)
2037                .and_modify(|w: &mut f32| *w = w.max(relation.ic_weight))
2038                .or_insert(relation.ic_weight);
2039            if relation.stem != relation.text.to_lowercase() {
2040                weights
2041                    .entry(relation.stem.clone())
2042                    .and_modify(|w: &mut f32| *w = w.max(relation.ic_weight))
2043                    .or_insert(relation.ic_weight);
2044            }
2045        }
2046
2047        // Boost compound nouns (they carry more specific meaning)
2048        for compound in &self.compound_nouns {
2049            // Each word in compound gets a small boost
2050            for word in compound.split_whitespace() {
2051                let term = word.to_lowercase();
2052                weights.entry(term).and_modify(|w: &mut f32| *w *= 1.2);
2053            }
2054        }
2055
2056        weights
2057    }
2058
2059    /// Get maximum YAKE keyword discriminativeness score for dynamic weight adjustment
2060    ///
2061    /// Returns (max_importance, discriminative_keywords) where:
2062    /// - max_importance: 0.0-1.0, higher means more discriminative keywords found
2063    /// - discriminative_keywords: keywords with importance > 0.5 (for logging)
2064    ///
2065    /// Use this to dynamically adjust BM25/vector weights in hybrid search:
2066    /// - High discriminativeness (>0.6) → boost BM25 weight (keyword matching critical)
2067    /// - Low discriminativeness (<0.3) → trust vector more (semantic similarity better)
2068    pub fn keyword_discriminativeness(&self) -> (f32, Vec<String>) {
2069        use crate::embeddings::keywords::{KeywordConfig, KeywordExtractor};
2070
2071        let config = KeywordConfig {
2072            max_keywords: 5,
2073            ngrams: 2,
2074            min_length: 2, // Allow short terms like "AI", "ML", "Go", etc.
2075            ..Default::default()
2076        };
2077        let extractor = KeywordExtractor::with_config(config);
2078        let keywords = extractor.extract(&self.original_query);
2079
2080        let mut max_importance = 0.0f32;
2081        let mut discriminative = Vec::new();
2082
2083        for kw in keywords {
2084            if kw.importance > max_importance {
2085                max_importance = kw.importance;
2086            }
2087            // Keywords with importance > 0.5 are considered discriminative
2088            if kw.importance > 0.5 {
2089                discriminative.push(kw.text.to_lowercase());
2090            }
2091        }
2092
2093        (max_importance, discriminative)
2094    }
2095
2096    /// Get phrase boosts for BM25 exact phrase matching
2097    ///
2098    /// Returns compound nouns and adjacent noun pairs as phrases with boost weights.
2099    /// Phrase matching significantly improves retrieval for multi-word concepts
2100    /// like "support group", "machine learning", "LGBTQ community".
2101    pub fn to_phrase_boosts(&self) -> Vec<(String, f32)> {
2102        let mut phrases = Vec::new();
2103
2104        // Add compound nouns with high boost (they are specific concepts)
2105        for compound in &self.compound_nouns {
2106            // Compound nouns get 2.0x boost for exact phrase match
2107            phrases.push((compound.to_lowercase(), 2.0));
2108        }
2109
2110        // Also detect adjacent nouns that might form natural phrases
2111        // Even if not in compound_nouns, adjacent entities may form useful phrases
2112        if self.focal_entities.len() >= 2 {
2113            for i in 0..self.focal_entities.len() - 1 {
2114                let e1 = &self.focal_entities[i];
2115                let e2 = &self.focal_entities[i + 1];
2116                // Only if not negated and not already a compound
2117                if !e1.negated && !e2.negated {
2118                    let phrase = format!("{} {}", e1.text.to_lowercase(), e2.text.to_lowercase());
2119                    if !self
2120                        .compound_nouns
2121                        .iter()
2122                        .any(|c| c.to_lowercase() == phrase)
2123                    {
2124                        // Adjacent nouns get 1.5x boost (lower than explicit compounds)
2125                        phrases.push((phrase, 1.5));
2126                    }
2127                }
2128            }
2129        }
2130
2131        phrases
2132    }
2133}
2134
2135/// Token with linguistic annotations
2136#[derive(Debug)]
2137struct AnnotatedToken {
2138    text: String,
2139    stem: String,
2140    pos: PartOfSpeech,
2141    negated: bool,
2142    position: usize,
2143}
2144
2145#[derive(Debug, Clone, Copy, PartialEq)]
2146enum PartOfSpeech {
2147    Noun,
2148    Adjective,
2149    Verb,
2150    StopWord,
2151    Negation,
2152    Unknown,
2153}
2154
2155/// Parse query using linguistic analysis with Porter2 stemming
2156pub fn analyze_query(query_text: &str) -> QueryAnalysis {
2157    let stemmer = Stemmer::create(Algorithm::English);
2158    let words = tokenize(query_text);
2159
2160    if words.is_empty() {
2161        return QueryAnalysis {
2162            focal_entities: Vec::new(),
2163            discriminative_modifiers: Vec::new(),
2164            relational_context: Vec::new(),
2165            compound_nouns: Vec::new(),
2166            original_query: query_text.to_string(),
2167            has_negation: false,
2168            intent: QueryIntent::Hybrid,
2169        };
2170    }
2171
2172    // Annotate each token with POS and negation scope
2173    let annotated = annotate_tokens(&words, &stemmer);
2174
2175    // Detect compound nouns
2176    let compound_nouns = detect_compound_nouns(&annotated);
2177
2178    // Build result structures
2179    let mut focal_entities = Vec::new();
2180    let mut discriminative_modifiers = Vec::new();
2181    let mut relational_context = Vec::new();
2182    let mut has_negation = false;
2183
2184    // Track which tokens are part of compounds
2185    let compound_positions: HashSet<usize> = compound_positions(&annotated, &compound_nouns);
2186
2187    for token in &annotated {
2188        if token.pos == PartOfSpeech::Negation {
2189            has_negation = true;
2190            continue;
2191        }
2192        if token.pos == PartOfSpeech::StopWord {
2193            continue;
2194        }
2195
2196        let is_compound = compound_positions.contains(&token.position);
2197
2198        match token.pos {
2199            PartOfSpeech::Noun | PartOfSpeech::Unknown => {
2200                // Unknown words are likely domain-specific nouns
2201                let weight = calculate_term_weight(&token.text, IC_NOUN);
2202                focal_entities.push(FocalEntity {
2203                    text: token.text.clone(),
2204                    stem: token.stem.clone(),
2205                    ic_weight: weight,
2206                    is_compound,
2207                    negated: token.negated,
2208                });
2209            }
2210            PartOfSpeech::Adjective => {
2211                let weight = calculate_term_weight(&token.text, IC_ADJECTIVE);
2212                discriminative_modifiers.push(Modifier {
2213                    text: token.text.clone(),
2214                    stem: token.stem.clone(),
2215                    ic_weight: weight,
2216                    negated: token.negated,
2217                });
2218            }
2219            PartOfSpeech::Verb => {
2220                let weight = calculate_term_weight(&token.text, IC_VERB);
2221                relational_context.push(Relation {
2222                    text: token.text.clone(),
2223                    stem: token.stem.clone(),
2224                    ic_weight: weight,
2225                    negated: token.negated,
2226                });
2227            }
2228            _ => {}
2229        }
2230    }
2231
2232    // Add compound nouns as high-weight entities
2233    for compound in &compound_nouns {
2234        let stem = stemmer.stem(compound).to_string();
2235        focal_entities.push(FocalEntity {
2236            text: compound.clone(),
2237            stem,
2238            ic_weight: IC_NOUN * 1.5, // Compound bonus
2239            is_compound: true,
2240            negated: false,
2241        });
2242    }
2243
2244    // Detect query intent for retrieval strategy (SHO-D6)
2245    let intent = detect_query_intent(query_text, &focal_entities, &relational_context);
2246
2247    QueryAnalysis {
2248        focal_entities,
2249        discriminative_modifiers,
2250        relational_context,
2251        compound_nouns,
2252        original_query: query_text.to_string(),
2253        has_negation,
2254        intent,
2255    }
2256}
2257
2258/// Detect query intent for retrieval strategy selection (SHO-D6)
2259///
2260/// Analyzes query structure to determine optimal retrieval approach:
2261/// - Needle: Specific fact-seeking queries → favor BM25/Vector
2262/// - Exploratory: Association-seeking queries → favor Graph
2263/// - Hybrid: Balanced queries → use all methods equally
2264fn detect_query_intent(
2265    query_text: &str,
2266    focal_entities: &[FocalEntity],
2267    relational_context: &[Relation],
2268) -> QueryIntent {
2269    let lower = query_text.to_lowercase();
2270
2271    // Needle indicators: seeking specific facts
2272    let needle_starters = [
2273        "what is", "what's", "who is", "who's", "where is", "where's", "when did", "when was",
2274        "which", "how much", "how many", "find", "get me", "show me", "list", "give me",
2275    ];
2276
2277    let needle_patterns = [
2278        "'s email",
2279        "'s phone",
2280        "'s address",
2281        "'s name",
2282        "email of",
2283        "phone of",
2284        "address of",
2285        "name of",
2286        "id of",
2287        "password",
2288        "api key",
2289        "token",
2290    ];
2291
2292    // Exploratory indicators: seeking associations and context
2293    let exploratory_starters = [
2294        "tell me about",
2295        "explain",
2296        "describe",
2297        "what do we know about",
2298        "summarize",
2299        "overview",
2300        "recap",
2301        "context",
2302        "related to",
2303        "associated with",
2304        "connected to",
2305        "how does",
2306        "how do",
2307        "why does",
2308        "why do",
2309    ];
2310
2311    let exploratory_patterns = [
2312        "all about",
2313        "everything about",
2314        "more about",
2315        "history of",
2316        "background",
2317        "related",
2318    ];
2319
2320    // Check for needle patterns
2321    for starter in needle_starters.iter() {
2322        if lower.starts_with(starter) {
2323            return QueryIntent::Needle;
2324        }
2325    }
2326
2327    for pattern in needle_patterns.iter() {
2328        if lower.contains(pattern) {
2329            return QueryIntent::Needle;
2330        }
2331    }
2332
2333    // Check for exploratory patterns
2334    for starter in exploratory_starters.iter() {
2335        if lower.starts_with(starter) || lower.contains(starter) {
2336            return QueryIntent::Exploratory;
2337        }
2338    }
2339
2340    for pattern in exploratory_patterns.iter() {
2341        if lower.contains(pattern) {
2342            return QueryIntent::Exploratory;
2343        }
2344    }
2345
2346    // Heuristic: If query has many entities and few relations → Needle
2347    // If query has few entities and many relations → Exploratory
2348    let entity_count = focal_entities.len();
2349    let relation_count = relational_context.len();
2350
2351    if entity_count > 0 && relation_count == 0 {
2352        // Pure entity query (e.g., "John's project") → Needle
2353        QueryIntent::Needle
2354    } else if relation_count > entity_count {
2355        // More relations than entities → Exploratory
2356        QueryIntent::Exploratory
2357    } else {
2358        // Balanced → Hybrid
2359        QueryIntent::Hybrid
2360    }
2361}
2362
2363/// Tokenize query text into lowercase words
2364fn tokenize(text: &str) -> Vec<String> {
2365    text.split_whitespace()
2366        .map(|w| {
2367            w.trim_matches(|c: char| !c.is_alphanumeric())
2368                .to_lowercase()
2369        })
2370        .filter(|w| !w.is_empty())
2371        .collect()
2372}
2373
2374/// Annotate tokens with POS tags and negation scope
2375fn annotate_tokens(words: &[String], stemmer: &Stemmer) -> Vec<AnnotatedToken> {
2376    let mut annotated = Vec::with_capacity(words.len());
2377    let mut in_negation_scope = false;
2378    let mut negation_distance = 0;
2379
2380    for (i, word) in words.iter().enumerate() {
2381        let stem = stemmer.stem(word).to_string();
2382        let pos = classify_pos(word, i, words);
2383
2384        // Track negation scope (extends 2-3 words after negation)
2385        if pos == PartOfSpeech::Negation {
2386            in_negation_scope = true;
2387            negation_distance = 0;
2388        } else if in_negation_scope {
2389            negation_distance += 1;
2390            if negation_distance > 3 {
2391                in_negation_scope = false;
2392            }
2393        }
2394
2395        let negated = in_negation_scope && pos != PartOfSpeech::Negation;
2396
2397        annotated.push(AnnotatedToken {
2398            text: word.clone(),
2399            stem,
2400            pos,
2401            negated,
2402            position: i,
2403        });
2404    }
2405
2406    annotated
2407}
2408
2409/// Classify part of speech using heuristics
2410fn classify_pos(word: &str, position: usize, context: &[String]) -> PartOfSpeech {
2411    // Check negation first
2412    if is_negation(word) {
2413        return PartOfSpeech::Negation;
2414    }
2415
2416    // Check stop words
2417    if is_stop_word(word) {
2418        return PartOfSpeech::StopWord;
2419    }
2420
2421    // Use suffix patterns and context for classification
2422    if is_verb(word) {
2423        return PartOfSpeech::Verb;
2424    }
2425
2426    if is_adjective(word) {
2427        return PartOfSpeech::Adjective;
2428    }
2429
2430    if is_noun(word, position, context) {
2431        return PartOfSpeech::Noun;
2432    }
2433
2434    // Default to unknown (treated as noun for domain terms)
2435    PartOfSpeech::Unknown
2436}
2437
2438/// Detect compound nouns (bigrams that commonly co-occur)
2439fn detect_compound_nouns(tokens: &[AnnotatedToken]) -> Vec<String> {
2440    let mut compounds = Vec::new();
2441
2442    // Common compound noun patterns
2443    const COMPOUND_PATTERNS: &[(&str, &str)] = &[
2444        // Tech/AI compounds
2445        ("machine", "learning"),
2446        ("deep", "learning"),
2447        ("neural", "network"),
2448        ("natural", "language"),
2449        ("language", "model"),
2450        ("artificial", "intelligence"),
2451        ("knowledge", "graph"),
2452        ("vector", "database"),
2453        ("memory", "system"),
2454        ("data", "structure"),
2455        ("source", "code"),
2456        ("error", "handling"),
2457        ("unit", "test"),
2458        ("integration", "test"),
2459        ("api", "endpoint"),
2460        ("web", "server"),
2461        ("file", "system"),
2462        ("operating", "system"),
2463        ("database", "schema"),
2464        ("user", "interface"),
2465        ("command", "line"),
2466        ("version", "control"),
2467        ("pull", "request"),
2468        ("code", "review"),
2469        ("bug", "fix"),
2470        ("feature", "request"),
2471        // Domain-specific
2472        ("spreading", "activation"),
2473        ("hebbian", "learning"),
2474        ("long", "term"),
2475        ("short", "term"),
2476        ("working", "memory"),
2477        ("semantic", "search"),
2478        ("graph", "traversal"),
2479        ("edge", "device"),
2480        ("air", "gapped"),
2481        // Social/community
2482        ("support", "group"),
2483        ("pride", "parade"),
2484        ("poetry", "reading"),
2485        ("civil", "rights"),
2486        ("human", "rights"),
2487        ("social", "media"),
2488        ("community", "center"),
2489        ("discussion", "group"),
2490        ("therapy", "session"),
2491        ("art", "therapy"),
2492        ("group", "therapy"),
2493    ];
2494
2495    // Check for known compound patterns
2496    for i in 0..tokens.len().saturating_sub(1) {
2497        let t1 = &tokens[i];
2498        let t2 = &tokens[i + 1];
2499
2500        // Skip if either is a stop word or verb
2501        if t1.pos == PartOfSpeech::StopWord || t2.pos == PartOfSpeech::StopWord {
2502            continue;
2503        }
2504
2505        for (w1, w2) in COMPOUND_PATTERNS {
2506            if (t1.stem == *w1 || t1.text == *w1) && (t2.stem == *w2 || t2.text == *w2) {
2507                compounds.push(format!("{} {}", t1.text, t2.text));
2508                break;
2509            }
2510        }
2511
2512        // Heuristic: Noun + Noun often forms compound
2513        if (t1.pos == PartOfSpeech::Noun || t1.pos == PartOfSpeech::Unknown)
2514            && (t2.pos == PartOfSpeech::Noun || t2.pos == PartOfSpeech::Unknown)
2515        {
2516            // Check for common suffixes that indicate compound-worthy nouns
2517            if has_compound_suffix(&t1.text) || has_compound_suffix(&t2.text) {
2518                let compound = format!("{} {}", t1.text, t2.text);
2519                if !compounds.contains(&compound) {
2520                    compounds.push(compound);
2521                }
2522            }
2523        }
2524    }
2525
2526    compounds
2527}
2528
2529/// Check if word has suffix that often appears in compounds
2530fn has_compound_suffix(word: &str) -> bool {
2531    word.ends_with("tion")
2532        || word.ends_with("ment")
2533        || word.ends_with("ing")
2534        || word.ends_with("ness")
2535        || word.ends_with("ity")
2536        || word.ends_with("ance")
2537        || word.ends_with("ence")
2538        || word.ends_with("er")
2539        || word.ends_with("or")
2540        || word.ends_with("ist")
2541        || word.ends_with("ism")
2542}
2543
2544/// Get positions of tokens that are part of compounds
2545fn compound_positions(tokens: &[AnnotatedToken], compounds: &[String]) -> HashSet<usize> {
2546    let mut positions = HashSet::new();
2547
2548    for compound in compounds {
2549        let parts: Vec<&str> = compound.split_whitespace().collect();
2550        if parts.len() < 2 {
2551            continue;
2552        }
2553
2554        for i in 0..tokens.len().saturating_sub(parts.len() - 1) {
2555            let mut matches = true;
2556            for (j, part) in parts.iter().enumerate() {
2557                if tokens[i + j].text != *part {
2558                    matches = false;
2559                    break;
2560                }
2561            }
2562            if matches {
2563                for j in 0..parts.len() {
2564                    positions.insert(i + j);
2565                }
2566            }
2567        }
2568    }
2569
2570    positions
2571}
2572
2573/// Calculate term weight with IDF-like rarity boost
2574fn calculate_term_weight(word: &str, base_weight: f32) -> f32 {
2575    // Longer words tend to be more specific/rare
2576    let length_factor = if word.len() > 8 {
2577        1.2
2578    } else if word.len() > 5 {
2579        1.1
2580    } else {
2581        1.0
2582    };
2583
2584    // Technical suffixes get slight boost
2585    let suffix_factor = if word.ends_with("tion")
2586        || word.ends_with("ment")
2587        || word.ends_with("ness")
2588        || word.ends_with("ity")
2589    {
2590        1.1
2591    } else {
2592        1.0
2593    };
2594
2595    base_weight * length_factor * suffix_factor
2596}
2597
2598/// Check if word is negation
2599fn is_negation(word: &str) -> bool {
2600    const NEGATIONS: &[&str] = &[
2601        "not",
2602        "no",
2603        "never",
2604        "none",
2605        "nothing",
2606        "neither",
2607        "nobody",
2608        "nowhere",
2609        "without",
2610        "cannot",
2611        "can't",
2612        "won't",
2613        "don't",
2614        "doesn't",
2615        "didn't",
2616        "isn't",
2617        "aren't",
2618        "wasn't",
2619        "weren't",
2620        "hasn't",
2621        "haven't",
2622        "hadn't",
2623        "shouldn't",
2624        "wouldn't",
2625        "couldn't",
2626        "mustn't",
2627    ];
2628    NEGATIONS.contains(&word)
2629}
2630
2631/// Check if word is a noun (entity)
2632fn is_noun(word: &str, position: usize, context: &[String]) -> bool {
2633    // Domain-specific nouns (expanded list)
2634    const NOUN_INDICATORS: &[&str] = &[
2635        // Core memory/cognitive terms
2636        "memory",
2637        "graph",
2638        "node",
2639        "edge",
2640        "entity",
2641        "embedding",
2642        "vector",
2643        "index",
2644        "query",
2645        "retrieval",
2646        "activation",
2647        "potentiation",
2648        "consolidation",
2649        "decay",
2650        "strength",
2651        "weight",
2652        "threshold",
2653        "importance",
2654        // Tech terms
2655        "robot",
2656        "drone",
2657        "sensor",
2658        "lidar",
2659        "camera",
2660        "motor",
2661        "actuator",
2662        "obstacle",
2663        "path",
2664        "waypoint",
2665        "location",
2666        "coordinates",
2667        "position",
2668        "battery",
2669        "power",
2670        "energy",
2671        "voltage",
2672        "current",
2673        "system",
2674        "module",
2675        "component",
2676        "unit",
2677        "device",
2678        "temperature",
2679        "pressure",
2680        "humidity",
2681        "speed",
2682        "velocity",
2683        "signal",
2684        "communication",
2685        "network",
2686        "link",
2687        "connection",
2688        "navigation",
2689        "guidance",
2690        "control",
2691        "steering",
2692        "data",
2693        "information",
2694        "message",
2695        "command",
2696        "response",
2697        // Software terms
2698        "function",
2699        "method",
2700        "class",
2701        "struct",
2702        "interface",
2703        "module",
2704        "package",
2705        "library",
2706        "framework",
2707        "api",
2708        "endpoint",
2709        "request",
2710        "response",
2711        "error",
2712        "exception",
2713        "bug",
2714        "fix",
2715        "feature",
2716        "test",
2717        "benchmark",
2718        "performance",
2719        "latency",
2720        "throughput",
2721        "cache",
2722        "buffer",
2723        "queue",
2724        "stack",
2725        "heap",
2726        "thread",
2727        "process",
2728        "server",
2729        "client",
2730        "database",
2731        "table",
2732        "column",
2733        "row",
2734        "schema",
2735        "migration",
2736        "deployment",
2737        "container",
2738        "cluster",
2739        "replica",
2740        // General nouns
2741        "person",
2742        "people",
2743        "user",
2744        "agent",
2745        "operator",
2746        "time",
2747        "date",
2748        "day",
2749        "hour",
2750        "minute",
2751        "second",
2752        "area",
2753        "zone",
2754        "region",
2755        "sector",
2756        "space",
2757        "task",
2758        "mission",
2759        "goal",
2760        "objective",
2761        "target",
2762        "warning",
2763        "alert",
2764        "notification",
2765        "level",
2766        "status",
2767        "state",
2768        "condition",
2769        "mode",
2770        "type",
2771        "kind",
2772        "version",
2773        "release",
2774        "update",
2775        "change",
2776        "result",
2777        "output",
2778        "input",
2779        "value",
2780        "key",
2781        "name",
2782        "id",
2783        "identifier",
2784    ];
2785
2786    if NOUN_INDICATORS.contains(&word) {
2787        return true;
2788    }
2789
2790    // Check for noun suffixes
2791    if word.ends_with("tion")
2792        || word.ends_with("sion")
2793        || word.ends_with("ment")
2794        || word.ends_with("ness")
2795        || word.ends_with("ity")
2796        || word.ends_with("ance")
2797        || word.ends_with("ence")
2798        || word.ends_with("er")
2799        || word.ends_with("or")
2800        || word.ends_with("ist")
2801        || word.ends_with("ism")
2802        || word.ends_with("age")
2803        || word.ends_with("ure")
2804        || word.ends_with("dom")
2805    {
2806        // Avoid verb forms like "better", "faster"
2807        if !(word.ends_with("er") && word.len() < 5) {
2808            return true;
2809        }
2810    }
2811
2812    // Check if preceded by determiner (a, an, the)
2813    if position > 0 {
2814        if let Some(prev) = context.get(position - 1) {
2815            let prev = prev.to_lowercase();
2816            if prev == "a" || prev == "an" || prev == "the" || prev == "this" || prev == "that" {
2817                return true;
2818            }
2819        }
2820    }
2821
2822    // Check if preceded by possessive
2823    if position > 0 {
2824        if let Some(prev) = context.get(position - 1) {
2825            if prev.ends_with("'s") || prev.ends_with("s'") {
2826                return true;
2827            }
2828        }
2829    }
2830
2831    false
2832}
2833
2834/// Check if word is an adjective (qualifier)
2835fn is_adjective(word: &str) -> bool {
2836    const ADJECTIVE_INDICATORS: &[&str] = &[
2837        // Colors
2838        "red",
2839        "blue",
2840        "green",
2841        "yellow",
2842        "orange",
2843        "purple",
2844        "black",
2845        "white",
2846        "gray",
2847        "grey",
2848        "pink",
2849        "brown",
2850        // Sizes
2851        "big",
2852        "small",
2853        "large",
2854        "tiny",
2855        "huge",
2856        "massive",
2857        "mini",
2858        "micro",
2859        "high",
2860        "low",
2861        "tall",
2862        "short",
2863        "long",
2864        "wide",
2865        "narrow",
2866        // States
2867        "hot",
2868        "cold",
2869        "warm",
2870        "cool",
2871        "frozen",
2872        "heated",
2873        "fast",
2874        "slow",
2875        "quick",
2876        "rapid",
2877        "gradual",
2878        "active",
2879        "inactive",
2880        "enabled",
2881        "disabled",
2882        "open",
2883        "closed",
2884        "locked",
2885        "unlocked",
2886        "full",
2887        "empty",
2888        "partial",
2889        "complete",
2890        "valid",
2891        "invalid",
2892        "correct",
2893        "incorrect",
2894        "true",
2895        "false",
2896        // Quality
2897        "good",
2898        "bad",
2899        "excellent",
2900        "poor",
2901        "optimal",
2902        "suboptimal",
2903        "normal",
2904        "abnormal",
2905        "stable",
2906        "unstable",
2907        "safe",
2908        "unsafe",
2909        "dangerous",
2910        "hazardous",
2911        "new",
2912        "old",
2913        "recent",
2914        "ancient",
2915        "current",
2916        "latest",
2917        "first",
2918        "last",
2919        "next",
2920        "previous",
2921        "primary",
2922        "secondary",
2923        "main",
2924        "important",
2925        "critical",
2926        "minor",
2927        "major",
2928        // Technical
2929        "autonomous",
2930        "manual",
2931        "automatic",
2932        "remote",
2933        "digital",
2934        "analog",
2935        "electronic",
2936        "mechanical",
2937        "wireless",
2938        "wired",
2939        "connected",
2940        "disconnected",
2941        "local",
2942        "global",
2943        "private",
2944        "public",
2945        "static",
2946        "dynamic",
2947        "mutable",
2948        "immutable",
2949        "sync",
2950        "async",
2951        "concurrent",
2952        "parallel",
2953        "serial",
2954        "sequential",
2955        "optional",
2956        "required",
2957        "default",
2958        "custom",
2959    ];
2960
2961    if ADJECTIVE_INDICATORS.contains(&word) {
2962        return true;
2963    }
2964
2965    // Common adjective suffixes (excluding verb participles)
2966    if word.ends_with("ful")
2967        || word.ends_with("less")
2968        || word.ends_with("ous")
2969        || word.ends_with("ive")
2970        || word.ends_with("able")
2971        || word.ends_with("ible")
2972        || word.ends_with("al")
2973        || word.ends_with("ic")
2974        || word.ends_with("ary")
2975        || word.ends_with("ory")
2976    {
2977        // Avoid false positives
2978        let exceptions = ["animal", "interval", "arrival", "approval"];
2979        if !exceptions.contains(&word) {
2980            return true;
2981        }
2982    }
2983
2984    false
2985}
2986
2987/// Check if word is a verb (relational, lower priority)
2988fn is_verb(word: &str) -> bool {
2989    const VERB_INDICATORS: &[&str] = &[
2990        // Auxiliary/modal verbs
2991        "is",
2992        "are",
2993        "was",
2994        "were",
2995        "be",
2996        "been",
2997        "being",
2998        "has",
2999        "have",
3000        "had",
3001        "do",
3002        "does",
3003        "did",
3004        "can",
3005        "could",
3006        "will",
3007        "would",
3008        "shall",
3009        "should",
3010        "may",
3011        "might",
3012        "must",
3013        // Common action verbs
3014        "go",
3015        "goes",
3016        "went",
3017        "gone",
3018        "going",
3019        "get",
3020        "gets",
3021        "got",
3022        "gotten",
3023        "getting",
3024        "make",
3025        "makes",
3026        "made",
3027        "making",
3028        "take",
3029        "takes",
3030        "took",
3031        "taken",
3032        "taking",
3033        "see",
3034        "sees",
3035        "saw",
3036        "seen",
3037        "seeing",
3038        "give",
3039        "gives",
3040        "gave",
3041        "given",
3042        "giving",
3043        "use",
3044        "uses",
3045        "used",
3046        "using",
3047        "find",
3048        "finds",
3049        "found",
3050        "finding",
3051        "know",
3052        "knows",
3053        "knew",
3054        "known",
3055        "knowing",
3056        "think",
3057        "thinks",
3058        "thought",
3059        "thinking",
3060        "want",
3061        "wants",
3062        "wanted",
3063        "wanting",
3064        "need",
3065        "needs",
3066        "needed",
3067        "needing",
3068        "try",
3069        "tries",
3070        "tried",
3071        "trying",
3072        // Technical verbs
3073        "detect",
3074        "detects",
3075        "detected",
3076        "detecting",
3077        "observe",
3078        "observes",
3079        "observed",
3080        "observing",
3081        "measure",
3082        "measures",
3083        "measured",
3084        "measuring",
3085        "sense",
3086        "senses",
3087        "sensed",
3088        "sensing",
3089        "scan",
3090        "scans",
3091        "scanned",
3092        "scanning",
3093        "navigate",
3094        "navigates",
3095        "navigated",
3096        "navigating",
3097        "move",
3098        "moves",
3099        "moved",
3100        "moving",
3101        "stop",
3102        "stops",
3103        "stopped",
3104        "stopping",
3105        "start",
3106        "starts",
3107        "started",
3108        "starting",
3109        "reach",
3110        "reaches",
3111        "reached",
3112        "reaching",
3113        "avoid",
3114        "avoids",
3115        "avoided",
3116        "avoiding",
3117        "block",
3118        "blocks",
3119        "blocked",
3120        "blocking",
3121        "create",
3122        "creates",
3123        "created",
3124        "creating",
3125        "delete",
3126        "deletes",
3127        "deleted",
3128        "deleting",
3129        "update",
3130        "updates",
3131        "updated",
3132        "updating",
3133        "read",
3134        "reads",
3135        "reading",
3136        "write",
3137        "writes",
3138        "wrote",
3139        "written",
3140        "writing",
3141        "run",
3142        "runs",
3143        "ran",
3144        "running",
3145        "execute",
3146        "executes",
3147        "executed",
3148        "executing",
3149        "call",
3150        "calls",
3151        "called",
3152        "calling",
3153        "return",
3154        "returns",
3155        "returned",
3156        "returning",
3157        "store",
3158        "stores",
3159        "stored",
3160        "storing",
3161        "load",
3162        "loads",
3163        "loaded",
3164        "loading",
3165        "save",
3166        "saves",
3167        "saved",
3168        "saving",
3169        "fetch",
3170        "fetches",
3171        "fetched",
3172        "fetching",
3173        "send",
3174        "sends",
3175        "sent",
3176        "sending",
3177        "receive",
3178        "receives",
3179        "received",
3180        "receiving",
3181        "connect",
3182        "connects",
3183        "connected",
3184        "connecting",
3185        "disconnect",
3186        "disconnects",
3187        "disconnected",
3188        "disconnecting",
3189        "process",
3190        "processes",
3191        "processed",
3192        "processing",
3193        "handle",
3194        "handles",
3195        "handled",
3196        "handling",
3197        "parse",
3198        "parses",
3199        "parsed",
3200        "parsing",
3201        "compile",
3202        "compiles",
3203        "compiled",
3204        "compiling",
3205        "build",
3206        "builds",
3207        "built",
3208        "building",
3209        "test",
3210        "tests",
3211        "tested",
3212        "testing",
3213        "deploy",
3214        "deploys",
3215        "deployed",
3216        "deploying",
3217        "install",
3218        "installs",
3219        "installed",
3220        "installing",
3221        "configure",
3222        "configures",
3223        "configured",
3224        "configuring",
3225        "initialize",
3226        "initializes",
3227        "initialized",
3228        "initializing",
3229        "shutdown",
3230        "shutdowns",
3231        "terminate",
3232        "terminates",
3233        "terminated",
3234        "terminating",
3235    ];
3236
3237    VERB_INDICATORS.contains(&word)
3238}
3239
3240/// Check if word is a stop word (no information content)
3241fn is_stop_word(word: &str) -> bool {
3242    const STOP_WORDS: &[&str] = &[
3243        // Articles
3244        "a",
3245        "an",
3246        "the",
3247        // Demonstratives
3248        "this",
3249        "that",
3250        "these",
3251        "those",
3252        // Prepositions
3253        "at",
3254        "in",
3255        "on",
3256        "to",
3257        "for",
3258        "of",
3259        "from",
3260        "by",
3261        "with",
3262        "about",
3263        "into",
3264        "through",
3265        "during",
3266        "before",
3267        "after",
3268        "above",
3269        "below",
3270        "between",
3271        "under",
3272        "over",
3273        // Conjunctions
3274        "and",
3275        "or",
3276        "but",
3277        "nor",
3278        "so",
3279        "yet",
3280        "both",
3281        "either",
3282        "neither",
3283        // Pronouns
3284        "i",
3285        "you",
3286        "he",
3287        "she",
3288        "it",
3289        "we",
3290        "they",
3291        "me",
3292        "him",
3293        "her",
3294        "us",
3295        "them",
3296        "my",
3297        "your",
3298        "his",
3299        "its",
3300        "our",
3301        "their",
3302        "mine",
3303        "yours",
3304        "hers",
3305        "ours",
3306        "theirs",
3307        "who",
3308        "whom",
3309        "whose",
3310        "which",
3311        "what",
3312        "whoever",
3313        "whatever",
3314        "whichever",
3315        // Relative
3316        "that",
3317        "which",
3318        "who",
3319        "whom",
3320        "whose",
3321        // Question words (when not seeking info)
3322        "how",
3323        "when",
3324        "where",
3325        "why",
3326        // Common filler
3327        "just",
3328        "only",
3329        "even",
3330        "also",
3331        "too",
3332        "very",
3333        "really",
3334        "quite",
3335        "rather",
3336        "almost",
3337        "already",
3338        "still",
3339        "always",
3340        "never",
3341        "ever",
3342        "often",
3343        "sometimes",
3344        "usually",
3345        "perhaps",
3346        "maybe",
3347        "probably",
3348        "possibly",
3349        "certainly",
3350        "definitely",
3351        "actually",
3352        "basically",
3353        "essentially",
3354        "simply",
3355        "merely",
3356        // Be forms handled separately as verbs
3357        "as",
3358        "if",
3359        "then",
3360        "than",
3361        "because",
3362        "although",
3363        "though",
3364        "unless",
3365        "until",
3366        "while",
3367        "whereas",
3368        "whether",
3369        "since",
3370        // Others
3371        "some",
3372        "any",
3373        "all",
3374        "each",
3375        "every",
3376        "many",
3377        "much",
3378        "more",
3379        "most",
3380        "few",
3381        "less",
3382        "least",
3383        "other",
3384        "another",
3385        "such",
3386        "same",
3387        "different",
3388        "own",
3389        "several",
3390    ];
3391
3392    STOP_WORDS.contains(&word)
3393}
3394
3395#[cfg(test)]
3396mod tests {
3397    use super::*;
3398
3399    #[test]
3400    fn test_noun_detection() {
3401        let query = "robot detected obstacle at coordinates";
3402        let analysis = analyze_query(query);
3403
3404        let noun_texts: Vec<String> = analysis
3405            .focal_entities
3406            .iter()
3407            .map(|e| e.text.clone())
3408            .collect();
3409
3410        assert!(noun_texts.contains(&"robot".to_string()));
3411        assert!(noun_texts.contains(&"obstacle".to_string()));
3412        assert!(noun_texts.contains(&"coordinates".to_string()));
3413    }
3414
3415    #[test]
3416    fn test_adjective_detection() {
3417        let query = "red large obstacle in path";
3418        let analysis = analyze_query(query);
3419
3420        let adj_texts: Vec<String> = analysis
3421            .discriminative_modifiers
3422            .iter()
3423            .map(|m| m.text.clone())
3424            .collect();
3425
3426        assert!(adj_texts.contains(&"red".to_string()));
3427        assert!(adj_texts.contains(&"large".to_string()));
3428    }
3429
3430    #[test]
3431    fn test_verb_detection() {
3432        let query = "robot detected obstacle";
3433        let analysis = analyze_query(query);
3434
3435        let verb_texts: Vec<String> = analysis
3436            .relational_context
3437            .iter()
3438            .map(|r| r.text.clone())
3439            .collect();
3440
3441        assert!(verb_texts.contains(&"detected".to_string()));
3442    }
3443
3444    #[test]
3445    fn test_information_content_weights() {
3446        let query = "sensor detected red obstacle";
3447        let analysis = analyze_query(query);
3448
3449        // Nouns should have IC weight >= IC_NOUN base
3450        for entity in &analysis.focal_entities {
3451            assert!(entity.ic_weight >= IC_NOUN * 0.9); // Allow small variance
3452        }
3453
3454        // Adjectives should have IC weight >= IC_ADJECTIVE base
3455        for modifier in &analysis.discriminative_modifiers {
3456            assert!(modifier.ic_weight >= IC_ADJECTIVE * 0.9);
3457        }
3458
3459        // Verbs should have IC weight >= IC_VERB base
3460        for relation in &analysis.relational_context {
3461            assert!(relation.ic_weight >= IC_VERB * 0.9);
3462        }
3463    }
3464
3465    #[test]
3466    fn test_stemming() {
3467        let query = "running detection algorithms";
3468        let analysis = analyze_query(query);
3469
3470        // Check stems are different from original text
3471        let stems: Vec<String> = analysis
3472            .focal_entities
3473            .iter()
3474            .map(|e| e.stem.clone())
3475            .collect();
3476
3477        // "detection" should stem to "detect"
3478        assert!(stems.iter().any(|s| s == "detect"));
3479        // "algorithms" should stem to "algorithm"
3480        assert!(stems.iter().any(|s| s == "algorithm"));
3481    }
3482
3483    #[test]
3484    fn test_compound_noun_detection() {
3485        let query = "machine learning neural network";
3486        let analysis = analyze_query(query);
3487
3488        assert!(analysis
3489            .compound_nouns
3490            .contains(&"machine learning".to_string()));
3491        assert!(analysis
3492            .compound_nouns
3493            .contains(&"neural network".to_string()));
3494    }
3495
3496    #[test]
3497    fn test_negation_detection() {
3498        let query = "not working correctly";
3499        let analysis = analyze_query(query);
3500
3501        assert!(analysis.has_negation);
3502
3503        // Check that tokens after negation are marked
3504        let negated_entities: Vec<&FocalEntity> = analysis
3505            .focal_entities
3506            .iter()
3507            .filter(|e| e.negated)
3508            .collect();
3509
3510        assert!(!negated_entities.is_empty());
3511    }
3512
3513    #[test]
3514    fn test_negation_scope() {
3515        let query = "the sensor is not detecting obstacles properly";
3516        let analysis = analyze_query(query);
3517
3518        assert!(analysis.has_negation);
3519
3520        // "detecting" should be marked as negated
3521        let negated_verbs: Vec<&Relation> = analysis
3522            .relational_context
3523            .iter()
3524            .filter(|r| r.negated)
3525            .collect();
3526
3527        assert!(negated_verbs.iter().any(|r| r.text == "detecting"));
3528    }
3529
3530    #[test]
3531    fn test_all_stems_helper() {
3532        let query = "fast robot detecting obstacles";
3533        let analysis = analyze_query(query);
3534
3535        let stems = analysis.all_stems();
3536        assert!(stems.contains("robot"));
3537        assert!(stems.contains("fast"));
3538        assert!(stems.contains("detect"));
3539        assert!(stems.contains("obstacl")); // Porter stem
3540    }
3541
3542    #[test]
3543    fn test_positive_and_negated_stems() {
3544        let query = "working memory not failed";
3545        let analysis = analyze_query(query);
3546
3547        let positive = analysis.positive_entity_stems();
3548        let _negated = analysis.negated_entity_stems();
3549
3550        // "memory" should be positive
3551        assert!(positive.iter().any(|s| s.contains("memori")));
3552
3553        // "failed" should be negated (after "not")
3554        // Note: "failed" might be classified as verb or noun depending on context
3555    }
3556
3557    #[test]
3558    fn test_empty_query() {
3559        let query = "";
3560        let analysis = analyze_query(query);
3561
3562        assert!(analysis.focal_entities.is_empty());
3563        assert!(analysis.discriminative_modifiers.is_empty());
3564        assert!(analysis.relational_context.is_empty());
3565        assert!(!analysis.has_negation);
3566    }
3567
3568    #[test]
3569    fn test_stop_words_filtered() {
3570        let query = "the a an is are was were";
3571        let analysis = analyze_query(query);
3572
3573        // Only verbs should remain (is, are, was, were)
3574        assert!(analysis.focal_entities.is_empty());
3575        assert!(analysis.discriminative_modifiers.is_empty());
3576        assert!(!analysis.relational_context.is_empty());
3577    }
3578
3579    #[test]
3580    fn test_total_weight_calculation() {
3581        let query = "fast robot detecting red obstacles";
3582        let analysis = analyze_query(query);
3583
3584        let weight = analysis.total_weight();
3585        assert!(weight > 0.0);
3586    }
3587
3588    #[test]
3589    fn test_to_ic_weights() {
3590        use crate::constants::{IC_ADJECTIVE, IC_NOUN, IC_VERB};
3591
3592        let query = "fast robot detecting obstacles";
3593        let analysis = analyze_query(query);
3594        let weights = analysis.to_ic_weights();
3595
3596        // Should have weights for terms
3597        assert!(!weights.is_empty(), "Weights should not be empty");
3598
3599        // Check that weights were generated
3600        // Nouns get IC_NOUN (1.5), adjectives IC_ADJECTIVE (0.9), verbs IC_VERB (0.7)
3601        // The actual terms depend on POS tagging which may vary
3602
3603        // At minimum, check that we have some weights with expected IC values
3604        let has_noun_weight = weights.values().any(|&w| (w - IC_NOUN).abs() < 0.01);
3605        let has_adj_weight = weights.values().any(|&w| (w - IC_ADJECTIVE).abs() < 0.01);
3606        let has_verb_weight = weights.values().any(|&w| (w - IC_VERB).abs() < 0.01);
3607
3608        // At least one type should be present
3609        assert!(
3610            has_noun_weight || has_adj_weight || has_verb_weight,
3611            "Should have at least one IC weight type. Weights: {:?}",
3612            weights
3613        );
3614    }
3615
3616    #[test]
3617    fn test_to_phrase_boosts() {
3618        // Test with a query containing known compound noun
3619        let query = "machine learning model for semantic search";
3620        let analysis = analyze_query(query);
3621        let phrases = analysis.to_phrase_boosts();
3622
3623        // "machine learning" is a known compound pattern
3624        let has_ml = phrases.iter().any(|(p, _)| p == "machine learning");
3625        let has_ss = phrases.iter().any(|(p, _)| p == "semantic search");
3626
3627        assert!(
3628            has_ml || has_ss,
3629            "Should detect 'machine learning' or 'semantic search' as phrase. Found: {:?}",
3630            phrases
3631        );
3632
3633        // Compound nouns should have higher boost (2.0)
3634        for (phrase, boost) in &phrases {
3635            assert!(
3636                *boost >= 1.0,
3637                "Phrase '{}' should have boost >= 1.0, got {}",
3638                phrase,
3639                boost
3640            );
3641        }
3642    }
3643
3644    #[test]
3645    fn test_to_phrase_boosts_support_group() {
3646        // Test with LoCoMo-style query
3647        let query = "when did she go to the support group";
3648        let analysis = analyze_query(query);
3649        let phrases = analysis.to_phrase_boosts();
3650
3651        // "support group" should be detected as a compound
3652        let has_support_group = phrases.iter().any(|(p, _)| p == "support group");
3653        assert!(
3654            has_support_group,
3655            "Should detect 'support group' as phrase. Found: {:?}",
3656            phrases
3657        );
3658    }
3659}