Skip to main content

offline_intelligence/utils/
topic_extractor.rs

1//! Common topic extraction utilities
2
3use crate::memory::Message;
4use lazy_static::lazy_static;
5
6lazy_static! {
7    static ref STOP_WORDS: Vec<&'static str> = vec![
8        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
9        "of", "with", "by", "is", "am", "are", "was", "were", "be", "been",
10        "being", "have", "has", "had", "do", "does", "did", "will", "would",
11        "shall", "should", "may", "might", "must", "can", "could", "i", "you",
12        "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
13        "my", "your", "his", "its", "our", "their", "mine", "yours", "hers",
14        "ours", "theirs", "this", "that", "these", "those",
15    ];
16}
17
18/// Extract topics from text with configurable parameters
19pub struct TopicExtractor {
20    max_topics: usize,
21    min_word_length: usize,
22}
23
24impl Default for TopicExtractor {
25    fn default() -> Self {
26        Self {
27            max_topics: 3,
28            min_word_length: 3,
29        }
30    }
31}
32
33impl TopicExtractor {
34    pub fn new(max_topics: usize, min_word_length: usize) -> Self {
35        Self {
36            max_topics,
37            min_word_length,
38        }
39    }
40    
41    /// Extract topics from a single text
42    pub fn extract_from_text(&self, text: &str) -> Vec<String> {
43        let mut topics = Vec::new();
44        let text_lower = text.to_lowercase();
45        let words: Vec<&str> = text_lower.split_whitespace().collect();
46        
47        // Look for question patterns
48        let question_words = ["what", "how", "why", "when", "where", "who", "which"];
49        for i in 0..words.len().saturating_sub(1) {
50            if question_words.contains(&words[i]) {
51                let topic = self.extract_topic_phrase(&words, i + 1, 4);
52                if !topic.is_empty() {
53                    topics.push(topic);
54                }
55            }
56            
57            // Look for "about" pattern
58            if words[i] == "about" || words[i] == "regarding" || words[i] == "discussing" {
59                let topic = self.extract_topic_phrase(&words, i + 1, 3);
60                if !topic.is_empty() {
61                    topics.push(topic);
62                }
63            }
64        }
65        
66        // Fallback: extract significant words
67        if topics.is_empty() {
68            let significant: Vec<&str> = words.iter()
69                .filter(|&&word| {
70                    word.len() >= self.min_word_length &&
71                    !STOP_WORDS.contains(&word) &&
72                    (word.ends_with("ing") || word.ends_with("tion") || 
73                     word.starts_with("what") || word.starts_with("how"))
74                })
75                .take(self.max_topics * 2)
76                .copied()
77                .collect();
78            
79            if !significant.is_empty() {
80                topics.push(significant.join(" "));
81            }
82        }
83        
84        // Deduplicate and limit
85        topics.sort();
86        topics.dedup();
87        topics.truncate(self.max_topics);
88        
89        // Capitalize first letter
90        topics.iter_mut().for_each(|topic| {
91            if !topic.is_empty() {
92                let mut chars: Vec<char> = topic.chars().collect();
93                if !chars.is_empty() {
94                    chars[0] = chars[0].to_uppercase().next().unwrap_or(chars[0]);
95                    *topic = chars.into_iter().collect();
96                }
97            }
98        });
99        
100        topics
101    }
102    
103    /// Extract topics from messages
104    pub fn extract_from_messages(&self, messages: &[Message], recent_count: usize) -> Vec<String> {
105        let recent_messages: Vec<&Message> = messages.iter()
106            .rev()
107            .take(recent_count)
108            .collect();
109        
110        let mut all_topics = Vec::new();
111        for message in recent_messages {
112            let topics = self.extract_from_text(&message.content);
113            all_topics.extend(topics);
114        }
115        
116        // Deduplicate and limit
117        all_topics.sort();
118        all_topics.dedup();
119        all_topics.truncate(self.max_topics);
120        
121        all_topics
122    }
123    
124    /// Helper to extract topic phrase starting from position
125    fn extract_topic_phrase(&self, words: &[&str], start: usize, max_words: usize) -> String {
126        let end = (start + max_words).min(words.len());
127        if start >= end {
128            return String::new();
129        }
130        
131        let phrase_words: Vec<&str> = words[start..end].iter()
132            .filter(|&&word| word.len() >= self.min_word_length && !STOP_WORDS.contains(&word))
133            .copied()
134            .collect();
135        
136        if phrase_words.is_empty() {
137            String::new()
138        } else {
139            phrase_words.join(" ")
140        }
141    }
142    
143    /// Check if a word is a stop word
144    pub fn is_stop_word(word: &str) -> bool {
145        STOP_WORDS.contains(&word.to_lowercase().as_str())
146    }
147}