offline_intelligence/utils/
topic_extractor.rs

1//! Common topic extraction utilities
2
3use crate::memory::Message;
4use regex::Regex;
5use lazy_static::lazy_static;
6
7lazy_static! {
8    static ref STOP_WORDS: Vec<&'static str> = vec![
9        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
10        "of", "with", "by", "is", "am", "are", "was", "were", "be", "been",
11        "being", "have", "has", "had", "do", "does", "did", "will", "would",
12        "shall", "should", "may", "might", "must", "can", "could", "i", "you",
13        "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
14        "my", "your", "his", "its", "our", "their", "mine", "yours", "hers",
15        "ours", "theirs", "this", "that", "these", "those",
16    ];
17}
18
19/// Extract topics from text with configurable parameters
20pub struct TopicExtractor {
21    max_topics: usize,
22    min_word_length: usize,
23}
24
25impl Default for TopicExtractor {
26    fn default() -> Self {
27        Self {
28            max_topics: 3,
29            min_word_length: 3,
30        }
31    }
32}
33
34impl TopicExtractor {
35    pub fn new(max_topics: usize, min_word_length: usize) -> Self {
36        Self {
37            max_topics,
38            min_word_length,
39        }
40    }
41    
42    /// Extract topics from a single text
43    pub fn extract_from_text(&self, text: &str) -> Vec<String> {
44        let mut topics = Vec::new();
45        let text_lower = text.to_lowercase();
46        let words: Vec<&str> = text_lower.split_whitespace().collect();
47        
48        // Look for question patterns
49        let question_words = ["what", "how", "why", "when", "where", "who", "which"];
50        for i in 0..words.len().saturating_sub(1) {
51            if question_words.contains(&words[i]) {
52                let topic = self.extract_topic_phrase(&words, i + 1, 4);
53                if !topic.is_empty() {
54                    topics.push(topic);
55                }
56            }
57            
58            // Look for "about" pattern
59            if words[i] == "about" || words[i] == "regarding" || words[i] == "discussing" {
60                let topic = self.extract_topic_phrase(&words, i + 1, 3);
61                if !topic.is_empty() {
62                    topics.push(topic);
63                }
64            }
65        }
66        
67        // Fallback: extract significant words
68        if topics.is_empty() {
69            let significant: Vec<&str> = words.iter()
70                .filter(|&&word| {
71                    word.len() >= self.min_word_length &&
72                    !STOP_WORDS.contains(&word) &&
73                    (word.ends_with("ing") || word.ends_with("tion") || 
74                     word.starts_with("what") || word.starts_with("how"))
75                })
76                .take(self.max_topics * 2)
77                .copied()
78                .collect();
79            
80            if !significant.is_empty() {
81                topics.push(significant.join(" "));
82            }
83        }
84        
85        // Deduplicate and limit
86        topics.sort();
87        topics.dedup();
88        topics.truncate(self.max_topics);
89        
90        // Capitalize first letter
91        topics.iter_mut().for_each(|topic| {
92            if !topic.is_empty() {
93                let mut chars: Vec<char> = topic.chars().collect();
94                if !chars.is_empty() {
95                    chars[0] = chars[0].to_uppercase().next().unwrap_or(chars[0]);
96                    *topic = chars.into_iter().collect();
97                }
98            }
99        });
100        
101        topics
102    }
103    
104    /// Extract topics from messages
105    pub fn extract_from_messages(&self, messages: &[Message], recent_count: usize) -> Vec<String> {
106        let recent_messages: Vec<&Message> = messages.iter()
107            .rev()
108            .take(recent_count)
109            .collect();
110        
111        let mut all_topics = Vec::new();
112        for message in recent_messages {
113            let topics = self.extract_from_text(&message.content);
114            all_topics.extend(topics);
115        }
116        
117        // Deduplicate and limit
118        all_topics.sort();
119        all_topics.dedup();
120        all_topics.truncate(self.max_topics);
121        
122        all_topics
123    }
124    
125    /// Helper to extract topic phrase starting from position
126    fn extract_topic_phrase(&self, words: &[&str], start: usize, max_words: usize) -> String {
127        let end = (start + max_words).min(words.len());
128        if start >= end {
129            return String::new();
130        }
131        
132        let phrase_words: Vec<&str> = words[start..end].iter()
133            .filter(|&&word| word.len() >= self.min_word_length && !STOP_WORDS.contains(&word))
134            .copied()
135            .collect();
136        
137        if phrase_words.is_empty() {
138            String::new()
139        } else {
140            phrase_words.join(" ")
141        }
142    }
143    
144    /// Check if a word is a stop word
145    pub fn is_stop_word(word: &str) -> bool {
146        STOP_WORDS.contains(&word.to_lowercase().as_str())
147    }
148}