offline_intelligence/utils/
topic_extractor.rs1use crate::memory::Message;
4use lazy_static::lazy_static;
5
6lazy_static! {
7 static ref STOP_WORDS: Vec<&'static str> = vec![
8 "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
9 "of", "with", "by", "is", "am", "are", "was", "were", "be", "been",
10 "being", "have", "has", "had", "do", "does", "did", "will", "would",
11 "shall", "should", "may", "might", "must", "can", "could", "i", "you",
12 "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
13 "my", "your", "his", "its", "our", "their", "mine", "yours", "hers",
14 "ours", "theirs", "this", "that", "these", "those",
15 ];
16}
17
18pub struct TopicExtractor {
20 max_topics: usize,
21 min_word_length: usize,
22}
23
24impl Default for TopicExtractor {
25 fn default() -> Self {
26 Self {
27 max_topics: 3,
28 min_word_length: 3,
29 }
30 }
31}
32
33impl TopicExtractor {
34 pub fn new(max_topics: usize, min_word_length: usize) -> Self {
35 Self {
36 max_topics,
37 min_word_length,
38 }
39 }
40
41 pub fn extract_from_text(&self, text: &str) -> Vec<String> {
43 let mut topics = Vec::new();
44 let text_lower = text.to_lowercase();
45 let words: Vec<&str> = text_lower.split_whitespace().collect();
46
47 let question_words = ["what", "how", "why", "when", "where", "who", "which"];
49 for i in 0..words.len().saturating_sub(1) {
50 if question_words.contains(&words[i]) {
51 let topic = self.extract_topic_phrase(&words, i + 1, 4);
52 if !topic.is_empty() {
53 topics.push(topic);
54 }
55 }
56
57 if words[i] == "about" || words[i] == "regarding" || words[i] == "discussing" {
59 let topic = self.extract_topic_phrase(&words, i + 1, 3);
60 if !topic.is_empty() {
61 topics.push(topic);
62 }
63 }
64 }
65
66 if topics.is_empty() {
68 let significant: Vec<&str> = words.iter()
69 .filter(|&&word| {
70 word.len() >= self.min_word_length &&
71 !STOP_WORDS.contains(&word) &&
72 (word.ends_with("ing") || word.ends_with("tion") ||
73 word.starts_with("what") || word.starts_with("how"))
74 })
75 .take(self.max_topics * 2)
76 .copied()
77 .collect();
78
79 if !significant.is_empty() {
80 topics.push(significant.join(" "));
81 }
82 }
83
84 topics.sort();
86 topics.dedup();
87 topics.truncate(self.max_topics);
88
89 topics.iter_mut().for_each(|topic| {
91 if !topic.is_empty() {
92 let mut chars: Vec<char> = topic.chars().collect();
93 if !chars.is_empty() {
94 chars[0] = chars[0].to_uppercase().next().unwrap_or(chars[0]);
95 *topic = chars.into_iter().collect();
96 }
97 }
98 });
99
100 topics
101 }
102
103 pub fn extract_from_messages(&self, messages: &[Message], recent_count: usize) -> Vec<String> {
105 let recent_messages: Vec<&Message> = messages.iter()
106 .rev()
107 .take(recent_count)
108 .collect();
109
110 let mut all_topics = Vec::new();
111 for message in recent_messages {
112 let topics = self.extract_from_text(&message.content);
113 all_topics.extend(topics);
114 }
115
116 all_topics.sort();
118 all_topics.dedup();
119 all_topics.truncate(self.max_topics);
120
121 all_topics
122 }
123
124 fn extract_topic_phrase(&self, words: &[&str], start: usize, max_words: usize) -> String {
126 let end = (start + max_words).min(words.len());
127 if start >= end {
128 return String::new();
129 }
130
131 let phrase_words: Vec<&str> = words[start..end].iter()
132 .filter(|&&word| word.len() >= self.min_word_length && !STOP_WORDS.contains(&word))
133 .copied()
134 .collect();
135
136 if phrase_words.is_empty() {
137 String::new()
138 } else {
139 phrase_words.join(" ")
140 }
141 }
142
143 pub fn is_stop_word(word: &str) -> bool {
145 STOP_WORDS.contains(&word.to_lowercase().as_str())
146 }
147}