offline_intelligence/utils/
topic_extractor.rs1use crate::memory::Message;
4use regex::Regex;
5use lazy_static::lazy_static;
6
7lazy_static! {
8 static ref STOP_WORDS: Vec<&'static str> = vec![
9 "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
10 "of", "with", "by", "is", "am", "are", "was", "were", "be", "been",
11 "being", "have", "has", "had", "do", "does", "did", "will", "would",
12 "shall", "should", "may", "might", "must", "can", "could", "i", "you",
13 "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
14 "my", "your", "his", "its", "our", "their", "mine", "yours", "hers",
15 "ours", "theirs", "this", "that", "these", "those",
16 ];
17}
18
19pub struct TopicExtractor {
21 max_topics: usize,
22 min_word_length: usize,
23}
24
25impl Default for TopicExtractor {
26 fn default() -> Self {
27 Self {
28 max_topics: 3,
29 min_word_length: 3,
30 }
31 }
32}
33
34impl TopicExtractor {
35 pub fn new(max_topics: usize, min_word_length: usize) -> Self {
36 Self {
37 max_topics,
38 min_word_length,
39 }
40 }
41
42 pub fn extract_from_text(&self, text: &str) -> Vec<String> {
44 let mut topics = Vec::new();
45 let text_lower = text.to_lowercase();
46 let words: Vec<&str> = text_lower.split_whitespace().collect();
47
48 let question_words = ["what", "how", "why", "when", "where", "who", "which"];
50 for i in 0..words.len().saturating_sub(1) {
51 if question_words.contains(&words[i]) {
52 let topic = self.extract_topic_phrase(&words, i + 1, 4);
53 if !topic.is_empty() {
54 topics.push(topic);
55 }
56 }
57
58 if words[i] == "about" || words[i] == "regarding" || words[i] == "discussing" {
60 let topic = self.extract_topic_phrase(&words, i + 1, 3);
61 if !topic.is_empty() {
62 topics.push(topic);
63 }
64 }
65 }
66
67 if topics.is_empty() {
69 let significant: Vec<&str> = words.iter()
70 .filter(|&&word| {
71 word.len() >= self.min_word_length &&
72 !STOP_WORDS.contains(&word) &&
73 (word.ends_with("ing") || word.ends_with("tion") ||
74 word.starts_with("what") || word.starts_with("how"))
75 })
76 .take(self.max_topics * 2)
77 .copied()
78 .collect();
79
80 if !significant.is_empty() {
81 topics.push(significant.join(" "));
82 }
83 }
84
85 topics.sort();
87 topics.dedup();
88 topics.truncate(self.max_topics);
89
90 topics.iter_mut().for_each(|topic| {
92 if !topic.is_empty() {
93 let mut chars: Vec<char> = topic.chars().collect();
94 if !chars.is_empty() {
95 chars[0] = chars[0].to_uppercase().next().unwrap_or(chars[0]);
96 *topic = chars.into_iter().collect();
97 }
98 }
99 });
100
101 topics
102 }
103
104 pub fn extract_from_messages(&self, messages: &[Message], recent_count: usize) -> Vec<String> {
106 let recent_messages: Vec<&Message> = messages.iter()
107 .rev()
108 .take(recent_count)
109 .collect();
110
111 let mut all_topics = Vec::new();
112 for message in recent_messages {
113 let topics = self.extract_from_text(&message.content);
114 all_topics.extend(topics);
115 }
116
117 all_topics.sort();
119 all_topics.dedup();
120 all_topics.truncate(self.max_topics);
121
122 all_topics
123 }
124
125 fn extract_topic_phrase(&self, words: &[&str], start: usize, max_words: usize) -> String {
127 let end = (start + max_words).min(words.len());
128 if start >= end {
129 return String::new();
130 }
131
132 let phrase_words: Vec<&str> = words[start..end].iter()
133 .filter(|&&word| word.len() >= self.min_word_length && !STOP_WORDS.contains(&word))
134 .copied()
135 .collect();
136
137 if phrase_words.is_empty() {
138 String::new()
139 } else {
140 phrase_words.join(" ")
141 }
142 }
143
144 pub fn is_stop_word(word: &str) -> bool {
146 STOP_WORDS.contains(&word.to_lowercase().as_str())
147 }
148}