Skip to main content

kaiba/domain/services/
emphasis_parser.rs

1//! EmphasisParser - Markdown emphasis extraction service
2//!
3//! Parses Markdown documents to extract emphasized text (bold, italic, code)
4//! and generates EmphasisNodes with contextual information for GraphKai.
5
6use pulldown_cmark::{Event, Parser, Tag, TagEnd};
7use uuid::Uuid;
8
9use crate::domain::entities::{EmphasisNode, EmphasisParseResult, TextPosition};
10use crate::domain::value_objects::EmphasisStyle;
11
12/// Configuration for emphasis parsing
13#[derive(Debug, Clone)]
14pub struct EmphasisParserConfig {
15    /// Number of characters to include before/after emphasis for context
16    pub context_chars: usize,
17    /// Minimum text length to consider for emphasis extraction
18    pub min_text_length: usize,
19}
20
21impl Default for EmphasisParserConfig {
22    fn default() -> Self {
23        Self {
24            context_chars: 200, // ~50 tokens worth of context
25            min_text_length: 2,
26        }
27    }
28}
29
30/// Service for parsing emphasis from Markdown documents
31pub struct EmphasisParser {
32    config: EmphasisParserConfig,
33}
34
35impl EmphasisParser {
36    pub fn new() -> Self {
37        Self {
38            config: EmphasisParserConfig::default(),
39        }
40    }
41
42    pub fn with_config(config: EmphasisParserConfig) -> Self {
43        Self { config }
44    }
45
46    /// Parse a Markdown document and extract all emphasis nodes
47    pub fn parse(&self, doc_id: Uuid, content: &str) -> EmphasisParseResult {
48        let mut result = EmphasisParseResult::new();
49        let parser = Parser::new(content);
50
51        // Track current emphasis state
52        let mut in_bold = false;
53        let mut in_italic = false;
54        let mut current_text = String::new();
55        let mut text_start_offset: usize = 0;
56
57        for (event, range) in parser.into_offset_iter() {
58            match event {
59                Event::Start(Tag::Strong) => {
60                    in_bold = true;
61                    current_text.clear();
62                    text_start_offset = range.start;
63                }
64                Event::End(TagEnd::Strong) => {
65                    if !current_text.is_empty() && current_text.len() >= self.config.min_text_length
66                    {
67                        let style = if in_italic {
68                            EmphasisStyle::BoldItalic
69                        } else {
70                            EmphasisStyle::Bold
71                        };
72                        let position = self.calculate_position(content, text_start_offset);
73                        let contextual_text =
74                            self.extract_context(content, text_start_offset, range.end);
75
76                        result.add_node(EmphasisNode::new(
77                            doc_id,
78                            current_text.clone(),
79                            style,
80                            position,
81                            contextual_text,
82                        ));
83                    }
84                    in_bold = false;
85                    current_text.clear();
86                }
87                Event::Start(Tag::Emphasis) => {
88                    if !in_bold {
89                        current_text.clear();
90                        text_start_offset = range.start;
91                    }
92                    in_italic = true;
93                }
94                Event::End(TagEnd::Emphasis) => {
95                    if !in_bold
96                        && !current_text.is_empty()
97                        && current_text.len() >= self.config.min_text_length
98                    {
99                        let position = self.calculate_position(content, text_start_offset);
100                        let contextual_text =
101                            self.extract_context(content, text_start_offset, range.end);
102
103                        result.add_node(EmphasisNode::new(
104                            doc_id,
105                            current_text.clone(),
106                            EmphasisStyle::Italic,
107                            position,
108                            contextual_text,
109                        ));
110                    }
111                    in_italic = false;
112                    if !in_bold {
113                        current_text.clear();
114                    }
115                }
116                Event::Code(code) => {
117                    if code.len() >= self.config.min_text_length {
118                        let position = self.calculate_position(content, range.start);
119                        let contextual_text = self.extract_context(content, range.start, range.end);
120
121                        result.add_node(EmphasisNode::new(
122                            doc_id,
123                            code.to_string(),
124                            EmphasisStyle::Code,
125                            position,
126                            contextual_text,
127                        ));
128                    }
129                }
130                Event::Text(text) => {
131                    if in_bold || in_italic {
132                        current_text.push_str(&text);
133                    }
134                }
135                _ => {}
136            }
137        }
138
139        result
140    }
141
142    /// Calculate line/column position from byte offset
143    fn calculate_position(&self, content: &str, byte_offset: usize) -> TextPosition {
144        let prefix = &content[..byte_offset.min(content.len())];
145        let line = prefix.matches('\n').count() + 1;
146        let column = prefix
147            .rfind('\n')
148            .map(|pos| byte_offset - pos)
149            .unwrap_or(byte_offset + 1);
150
151        TextPosition::new(byte_offset, line, column)
152    }
153
154    /// Extract surrounding context for an emphasis
155    fn extract_context(&self, content: &str, start: usize, end: usize) -> String {
156        let context_start = start.saturating_sub(self.config.context_chars);
157        let context_end = (end + self.config.context_chars).min(content.len());
158
159        // Find word boundaries
160        let actual_start = content[..context_start]
161            .rfind(char::is_whitespace)
162            .map(|p| p + 1)
163            .unwrap_or(context_start);
164
165        let actual_end = content[context_end..]
166            .find(char::is_whitespace)
167            .map(|p| context_end + p)
168            .unwrap_or(context_end);
169
170        content[actual_start..actual_end].to_string()
171    }
172}
173
174impl Default for EmphasisParser {
175    fn default() -> Self {
176        Self::new()
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn test_parse_bold() {
186        let parser = EmphasisParser::new();
187        let doc_id = Uuid::new_v4();
188        let content = "This is **important** text.";
189
190        let result = parser.parse(doc_id, content);
191
192        assert_eq!(result.bold_count, 1);
193        assert_eq!(result.nodes[0].text, "important");
194        assert_eq!(result.nodes[0].style, EmphasisStyle::Bold);
195    }
196
197    #[test]
198    fn test_parse_italic() {
199        let parser = EmphasisParser::new();
200        let doc_id = Uuid::new_v4();
201        let content = "This is *emphasized* text.";
202
203        let result = parser.parse(doc_id, content);
204
205        assert_eq!(result.italic_count, 1);
206        assert_eq!(result.nodes[0].text, "emphasized");
207        assert_eq!(result.nodes[0].style, EmphasisStyle::Italic);
208    }
209
210    #[test]
211    fn test_parse_bold_italic() {
212        let parser = EmphasisParser::new();
213        let doc_id = Uuid::new_v4();
214        let content = "This is ***critical*** concept.";
215
216        let result = parser.parse(doc_id, content);
217
218        // pulldown-cmark parses ***text*** as Strong containing Emphasis
219        assert_eq!(result.total_count(), 1);
220        assert_eq!(result.nodes[0].text, "critical");
221        assert_eq!(result.nodes[0].style, EmphasisStyle::BoldItalic);
222    }
223
224    #[test]
225    fn test_parse_code() {
226        let parser = EmphasisParser::new();
227        let doc_id = Uuid::new_v4();
228        let content = "Use the `println!` macro.";
229
230        let result = parser.parse(doc_id, content);
231
232        assert_eq!(result.code_count, 1);
233        assert_eq!(result.nodes[0].text, "println!");
234        assert_eq!(result.nodes[0].style, EmphasisStyle::Code);
235    }
236
237    #[test]
238    fn test_parse_mixed() {
239        let parser = EmphasisParser::new();
240        let doc_id = Uuid::new_v4();
241        let content = r#"
242# GraphKai Architecture
243
244The **knowledge graph** stores *semantic relationships* between concepts.
245Use `EmphasisNode` to represent emphasized text.
246"#;
247
248        let result = parser.parse(doc_id, content);
249
250        assert_eq!(result.bold_count, 1);
251        assert_eq!(result.italic_count, 1);
252        assert_eq!(result.code_count, 1);
253        assert_eq!(result.total_count(), 3);
254    }
255
256    #[test]
257    fn test_context_extraction() {
258        let parser = EmphasisParser::new();
259        let doc_id = Uuid::new_v4();
260        let content = "The quick brown fox **jumps** over the lazy dog.";
261
262        let result = parser.parse(doc_id, content);
263
264        assert!(result.nodes[0].contextual_text.contains("jumps"));
265        assert!(result.nodes[0].contextual_text.contains("fox"));
266        assert!(result.nodes[0].contextual_text.contains("over"));
267    }
268
269    #[test]
270    fn test_position_tracking() {
271        let parser = EmphasisParser::new();
272        let doc_id = Uuid::new_v4();
273        let content = "Line 1\nLine 2 with **bold** text\nLine 3";
274
275        let result = parser.parse(doc_id, content);
276
277        assert_eq!(result.nodes[0].position.line, 2);
278    }
279
280    #[test]
281    fn test_min_length_filter() {
282        let parser = EmphasisParser::with_config(EmphasisParserConfig {
283            context_chars: 200,
284            min_text_length: 5,
285        });
286        let doc_id = Uuid::new_v4();
287        let content = "Skip **a** but include **longer text** here.";
288
289        let result = parser.parse(doc_id, content);
290
291        assert_eq!(result.bold_count, 1);
292        assert_eq!(result.nodes[0].text, "longer text");
293    }
294}