kaiba/domain/services/
emphasis_parser.rs1use pulldown_cmark::{Event, Parser, Tag, TagEnd};
7use uuid::Uuid;
8
9use crate::domain::entities::{EmphasisNode, EmphasisParseResult, TextPosition};
10use crate::domain::value_objects::EmphasisStyle;
11
12#[derive(Debug, Clone)]
14pub struct EmphasisParserConfig {
15 pub context_chars: usize,
17 pub min_text_length: usize,
19}
20
21impl Default for EmphasisParserConfig {
22 fn default() -> Self {
23 Self {
24 context_chars: 200, min_text_length: 2,
26 }
27 }
28}
29
30pub struct EmphasisParser {
32 config: EmphasisParserConfig,
33}
34
35impl EmphasisParser {
36 pub fn new() -> Self {
37 Self {
38 config: EmphasisParserConfig::default(),
39 }
40 }
41
42 pub fn with_config(config: EmphasisParserConfig) -> Self {
43 Self { config }
44 }
45
46 pub fn parse(&self, doc_id: Uuid, content: &str) -> EmphasisParseResult {
48 let mut result = EmphasisParseResult::new();
49 let parser = Parser::new(content);
50
51 let mut in_bold = false;
53 let mut in_italic = false;
54 let mut current_text = String::new();
55 let mut text_start_offset: usize = 0;
56
57 for (event, range) in parser.into_offset_iter() {
58 match event {
59 Event::Start(Tag::Strong) => {
60 in_bold = true;
61 current_text.clear();
62 text_start_offset = range.start;
63 }
64 Event::End(TagEnd::Strong) => {
65 if !current_text.is_empty() && current_text.len() >= self.config.min_text_length
66 {
67 let style = if in_italic {
68 EmphasisStyle::BoldItalic
69 } else {
70 EmphasisStyle::Bold
71 };
72 let position = self.calculate_position(content, text_start_offset);
73 let contextual_text =
74 self.extract_context(content, text_start_offset, range.end);
75
76 result.add_node(EmphasisNode::new(
77 doc_id,
78 current_text.clone(),
79 style,
80 position,
81 contextual_text,
82 ));
83 }
84 in_bold = false;
85 current_text.clear();
86 }
87 Event::Start(Tag::Emphasis) => {
88 if !in_bold {
89 current_text.clear();
90 text_start_offset = range.start;
91 }
92 in_italic = true;
93 }
94 Event::End(TagEnd::Emphasis) => {
95 if !in_bold
96 && !current_text.is_empty()
97 && current_text.len() >= self.config.min_text_length
98 {
99 let position = self.calculate_position(content, text_start_offset);
100 let contextual_text =
101 self.extract_context(content, text_start_offset, range.end);
102
103 result.add_node(EmphasisNode::new(
104 doc_id,
105 current_text.clone(),
106 EmphasisStyle::Italic,
107 position,
108 contextual_text,
109 ));
110 }
111 in_italic = false;
112 if !in_bold {
113 current_text.clear();
114 }
115 }
116 Event::Code(code) => {
117 if code.len() >= self.config.min_text_length {
118 let position = self.calculate_position(content, range.start);
119 let contextual_text = self.extract_context(content, range.start, range.end);
120
121 result.add_node(EmphasisNode::new(
122 doc_id,
123 code.to_string(),
124 EmphasisStyle::Code,
125 position,
126 contextual_text,
127 ));
128 }
129 }
130 Event::Text(text) => {
131 if in_bold || in_italic {
132 current_text.push_str(&text);
133 }
134 }
135 _ => {}
136 }
137 }
138
139 result
140 }
141
142 fn calculate_position(&self, content: &str, byte_offset: usize) -> TextPosition {
144 let prefix = &content[..byte_offset.min(content.len())];
145 let line = prefix.matches('\n').count() + 1;
146 let column = prefix
147 .rfind('\n')
148 .map(|pos| byte_offset - pos)
149 .unwrap_or(byte_offset + 1);
150
151 TextPosition::new(byte_offset, line, column)
152 }
153
154 fn extract_context(&self, content: &str, start: usize, end: usize) -> String {
156 let context_start = start.saturating_sub(self.config.context_chars);
157 let context_end = (end + self.config.context_chars).min(content.len());
158
159 let actual_start = content[..context_start]
161 .rfind(char::is_whitespace)
162 .map(|p| p + 1)
163 .unwrap_or(context_start);
164
165 let actual_end = content[context_end..]
166 .find(char::is_whitespace)
167 .map(|p| context_end + p)
168 .unwrap_or(context_end);
169
170 content[actual_start..actual_end].to_string()
171 }
172}
173
174impl Default for EmphasisParser {
175 fn default() -> Self {
176 Self::new()
177 }
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183
184 #[test]
185 fn test_parse_bold() {
186 let parser = EmphasisParser::new();
187 let doc_id = Uuid::new_v4();
188 let content = "This is **important** text.";
189
190 let result = parser.parse(doc_id, content);
191
192 assert_eq!(result.bold_count, 1);
193 assert_eq!(result.nodes[0].text, "important");
194 assert_eq!(result.nodes[0].style, EmphasisStyle::Bold);
195 }
196
197 #[test]
198 fn test_parse_italic() {
199 let parser = EmphasisParser::new();
200 let doc_id = Uuid::new_v4();
201 let content = "This is *emphasized* text.";
202
203 let result = parser.parse(doc_id, content);
204
205 assert_eq!(result.italic_count, 1);
206 assert_eq!(result.nodes[0].text, "emphasized");
207 assert_eq!(result.nodes[0].style, EmphasisStyle::Italic);
208 }
209
210 #[test]
211 fn test_parse_bold_italic() {
212 let parser = EmphasisParser::new();
213 let doc_id = Uuid::new_v4();
214 let content = "This is ***critical*** concept.";
215
216 let result = parser.parse(doc_id, content);
217
218 assert_eq!(result.total_count(), 1);
220 assert_eq!(result.nodes[0].text, "critical");
221 assert_eq!(result.nodes[0].style, EmphasisStyle::BoldItalic);
222 }
223
224 #[test]
225 fn test_parse_code() {
226 let parser = EmphasisParser::new();
227 let doc_id = Uuid::new_v4();
228 let content = "Use the `println!` macro.";
229
230 let result = parser.parse(doc_id, content);
231
232 assert_eq!(result.code_count, 1);
233 assert_eq!(result.nodes[0].text, "println!");
234 assert_eq!(result.nodes[0].style, EmphasisStyle::Code);
235 }
236
237 #[test]
238 fn test_parse_mixed() {
239 let parser = EmphasisParser::new();
240 let doc_id = Uuid::new_v4();
241 let content = r#"
242# GraphKai Architecture
243
244The **knowledge graph** stores *semantic relationships* between concepts.
245Use `EmphasisNode` to represent emphasized text.
246"#;
247
248 let result = parser.parse(doc_id, content);
249
250 assert_eq!(result.bold_count, 1);
251 assert_eq!(result.italic_count, 1);
252 assert_eq!(result.code_count, 1);
253 assert_eq!(result.total_count(), 3);
254 }
255
256 #[test]
257 fn test_context_extraction() {
258 let parser = EmphasisParser::new();
259 let doc_id = Uuid::new_v4();
260 let content = "The quick brown fox **jumps** over the lazy dog.";
261
262 let result = parser.parse(doc_id, content);
263
264 assert!(result.nodes[0].contextual_text.contains("jumps"));
265 assert!(result.nodes[0].contextual_text.contains("fox"));
266 assert!(result.nodes[0].contextual_text.contains("over"));
267 }
268
269 #[test]
270 fn test_position_tracking() {
271 let parser = EmphasisParser::new();
272 let doc_id = Uuid::new_v4();
273 let content = "Line 1\nLine 2 with **bold** text\nLine 3";
274
275 let result = parser.parse(doc_id, content);
276
277 assert_eq!(result.nodes[0].position.line, 2);
278 }
279
280 #[test]
281 fn test_min_length_filter() {
282 let parser = EmphasisParser::with_config(EmphasisParserConfig {
283 context_chars: 200,
284 min_text_length: 5,
285 });
286 let doc_id = Uuid::new_v4();
287 let content = "Skip **a** but include **longer text** here.";
288
289 let result = parser.parse(doc_id, content);
290
291 assert_eq!(result.bold_count, 1);
292 assert_eq!(result.nodes[0].text, "longer text");
293 }
294}