llm_toolkit/extract/
extractors.rs1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7pub struct FlexibleExtractor {
9 debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13 pub fn new() -> Self {
14 Self { debug_mode: false }
15 }
16
17 pub fn with_debug(mut self) -> Self {
18 self.debug_mode = true;
19 self
20 }
21
22 pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23 vec![
24 ExtractionStrategy::TaggedContent("answer".to_string()),
25 ExtractionStrategy::JsonBrackets,
26 ExtractionStrategy::FirstJsonObject,
27 ]
28 }
29
30 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32 if self.debug_mode {
33 debug!("Extracting content from text: {}", text);
34 }
35 self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36 }
37
38 pub fn extract_with_strategy(
40 &self,
41 text: &str,
42 strategy: &ExtractionStrategy,
43 ) -> Option<String> {
44 if self.debug_mode {
45 debug!("Trying extraction strategy: {:?}", strategy);
46 }
47
48 match strategy {
49 ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50 ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51 ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52 ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53 ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54 ExtractionStrategy::OriginalText => Some(text.to_string()),
55 }
56 }
57
58 pub fn extract_with_strategies(
60 &self,
61 text: &str,
62 strategies: &[ExtractionStrategy],
63 ) -> Result<String, ParseError> {
64 let mut errors = Vec::new();
65
66 for strategy in strategies {
67 if let Some(result) = self.extract_with_strategy(text, strategy) {
68 if self.debug_mode {
69 debug!("Successfully extracted with strategy: {:?}", strategy);
70 }
71 return Ok(result);
72 } else {
73 errors.push(format!("Strategy {:?} failed", strategy));
74 }
75 }
76
77 Err(ParseError::AllStrategiesFailed(errors))
78 }
79
80 fn extract_first_json_entity(&self, text: &str) -> Option<String> {
82 let mut bracket_count = 0;
83 let mut start_pos = None;
84 let mut in_string = false;
85 let mut escape_next = false;
86 let mut opening_char = None;
87
88 for (i, ch) in text.char_indices() {
89 if escape_next {
90 escape_next = false;
91 continue;
92 }
93
94 match ch {
95 '\\' if in_string => escape_next = true,
96 '"' => in_string = !in_string,
97 '{' | '[' if !in_string => {
98 if bracket_count == 0 {
99 start_pos = Some(i);
100 opening_char = Some(ch);
101 }
102 bracket_count += 1;
103 }
104 '}' | ']' if !in_string => {
105 bracket_count -= 1;
106 if bracket_count == 0
107 && let Some(p) = start_pos
108 && let Some(opening) = opening_char
109 {
110 let is_valid =
112 (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
113 if is_valid {
114 return Some(text[p..=i].to_string());
115 }
116 }
117 }
118 _ => {}
119 }
120 }
121
122 None
123 }
124
125 fn extract_first_json_object(&self, text: &str) -> Option<String> {
127 self.extract_first_json_entity(text)
128 }
129
130 fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
132 let lower_text = text.to_lowercase();
133
134 for keyword in keywords {
135 if lower_text.contains(&keyword.to_lowercase()) {
136 return Some(keyword.clone());
138 }
139 }
140
141 None
142 }
143}
144
145impl Default for FlexibleExtractor {
146 fn default() -> Self {
147 Self::new()
148 }
149}
150
151impl ContentExtractor for FlexibleExtractor {
152 fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
153 let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
155
156 if let Ok(regex) = Regex::new(&pattern)
157 && let Some(captures) = regex.captures(text)
158 && let Some(content) = captures.get(1)
159 {
160 return Some(content.as_str().trim().to_string());
161 }
162
163 if self.debug_mode {
164 debug!("Failed to extract tagged content with tag: {}", tag);
165 }
166
167 None
168 }
169
170 fn extract_json_like(&self, text: &str) -> Option<String> {
171 let result = self.extract_first_json_entity(text);
173
174 if result.is_none() && self.debug_mode {
175 debug!("Failed to extract JSON-like content");
176 }
177
178 result
179 }
180
181 fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
182 if let Ok(regex) = Regex::new(pattern)
183 && let Some(captures) = regex.captures(text)
184 {
185 if captures.len() > 1 {
187 return captures.get(1).map(|m| m.as_str().to_string());
188 } else {
189 return captures.get(0).map(|m| m.as_str().to_string());
190 }
191 }
192
193 if self.debug_mode {
194 debug!("Failed to extract with pattern: {}", pattern);
195 }
196
197 None
198 }
199}
200
201pub struct MarkdownCodeBlockExtractor {
203 pub language: Option<String>,
205}
206
207impl Default for MarkdownCodeBlockExtractor {
208 fn default() -> Self {
209 Self::new()
210 }
211}
212
213impl MarkdownCodeBlockExtractor {
214 pub fn new() -> Self {
216 Self { language: None }
217 }
218
219 pub fn with_language(language: String) -> Self {
221 Self {
222 language: Some(language),
223 }
224 }
225
226 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
228 let pattern = if let Some(ref lang) = self.language {
229 format!(
231 r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
232 regex::escape(lang)
233 )
234 } else {
235 r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
237 };
238
239 let regex = Regex::new(&pattern)
240 .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
241
242 if let Some(captures) = regex.captures(text)
243 && let Some(content) = captures.get(1)
244 {
245 let extracted = content.as_str().trim_end();
247 return Ok(extracted.to_string());
248 }
249
250 Err(ParseError::TagExtractionFailed(format!(
251 "No markdown code block found{}",
252 if let Some(ref lang) = self.language {
253 format!(" with language '{}'", lang)
254 } else {
255 String::new()
256 }
257 )))
258 }
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
266 fn test_extract_tagged_content() {
267 let extractor = FlexibleExtractor::new();
268
269 let text = "<answer>Hello World</answer>";
270 let result = extractor.extract_tagged(text, "answer");
271 assert_eq!(result, Some("Hello World".to_string()));
272
273 let text_with_whitespace = "<answer>\n Hello World \n</answer>";
274 let result = extractor.extract_tagged(text_with_whitespace, "answer");
275 assert_eq!(result, Some("Hello World".to_string()));
276 }
277
278 #[test]
279 fn test_extract_json_like() {
280 let extractor = FlexibleExtractor::new();
281
282 let text = "Here is some JSON: {\"key\": \"value\"} and more text";
283 let result = extractor.extract_json_like(text);
284 assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
285 }
286
287 #[test]
288 fn test_extract_first_json_object() {
289 let extractor = FlexibleExtractor::new();
290
291 let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
292 let result = extractor.extract_first_json_object(text);
293 assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
294 }
295
296 #[test]
297 fn test_extract_json_array() {
298 let extractor = FlexibleExtractor::new();
299
300 let text = "Here is an array: [{\"key\": \"value\"}] and more text";
301 let result = extractor.extract_first_json_object(text);
302 assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
303
304 let result2 = extractor.extract_json_like(text);
306 assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
307 }
308
309 #[test]
310 fn test_extract_by_keywords() {
311 let extractor = FlexibleExtractor::new();
312 let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
313
314 let text = "This is about comfort and support";
315 let result = extractor.extract_by_keywords(text, &keywords);
316 assert_eq!(result, Some("Comfort".to_string()));
317 }
318
319 #[test]
320 fn test_extraction_strategies() {
321 let extractor = FlexibleExtractor::new();
322
323 let strategies = vec![
324 ExtractionStrategy::TaggedContent("answer".to_string()),
325 ExtractionStrategy::JsonBrackets,
326 ExtractionStrategy::OriginalText,
327 ];
328
329 let text = "<answer>{\"type\": \"success\"}</answer>";
330 let result = extractor.extract_with_strategies(text, &strategies);
331 assert!(result.is_ok());
332 assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
333 }
334}