llm_toolkit/extract/
extractors.rs

1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7/// Flexible content extractor with multiple strategies
8pub struct FlexibleExtractor {
9    debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13    pub fn new() -> Self {
14        Self { debug_mode: false }
15    }
16
17    pub fn with_debug(mut self) -> Self {
18        self.debug_mode = true;
19        self
20    }
21
22    pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23        vec![
24            ExtractionStrategy::TaggedContent("answer".to_string()),
25            ExtractionStrategy::JsonBrackets,
26            ExtractionStrategy::FirstJsonObject,
27        ]
28    }
29
30    /// Standard extraction
31    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32        if self.debug_mode {
33            debug!("Extracting content from text: {}", text);
34        }
35        self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36    }
37
38    /// Extract content using specified strategy
39    pub fn extract_with_strategy(
40        &self,
41        text: &str,
42        strategy: &ExtractionStrategy,
43    ) -> Option<String> {
44        if self.debug_mode {
45            debug!("Trying extraction strategy: {:?}", strategy);
46        }
47
48        match strategy {
49            ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50            ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51            ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52            ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53            ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54            ExtractionStrategy::OriginalText => Some(text.to_string()),
55        }
56    }
57
58    /// Try multiple extraction strategies in order
59    pub fn extract_with_strategies(
60        &self,
61        text: &str,
62        strategies: &[ExtractionStrategy],
63    ) -> Result<String, ParseError> {
64        let mut errors = Vec::new();
65
66        for strategy in strategies {
67            if let Some(result) = self.extract_with_strategy(text, strategy) {
68                if self.debug_mode {
69                    debug!("Successfully extracted with strategy: {:?}", strategy);
70                }
71                return Ok(result);
72            } else {
73                errors.push(format!("Strategy {:?} failed", strategy));
74            }
75        }
76
77        Err(ParseError::AllStrategiesFailed(errors))
78    }
79
80    /// Extract first complete JSON entity (object or array) from text
81    fn extract_first_json_entity(&self, text: &str) -> Option<String> {
82        let mut bracket_count = 0;
83        let mut start_pos = None;
84        let mut in_string = false;
85        let mut escape_next = false;
86        let mut opening_char = None;
87
88        for (i, ch) in text.char_indices() {
89            if escape_next {
90                escape_next = false;
91                continue;
92            }
93
94            match ch {
95                '\\' if in_string => escape_next = true,
96                '"' => in_string = !in_string,
97                '{' | '[' if !in_string => {
98                    if bracket_count == 0 {
99                        start_pos = Some(i);
100                        opening_char = Some(ch);
101                    }
102                    bracket_count += 1;
103                }
104                '}' | ']' if !in_string => {
105                    bracket_count -= 1;
106                    if bracket_count == 0
107                        && let Some(p) = start_pos
108                        && let Some(opening) = opening_char
109                    {
110                        // Verify matching brackets
111                        let is_valid =
112                            (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
113                        if is_valid {
114                            return Some(text[p..=i].to_string());
115                        }
116                    }
117                }
118                _ => {}
119            }
120        }
121
122        None
123    }
124
125    /// Extract first complete JSON object from text
126    fn extract_first_json_object(&self, text: &str) -> Option<String> {
127        self.extract_first_json_entity(text)
128    }
129
130    /// Extract content based on keyword matching
131    fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
132        let lower_text = text.to_lowercase();
133
134        for keyword in keywords {
135            if lower_text.contains(&keyword.to_lowercase()) {
136                // Return the keyword as the extracted content
137                return Some(keyword.clone());
138            }
139        }
140
141        None
142    }
143}
144
145impl Default for FlexibleExtractor {
146    fn default() -> Self {
147        Self::new()
148    }
149}
150
151impl ContentExtractor for FlexibleExtractor {
152    fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
153        // Create regex pattern for XML-like tags
154        let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
155
156        if let Ok(regex) = Regex::new(&pattern)
157            && let Some(captures) = regex.captures(text)
158            && let Some(content) = captures.get(1)
159        {
160            return Some(content.as_str().trim().to_string());
161        }
162
163        if self.debug_mode {
164            debug!("Failed to extract tagged content with tag: {}", tag);
165        }
166
167        None
168    }
169
170    fn extract_json_like(&self, text: &str) -> Option<String> {
171        // Delegate to extract_first_json_entity for proper handling
172        let result = self.extract_first_json_entity(text);
173
174        if result.is_none() && self.debug_mode {
175            debug!("Failed to extract JSON-like content");
176        }
177
178        result
179    }
180
181    fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
182        if let Ok(regex) = Regex::new(pattern)
183            && let Some(captures) = regex.captures(text)
184        {
185            // Return the first capture group, or the whole match if no groups
186            if captures.len() > 1 {
187                return captures.get(1).map(|m| m.as_str().to_string());
188            } else {
189                return captures.get(0).map(|m| m.as_str().to_string());
190            }
191        }
192
193        if self.debug_mode {
194            debug!("Failed to extract with pattern: {}", pattern);
195        }
196
197        None
198    }
199}
200
201/// Extractor for Markdown code blocks
202pub struct MarkdownCodeBlockExtractor {
203    /// Optional language to filter by (e.g., "rust", "python")
204    pub language: Option<String>,
205}
206
207impl Default for MarkdownCodeBlockExtractor {
208    fn default() -> Self {
209        Self::new()
210    }
211}
212
213impl MarkdownCodeBlockExtractor {
214    /// Create a new extractor for any code block
215    pub fn new() -> Self {
216        Self { language: None }
217    }
218
219    /// Create a new extractor for a specific language
220    pub fn with_language(language: String) -> Self {
221        Self {
222            language: Some(language),
223        }
224    }
225
226    /// Extract content from a markdown code block
227    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
228        let pattern = if let Some(ref lang) = self.language {
229            // Match code block with specific language
230            format!(
231                r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
232                regex::escape(lang)
233            )
234        } else {
235            // Match any code block (with or without language specifier)
236            r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
237        };
238
239        let regex = Regex::new(&pattern)
240            .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
241
242        if let Some(captures) = regex.captures(text)
243            && let Some(content) = captures.get(1)
244        {
245            // Trim surrounding newlines but preserve internal formatting
246            let extracted = content.as_str().trim_end();
247            return Ok(extracted.to_string());
248        }
249
250        Err(ParseError::TagExtractionFailed(format!(
251            "No markdown code block found{}",
252            if let Some(ref lang) = self.language {
253                format!(" with language '{}'", lang)
254            } else {
255                String::new()
256            }
257        )))
258    }
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn test_extract_tagged_content() {
267        let extractor = FlexibleExtractor::new();
268
269        let text = "<answer>Hello World</answer>";
270        let result = extractor.extract_tagged(text, "answer");
271        assert_eq!(result, Some("Hello World".to_string()));
272
273        let text_with_whitespace = "<answer>\n  Hello World  \n</answer>";
274        let result = extractor.extract_tagged(text_with_whitespace, "answer");
275        assert_eq!(result, Some("Hello World".to_string()));
276    }
277
278    #[test]
279    fn test_extract_json_like() {
280        let extractor = FlexibleExtractor::new();
281
282        let text = "Here is some JSON: {\"key\": \"value\"} and more text";
283        let result = extractor.extract_json_like(text);
284        assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
285    }
286
287    #[test]
288    fn test_extract_first_json_object() {
289        let extractor = FlexibleExtractor::new();
290
291        let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
292        let result = extractor.extract_first_json_object(text);
293        assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
294    }
295
296    #[test]
297    fn test_extract_json_array() {
298        let extractor = FlexibleExtractor::new();
299
300        let text = "Here is an array: [{\"key\": \"value\"}] and more text";
301        let result = extractor.extract_first_json_object(text);
302        assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
303
304        // Test via extract_json_like as well
305        let result2 = extractor.extract_json_like(text);
306        assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
307    }
308
309    #[test]
310    fn test_extract_by_keywords() {
311        let extractor = FlexibleExtractor::new();
312        let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
313
314        let text = "This is about comfort and support";
315        let result = extractor.extract_by_keywords(text, &keywords);
316        assert_eq!(result, Some("Comfort".to_string()));
317    }
318
319    #[test]
320    fn test_extraction_strategies() {
321        let extractor = FlexibleExtractor::new();
322
323        let strategies = vec![
324            ExtractionStrategy::TaggedContent("answer".to_string()),
325            ExtractionStrategy::JsonBrackets,
326            ExtractionStrategy::OriginalText,
327        ];
328
329        let text = "<answer>{\"type\": \"success\"}</answer>";
330        let result = extractor.extract_with_strategies(text, &strategies);
331        assert!(result.is_ok());
332        assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
333    }
334}