llm_toolkit/extract/
extractors.rs

1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7/// Flexible content extractor with multiple strategies
8pub struct FlexibleExtractor {
9    debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13    pub fn new() -> Self {
14        Self { debug_mode: false }
15    }
16
17    pub fn with_debug(mut self) -> Self {
18        self.debug_mode = true;
19        self
20    }
21
22    pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23        vec![
24            ExtractionStrategy::TaggedContent("answer".to_string()),
25            ExtractionStrategy::JsonBrackets,
26            ExtractionStrategy::FirstJsonObject,
27        ]
28    }
29
30    /// Standard extraction
31    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32        if self.debug_mode {
33            debug!("Extracting content from text: {}", text);
34        }
35        self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36    }
37
38    /// Extract content using specified strategy
39    pub fn extract_with_strategy(
40        &self,
41        text: &str,
42        strategy: &ExtractionStrategy,
43    ) -> Option<String> {
44        if self.debug_mode {
45            debug!("Trying extraction strategy: {:?}", strategy);
46        }
47
48        match strategy {
49            ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50            ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51            ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52            ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53            ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54            ExtractionStrategy::OriginalText => Some(text.to_string()),
55        }
56    }
57
58    /// Try multiple extraction strategies in order
59    pub fn extract_with_strategies(
60        &self,
61        text: &str,
62        strategies: &[ExtractionStrategy],
63    ) -> Result<String, ParseError> {
64        let mut errors = Vec::new();
65
66        for strategy in strategies {
67            if let Some(result) = self.extract_with_strategy(text, strategy) {
68                if self.debug_mode {
69                    debug!("Successfully extracted with strategy: {:?}", strategy);
70                }
71                return Ok(result);
72            } else {
73                errors.push(format!("Strategy {:?} failed", strategy));
74            }
75        }
76
77        Err(ParseError::AllStrategiesFailed(errors))
78    }
79
80    /// Extract first complete JSON object from text
81    fn extract_first_json_object(&self, text: &str) -> Option<String> {
82        let mut brace_count = 0;
83        let mut start_pos = None;
84        let mut in_string = false;
85        let mut escape_next = false;
86
87        for (i, ch) in text.char_indices() {
88            if escape_next {
89                escape_next = false;
90                continue;
91            }
92
93            match ch {
94                '\\' if in_string => escape_next = true,
95                '"' => in_string = !in_string,
96                '{' if !in_string => {
97                    if brace_count == 0 {
98                        start_pos = Some(i);
99                    }
100                    brace_count += 1;
101                }
102                '}' if !in_string => {
103                    brace_count -= 1;
104                    if brace_count == 0 {
105                        if let Some(p) = start_pos {
106                            return Some(text[p..=i].to_string());
107                        }
108                    }
109                }
110                _ => {}
111            }
112        }
113
114        None
115    }
116
117    /// Extract content based on keyword matching
118    fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
119        let lower_text = text.to_lowercase();
120
121        for keyword in keywords {
122            if lower_text.contains(&keyword.to_lowercase()) {
123                // Return the keyword as the extracted content
124                return Some(keyword.clone());
125            }
126        }
127
128        None
129    }
130}
131
132impl Default for FlexibleExtractor {
133    fn default() -> Self {
134        Self::new()
135    }
136}
137
138impl ContentExtractor for FlexibleExtractor {
139    fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
140        // Create regex pattern for XML-like tags
141        let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
142
143        if let Ok(regex) = Regex::new(&pattern) {
144            if let Some(captures) = regex.captures(text) {
145                if let Some(content) = captures.get(1) {
146                    return Some(content.as_str().trim().to_string());
147                }
148            }
149        }
150
151        if self.debug_mode {
152            debug!("Failed to extract tagged content with tag: {}", tag);
153        }
154
155        None
156    }
157
158    fn extract_json_like(&self, text: &str) -> Option<String> {
159        // Find JSON-like content within braces
160        if let Some(start) = text.find('{') {
161            if let Some(end) = text.rfind('}') {
162                if end > start {
163                    return Some(text[start..=end].to_string());
164                }
165            }
166        }
167
168        if self.debug_mode {
169            debug!("Failed to extract JSON-like content");
170        }
171
172        None
173    }
174
175    fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
176        if let Ok(regex) = Regex::new(pattern) {
177            if let Some(captures) = regex.captures(text) {
178                // Return the first capture group, or the whole match if no groups
179                if captures.len() > 1 {
180                    return captures.get(1).map(|m| m.as_str().to_string());
181                } else {
182                    return captures.get(0).map(|m| m.as_str().to_string());
183                }
184            }
185        }
186
187        if self.debug_mode {
188            debug!("Failed to extract with pattern: {}", pattern);
189        }
190
191        None
192    }
193}
194
195/// Extractor for Markdown code blocks
196pub struct MarkdownCodeBlockExtractor {
197    /// Optional language to filter by (e.g., "rust", "python")
198    pub language: Option<String>,
199}
200
201impl Default for MarkdownCodeBlockExtractor {
202    fn default() -> Self {
203        Self::new()
204    }
205}
206
207impl MarkdownCodeBlockExtractor {
208    /// Create a new extractor for any code block
209    pub fn new() -> Self {
210        Self { language: None }
211    }
212
213    /// Create a new extractor for a specific language
214    pub fn with_language(language: String) -> Self {
215        Self {
216            language: Some(language),
217        }
218    }
219
220    /// Extract content from a markdown code block
221    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
222        let pattern = if let Some(ref lang) = self.language {
223            // Match code block with specific language
224            format!(
225                r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
226                regex::escape(lang)
227            )
228        } else {
229            // Match any code block (with or without language specifier)
230            r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
231        };
232
233        let regex = Regex::new(&pattern)
234            .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
235
236        if let Some(captures) = regex.captures(text) {
237            if let Some(content) = captures.get(1) {
238                // Trim surrounding newlines but preserve internal formatting
239                let extracted = content.as_str().trim_end();
240                return Ok(extracted.to_string());
241            }
242        }
243
244        Err(ParseError::TagExtractionFailed(format!(
245            "No markdown code block found{}",
246            if let Some(ref lang) = self.language {
247                format!(" with language '{}'", lang)
248            } else {
249                String::new()
250            }
251        )))
252    }
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn test_extract_tagged_content() {
261        let extractor = FlexibleExtractor::new();
262
263        let text = "<answer>Hello World</answer>";
264        let result = extractor.extract_tagged(text, "answer");
265        assert_eq!(result, Some("Hello World".to_string()));
266
267        let text_with_whitespace = "<answer>\n  Hello World  \n</answer>";
268        let result = extractor.extract_tagged(text_with_whitespace, "answer");
269        assert_eq!(result, Some("Hello World".to_string()));
270    }
271
272    #[test]
273    fn test_extract_json_like() {
274        let extractor = FlexibleExtractor::new();
275
276        let text = "Here is some JSON: {\"key\": \"value\"} and more text";
277        let result = extractor.extract_json_like(text);
278        assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
279    }
280
281    #[test]
282    fn test_extract_first_json_object() {
283        let extractor = FlexibleExtractor::new();
284
285        let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
286        let result = extractor.extract_first_json_object(text);
287        assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
288    }
289
290    #[test]
291    fn test_extract_by_keywords() {
292        let extractor = FlexibleExtractor::new();
293        let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
294
295        let text = "This is about comfort and support";
296        let result = extractor.extract_by_keywords(text, &keywords);
297        assert_eq!(result, Some("Comfort".to_string()));
298    }
299
300    #[test]
301    fn test_extraction_strategies() {
302        let extractor = FlexibleExtractor::new();
303
304        let strategies = vec![
305            ExtractionStrategy::TaggedContent("answer".to_string()),
306            ExtractionStrategy::JsonBrackets,
307            ExtractionStrategy::OriginalText,
308        ];
309
310        let text = "<answer>{\"type\": \"success\"}</answer>";
311        let result = extractor.extract_with_strategies(text, &strategies);
312        assert!(result.is_ok());
313        assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
314    }
315}