llm_toolkit/extract/
extractors.rs

1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use fuzzy_parser::sanitize_json;
5use log::debug;
6use regex::Regex;
7
8/// Flexible content extractor with multiple strategies
9pub struct FlexibleExtractor {
10    debug_mode: bool,
11}
12
13impl FlexibleExtractor {
14    pub fn new() -> Self {
15        Self { debug_mode: false }
16    }
17
18    pub fn with_debug(mut self) -> Self {
19        self.debug_mode = true;
20        self
21    }
22
23    pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
24        vec![
25            ExtractionStrategy::TaggedContent("answer".to_string()),
26            ExtractionStrategy::JsonBrackets,
27            ExtractionStrategy::FirstJsonObject,
28        ]
29    }
30
31    /// Standard extraction
32    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
33        if self.debug_mode {
34            debug!("Extracting content from text: {}", text);
35        }
36        self.extract_with_strategies(text, &Self::standard_extraction_strategies())
37    }
38
39    /// Extract content using specified strategy
40    pub fn extract_with_strategy(
41        &self,
42        text: &str,
43        strategy: &ExtractionStrategy,
44    ) -> Option<String> {
45        if self.debug_mode {
46            debug!("Trying extraction strategy: {:?}", strategy);
47        }
48
49        match strategy {
50            ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
51            ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
52            ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
53            ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
54            ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
55            ExtractionStrategy::OriginalText => Some(text.to_string()),
56        }
57    }
58
59    /// Try multiple extraction strategies in order
60    pub fn extract_with_strategies(
61        &self,
62        text: &str,
63        strategies: &[ExtractionStrategy],
64    ) -> Result<String, ParseError> {
65        let mut errors = Vec::new();
66
67        for strategy in strategies {
68            if let Some(result) = self.extract_with_strategy(text, strategy) {
69                if self.debug_mode {
70                    debug!("Successfully extracted with strategy: {:?}", strategy);
71                }
72                return Ok(result);
73            } else {
74                errors.push(format!("Strategy {:?} failed", strategy));
75            }
76        }
77
78        Err(ParseError::AllStrategiesFailed(errors))
79    }
80
81    /// Extract first complete JSON entity (object or array) from text
82    fn extract_first_json_entity(&self, text: &str) -> Option<String> {
83        let mut bracket_count = 0;
84        let mut start_pos = None;
85        let mut in_string = false;
86        let mut escape_next = false;
87        let mut opening_char = None;
88
89        for (i, ch) in text.char_indices() {
90            if escape_next {
91                escape_next = false;
92                continue;
93            }
94
95            match ch {
96                '\\' if in_string => escape_next = true,
97                '"' => in_string = !in_string,
98                '{' | '[' if !in_string => {
99                    if bracket_count == 0 {
100                        start_pos = Some(i);
101                        opening_char = Some(ch);
102                    }
103                    bracket_count += 1;
104                }
105                '}' | ']' if !in_string => {
106                    bracket_count -= 1;
107                    if bracket_count == 0
108                        && let Some(p) = start_pos
109                        && let Some(opening) = opening_char
110                    {
111                        // Verify matching brackets
112                        let is_valid =
113                            (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
114                        if is_valid {
115                            return Some(text[p..=i].to_string());
116                        }
117                    }
118                }
119                _ => {}
120            }
121        }
122
123        None
124    }
125
126    /// Extract first complete JSON object from text
127    fn extract_first_json_object(&self, text: &str) -> Option<String> {
128        self.extract_first_json_entity(text)
129            .map(|json| sanitize_json(&json))
130    }
131
132    /// Extract content based on keyword matching
133    fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
134        let lower_text = text.to_lowercase();
135
136        for keyword in keywords {
137            if lower_text.contains(&keyword.to_lowercase()) {
138                // Return the keyword as the extracted content
139                return Some(keyword.clone());
140            }
141        }
142
143        None
144    }
145}
146
147impl Default for FlexibleExtractor {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153impl ContentExtractor for FlexibleExtractor {
154    fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
155        // Create regex pattern for XML-like tags
156        let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
157
158        if let Ok(regex) = Regex::new(&pattern)
159            && let Some(captures) = regex.captures(text)
160            && let Some(content) = captures.get(1)
161        {
162            return Some(content.as_str().trim().to_string());
163        }
164
165        if self.debug_mode {
166            debug!("Failed to extract tagged content with tag: {}", tag);
167        }
168
169        None
170    }
171
172    fn extract_json_like(&self, text: &str) -> Option<String> {
173        // Delegate to extract_first_json_entity for proper handling
174        let result = self
175            .extract_first_json_entity(text)
176            .map(|json| sanitize_json(&json));
177
178        if result.is_none() && self.debug_mode {
179            debug!("Failed to extract JSON-like content");
180        }
181
182        result
183    }
184
185    fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
186        if let Ok(regex) = Regex::new(pattern)
187            && let Some(captures) = regex.captures(text)
188        {
189            // Return the first capture group, or the whole match if no groups
190            if captures.len() > 1 {
191                return captures.get(1).map(|m| m.as_str().to_string());
192            } else {
193                return captures.get(0).map(|m| m.as_str().to_string());
194            }
195        }
196
197        if self.debug_mode {
198            debug!("Failed to extract with pattern: {}", pattern);
199        }
200
201        None
202    }
203}
204
205/// Extractor for Markdown code blocks
206pub struct MarkdownCodeBlockExtractor {
207    /// Optional language to filter by (e.g., "rust", "python")
208    pub language: Option<String>,
209}
210
211impl Default for MarkdownCodeBlockExtractor {
212    fn default() -> Self {
213        Self::new()
214    }
215}
216
217impl MarkdownCodeBlockExtractor {
218    /// Create a new extractor for any code block
219    pub fn new() -> Self {
220        Self { language: None }
221    }
222
223    /// Create a new extractor for a specific language
224    pub fn with_language(language: String) -> Self {
225        Self {
226            language: Some(language),
227        }
228    }
229
230    /// Extract content from a markdown code block
231    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
232        let pattern = if let Some(ref lang) = self.language {
233            // Match code block with specific language
234            format!(
235                r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
236                regex::escape(lang)
237            )
238        } else {
239            // Match any code block (with or without language specifier)
240            r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
241        };
242
243        let regex = Regex::new(&pattern)
244            .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
245
246        if let Some(captures) = regex.captures(text)
247            && let Some(content) = captures.get(1)
248        {
249            // Trim surrounding newlines but preserve internal formatting
250            let extracted = content.as_str().trim_end();
251            return Ok(extracted.to_string());
252        }
253
254        Err(ParseError::TagExtractionFailed(format!(
255            "No markdown code block found{}",
256            if let Some(ref lang) = self.language {
257                format!(" with language '{}'", lang)
258            } else {
259                String::new()
260            }
261        )))
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_extract_tagged_content() {
271        let extractor = FlexibleExtractor::new();
272
273        let text = "<answer>Hello World</answer>";
274        let result = extractor.extract_tagged(text, "answer");
275        assert_eq!(result, Some("Hello World".to_string()));
276
277        let text_with_whitespace = "<answer>\n  Hello World  \n</answer>";
278        let result = extractor.extract_tagged(text_with_whitespace, "answer");
279        assert_eq!(result, Some("Hello World".to_string()));
280    }
281
282    #[test]
283    fn test_extract_json_like() {
284        let extractor = FlexibleExtractor::new();
285
286        let text = "Here is some JSON: {\"key\": \"value\"} and more text";
287        let result = extractor.extract_json_like(text);
288        assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
289    }
290
291    #[test]
292    fn test_extract_first_json_object() {
293        let extractor = FlexibleExtractor::new();
294
295        let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
296        let result = extractor.extract_first_json_object(text);
297        assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
298    }
299
300    #[test]
301    fn test_extract_json_array() {
302        let extractor = FlexibleExtractor::new();
303
304        let text = "Here is an array: [{\"key\": \"value\"}] and more text";
305        let result = extractor.extract_first_json_object(text);
306        assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
307
308        // Test via extract_json_like as well
309        let result2 = extractor.extract_json_like(text);
310        assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
311    }
312
313    #[test]
314    fn test_extract_by_keywords() {
315        let extractor = FlexibleExtractor::new();
316        let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
317
318        let text = "This is about comfort and support";
319        let result = extractor.extract_by_keywords(text, &keywords);
320        assert_eq!(result, Some("Comfort".to_string()));
321    }
322
323    #[test]
324    fn test_extraction_strategies() {
325        let extractor = FlexibleExtractor::new();
326
327        let strategies = vec![
328            ExtractionStrategy::TaggedContent("answer".to_string()),
329            ExtractionStrategy::JsonBrackets,
330            ExtractionStrategy::OriginalText,
331        ];
332
333        let text = "<answer>{\"type\": \"success\"}</answer>";
334        let result = extractor.extract_with_strategies(text, &strategies);
335        assert!(result.is_ok());
336        assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
337    }
338
339    #[test]
340    fn test_clean_json_trailing_commas_object() {
341        let extractor = FlexibleExtractor::new();
342
343        // Test trailing comma in object
344        let text = r#"{"name": "Alice", "age": 30,}"#;
345        let result = extractor.extract_first_json_object(text);
346        assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
347
348        // Test trailing comma with whitespace
349        let text2 = r#"{"name": "Bob", "age": 25, }"#;
350        let result2 = extractor.extract_first_json_object(text2);
351        assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
352    }
353
354    #[test]
355    fn test_clean_json_trailing_commas_array() {
356        let extractor = FlexibleExtractor::new();
357
358        // Test trailing comma in array
359        let text = r#"["apple", "banana", "cherry",]"#;
360        let result = extractor.extract_first_json_object(text);
361        assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
362
363        // Test trailing comma with whitespace
364        let text2 = r#"[1, 2, 3, ]"#;
365        let result2 = extractor.extract_first_json_object(text2);
366        assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
367    }
368
369    #[test]
370    fn test_clean_json_trailing_commas_nested() {
371        let extractor = FlexibleExtractor::new();
372
373        // Test nested structures with trailing commas
374        let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
375        let result = extractor.extract_first_json_object(text);
376        assert_eq!(
377            result,
378            Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
379        );
380    }
381
382    #[test]
383    fn test_clean_json_preserves_commas_in_strings() {
384        let extractor = FlexibleExtractor::new();
385
386        // Commas inside strings should be preserved
387        let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
388        let result = extractor.extract_first_json_object(text);
389        // The commas in strings should remain
390        assert_eq!(
391            result,
392            Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
393        );
394
395        // Test with trailing comma but commas in string values
396        let text2 = r#"{"msg": "test, data", "val": 1,}"#;
397        let result2 = extractor.extract_first_json_object(text2);
398        assert_eq!(
399            result2,
400            Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
401        );
402    }
403
404    #[test]
405    fn test_clean_json_valid_json_unchanged() {
406        let extractor = FlexibleExtractor::new();
407
408        // Valid JSON without trailing commas should remain unchanged
409        let text = r#"{"name": "Alice", "age": 30}"#;
410        let result = extractor.extract_first_json_object(text);
411        assert_eq!(result, Some(text.to_string()));
412
413        let text2 = r#"["a", "b", "c"]"#;
414        let result2 = extractor.extract_first_json_object(text2);
415        assert_eq!(result2, Some(text2.to_string()));
416    }
417
418    #[test]
419    fn test_extract_json_like_with_trailing_commas() {
420        let extractor = FlexibleExtractor::new();
421
422        // extract_json_like should also clean trailing commas
423        let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
424        let result = extractor.extract_json_like(text);
425        assert_eq!(
426            result,
427            Some(r#"{"result": "success", "code": 200}"#.to_string())
428        );
429    }
430}