llm_toolkit/extract/
extractors.rs

1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7/// Flexible content extractor with multiple strategies
8pub struct FlexibleExtractor {
9    debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13    pub fn new() -> Self {
14        Self { debug_mode: false }
15    }
16
17    pub fn with_debug(mut self) -> Self {
18        self.debug_mode = true;
19        self
20    }
21
22    pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23        vec![
24            ExtractionStrategy::TaggedContent("answer".to_string()),
25            ExtractionStrategy::JsonBrackets,
26            ExtractionStrategy::FirstJsonObject,
27        ]
28    }
29
30    /// Standard extraction
31    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32        if self.debug_mode {
33            debug!("Extracting content from text: {}", text);
34        }
35        self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36    }
37
38    /// Extract content using specified strategy
39    pub fn extract_with_strategy(
40        &self,
41        text: &str,
42        strategy: &ExtractionStrategy,
43    ) -> Option<String> {
44        if self.debug_mode {
45            debug!("Trying extraction strategy: {:?}", strategy);
46        }
47
48        match strategy {
49            ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50            ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51            ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52            ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53            ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54            ExtractionStrategy::OriginalText => Some(text.to_string()),
55        }
56    }
57
58    /// Try multiple extraction strategies in order
59    pub fn extract_with_strategies(
60        &self,
61        text: &str,
62        strategies: &[ExtractionStrategy],
63    ) -> Result<String, ParseError> {
64        let mut errors = Vec::new();
65
66        for strategy in strategies {
67            if let Some(result) = self.extract_with_strategy(text, strategy) {
68                if self.debug_mode {
69                    debug!("Successfully extracted with strategy: {:?}", strategy);
70                }
71                return Ok(result);
72            } else {
73                errors.push(format!("Strategy {:?} failed", strategy));
74            }
75        }
76
77        Err(ParseError::AllStrategiesFailed(errors))
78    }
79
80    /// Extract first complete JSON entity (object or array) from text
81    fn extract_first_json_entity(&self, text: &str) -> Option<String> {
82        let mut bracket_count = 0;
83        let mut start_pos = None;
84        let mut in_string = false;
85        let mut escape_next = false;
86        let mut opening_char = None;
87
88        for (i, ch) in text.char_indices() {
89            if escape_next {
90                escape_next = false;
91                continue;
92            }
93
94            match ch {
95                '\\' if in_string => escape_next = true,
96                '"' => in_string = !in_string,
97                '{' | '[' if !in_string => {
98                    if bracket_count == 0 {
99                        start_pos = Some(i);
100                        opening_char = Some(ch);
101                    }
102                    bracket_count += 1;
103                }
104                '}' | ']' if !in_string => {
105                    bracket_count -= 1;
106                    if bracket_count == 0
107                        && let Some(p) = start_pos
108                        && let Some(opening) = opening_char
109                    {
110                        // Verify matching brackets
111                        let is_valid =
112                            (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
113                        if is_valid {
114                            return Some(text[p..=i].to_string());
115                        }
116                    }
117                }
118                _ => {}
119            }
120        }
121
122        None
123    }
124
125    /// Clean trailing commas from JSON string (common LLM output issue)
126    ///
127    /// Removes trailing commas before closing braces/brackets:
128    /// - `{"a": 1,}` → `{"a": 1}`
129    /// - `["a", "b",]` → `["a", "b"]`
130    ///
131    /// Handles:
132    /// - Optional whitespace between comma and bracket: `, }` or `,}`
133    /// - Preserves commas inside strings
134    /// - Nested structures
135    fn clean_json_trailing_commas(json: &str) -> String {
136        let mut result = String::with_capacity(json.len());
137        let mut in_string = false;
138        let mut escape_next = false;
139        let chars: Vec<char> = json.chars().collect();
140
141        for i in 0..chars.len() {
142            let ch = chars[i];
143
144            if escape_next {
145                escape_next = false;
146                result.push(ch);
147                continue;
148            }
149
150            match ch {
151                '\\' if in_string => {
152                    escape_next = true;
153                    result.push(ch);
154                }
155                '"' => {
156                    in_string = !in_string;
157                    result.push(ch);
158                }
159                ',' if !in_string => {
160                    // Look ahead to check if this is a trailing comma
161                    let mut j = i + 1;
162                    // Skip whitespace
163                    while j < chars.len() && chars[j].is_whitespace() {
164                        j += 1;
165                    }
166                    // If next non-whitespace is } or ], skip the comma
167                    if j < chars.len() && (chars[j] == '}' || chars[j] == ']') {
168                        // Skip this comma (trailing comma)
169                        continue;
170                    } else {
171                        // Keep this comma (not trailing)
172                        result.push(ch);
173                    }
174                }
175                _ => result.push(ch),
176            }
177        }
178
179        result
180    }
181
182    /// Extract first complete JSON object from text
183    fn extract_first_json_object(&self, text: &str) -> Option<String> {
184        self.extract_first_json_entity(text)
185            .map(|json| Self::clean_json_trailing_commas(&json))
186    }
187
188    /// Extract content based on keyword matching
189    fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
190        let lower_text = text.to_lowercase();
191
192        for keyword in keywords {
193            if lower_text.contains(&keyword.to_lowercase()) {
194                // Return the keyword as the extracted content
195                return Some(keyword.clone());
196            }
197        }
198
199        None
200    }
201}
202
203impl Default for FlexibleExtractor {
204    fn default() -> Self {
205        Self::new()
206    }
207}
208
209impl ContentExtractor for FlexibleExtractor {
210    fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
211        // Create regex pattern for XML-like tags
212        let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
213
214        if let Ok(regex) = Regex::new(&pattern)
215            && let Some(captures) = regex.captures(text)
216            && let Some(content) = captures.get(1)
217        {
218            return Some(content.as_str().trim().to_string());
219        }
220
221        if self.debug_mode {
222            debug!("Failed to extract tagged content with tag: {}", tag);
223        }
224
225        None
226    }
227
228    fn extract_json_like(&self, text: &str) -> Option<String> {
229        // Delegate to extract_first_json_entity for proper handling
230        let result = self
231            .extract_first_json_entity(text)
232            .map(|json| Self::clean_json_trailing_commas(&json));
233
234        if result.is_none() && self.debug_mode {
235            debug!("Failed to extract JSON-like content");
236        }
237
238        result
239    }
240
241    fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
242        if let Ok(regex) = Regex::new(pattern)
243            && let Some(captures) = regex.captures(text)
244        {
245            // Return the first capture group, or the whole match if no groups
246            if captures.len() > 1 {
247                return captures.get(1).map(|m| m.as_str().to_string());
248            } else {
249                return captures.get(0).map(|m| m.as_str().to_string());
250            }
251        }
252
253        if self.debug_mode {
254            debug!("Failed to extract with pattern: {}", pattern);
255        }
256
257        None
258    }
259}
260
261/// Extractor for Markdown code blocks
262pub struct MarkdownCodeBlockExtractor {
263    /// Optional language to filter by (e.g., "rust", "python")
264    pub language: Option<String>,
265}
266
267impl Default for MarkdownCodeBlockExtractor {
268    fn default() -> Self {
269        Self::new()
270    }
271}
272
273impl MarkdownCodeBlockExtractor {
274    /// Create a new extractor for any code block
275    pub fn new() -> Self {
276        Self { language: None }
277    }
278
279    /// Create a new extractor for a specific language
280    pub fn with_language(language: String) -> Self {
281        Self {
282            language: Some(language),
283        }
284    }
285
286    /// Extract content from a markdown code block
287    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
288        let pattern = if let Some(ref lang) = self.language {
289            // Match code block with specific language
290            format!(
291                r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
292                regex::escape(lang)
293            )
294        } else {
295            // Match any code block (with or without language specifier)
296            r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
297        };
298
299        let regex = Regex::new(&pattern)
300            .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
301
302        if let Some(captures) = regex.captures(text)
303            && let Some(content) = captures.get(1)
304        {
305            // Trim surrounding newlines but preserve internal formatting
306            let extracted = content.as_str().trim_end();
307            return Ok(extracted.to_string());
308        }
309
310        Err(ParseError::TagExtractionFailed(format!(
311            "No markdown code block found{}",
312            if let Some(ref lang) = self.language {
313                format!(" with language '{}'", lang)
314            } else {
315                String::new()
316            }
317        )))
318    }
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    #[test]
326    fn test_extract_tagged_content() {
327        let extractor = FlexibleExtractor::new();
328
329        let text = "<answer>Hello World</answer>";
330        let result = extractor.extract_tagged(text, "answer");
331        assert_eq!(result, Some("Hello World".to_string()));
332
333        let text_with_whitespace = "<answer>\n  Hello World  \n</answer>";
334        let result = extractor.extract_tagged(text_with_whitespace, "answer");
335        assert_eq!(result, Some("Hello World".to_string()));
336    }
337
338    #[test]
339    fn test_extract_json_like() {
340        let extractor = FlexibleExtractor::new();
341
342        let text = "Here is some JSON: {\"key\": \"value\"} and more text";
343        let result = extractor.extract_json_like(text);
344        assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
345    }
346
347    #[test]
348    fn test_extract_first_json_object() {
349        let extractor = FlexibleExtractor::new();
350
351        let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
352        let result = extractor.extract_first_json_object(text);
353        assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
354    }
355
356    #[test]
357    fn test_extract_json_array() {
358        let extractor = FlexibleExtractor::new();
359
360        let text = "Here is an array: [{\"key\": \"value\"}] and more text";
361        let result = extractor.extract_first_json_object(text);
362        assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
363
364        // Test via extract_json_like as well
365        let result2 = extractor.extract_json_like(text);
366        assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
367    }
368
369    #[test]
370    fn test_extract_by_keywords() {
371        let extractor = FlexibleExtractor::new();
372        let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
373
374        let text = "This is about comfort and support";
375        let result = extractor.extract_by_keywords(text, &keywords);
376        assert_eq!(result, Some("Comfort".to_string()));
377    }
378
379    #[test]
380    fn test_extraction_strategies() {
381        let extractor = FlexibleExtractor::new();
382
383        let strategies = vec![
384            ExtractionStrategy::TaggedContent("answer".to_string()),
385            ExtractionStrategy::JsonBrackets,
386            ExtractionStrategy::OriginalText,
387        ];
388
389        let text = "<answer>{\"type\": \"success\"}</answer>";
390        let result = extractor.extract_with_strategies(text, &strategies);
391        assert!(result.is_ok());
392        assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
393    }
394
395    #[test]
396    fn test_clean_json_trailing_commas_object() {
397        let extractor = FlexibleExtractor::new();
398
399        // Test trailing comma in object
400        let text = r#"{"name": "Alice", "age": 30,}"#;
401        let result = extractor.extract_first_json_object(text);
402        assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
403
404        // Test trailing comma with whitespace
405        let text2 = r#"{"name": "Bob", "age": 25, }"#;
406        let result2 = extractor.extract_first_json_object(text2);
407        assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
408    }
409
410    #[test]
411    fn test_clean_json_trailing_commas_array() {
412        let extractor = FlexibleExtractor::new();
413
414        // Test trailing comma in array
415        let text = r#"["apple", "banana", "cherry",]"#;
416        let result = extractor.extract_first_json_object(text);
417        assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
418
419        // Test trailing comma with whitespace
420        let text2 = r#"[1, 2, 3, ]"#;
421        let result2 = extractor.extract_first_json_object(text2);
422        assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
423    }
424
425    #[test]
426    fn test_clean_json_trailing_commas_nested() {
427        let extractor = FlexibleExtractor::new();
428
429        // Test nested structures with trailing commas
430        let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
431        let result = extractor.extract_first_json_object(text);
432        assert_eq!(
433            result,
434            Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
435        );
436    }
437
438    #[test]
439    fn test_clean_json_preserves_commas_in_strings() {
440        let extractor = FlexibleExtractor::new();
441
442        // Commas inside strings should be preserved
443        let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
444        let result = extractor.extract_first_json_object(text);
445        // The commas in strings should remain
446        assert_eq!(
447            result,
448            Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
449        );
450
451        // Test with trailing comma but commas in string values
452        let text2 = r#"{"msg": "test, data", "val": 1,}"#;
453        let result2 = extractor.extract_first_json_object(text2);
454        assert_eq!(
455            result2,
456            Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
457        );
458    }
459
460    #[test]
461    fn test_clean_json_valid_json_unchanged() {
462        let extractor = FlexibleExtractor::new();
463
464        // Valid JSON without trailing commas should remain unchanged
465        let text = r#"{"name": "Alice", "age": 30}"#;
466        let result = extractor.extract_first_json_object(text);
467        assert_eq!(result, Some(text.to_string()));
468
469        let text2 = r#"["a", "b", "c"]"#;
470        let result2 = extractor.extract_first_json_object(text2);
471        assert_eq!(result2, Some(text2.to_string()));
472    }
473
474    #[test]
475    fn test_extract_json_like_with_trailing_commas() {
476        let extractor = FlexibleExtractor::new();
477
478        // extract_json_like should also clean trailing commas
479        let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
480        let result = extractor.extract_json_like(text);
481        assert_eq!(
482            result,
483            Some(r#"{"result": "success", "code": 200}"#.to_string())
484        );
485    }
486}