Skip to main content

llm_toolkit/extract/
extractors.rs

1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use fuzzy_parser::sanitize_json;
5use regex::Regex;
6
7// Conditional debug logging macro
8#[cfg(feature = "log")]
9macro_rules! debug_log {
10    ($($arg:tt)*) => { log::debug!($($arg)*) }
11}
12
13#[cfg(not(feature = "log"))]
14macro_rules! debug_log {
15    ($($arg:tt)*) => {};
16}
17
18/// Flexible content extractor with multiple strategies
19pub struct FlexibleExtractor {
20    debug_mode: bool,
21}
22
23impl FlexibleExtractor {
24    pub fn new() -> Self {
25        Self { debug_mode: false }
26    }
27
28    pub fn with_debug(mut self) -> Self {
29        self.debug_mode = true;
30        self
31    }
32
33    pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
34        vec![
35            ExtractionStrategy::TaggedContent("answer".to_string()),
36            ExtractionStrategy::JsonBrackets,
37            ExtractionStrategy::FirstJsonObject,
38        ]
39    }
40
41    /// Standard extraction
42    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
43        if self.debug_mode {
44            debug_log!("Extracting content from text: {}", text);
45        }
46        self.extract_with_strategies(text, &Self::standard_extraction_strategies())
47    }
48
49    /// Extract content using specified strategy
50    pub fn extract_with_strategy(
51        &self,
52        text: &str,
53        strategy: &ExtractionStrategy,
54    ) -> Option<String> {
55        if self.debug_mode {
56            debug_log!("Trying extraction strategy: {:?}", strategy);
57        }
58
59        match strategy {
60            ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
61            ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
62            ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
63            ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
64            ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
65            ExtractionStrategy::OriginalText => Some(text.to_string()),
66        }
67    }
68
69    /// Try multiple extraction strategies in order
70    pub fn extract_with_strategies(
71        &self,
72        text: &str,
73        strategies: &[ExtractionStrategy],
74    ) -> Result<String, ParseError> {
75        let mut errors = Vec::new();
76
77        for strategy in strategies {
78            if let Some(result) = self.extract_with_strategy(text, strategy) {
79                if self.debug_mode {
80                    debug_log!("Successfully extracted with strategy: {:?}", strategy);
81                }
82                return Ok(result);
83            } else {
84                errors.push(format!("Strategy {:?} failed", strategy));
85            }
86        }
87
88        Err(ParseError::AllStrategiesFailed(errors))
89    }
90
91    /// Extract first complete JSON entity (object or array) from text
92    fn extract_first_json_entity(&self, text: &str) -> Option<String> {
93        let mut bracket_count = 0;
94        let mut start_pos = None;
95        let mut in_string = false;
96        let mut escape_next = false;
97        let mut opening_char = None;
98
99        for (i, ch) in text.char_indices() {
100            if escape_next {
101                escape_next = false;
102                continue;
103            }
104
105            match ch {
106                '\\' if in_string => escape_next = true,
107                '"' => in_string = !in_string,
108                '{' | '[' if !in_string => {
109                    if bracket_count == 0 {
110                        start_pos = Some(i);
111                        opening_char = Some(ch);
112                    }
113                    bracket_count += 1;
114                }
115                '}' | ']' if !in_string => {
116                    bracket_count -= 1;
117                    if bracket_count == 0
118                        && let Some(p) = start_pos
119                        && let Some(opening) = opening_char
120                    {
121                        // Verify matching brackets
122                        let is_valid =
123                            (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
124                        if is_valid {
125                            return Some(text[p..=i].to_string());
126                        }
127                    }
128                }
129                _ => {}
130            }
131        }
132
133        None
134    }
135
136    /// Extract first complete JSON object from text
137    fn extract_first_json_object(&self, text: &str) -> Option<String> {
138        self.extract_first_json_entity(text)
139            .map(|json| sanitize_json(&json))
140    }
141
142    /// Extract content based on keyword matching
143    fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
144        let lower_text = text.to_lowercase();
145
146        for keyword in keywords {
147            if lower_text.contains(&keyword.to_lowercase()) {
148                // Return the keyword as the extracted content
149                return Some(keyword.clone());
150            }
151        }
152
153        None
154    }
155}
156
157impl Default for FlexibleExtractor {
158    fn default() -> Self {
159        Self::new()
160    }
161}
162
163impl ContentExtractor for FlexibleExtractor {
164    fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
165        // Create regex pattern for XML-like tags
166        let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
167
168        if let Ok(regex) = Regex::new(&pattern)
169            && let Some(captures) = regex.captures(text)
170            && let Some(content) = captures.get(1)
171        {
172            return Some(content.as_str().trim().to_string());
173        }
174
175        if self.debug_mode {
176            debug_log!("Failed to extract tagged content with tag: {}", tag);
177        }
178
179        None
180    }
181
182    fn extract_json_like(&self, text: &str) -> Option<String> {
183        // Delegate to extract_first_json_entity for proper handling
184        let result = self
185            .extract_first_json_entity(text)
186            .map(|json| sanitize_json(&json));
187
188        if result.is_none() && self.debug_mode {
189            debug_log!("Failed to extract JSON-like content");
190        }
191
192        result
193    }
194
195    fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
196        if let Ok(regex) = Regex::new(pattern)
197            && let Some(captures) = regex.captures(text)
198        {
199            // Return the first capture group, or the whole match if no groups
200            if captures.len() > 1 {
201                return captures.get(1).map(|m| m.as_str().to_string());
202            } else {
203                return captures.get(0).map(|m| m.as_str().to_string());
204            }
205        }
206
207        if self.debug_mode {
208            debug_log!("Failed to extract with pattern: {}", pattern);
209        }
210
211        None
212    }
213}
214
215/// Extractor for Markdown code blocks
216pub struct MarkdownCodeBlockExtractor {
217    /// Optional language to filter by (e.g., "rust", "python")
218    pub language: Option<String>,
219}
220
221impl Default for MarkdownCodeBlockExtractor {
222    fn default() -> Self {
223        Self::new()
224    }
225}
226
227impl MarkdownCodeBlockExtractor {
228    /// Create a new extractor for any code block
229    pub fn new() -> Self {
230        Self { language: None }
231    }
232
233    /// Create a new extractor for a specific language
234    pub fn with_language(language: String) -> Self {
235        Self {
236            language: Some(language),
237        }
238    }
239
240    /// Extract content from a markdown code block
241    pub fn extract(&self, text: &str) -> Result<String, ParseError> {
242        let pattern = if let Some(ref lang) = self.language {
243            // Match code block with specific language
244            format!(
245                r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
246                regex::escape(lang)
247            )
248        } else {
249            // Match any code block (with or without language specifier)
250            r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
251        };
252
253        let regex = Regex::new(&pattern)
254            .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
255
256        if let Some(captures) = regex.captures(text)
257            && let Some(content) = captures.get(1)
258        {
259            // Trim surrounding newlines but preserve internal formatting
260            let extracted = content.as_str().trim_end();
261            return Ok(extracted.to_string());
262        }
263
264        Err(ParseError::TagExtractionFailed(format!(
265            "No markdown code block found{}",
266            if let Some(ref lang) = self.language {
267                format!(" with language '{}'", lang)
268            } else {
269                String::new()
270            }
271        )))
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_extract_tagged_content() {
281        let extractor = FlexibleExtractor::new();
282
283        let text = "<answer>Hello World</answer>";
284        let result = extractor.extract_tagged(text, "answer");
285        assert_eq!(result, Some("Hello World".to_string()));
286
287        let text_with_whitespace = "<answer>\n  Hello World  \n</answer>";
288        let result = extractor.extract_tagged(text_with_whitespace, "answer");
289        assert_eq!(result, Some("Hello World".to_string()));
290    }
291
292    #[test]
293    fn test_extract_json_like() {
294        let extractor = FlexibleExtractor::new();
295
296        let text = "Here is some JSON: {\"key\": \"value\"} and more text";
297        let result = extractor.extract_json_like(text);
298        assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
299    }
300
301    #[test]
302    fn test_extract_first_json_object() {
303        let extractor = FlexibleExtractor::new();
304
305        let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
306        let result = extractor.extract_first_json_object(text);
307        assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
308    }
309
310    #[test]
311    fn test_extract_json_array() {
312        let extractor = FlexibleExtractor::new();
313
314        let text = "Here is an array: [{\"key\": \"value\"}] and more text";
315        let result = extractor.extract_first_json_object(text);
316        assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
317
318        // Test via extract_json_like as well
319        let result2 = extractor.extract_json_like(text);
320        assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
321    }
322
323    #[test]
324    fn test_extract_by_keywords() {
325        let extractor = FlexibleExtractor::new();
326        let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
327
328        let text = "This is about comfort and support";
329        let result = extractor.extract_by_keywords(text, &keywords);
330        assert_eq!(result, Some("Comfort".to_string()));
331    }
332
333    #[test]
334    fn test_extraction_strategies() {
335        let extractor = FlexibleExtractor::new();
336
337        let strategies = vec![
338            ExtractionStrategy::TaggedContent("answer".to_string()),
339            ExtractionStrategy::JsonBrackets,
340            ExtractionStrategy::OriginalText,
341        ];
342
343        let text = "<answer>{\"type\": \"success\"}</answer>";
344        let result = extractor.extract_with_strategies(text, &strategies);
345        assert!(result.is_ok());
346        assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
347    }
348
349    #[test]
350    fn test_clean_json_trailing_commas_object() {
351        let extractor = FlexibleExtractor::new();
352
353        // Test trailing comma in object
354        let text = r#"{"name": "Alice", "age": 30,}"#;
355        let result = extractor.extract_first_json_object(text);
356        assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
357
358        // Test trailing comma with whitespace
359        let text2 = r#"{"name": "Bob", "age": 25, }"#;
360        let result2 = extractor.extract_first_json_object(text2);
361        assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
362    }
363
364    #[test]
365    fn test_clean_json_trailing_commas_array() {
366        let extractor = FlexibleExtractor::new();
367
368        // Test trailing comma in array
369        let text = r#"["apple", "banana", "cherry",]"#;
370        let result = extractor.extract_first_json_object(text);
371        assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
372
373        // Test trailing comma with whitespace
374        let text2 = r#"[1, 2, 3, ]"#;
375        let result2 = extractor.extract_first_json_object(text2);
376        assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
377    }
378
379    #[test]
380    fn test_clean_json_trailing_commas_nested() {
381        let extractor = FlexibleExtractor::new();
382
383        // Test nested structures with trailing commas
384        let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
385        let result = extractor.extract_first_json_object(text);
386        assert_eq!(
387            result,
388            Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
389        );
390    }
391
392    #[test]
393    fn test_clean_json_preserves_commas_in_strings() {
394        let extractor = FlexibleExtractor::new();
395
396        // Commas inside strings should be preserved
397        let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
398        let result = extractor.extract_first_json_object(text);
399        // The commas in strings should remain
400        assert_eq!(
401            result,
402            Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
403        );
404
405        // Test with trailing comma but commas in string values
406        let text2 = r#"{"msg": "test, data", "val": 1,}"#;
407        let result2 = extractor.extract_first_json_object(text2);
408        assert_eq!(
409            result2,
410            Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
411        );
412    }
413
414    #[test]
415    fn test_clean_json_valid_json_unchanged() {
416        let extractor = FlexibleExtractor::new();
417
418        // Valid JSON without trailing commas should remain unchanged
419        let text = r#"{"name": "Alice", "age": 30}"#;
420        let result = extractor.extract_first_json_object(text);
421        assert_eq!(result, Some(text.to_string()));
422
423        let text2 = r#"["a", "b", "c"]"#;
424        let result2 = extractor.extract_first_json_object(text2);
425        assert_eq!(result2, Some(text2.to_string()));
426    }
427
428    #[test]
429    fn test_extract_json_like_with_trailing_commas() {
430        let extractor = FlexibleExtractor::new();
431
432        // extract_json_like should also clean trailing commas
433        let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
434        let result = extractor.extract_json_like(text);
435        assert_eq!(
436            result,
437            Some(r#"{"result": "success", "code": 200}"#.to_string())
438        );
439    }
440}