1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use fuzzy_parser::sanitize_json;
5use log::debug;
6use regex::Regex;
7
8pub struct FlexibleExtractor {
10 debug_mode: bool,
11}
12
13impl FlexibleExtractor {
14 pub fn new() -> Self {
15 Self { debug_mode: false }
16 }
17
18 pub fn with_debug(mut self) -> Self {
19 self.debug_mode = true;
20 self
21 }
22
23 pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
24 vec![
25 ExtractionStrategy::TaggedContent("answer".to_string()),
26 ExtractionStrategy::JsonBrackets,
27 ExtractionStrategy::FirstJsonObject,
28 ]
29 }
30
31 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
33 if self.debug_mode {
34 debug!("Extracting content from text: {}", text);
35 }
36 self.extract_with_strategies(text, &Self::standard_extraction_strategies())
37 }
38
39 pub fn extract_with_strategy(
41 &self,
42 text: &str,
43 strategy: &ExtractionStrategy,
44 ) -> Option<String> {
45 if self.debug_mode {
46 debug!("Trying extraction strategy: {:?}", strategy);
47 }
48
49 match strategy {
50 ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
51 ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
52 ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
53 ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
54 ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
55 ExtractionStrategy::OriginalText => Some(text.to_string()),
56 }
57 }
58
59 pub fn extract_with_strategies(
61 &self,
62 text: &str,
63 strategies: &[ExtractionStrategy],
64 ) -> Result<String, ParseError> {
65 let mut errors = Vec::new();
66
67 for strategy in strategies {
68 if let Some(result) = self.extract_with_strategy(text, strategy) {
69 if self.debug_mode {
70 debug!("Successfully extracted with strategy: {:?}", strategy);
71 }
72 return Ok(result);
73 } else {
74 errors.push(format!("Strategy {:?} failed", strategy));
75 }
76 }
77
78 Err(ParseError::AllStrategiesFailed(errors))
79 }
80
81 fn extract_first_json_entity(&self, text: &str) -> Option<String> {
83 let mut bracket_count = 0;
84 let mut start_pos = None;
85 let mut in_string = false;
86 let mut escape_next = false;
87 let mut opening_char = None;
88
89 for (i, ch) in text.char_indices() {
90 if escape_next {
91 escape_next = false;
92 continue;
93 }
94
95 match ch {
96 '\\' if in_string => escape_next = true,
97 '"' => in_string = !in_string,
98 '{' | '[' if !in_string => {
99 if bracket_count == 0 {
100 start_pos = Some(i);
101 opening_char = Some(ch);
102 }
103 bracket_count += 1;
104 }
105 '}' | ']' if !in_string => {
106 bracket_count -= 1;
107 if bracket_count == 0
108 && let Some(p) = start_pos
109 && let Some(opening) = opening_char
110 {
111 let is_valid =
113 (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
114 if is_valid {
115 return Some(text[p..=i].to_string());
116 }
117 }
118 }
119 _ => {}
120 }
121 }
122
123 None
124 }
125
126 fn extract_first_json_object(&self, text: &str) -> Option<String> {
128 self.extract_first_json_entity(text)
129 .map(|json| sanitize_json(&json))
130 }
131
132 fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
134 let lower_text = text.to_lowercase();
135
136 for keyword in keywords {
137 if lower_text.contains(&keyword.to_lowercase()) {
138 return Some(keyword.clone());
140 }
141 }
142
143 None
144 }
145}
146
147impl Default for FlexibleExtractor {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153impl ContentExtractor for FlexibleExtractor {
154 fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
155 let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
157
158 if let Ok(regex) = Regex::new(&pattern)
159 && let Some(captures) = regex.captures(text)
160 && let Some(content) = captures.get(1)
161 {
162 return Some(content.as_str().trim().to_string());
163 }
164
165 if self.debug_mode {
166 debug!("Failed to extract tagged content with tag: {}", tag);
167 }
168
169 None
170 }
171
172 fn extract_json_like(&self, text: &str) -> Option<String> {
173 let result = self
175 .extract_first_json_entity(text)
176 .map(|json| sanitize_json(&json));
177
178 if result.is_none() && self.debug_mode {
179 debug!("Failed to extract JSON-like content");
180 }
181
182 result
183 }
184
185 fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
186 if let Ok(regex) = Regex::new(pattern)
187 && let Some(captures) = regex.captures(text)
188 {
189 if captures.len() > 1 {
191 return captures.get(1).map(|m| m.as_str().to_string());
192 } else {
193 return captures.get(0).map(|m| m.as_str().to_string());
194 }
195 }
196
197 if self.debug_mode {
198 debug!("Failed to extract with pattern: {}", pattern);
199 }
200
201 None
202 }
203}
204
205pub struct MarkdownCodeBlockExtractor {
207 pub language: Option<String>,
209}
210
211impl Default for MarkdownCodeBlockExtractor {
212 fn default() -> Self {
213 Self::new()
214 }
215}
216
217impl MarkdownCodeBlockExtractor {
218 pub fn new() -> Self {
220 Self { language: None }
221 }
222
223 pub fn with_language(language: String) -> Self {
225 Self {
226 language: Some(language),
227 }
228 }
229
230 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
232 let pattern = if let Some(ref lang) = self.language {
233 format!(
235 r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
236 regex::escape(lang)
237 )
238 } else {
239 r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
241 };
242
243 let regex = Regex::new(&pattern)
244 .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
245
246 if let Some(captures) = regex.captures(text)
247 && let Some(content) = captures.get(1)
248 {
249 let extracted = content.as_str().trim_end();
251 return Ok(extracted.to_string());
252 }
253
254 Err(ParseError::TagExtractionFailed(format!(
255 "No markdown code block found{}",
256 if let Some(ref lang) = self.language {
257 format!(" with language '{}'", lang)
258 } else {
259 String::new()
260 }
261 )))
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[test]
270 fn test_extract_tagged_content() {
271 let extractor = FlexibleExtractor::new();
272
273 let text = "<answer>Hello World</answer>";
274 let result = extractor.extract_tagged(text, "answer");
275 assert_eq!(result, Some("Hello World".to_string()));
276
277 let text_with_whitespace = "<answer>\n Hello World \n</answer>";
278 let result = extractor.extract_tagged(text_with_whitespace, "answer");
279 assert_eq!(result, Some("Hello World".to_string()));
280 }
281
282 #[test]
283 fn test_extract_json_like() {
284 let extractor = FlexibleExtractor::new();
285
286 let text = "Here is some JSON: {\"key\": \"value\"} and more text";
287 let result = extractor.extract_json_like(text);
288 assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
289 }
290
291 #[test]
292 fn test_extract_first_json_object() {
293 let extractor = FlexibleExtractor::new();
294
295 let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
296 let result = extractor.extract_first_json_object(text);
297 assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
298 }
299
300 #[test]
301 fn test_extract_json_array() {
302 let extractor = FlexibleExtractor::new();
303
304 let text = "Here is an array: [{\"key\": \"value\"}] and more text";
305 let result = extractor.extract_first_json_object(text);
306 assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
307
308 let result2 = extractor.extract_json_like(text);
310 assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
311 }
312
313 #[test]
314 fn test_extract_by_keywords() {
315 let extractor = FlexibleExtractor::new();
316 let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
317
318 let text = "This is about comfort and support";
319 let result = extractor.extract_by_keywords(text, &keywords);
320 assert_eq!(result, Some("Comfort".to_string()));
321 }
322
323 #[test]
324 fn test_extraction_strategies() {
325 let extractor = FlexibleExtractor::new();
326
327 let strategies = vec![
328 ExtractionStrategy::TaggedContent("answer".to_string()),
329 ExtractionStrategy::JsonBrackets,
330 ExtractionStrategy::OriginalText,
331 ];
332
333 let text = "<answer>{\"type\": \"success\"}</answer>";
334 let result = extractor.extract_with_strategies(text, &strategies);
335 assert!(result.is_ok());
336 assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
337 }
338
339 #[test]
340 fn test_clean_json_trailing_commas_object() {
341 let extractor = FlexibleExtractor::new();
342
343 let text = r#"{"name": "Alice", "age": 30,}"#;
345 let result = extractor.extract_first_json_object(text);
346 assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
347
348 let text2 = r#"{"name": "Bob", "age": 25, }"#;
350 let result2 = extractor.extract_first_json_object(text2);
351 assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
352 }
353
354 #[test]
355 fn test_clean_json_trailing_commas_array() {
356 let extractor = FlexibleExtractor::new();
357
358 let text = r#"["apple", "banana", "cherry",]"#;
360 let result = extractor.extract_first_json_object(text);
361 assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
362
363 let text2 = r#"[1, 2, 3, ]"#;
365 let result2 = extractor.extract_first_json_object(text2);
366 assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
367 }
368
369 #[test]
370 fn test_clean_json_trailing_commas_nested() {
371 let extractor = FlexibleExtractor::new();
372
373 let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
375 let result = extractor.extract_first_json_object(text);
376 assert_eq!(
377 result,
378 Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
379 );
380 }
381
382 #[test]
383 fn test_clean_json_preserves_commas_in_strings() {
384 let extractor = FlexibleExtractor::new();
385
386 let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
388 let result = extractor.extract_first_json_object(text);
389 assert_eq!(
391 result,
392 Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
393 );
394
395 let text2 = r#"{"msg": "test, data", "val": 1,}"#;
397 let result2 = extractor.extract_first_json_object(text2);
398 assert_eq!(
399 result2,
400 Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
401 );
402 }
403
404 #[test]
405 fn test_clean_json_valid_json_unchanged() {
406 let extractor = FlexibleExtractor::new();
407
408 let text = r#"{"name": "Alice", "age": 30}"#;
410 let result = extractor.extract_first_json_object(text);
411 assert_eq!(result, Some(text.to_string()));
412
413 let text2 = r#"["a", "b", "c"]"#;
414 let result2 = extractor.extract_first_json_object(text2);
415 assert_eq!(result2, Some(text2.to_string()));
416 }
417
418 #[test]
419 fn test_extract_json_like_with_trailing_commas() {
420 let extractor = FlexibleExtractor::new();
421
422 let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
424 let result = extractor.extract_json_like(text);
425 assert_eq!(
426 result,
427 Some(r#"{"result": "success", "code": 200}"#.to_string())
428 );
429 }
430}