1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7pub struct FlexibleExtractor {
9 debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13 pub fn new() -> Self {
14 Self { debug_mode: false }
15 }
16
17 pub fn with_debug(mut self) -> Self {
18 self.debug_mode = true;
19 self
20 }
21
22 pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23 vec![
24 ExtractionStrategy::TaggedContent("answer".to_string()),
25 ExtractionStrategy::JsonBrackets,
26 ExtractionStrategy::FirstJsonObject,
27 ]
28 }
29
30 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32 if self.debug_mode {
33 debug!("Extracting content from text: {}", text);
34 }
35 self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36 }
37
38 pub fn extract_with_strategy(
40 &self,
41 text: &str,
42 strategy: &ExtractionStrategy,
43 ) -> Option<String> {
44 if self.debug_mode {
45 debug!("Trying extraction strategy: {:?}", strategy);
46 }
47
48 match strategy {
49 ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50 ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51 ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52 ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53 ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54 ExtractionStrategy::OriginalText => Some(text.to_string()),
55 }
56 }
57
58 pub fn extract_with_strategies(
60 &self,
61 text: &str,
62 strategies: &[ExtractionStrategy],
63 ) -> Result<String, ParseError> {
64 let mut errors = Vec::new();
65
66 for strategy in strategies {
67 if let Some(result) = self.extract_with_strategy(text, strategy) {
68 if self.debug_mode {
69 debug!("Successfully extracted with strategy: {:?}", strategy);
70 }
71 return Ok(result);
72 } else {
73 errors.push(format!("Strategy {:?} failed", strategy));
74 }
75 }
76
77 Err(ParseError::AllStrategiesFailed(errors))
78 }
79
80 fn extract_first_json_entity(&self, text: &str) -> Option<String> {
82 let mut bracket_count = 0;
83 let mut start_pos = None;
84 let mut in_string = false;
85 let mut escape_next = false;
86 let mut opening_char = None;
87
88 for (i, ch) in text.char_indices() {
89 if escape_next {
90 escape_next = false;
91 continue;
92 }
93
94 match ch {
95 '\\' if in_string => escape_next = true,
96 '"' => in_string = !in_string,
97 '{' | '[' if !in_string => {
98 if bracket_count == 0 {
99 start_pos = Some(i);
100 opening_char = Some(ch);
101 }
102 bracket_count += 1;
103 }
104 '}' | ']' if !in_string => {
105 bracket_count -= 1;
106 if bracket_count == 0
107 && let Some(p) = start_pos
108 && let Some(opening) = opening_char
109 {
110 let is_valid =
112 (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
113 if is_valid {
114 return Some(text[p..=i].to_string());
115 }
116 }
117 }
118 _ => {}
119 }
120 }
121
122 None
123 }
124
125 fn clean_json_trailing_commas(json: &str) -> String {
136 let mut result = String::with_capacity(json.len());
137 let mut in_string = false;
138 let mut escape_next = false;
139 let chars: Vec<char> = json.chars().collect();
140
141 for i in 0..chars.len() {
142 let ch = chars[i];
143
144 if escape_next {
145 escape_next = false;
146 result.push(ch);
147 continue;
148 }
149
150 match ch {
151 '\\' if in_string => {
152 escape_next = true;
153 result.push(ch);
154 }
155 '"' => {
156 in_string = !in_string;
157 result.push(ch);
158 }
159 ',' if !in_string => {
160 let mut j = i + 1;
162 while j < chars.len() && chars[j].is_whitespace() {
164 j += 1;
165 }
166 if j < chars.len() && (chars[j] == '}' || chars[j] == ']') {
168 continue;
170 } else {
171 result.push(ch);
173 }
174 }
175 _ => result.push(ch),
176 }
177 }
178
179 result
180 }
181
182 fn extract_first_json_object(&self, text: &str) -> Option<String> {
184 self.extract_first_json_entity(text)
185 .map(|json| Self::clean_json_trailing_commas(&json))
186 }
187
188 fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
190 let lower_text = text.to_lowercase();
191
192 for keyword in keywords {
193 if lower_text.contains(&keyword.to_lowercase()) {
194 return Some(keyword.clone());
196 }
197 }
198
199 None
200 }
201}
202
203impl Default for FlexibleExtractor {
204 fn default() -> Self {
205 Self::new()
206 }
207}
208
209impl ContentExtractor for FlexibleExtractor {
210 fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
211 let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
213
214 if let Ok(regex) = Regex::new(&pattern)
215 && let Some(captures) = regex.captures(text)
216 && let Some(content) = captures.get(1)
217 {
218 return Some(content.as_str().trim().to_string());
219 }
220
221 if self.debug_mode {
222 debug!("Failed to extract tagged content with tag: {}", tag);
223 }
224
225 None
226 }
227
228 fn extract_json_like(&self, text: &str) -> Option<String> {
229 let result = self
231 .extract_first_json_entity(text)
232 .map(|json| Self::clean_json_trailing_commas(&json));
233
234 if result.is_none() && self.debug_mode {
235 debug!("Failed to extract JSON-like content");
236 }
237
238 result
239 }
240
241 fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
242 if let Ok(regex) = Regex::new(pattern)
243 && let Some(captures) = regex.captures(text)
244 {
245 if captures.len() > 1 {
247 return captures.get(1).map(|m| m.as_str().to_string());
248 } else {
249 return captures.get(0).map(|m| m.as_str().to_string());
250 }
251 }
252
253 if self.debug_mode {
254 debug!("Failed to extract with pattern: {}", pattern);
255 }
256
257 None
258 }
259}
260
261pub struct MarkdownCodeBlockExtractor {
263 pub language: Option<String>,
265}
266
267impl Default for MarkdownCodeBlockExtractor {
268 fn default() -> Self {
269 Self::new()
270 }
271}
272
273impl MarkdownCodeBlockExtractor {
274 pub fn new() -> Self {
276 Self { language: None }
277 }
278
279 pub fn with_language(language: String) -> Self {
281 Self {
282 language: Some(language),
283 }
284 }
285
286 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
288 let pattern = if let Some(ref lang) = self.language {
289 format!(
291 r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
292 regex::escape(lang)
293 )
294 } else {
295 r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
297 };
298
299 let regex = Regex::new(&pattern)
300 .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
301
302 if let Some(captures) = regex.captures(text)
303 && let Some(content) = captures.get(1)
304 {
305 let extracted = content.as_str().trim_end();
307 return Ok(extracted.to_string());
308 }
309
310 Err(ParseError::TagExtractionFailed(format!(
311 "No markdown code block found{}",
312 if let Some(ref lang) = self.language {
313 format!(" with language '{}'", lang)
314 } else {
315 String::new()
316 }
317 )))
318 }
319}
320
321#[cfg(test)]
322mod tests {
323 use super::*;
324
325 #[test]
326 fn test_extract_tagged_content() {
327 let extractor = FlexibleExtractor::new();
328
329 let text = "<answer>Hello World</answer>";
330 let result = extractor.extract_tagged(text, "answer");
331 assert_eq!(result, Some("Hello World".to_string()));
332
333 let text_with_whitespace = "<answer>\n Hello World \n</answer>";
334 let result = extractor.extract_tagged(text_with_whitespace, "answer");
335 assert_eq!(result, Some("Hello World".to_string()));
336 }
337
338 #[test]
339 fn test_extract_json_like() {
340 let extractor = FlexibleExtractor::new();
341
342 let text = "Here is some JSON: {\"key\": \"value\"} and more text";
343 let result = extractor.extract_json_like(text);
344 assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
345 }
346
347 #[test]
348 fn test_extract_first_json_object() {
349 let extractor = FlexibleExtractor::new();
350
351 let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
352 let result = extractor.extract_first_json_object(text);
353 assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
354 }
355
356 #[test]
357 fn test_extract_json_array() {
358 let extractor = FlexibleExtractor::new();
359
360 let text = "Here is an array: [{\"key\": \"value\"}] and more text";
361 let result = extractor.extract_first_json_object(text);
362 assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
363
364 let result2 = extractor.extract_json_like(text);
366 assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
367 }
368
369 #[test]
370 fn test_extract_by_keywords() {
371 let extractor = FlexibleExtractor::new();
372 let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
373
374 let text = "This is about comfort and support";
375 let result = extractor.extract_by_keywords(text, &keywords);
376 assert_eq!(result, Some("Comfort".to_string()));
377 }
378
379 #[test]
380 fn test_extraction_strategies() {
381 let extractor = FlexibleExtractor::new();
382
383 let strategies = vec![
384 ExtractionStrategy::TaggedContent("answer".to_string()),
385 ExtractionStrategy::JsonBrackets,
386 ExtractionStrategy::OriginalText,
387 ];
388
389 let text = "<answer>{\"type\": \"success\"}</answer>";
390 let result = extractor.extract_with_strategies(text, &strategies);
391 assert!(result.is_ok());
392 assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
393 }
394
395 #[test]
396 fn test_clean_json_trailing_commas_object() {
397 let extractor = FlexibleExtractor::new();
398
399 let text = r#"{"name": "Alice", "age": 30,}"#;
401 let result = extractor.extract_first_json_object(text);
402 assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
403
404 let text2 = r#"{"name": "Bob", "age": 25, }"#;
406 let result2 = extractor.extract_first_json_object(text2);
407 assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
408 }
409
410 #[test]
411 fn test_clean_json_trailing_commas_array() {
412 let extractor = FlexibleExtractor::new();
413
414 let text = r#"["apple", "banana", "cherry",]"#;
416 let result = extractor.extract_first_json_object(text);
417 assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
418
419 let text2 = r#"[1, 2, 3, ]"#;
421 let result2 = extractor.extract_first_json_object(text2);
422 assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
423 }
424
425 #[test]
426 fn test_clean_json_trailing_commas_nested() {
427 let extractor = FlexibleExtractor::new();
428
429 let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
431 let result = extractor.extract_first_json_object(text);
432 assert_eq!(
433 result,
434 Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
435 );
436 }
437
438 #[test]
439 fn test_clean_json_preserves_commas_in_strings() {
440 let extractor = FlexibleExtractor::new();
441
442 let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
444 let result = extractor.extract_first_json_object(text);
445 assert_eq!(
447 result,
448 Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
449 );
450
451 let text2 = r#"{"msg": "test, data", "val": 1,}"#;
453 let result2 = extractor.extract_first_json_object(text2);
454 assert_eq!(
455 result2,
456 Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
457 );
458 }
459
460 #[test]
461 fn test_clean_json_valid_json_unchanged() {
462 let extractor = FlexibleExtractor::new();
463
464 let text = r#"{"name": "Alice", "age": 30}"#;
466 let result = extractor.extract_first_json_object(text);
467 assert_eq!(result, Some(text.to_string()));
468
469 let text2 = r#"["a", "b", "c"]"#;
470 let result2 = extractor.extract_first_json_object(text2);
471 assert_eq!(result2, Some(text2.to_string()));
472 }
473
474 #[test]
475 fn test_extract_json_like_with_trailing_commas() {
476 let extractor = FlexibleExtractor::new();
477
478 let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
480 let result = extractor.extract_json_like(text);
481 assert_eq!(
482 result,
483 Some(r#"{"result": "success", "code": 200}"#.to_string())
484 );
485 }
486}