1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use fuzzy_parser::sanitize_json;
5use regex::Regex;
6
7#[cfg(feature = "log")]
9macro_rules! debug_log {
10 ($($arg:tt)*) => { log::debug!($($arg)*) }
11}
12
13#[cfg(not(feature = "log"))]
14macro_rules! debug_log {
15 ($($arg:tt)*) => {};
16}
17
18pub struct FlexibleExtractor {
20 debug_mode: bool,
21}
22
23impl FlexibleExtractor {
24 pub fn new() -> Self {
25 Self { debug_mode: false }
26 }
27
28 pub fn with_debug(mut self) -> Self {
29 self.debug_mode = true;
30 self
31 }
32
33 pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
34 vec![
35 ExtractionStrategy::TaggedContent("answer".to_string()),
36 ExtractionStrategy::JsonBrackets,
37 ExtractionStrategy::FirstJsonObject,
38 ]
39 }
40
41 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
43 if self.debug_mode {
44 debug_log!("Extracting content from text: {}", text);
45 }
46 self.extract_with_strategies(text, &Self::standard_extraction_strategies())
47 }
48
49 pub fn extract_with_strategy(
51 &self,
52 text: &str,
53 strategy: &ExtractionStrategy,
54 ) -> Option<String> {
55 if self.debug_mode {
56 debug_log!("Trying extraction strategy: {:?}", strategy);
57 }
58
59 match strategy {
60 ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
61 ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
62 ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
63 ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
64 ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
65 ExtractionStrategy::OriginalText => Some(text.to_string()),
66 }
67 }
68
69 pub fn extract_with_strategies(
71 &self,
72 text: &str,
73 strategies: &[ExtractionStrategy],
74 ) -> Result<String, ParseError> {
75 let mut errors = Vec::new();
76
77 for strategy in strategies {
78 if let Some(result) = self.extract_with_strategy(text, strategy) {
79 if self.debug_mode {
80 debug_log!("Successfully extracted with strategy: {:?}", strategy);
81 }
82 return Ok(result);
83 } else {
84 errors.push(format!("Strategy {:?} failed", strategy));
85 }
86 }
87
88 Err(ParseError::AllStrategiesFailed(errors))
89 }
90
91 fn extract_first_json_entity(&self, text: &str) -> Option<String> {
93 let mut bracket_count = 0;
94 let mut start_pos = None;
95 let mut in_string = false;
96 let mut escape_next = false;
97 let mut opening_char = None;
98
99 for (i, ch) in text.char_indices() {
100 if escape_next {
101 escape_next = false;
102 continue;
103 }
104
105 match ch {
106 '\\' if in_string => escape_next = true,
107 '"' => in_string = !in_string,
108 '{' | '[' if !in_string => {
109 if bracket_count == 0 {
110 start_pos = Some(i);
111 opening_char = Some(ch);
112 }
113 bracket_count += 1;
114 }
115 '}' | ']' if !in_string => {
116 bracket_count -= 1;
117 if bracket_count == 0
118 && let Some(p) = start_pos
119 && let Some(opening) = opening_char
120 {
121 let is_valid =
123 (opening == '{' && ch == '}') || (opening == '[' && ch == ']');
124 if is_valid {
125 return Some(text[p..=i].to_string());
126 }
127 }
128 }
129 _ => {}
130 }
131 }
132
133 None
134 }
135
136 fn extract_first_json_object(&self, text: &str) -> Option<String> {
138 self.extract_first_json_entity(text)
139 .map(|json| sanitize_json(&json))
140 }
141
142 fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
144 let lower_text = text.to_lowercase();
145
146 for keyword in keywords {
147 if lower_text.contains(&keyword.to_lowercase()) {
148 return Some(keyword.clone());
150 }
151 }
152
153 None
154 }
155}
156
157impl Default for FlexibleExtractor {
158 fn default() -> Self {
159 Self::new()
160 }
161}
162
163impl ContentExtractor for FlexibleExtractor {
164 fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
165 let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
167
168 if let Ok(regex) = Regex::new(&pattern)
169 && let Some(captures) = regex.captures(text)
170 && let Some(content) = captures.get(1)
171 {
172 return Some(content.as_str().trim().to_string());
173 }
174
175 if self.debug_mode {
176 debug_log!("Failed to extract tagged content with tag: {}", tag);
177 }
178
179 None
180 }
181
182 fn extract_json_like(&self, text: &str) -> Option<String> {
183 let result = self
185 .extract_first_json_entity(text)
186 .map(|json| sanitize_json(&json));
187
188 if result.is_none() && self.debug_mode {
189 debug_log!("Failed to extract JSON-like content");
190 }
191
192 result
193 }
194
195 fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
196 if let Ok(regex) = Regex::new(pattern)
197 && let Some(captures) = regex.captures(text)
198 {
199 if captures.len() > 1 {
201 return captures.get(1).map(|m| m.as_str().to_string());
202 } else {
203 return captures.get(0).map(|m| m.as_str().to_string());
204 }
205 }
206
207 if self.debug_mode {
208 debug_log!("Failed to extract with pattern: {}", pattern);
209 }
210
211 None
212 }
213}
214
215pub struct MarkdownCodeBlockExtractor {
217 pub language: Option<String>,
219}
220
221impl Default for MarkdownCodeBlockExtractor {
222 fn default() -> Self {
223 Self::new()
224 }
225}
226
227impl MarkdownCodeBlockExtractor {
228 pub fn new() -> Self {
230 Self { language: None }
231 }
232
233 pub fn with_language(language: String) -> Self {
235 Self {
236 language: Some(language),
237 }
238 }
239
240 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
242 let pattern = if let Some(ref lang) = self.language {
243 format!(
245 r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
246 regex::escape(lang)
247 )
248 } else {
249 r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
251 };
252
253 let regex = Regex::new(&pattern)
254 .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
255
256 if let Some(captures) = regex.captures(text)
257 && let Some(content) = captures.get(1)
258 {
259 let extracted = content.as_str().trim_end();
261 return Ok(extracted.to_string());
262 }
263
264 Err(ParseError::TagExtractionFailed(format!(
265 "No markdown code block found{}",
266 if let Some(ref lang) = self.language {
267 format!(" with language '{}'", lang)
268 } else {
269 String::new()
270 }
271 )))
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use super::*;
278
279 #[test]
280 fn test_extract_tagged_content() {
281 let extractor = FlexibleExtractor::new();
282
283 let text = "<answer>Hello World</answer>";
284 let result = extractor.extract_tagged(text, "answer");
285 assert_eq!(result, Some("Hello World".to_string()));
286
287 let text_with_whitespace = "<answer>\n Hello World \n</answer>";
288 let result = extractor.extract_tagged(text_with_whitespace, "answer");
289 assert_eq!(result, Some("Hello World".to_string()));
290 }
291
292 #[test]
293 fn test_extract_json_like() {
294 let extractor = FlexibleExtractor::new();
295
296 let text = "Here is some JSON: {\"key\": \"value\"} and more text";
297 let result = extractor.extract_json_like(text);
298 assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
299 }
300
301 #[test]
302 fn test_extract_first_json_object() {
303 let extractor = FlexibleExtractor::new();
304
305 let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
306 let result = extractor.extract_first_json_object(text);
307 assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
308 }
309
310 #[test]
311 fn test_extract_json_array() {
312 let extractor = FlexibleExtractor::new();
313
314 let text = "Here is an array: [{\"key\": \"value\"}] and more text";
315 let result = extractor.extract_first_json_object(text);
316 assert_eq!(result, Some("[{\"key\": \"value\"}]".to_string()));
317
318 let result2 = extractor.extract_json_like(text);
320 assert_eq!(result2, Some("[{\"key\": \"value\"}]".to_string()));
321 }
322
323 #[test]
324 fn test_extract_by_keywords() {
325 let extractor = FlexibleExtractor::new();
326 let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
327
328 let text = "This is about comfort and support";
329 let result = extractor.extract_by_keywords(text, &keywords);
330 assert_eq!(result, Some("Comfort".to_string()));
331 }
332
333 #[test]
334 fn test_extraction_strategies() {
335 let extractor = FlexibleExtractor::new();
336
337 let strategies = vec![
338 ExtractionStrategy::TaggedContent("answer".to_string()),
339 ExtractionStrategy::JsonBrackets,
340 ExtractionStrategy::OriginalText,
341 ];
342
343 let text = "<answer>{\"type\": \"success\"}</answer>";
344 let result = extractor.extract_with_strategies(text, &strategies);
345 assert!(result.is_ok());
346 assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
347 }
348
349 #[test]
350 fn test_clean_json_trailing_commas_object() {
351 let extractor = FlexibleExtractor::new();
352
353 let text = r#"{"name": "Alice", "age": 30,}"#;
355 let result = extractor.extract_first_json_object(text);
356 assert_eq!(result, Some(r#"{"name": "Alice", "age": 30}"#.to_string()));
357
358 let text2 = r#"{"name": "Bob", "age": 25, }"#;
360 let result2 = extractor.extract_first_json_object(text2);
361 assert_eq!(result2, Some(r#"{"name": "Bob", "age": 25 }"#.to_string()));
362 }
363
364 #[test]
365 fn test_clean_json_trailing_commas_array() {
366 let extractor = FlexibleExtractor::new();
367
368 let text = r#"["apple", "banana", "cherry",]"#;
370 let result = extractor.extract_first_json_object(text);
371 assert_eq!(result, Some(r#"["apple", "banana", "cherry"]"#.to_string()));
372
373 let text2 = r#"[1, 2, 3, ]"#;
375 let result2 = extractor.extract_first_json_object(text2);
376 assert_eq!(result2, Some(r#"[1, 2, 3 ]"#.to_string()));
377 }
378
379 #[test]
380 fn test_clean_json_trailing_commas_nested() {
381 let extractor = FlexibleExtractor::new();
382
383 let text = r#"{"items": [{"a": 1,}, {"b": 2,},], "count": 2,}"#;
385 let result = extractor.extract_first_json_object(text);
386 assert_eq!(
387 result,
388 Some(r#"{"items": [{"a": 1}, {"b": 2}], "count": 2}"#.to_string())
389 );
390 }
391
392 #[test]
393 fn test_clean_json_preserves_commas_in_strings() {
394 let extractor = FlexibleExtractor::new();
395
396 let text = r#"{"message": "Hello, world", "items": "a, b, c"}"#;
398 let result = extractor.extract_first_json_object(text);
399 assert_eq!(
401 result,
402 Some(r#"{"message": "Hello, world", "items": "a, b, c"}"#.to_string())
403 );
404
405 let text2 = r#"{"msg": "test, data", "val": 1,}"#;
407 let result2 = extractor.extract_first_json_object(text2);
408 assert_eq!(
409 result2,
410 Some(r#"{"msg": "test, data", "val": 1}"#.to_string())
411 );
412 }
413
414 #[test]
415 fn test_clean_json_valid_json_unchanged() {
416 let extractor = FlexibleExtractor::new();
417
418 let text = r#"{"name": "Alice", "age": 30}"#;
420 let result = extractor.extract_first_json_object(text);
421 assert_eq!(result, Some(text.to_string()));
422
423 let text2 = r#"["a", "b", "c"]"#;
424 let result2 = extractor.extract_first_json_object(text2);
425 assert_eq!(result2, Some(text2.to_string()));
426 }
427
428 #[test]
429 fn test_extract_json_like_with_trailing_commas() {
430 let extractor = FlexibleExtractor::new();
431
432 let text = "Here's the data: {\"result\": \"success\", \"code\": 200,}";
434 let result = extractor.extract_json_like(text);
435 assert_eq!(
436 result,
437 Some(r#"{"result": "success", "code": 200}"#.to_string())
438 );
439 }
440}