llm_toolkit/extract/
extractors.rs1use super::core::{ContentExtractor, ExtractionStrategy};
2
3use super::error::ParseError;
4use log::debug;
5use regex::Regex;
6
7pub struct FlexibleExtractor {
9 debug_mode: bool,
10}
11
12impl FlexibleExtractor {
13 pub fn new() -> Self {
14 Self { debug_mode: false }
15 }
16
17 pub fn with_debug(mut self) -> Self {
18 self.debug_mode = true;
19 self
20 }
21
22 pub fn standard_extraction_strategies() -> Vec<ExtractionStrategy> {
23 vec![
24 ExtractionStrategy::TaggedContent("answer".to_string()),
25 ExtractionStrategy::JsonBrackets,
26 ExtractionStrategy::FirstJsonObject,
27 ]
28 }
29
30 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
32 if self.debug_mode {
33 debug!("Extracting content from text: {}", text);
34 }
35 self.extract_with_strategies(text, &Self::standard_extraction_strategies())
36 }
37
38 pub fn extract_with_strategy(
40 &self,
41 text: &str,
42 strategy: &ExtractionStrategy,
43 ) -> Option<String> {
44 if self.debug_mode {
45 debug!("Trying extraction strategy: {:?}", strategy);
46 }
47
48 match strategy {
49 ExtractionStrategy::TaggedContent(tag) => self.extract_tagged(text, tag),
50 ExtractionStrategy::JsonBrackets => self.extract_json_like(text),
51 ExtractionStrategy::FirstJsonObject => self.extract_first_json_object(text),
52 ExtractionStrategy::KeywordSearch(keywords) => self.extract_by_keywords(text, keywords),
53 ExtractionStrategy::RegexPattern(pattern) => self.extract_pattern(text, pattern),
54 ExtractionStrategy::OriginalText => Some(text.to_string()),
55 }
56 }
57
58 pub fn extract_with_strategies(
60 &self,
61 text: &str,
62 strategies: &[ExtractionStrategy],
63 ) -> Result<String, ParseError> {
64 let mut errors = Vec::new();
65
66 for strategy in strategies {
67 if let Some(result) = self.extract_with_strategy(text, strategy) {
68 if self.debug_mode {
69 debug!("Successfully extracted with strategy: {:?}", strategy);
70 }
71 return Ok(result);
72 } else {
73 errors.push(format!("Strategy {:?} failed", strategy));
74 }
75 }
76
77 Err(ParseError::AllStrategiesFailed(errors))
78 }
79
80 fn extract_first_json_object(&self, text: &str) -> Option<String> {
82 let mut brace_count = 0;
83 let mut start_pos = None;
84 let mut in_string = false;
85 let mut escape_next = false;
86
87 for (i, ch) in text.char_indices() {
88 if escape_next {
89 escape_next = false;
90 continue;
91 }
92
93 match ch {
94 '\\' if in_string => escape_next = true,
95 '"' => in_string = !in_string,
96 '{' if !in_string => {
97 if brace_count == 0 {
98 start_pos = Some(i);
99 }
100 brace_count += 1;
101 }
102 '}' if !in_string => {
103 brace_count -= 1;
104 if brace_count == 0 {
105 if let Some(p) = start_pos {
106 return Some(text[p..=i].to_string());
107 }
108 }
109 }
110 _ => {}
111 }
112 }
113
114 None
115 }
116
117 fn extract_by_keywords(&self, text: &str, keywords: &[String]) -> Option<String> {
119 let lower_text = text.to_lowercase();
120
121 for keyword in keywords {
122 if lower_text.contains(&keyword.to_lowercase()) {
123 return Some(keyword.clone());
125 }
126 }
127
128 None
129 }
130}
131
132impl Default for FlexibleExtractor {
133 fn default() -> Self {
134 Self::new()
135 }
136}
137
138impl ContentExtractor for FlexibleExtractor {
139 fn extract_tagged(&self, text: &str, tag: &str) -> Option<String> {
140 let pattern = format!(r"(?s)<{tag}>(.*?)</{tag}>", tag = regex::escape(tag));
142
143 if let Ok(regex) = Regex::new(&pattern) {
144 if let Some(captures) = regex.captures(text) {
145 if let Some(content) = captures.get(1) {
146 return Some(content.as_str().trim().to_string());
147 }
148 }
149 }
150
151 if self.debug_mode {
152 debug!("Failed to extract tagged content with tag: {}", tag);
153 }
154
155 None
156 }
157
158 fn extract_json_like(&self, text: &str) -> Option<String> {
159 if let Some(start) = text.find('{') {
161 if let Some(end) = text.rfind('}') {
162 if end > start {
163 return Some(text[start..=end].to_string());
164 }
165 }
166 }
167
168 if self.debug_mode {
169 debug!("Failed to extract JSON-like content");
170 }
171
172 None
173 }
174
175 fn extract_pattern(&self, text: &str, pattern: &str) -> Option<String> {
176 if let Ok(regex) = Regex::new(pattern) {
177 if let Some(captures) = regex.captures(text) {
178 if captures.len() > 1 {
180 return captures.get(1).map(|m| m.as_str().to_string());
181 } else {
182 return captures.get(0).map(|m| m.as_str().to_string());
183 }
184 }
185 }
186
187 if self.debug_mode {
188 debug!("Failed to extract with pattern: {}", pattern);
189 }
190
191 None
192 }
193}
194
195pub struct MarkdownCodeBlockExtractor {
197 pub language: Option<String>,
199}
200
201impl Default for MarkdownCodeBlockExtractor {
202 fn default() -> Self {
203 Self::new()
204 }
205}
206
207impl MarkdownCodeBlockExtractor {
208 pub fn new() -> Self {
210 Self { language: None }
211 }
212
213 pub fn with_language(language: String) -> Self {
215 Self {
216 language: Some(language),
217 }
218 }
219
220 pub fn extract(&self, text: &str) -> Result<String, ParseError> {
222 let pattern = if let Some(ref lang) = self.language {
223 format!(
225 r"(?m)^\s*```\s*{}\s*\n((?:.*\n)*?)^\s*```\s*$",
226 regex::escape(lang)
227 )
228 } else {
229 r"(?m)^\s*```[^\n]*\n((?:.*\n)*?)^\s*```\s*$".to_string()
231 };
232
233 let regex = Regex::new(&pattern)
234 .map_err(|e| ParseError::InvalidFormat(format!("Failed to compile regex: {}", e)))?;
235
236 if let Some(captures) = regex.captures(text) {
237 if let Some(content) = captures.get(1) {
238 let extracted = content.as_str().trim_end();
240 return Ok(extracted.to_string());
241 }
242 }
243
244 Err(ParseError::TagExtractionFailed(format!(
245 "No markdown code block found{}",
246 if let Some(ref lang) = self.language {
247 format!(" with language '{}'", lang)
248 } else {
249 String::new()
250 }
251 )))
252 }
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 #[test]
260 fn test_extract_tagged_content() {
261 let extractor = FlexibleExtractor::new();
262
263 let text = "<answer>Hello World</answer>";
264 let result = extractor.extract_tagged(text, "answer");
265 assert_eq!(result, Some("Hello World".to_string()));
266
267 let text_with_whitespace = "<answer>\n Hello World \n</answer>";
268 let result = extractor.extract_tagged(text_with_whitespace, "answer");
269 assert_eq!(result, Some("Hello World".to_string()));
270 }
271
272 #[test]
273 fn test_extract_json_like() {
274 let extractor = FlexibleExtractor::new();
275
276 let text = "Here is some JSON: {\"key\": \"value\"} and more text";
277 let result = extractor.extract_json_like(text);
278 assert_eq!(result, Some("{\"key\": \"value\"}".to_string()));
279 }
280
281 #[test]
282 fn test_extract_first_json_object() {
283 let extractor = FlexibleExtractor::new();
284
285 let text = "Some text {\"first\": \"object\"} more text {\"second\": \"object\"}";
286 let result = extractor.extract_first_json_object(text);
287 assert_eq!(result, Some("{\"first\": \"object\"}".to_string()));
288 }
289
290 #[test]
291 fn test_extract_by_keywords() {
292 let extractor = FlexibleExtractor::new();
293 let keywords = vec!["Comfort".to_string(), "Debug".to_string()];
294
295 let text = "This is about comfort and support";
296 let result = extractor.extract_by_keywords(text, &keywords);
297 assert_eq!(result, Some("Comfort".to_string()));
298 }
299
300 #[test]
301 fn test_extraction_strategies() {
302 let extractor = FlexibleExtractor::new();
303
304 let strategies = vec![
305 ExtractionStrategy::TaggedContent("answer".to_string()),
306 ExtractionStrategy::JsonBrackets,
307 ExtractionStrategy::OriginalText,
308 ];
309
310 let text = "<answer>{\"type\": \"success\"}</answer>";
311 let result = extractor.extract_with_strategies(text, &strategies);
312 assert!(result.is_ok());
313 assert_eq!(result.unwrap(), "{\"type\": \"success\"}");
314 }
315}