Skip to main content

langextract_rust/
resolver.rs

1//! Output resolution and parsing functionality.
2
3use crate::{
4    data::{FormatType, Extraction}, 
5    exceptions::{LangExtractError, LangExtractResult}, 
6    ExtractConfig
7};
8use serde_json::Value;
9use std::fs;
10use std::path::Path;
11use uuid::Uuid;
12use regex::Regex;
13
14/// Configuration for validation behavior
15#[derive(Debug, Clone)]
16pub struct ValidationConfig {
17    /// Whether to enable schema validation
18    pub enable_schema_validation: bool,
19    /// Whether to enable type coercion (e.g., string "25" -> number 25)
20    pub enable_type_coercion: bool,
21    /// Whether to require all expected fields to be present
22    pub require_all_fields: bool,
23    /// Whether to save raw model outputs to files
24    pub save_raw_outputs: bool,
25    /// Directory to save raw outputs (defaults to "./raw_outputs")
26    pub raw_outputs_dir: String,
27    /// Quality threshold for extractions (0.0 to 1.0)
28    pub quality_threshold: f32,
29}
30
31impl Default for ValidationConfig {
32    fn default() -> Self {
33        Self {
34            enable_schema_validation: true,
35            enable_type_coercion: true,
36            require_all_fields: false,
37            save_raw_outputs: true,
38            raw_outputs_dir: "./raw_outputs".to_string(),
39            quality_threshold: 0.0,
40        }
41    }
42}
43
44/// Results of validation process
45#[derive(Debug, Clone)]
46pub struct ValidationResult {
47    /// Whether validation passed
48    pub is_valid: bool,
49    /// Validation errors encountered
50    pub errors: Vec<ValidationError>,
51    /// Validation warnings
52    pub warnings: Vec<ValidationWarning>,
53    /// Corrected/coerced data (if any)
54    pub corrected_data: Option<Value>,
55    /// Path to saved raw output file
56    pub raw_output_file: Option<String>,
57    /// Type coercion details
58    pub coercion_summary: Option<CoercionSummary>,
59}
60
61/// Validation error details
62#[derive(Debug, Clone)]
63pub struct ValidationError {
64    /// Error message
65    pub message: String,
66    /// Field path where error occurred
67    pub field_path: Option<String>,
68    /// Expected value or type
69    pub expected: Option<String>,
70    /// Actual value found
71    pub actual: Option<String>,
72}
73
74/// Validation warning details
75#[derive(Debug, Clone)]
76pub struct ValidationWarning {
77    /// Warning message
78    pub message: String,
79    /// Field path where warning occurred
80    pub field_path: Option<String>,
81}
82
83/// Summary of type coercion operations performed
84#[derive(Debug, Clone)]
85pub struct CoercionSummary {
86    /// Number of successful coercions
87    pub successful_coercions: usize,
88    /// Number of failed coercion attempts
89    pub failed_coercions: usize,
90    /// Details of each coercion attempt
91    pub coercion_details: Vec<CoercionDetail>,
92}
93
94/// Details of a single coercion operation
95#[derive(Debug, Clone)]
96pub struct CoercionDetail {
97    /// Field name being coerced
98    pub field_name: String,
99    /// Original value
100    pub original_value: String,
101    /// Coerced value (if successful)
102    pub coerced_value: Option<Value>,
103    /// Target type attempted
104    pub target_type: CoercionTargetType,
105    /// Whether coercion was successful
106    pub success: bool,
107    /// Error message if coercion failed
108    pub error_message: Option<String>,
109}
110
111/// Types that can be coerced to
112#[derive(Debug, Clone, PartialEq)]
113pub enum CoercionTargetType {
114    Integer,
115    Float,
116    Boolean,
117    Currency,
118    Percentage,
119    Email,
120    PhoneNumber,
121    Date,
122    Url,
123}
124
125/// Type coercion engine
126pub struct TypeCoercer {
127    enable_coercion: bool,
128    // Pre-compiled regex patterns for performance
129    integer_regex: Regex,
130    float_regex: Regex,
131    currency_regex: Regex,
132    percentage_regex: Regex,
133    email_regex: Regex,
134    phone_regex: Regex,
135    date_regex: Regex,
136    url_regex: Regex,
137}
138
139impl TypeCoercer {
140    /// Create a new type coercer
141    pub fn new(enable_coercion: bool) -> Self {
142        Self {
143            enable_coercion,
144            integer_regex: Regex::new(r"^[+-]?\d+$").unwrap(),
145            float_regex: Regex::new(r"^[+-]?\d*\.?\d+([eE][+-]?\d+)?$").unwrap(),
146            currency_regex: Regex::new(r"^\$+([\d,]+(?:\.\d{1,2})?)\s*(?:million|M|billion|B|thousand|K)?$|^([\d,]+(?:\.\d{1,2})?)\s*(?:million|M|billion|B|thousand|K)$").unwrap(),
147            percentage_regex: Regex::new(r"^(\d*\.?\d+)%$").unwrap(),
148            email_regex: Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap(),
149            phone_regex: Regex::new(r"^\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})$").unwrap(),
150            date_regex: Regex::new(r"^\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\w+ \d{1,2}, \d{4}$").unwrap(),
151            url_regex: Regex::new(r"^https?://[^\s/$.?#].[^\s]*$").unwrap(),
152        }
153    }
154
155    /// Attempt to coerce a string value to a more appropriate type
156    pub fn coerce_value(&self, field_name: &str, value: &str) -> CoercionDetail {
157        if !self.enable_coercion {
158            return CoercionDetail {
159                field_name: field_name.to_string(),
160                original_value: value.to_string(),
161                coerced_value: None,
162                target_type: CoercionTargetType::Integer, // Default
163                success: false,
164                error_message: Some("Type coercion disabled".to_string()),
165            };
166        }
167
168        let trimmed_value = value.trim();
169
170        // Try different coercion types in order of specificity
171        
172        // 1. Try percentage first (very specific pattern)
173        if let Some(result) = self.try_coerce_percentage(field_name, trimmed_value) {
174            return result;
175        }
176
177        // 2. Try email (very specific pattern)
178        if let Some(result) = self.try_coerce_email(field_name, trimmed_value) {
179            return result;
180        }
181
182        // 3. Try phone number (very specific pattern)
183        if let Some(result) = self.try_coerce_phone(field_name, trimmed_value) {
184            return result;
185        }
186
187        // 4. Try URL (very specific pattern)
188        if let Some(result) = self.try_coerce_url(field_name, trimmed_value) {
189            return result;
190        }
191
192        // 5. Try date (very specific pattern)
193        if let Some(result) = self.try_coerce_date(field_name, trimmed_value) {
194            return result;
195        }
196
197        // 6. Try currency (specific patterns with $ or units)
198        if let Some(result) = self.try_coerce_currency(field_name, trimmed_value) {
199            return result;
200        }
201
202        // 7. Try integer (before boolean so "1"/"0" become integers, not booleans)
203        if let Some(result) = self.try_coerce_integer(field_name, trimmed_value) {
204            return result;
205        }
206
207        // 8. Try float (more general numeric pattern)
208        if let Some(result) = self.try_coerce_float(field_name, trimmed_value) {
209            return result;
210        }
211
212        // 9. Try boolean (keyword strings only, after numeric coercion)
213        if let Some(result) = self.try_coerce_boolean(field_name, trimmed_value) {
214            return result;
215        }
216
217        // No coercion possible
218        CoercionDetail {
219            field_name: field_name.to_string(),
220            original_value: value.to_string(),
221            coerced_value: None,
222            target_type: CoercionTargetType::Integer, // Default
223            success: false,
224            error_message: Some("No applicable coercion found".to_string()),
225        }
226    }
227
228    fn try_coerce_integer(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
229        if self.integer_regex.is_match(value) {
230            match value.parse::<i64>() {
231                Ok(num) => Some(CoercionDetail {
232                    field_name: field_name.to_string(),
233                    original_value: value.to_string(),
234                    coerced_value: Some(Value::Number(serde_json::Number::from(num))),
235                    target_type: CoercionTargetType::Integer,
236                    success: true,
237                    error_message: None,
238                }),
239                Err(e) => Some(CoercionDetail {
240                    field_name: field_name.to_string(),
241                    original_value: value.to_string(),
242                    coerced_value: None,
243                    target_type: CoercionTargetType::Integer,
244                    success: false,
245                    error_message: Some(format!("Integer parse error: {}", e)),
246                }),
247            }
248        } else {
249            None
250        }
251    }
252
253    fn try_coerce_float(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
254        if self.float_regex.is_match(value) {
255            match value.parse::<f64>() {
256                Ok(num) => Some(CoercionDetail {
257                    field_name: field_name.to_string(),
258                    original_value: value.to_string(),
259                    coerced_value: Some(Value::Number(serde_json::Number::from_f64(num).unwrap_or_else(|| serde_json::Number::from(0)))),
260                    target_type: CoercionTargetType::Float,
261                    success: true,
262                    error_message: None,
263                }),
264                Err(e) => Some(CoercionDetail {
265                    field_name: field_name.to_string(),
266                    original_value: value.to_string(),
267                    coerced_value: None,
268                    target_type: CoercionTargetType::Float,
269                    success: false,
270                    error_message: Some(format!("Float parse error: {}", e)),
271                }),
272            }
273        } else {
274            None
275        }
276    }
277
278    fn try_coerce_boolean(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
279        let lower_value = value.to_lowercase();
280        match lower_value.as_str() {
281            "true" | "yes" | "y" | "on" | "enabled" => Some(CoercionDetail {
282                field_name: field_name.to_string(),
283                original_value: value.to_string(),
284                coerced_value: Some(Value::Bool(true)),
285                target_type: CoercionTargetType::Boolean,
286                success: true,
287                error_message: None,
288            }),
289            "false" | "no" | "n" | "off" | "disabled" => Some(CoercionDetail {
290                field_name: field_name.to_string(),
291                original_value: value.to_string(),
292                coerced_value: Some(Value::Bool(false)),
293                target_type: CoercionTargetType::Boolean,
294                success: true,
295                error_message: None,
296            }),
297            _ => None,
298        }
299    }
300
301    fn try_coerce_currency(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
302        if let Some(captures) = self.currency_regex.captures(value) {
303            // Try first capture group (for $amounts), then second (for amounts with units)
304            let amount_str = captures.get(1).or_else(|| captures.get(2))?;
305            let amount_clean = amount_str.as_str().replace(",", "");
306            if let Ok(mut amount) = amount_clean.parse::<f64>() {
307                // Handle suffixes
308                let lower_value = value.to_lowercase();
309                if lower_value.contains("million") || lower_value.contains("m") {
310                    amount *= 1_000_000.0;
311                } else if lower_value.contains("billion") || lower_value.contains("b") {
312                    amount *= 1_000_000_000.0;
313                } else if lower_value.contains("thousand") || lower_value.contains("k") {
314                    amount *= 1_000.0;
315                }
316
317                return Some(CoercionDetail {
318                    field_name: field_name.to_string(),
319                    original_value: value.to_string(),
320                    coerced_value: Some(Value::Number(serde_json::Number::from_f64(amount).unwrap_or_else(|| serde_json::Number::from(0)))),
321                    target_type: CoercionTargetType::Currency,
322                    success: true,
323                    error_message: None,
324                });
325            }
326        }
327        None
328    }
329
330    fn try_coerce_percentage(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
331        if let Some(captures) = self.percentage_regex.captures(value) {
332            if let Some(percent_str) = captures.get(1) {
333                if let Ok(percent) = percent_str.as_str().parse::<f64>() {
334                    return Some(CoercionDetail {
335                        field_name: field_name.to_string(),
336                        original_value: value.to_string(),
337                        coerced_value: Some(Value::Number(serde_json::Number::from_f64(percent / 100.0).unwrap_or_else(|| serde_json::Number::from(0)))),
338                        target_type: CoercionTargetType::Percentage,
339                        success: true,
340                        error_message: None,
341                    });
342                }
343            }
344        }
345        None
346    }
347
348    fn try_coerce_email(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
349        if self.email_regex.is_match(value) {
350            Some(CoercionDetail {
351                field_name: field_name.to_string(),
352                original_value: value.to_string(),
353                coerced_value: Some(Value::Object({
354                    let mut obj = serde_json::Map::new();
355                    obj.insert("email".to_string(), Value::String(value.to_string()));
356                    obj.insert("type".to_string(), Value::String("email".to_string()));
357                    obj
358                })),
359                target_type: CoercionTargetType::Email,
360                success: true,
361                error_message: None,
362            })
363        } else {
364            None
365        }
366    }
367
368    fn try_coerce_phone(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
369        if let Some(captures) = self.phone_regex.captures(value) {
370            let area = captures.get(1)?.as_str();
371            let exchange = captures.get(2)?.as_str();
372            let number = captures.get(3)?.as_str();
373            let formatted = format!("({}) {}-{}", area, exchange, number);
374
375            Some(CoercionDetail {
376                field_name: field_name.to_string(),
377                original_value: value.to_string(),
378                coerced_value: Some(Value::Object({
379                    let mut obj = serde_json::Map::new();
380                    obj.insert("phone".to_string(), Value::String(formatted));
381                    obj.insert("area_code".to_string(), Value::String(area.to_string()));
382                    obj.insert("type".to_string(), Value::String("phone".to_string()));
383                    obj
384                })),
385                target_type: CoercionTargetType::PhoneNumber,
386                success: true,
387                error_message: None,
388            })
389        } else {
390            None
391        }
392    }
393
394    fn try_coerce_date(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
395        if self.date_regex.is_match(value) {
396            Some(CoercionDetail {
397                field_name: field_name.to_string(),
398                original_value: value.to_string(),
399                coerced_value: Some(Value::Object({
400                    let mut obj = serde_json::Map::new();
401                    obj.insert("date".to_string(), Value::String(value.to_string()));
402                    obj.insert("type".to_string(), Value::String("date".to_string()));
403                    obj
404                })),
405                target_type: CoercionTargetType::Date,
406                success: true,
407                error_message: None,
408            })
409        } else {
410            None
411        }
412    }
413
414    fn try_coerce_url(&self, field_name: &str, value: &str) -> Option<CoercionDetail> {
415        if self.url_regex.is_match(value) {
416            Some(CoercionDetail {
417                field_name: field_name.to_string(),
418                original_value: value.to_string(),
419                coerced_value: Some(Value::Object({
420                    let mut obj = serde_json::Map::new();
421                    obj.insert("url".to_string(), Value::String(value.to_string()));
422                    obj.insert("type".to_string(), Value::String("url".to_string()));
423                    obj
424                })),
425                target_type: CoercionTargetType::Url,
426                success: true,
427                error_message: None,
428            })
429        } else {
430            None
431        }
432    }
433}
434
435/// Resolver for parsing language model outputs with validation
436pub struct Resolver {
437    /// Whether to expect fenced output
438    fence_output: bool,
439    /// Output format type
440    format_type: FormatType,
441    /// Validation configuration
442    validation_config: ValidationConfig,
443    /// Type coercion engine
444    type_coercer: TypeCoercer,
445}
446
447impl Resolver {
448    /// Create a new resolver
449    pub fn new(config: &ExtractConfig, fence_output: bool) -> LangExtractResult<Self> {
450        let validation_config = ValidationConfig {
451            save_raw_outputs: config.debug, // Enable for debug mode by default
452            ..Default::default()
453        };
454
455        // Create raw outputs directory if it doesn't exist
456        if validation_config.save_raw_outputs {
457            if let Err(e) = fs::create_dir_all(&validation_config.raw_outputs_dir) {
458                log::warn!("Failed to create raw outputs directory: {}", e);
459            }
460        }
461
462        let type_coercer = TypeCoercer::new(validation_config.enable_type_coercion);
463
464        Ok(Self {
465            fence_output,
466            format_type: config.format_type,
467            validation_config,
468            type_coercer,
469        })
470    }
471
472    /// Create a new resolver with custom validation config
473    pub fn with_validation_config(
474        config: &ExtractConfig, 
475        fence_output: bool, 
476        validation_config: ValidationConfig
477    ) -> LangExtractResult<Self> {
478        // Create raw outputs directory if it doesn't exist
479        if validation_config.save_raw_outputs {
480            if let Err(e) = fs::create_dir_all(&validation_config.raw_outputs_dir) {
481                log::warn!("Failed to create raw outputs directory: {}", e);
482            }
483        }
484
485        let type_coercer = TypeCoercer::new(validation_config.enable_type_coercion);
486
487        Ok(Self {
488            fence_output,
489            format_type: config.format_type,
490            validation_config,
491            type_coercer,
492        })
493    }
494
495    /// Get whether this resolver expects fenced output
496    pub fn fence_output(&self) -> bool {
497        self.fence_output
498    }
499
500    /// Save raw model output to a file for debugging/recovery
501    pub fn save_raw_output(&self, raw_output: &str, metadata: Option<&str>) -> LangExtractResult<String> {
502        if !self.validation_config.save_raw_outputs {
503            return Err(LangExtractError::configuration("Raw output saving is disabled"));
504        }
505
506        // Ensure output directory exists
507        let output_dir = Path::new(&self.validation_config.raw_outputs_dir);
508        if !output_dir.exists() {
509            fs::create_dir_all(output_dir).map_err(|e| {
510                LangExtractError::IoError(e)
511            })?;
512        }
513
514        let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S").to_string();
515        let unique_id = Uuid::new_v4().to_string()[..8].to_string();
516        let filename = format!("raw_output_{}_{}.txt", timestamp, unique_id);
517        let filepath = output_dir.join(&filename);
518
519        let mut content = String::new();
520        content.push_str(&format!("=== Raw Model Output ===\n"));
521        content.push_str(&format!("Timestamp: {}\n", chrono::Utc::now().to_rfc3339()));
522        if let Some(meta) = metadata {
523            content.push_str(&format!("Metadata: {}\n", meta));
524        }
525        content.push_str(&format!("Format: {:?}\n", self.format_type));
526        content.push_str(&format!("Content Length: {} chars\n", raw_output.len()));
527        content.push_str(&format!("=== Output Content ===\n"));
528        content.push_str(raw_output);
529        content.push_str("\n=== End Output ===\n");
530
531        fs::write(&filepath, content).map_err(|e| {
532            LangExtractError::IoError(e)
533        })?;
534
535        let path_str = filepath.to_string_lossy().to_string();
536        log::info!("Saved raw output to: {}", path_str);
537        Ok(path_str)
538    }
539
540    /// Validate and parse model response with raw data preservation
541    #[tracing::instrument(skip_all, fields(response_len = raw_response.len(), num_expected_fields = expected_fields.len()))]
542    pub fn validate_and_parse(&self, raw_response: &str, expected_fields: &[String]) -> LangExtractResult<(Vec<Extraction>, ValidationResult)> {
543        // Step 1: Always save raw output first if enabled
544        let raw_file_path = if self.validation_config.save_raw_outputs {
545            match self.save_raw_output(raw_response, Some("validation_parse")) {
546                Ok(path) => {
547                    log::debug!("Raw output saved to: {}", path);
548                    Some(path)
549                }
550                Err(e) => {
551                    log::warn!("Failed to save raw output: {}", e);
552                    None
553                }
554            }
555        } else {
556            None
557        };
558
559        // Step 2: Attempt to parse the response with enhanced cleaning and repair
560        log::debug!("Parsing model response...");
561        let parse_result = self.parse_response_with_repair(raw_response, expected_fields);
562        
563        // Step 3: Validate the parsed data
564        let mut validation_result = match &parse_result {
565            Ok(extractions) => {
566                log::debug!("Successfully parsed {} potential extractions", extractions.len());
567                self.validate_extractions(extractions, expected_fields)
568            }
569            Err(parse_error) => {
570                log::debug!("Failed to parse model response");
571                // If parsing failed, create validation result with error
572                ValidationResult {
573                    is_valid: false,
574                    errors: vec![ValidationError {
575                        message: format!("Failed to parse response: {}", parse_error),
576                        field_path: None,
577                        expected: Some("Valid JSON structure".to_string()),
578                        actual: Some("Unparseable content".to_string()),
579                    }],
580                    warnings: vec![],
581                    corrected_data: None,
582                    raw_output_file: raw_file_path.clone(), // Set the path here
583                    coercion_summary: None,
584                }
585            }
586        };
587
588        // Step 4: Set the raw output file path in the validation result (update if not already set)
589        if validation_result.raw_output_file.is_none() {
590            validation_result.raw_output_file = raw_file_path.clone();
591        }
592
593        // Step 5: Return results - even if validation fails, we preserve the raw data
594        match parse_result {
595            Ok(extractions) => Ok((extractions, validation_result)),
596            Err(e) => {
597                // Improved error reporting
598                match &validation_result.raw_output_file {
599                    Some(path) => {
600                        log::warn!("Parse failed but raw data saved to: {}", path);
601                        log::warn!("Parse failed - check raw output at: {}", path);
602                    }
603                    None => {
604                        log::warn!("Parse failed and no raw data was saved");
605                        log::warn!("Parse failed and raw data could not be saved");
606                    }
607                }
608                Err(e)
609            }
610        }
611    }
612
613    /// Clean and preprocess response before parsing
614    fn clean_response(&self, response: &str) -> String {
615        let mut cleaned = response.to_string();
616
617        // Simple approach: remove code fence markers step by step
618        // First, handle fenced blocks with language specifiers
619        cleaned = cleaned.replace("```json", "");
620        cleaned = cleaned.replace("```yaml", "");
621        cleaned = cleaned.replace("```python", "");
622        cleaned = cleaned.replace("```javascript", "");
623        cleaned = cleaned.replace("```rust", "");
624        cleaned = cleaned.replace("```", "");
625
626        // Trim whitespace
627        cleaned.trim().to_string()
628    }
629
630    /// Detect and repair malformed JSON where multiple extraction classes are crammed into a single extraction_text
631    fn detect_and_repair_malformed_json(&self, json: &serde_json::Value, expected_fields: &[String]) -> Option<serde_json::Value> {
632        // Check if this looks like malformed JSON where multiple classes are in a single extraction_text
633        if let Some(obj) = json.as_object() {
634            // Only proceed if we have exactly one key-value pair
635            if obj.len() == 1 {
636                if let Some((single_key, single_value)) = obj.iter().next() {
637                    if let Some(extraction_text) = single_value.as_str() {
638                        // Check if the extraction_text contains multiple expected field names
639                        let mut found_fields = Vec::new();
640
641                        for field in expected_fields {
642                            // Look for patterns like "field_name: value" or "field_name - value" or "field_name=value"
643                            let patterns = [
644                                format!(r"(?i){}[:\-=]\s*([^\n\r,]*)", regex::escape(field)),
645                                format!(r"(?i){}\s*[:\-=]\s*([^\n\r,]*)", regex::escape(field)),
646                                format!(r"(?i){}[:\-=]\s*([^,\n\r]+)", regex::escape(field)),
647                            ];
648
649                            for pattern in &patterns {
650                                if let Ok(regex) = Regex::new(pattern) {
651                                    if regex.is_match(extraction_text) {
652                                        found_fields.push(field.clone());
653                                        break; // Found this field, move to next
654                                    }
655                                }
656                            }
657                        }
658
659                        // If we found multiple expected fields in the single extraction_text,
660                        // this is likely malformed and should be re-parsed
661                        if found_fields.len() > 1 {
662                            log::debug!("Detected malformed JSON: {} extraction classes found in single extraction_text '{}'",
663                                     found_fields.len(), single_key);
664
665                            // Try to extract individual field values
666                            let mut repaired_obj = serde_json::Map::new();
667
668                            for field in &found_fields {
669                                let patterns = [
670                                    format!(r"(?i){}[:\-=]\s*([^\n\r,]*)", regex::escape(field)),
671                                    format!(r"(?i){}\s*[:\-=]\s*([^\n\r,]*)", regex::escape(field)),
672                                ];
673
674                                for pattern in &patterns {
675                                    if let Ok(regex) = Regex::new(pattern) {
676                                        if let Some(captures) = regex.captures(extraction_text) {
677                                            if let Some(value_match) = captures.get(1) {
678                                                let value = value_match.as_str().trim();
679                                                if !value.is_empty() {
680                                                    repaired_obj.insert(field.clone(), serde_json::Value::String(value.to_string()));
681                                                    break;
682                                                }
683                                            }
684                                        }
685                                    }
686                                }
687                            }
688
689                            if !repaired_obj.is_empty() {
690                                log::debug!("Successfully repaired malformed JSON, extracted {} fields", repaired_obj.len());
691                                return Some(serde_json::Value::Object(repaired_obj));
692                            }
693                        }
694                    }
695                }
696            }
697        }
698
699        None // No repair needed
700    }
701
702    /// Parse response with enhanced cleaning and repair capabilities
703    fn parse_response_with_repair(&self, response: &str, expected_fields: &[String]) -> LangExtractResult<Vec<Extraction>> {
704        // First, clean the response (remove code fences, etc.)
705        let cleaned_response = self.clean_response(response);
706        log::debug!("Cleaned response length: {} chars", cleaned_response.len());
707
708        // Try to parse as JSON first
709        if let Ok(json_value) = serde_json::from_str::<serde_json::Value>(&cleaned_response) {
710            log::debug!("Parsed JSON successfully");
711
712            // Check if the JSON needs repair (malformed case with multiple classes in single extraction_text)
713            if let Some(repaired_json) = self.detect_and_repair_malformed_json(&json_value, expected_fields) {
714                log::debug!("Applied JSON repair logic");
715                return self.parse_json_response(&repaired_json);
716            } else {
717                return self.parse_json_response(&json_value);
718            }
719        }
720
721        // If that fails, try to extract JSON from the response (in case it's wrapped)
722        if let Some(json_start) = cleaned_response.find('{') {
723            if let Some(json_end) = cleaned_response.rfind('}') {
724                let json_str = &cleaned_response[json_start..=json_end];
725                if let Ok(json_value) = serde_json::from_str::<serde_json::Value>(json_str) {
726                    log::debug!("Extracted and parsed JSON from wrapped content");
727
728                    // Check if the extracted JSON needs repair
729                    if let Some(repaired_json) = self.detect_and_repair_malformed_json(&json_value, expected_fields) {
730                        log::debug!("Applied JSON repair logic to extracted content");
731                        return self.parse_json_response(&repaired_json);
732                    } else {
733                        return self.parse_json_response(&json_value);
734                    }
735                }
736            }
737        }
738
739        Err(LangExtractError::parsing(
740            format!("Could not parse response as JSON after cleaning: {}", cleaned_response)
741        ))
742    }
743
744
745
746    /// Parse JSON response into extractions
747    fn parse_json_response(&self, json: &serde_json::Value) -> LangExtractResult<Vec<Extraction>> {
748        let mut extractions = Vec::new();
749
750        // Handle array at top level
751        if let Some(array) = json.as_array() {
752            for (index, item) in array.iter().enumerate() {
753                extractions.extend(self.parse_single_item(item, Some(index))?);
754            }
755            return Ok(extractions);
756        }
757
758        // Handle object with data/results wrapper
759        if let Some(obj) = json.as_object() {
760            if let Some(data_array) = obj.get("data").and_then(|v| v.as_array()) {
761                for (index, item) in data_array.iter().enumerate() {
762                    extractions.extend(self.parse_single_item(item, Some(index))?);
763                }
764                return Ok(extractions);
765            }
766            if let Some(results_array) = obj.get("results").and_then(|v| v.as_array()) {
767                for (index, item) in results_array.iter().enumerate() {
768                    extractions.extend(self.parse_single_item(item, Some(index))?);
769                }
770                return Ok(extractions);
771            }
772
773            // Handle flat JSON structure like {"name": "John", "age": "25"}
774            extractions.extend(self.parse_single_item(json, None)?);
775        }
776
777        Ok(extractions)
778    }
779
780    /// Parse a single item (object or primitive) into extractions
781    fn parse_single_item(&self, item: &serde_json::Value, index: Option<usize>) -> LangExtractResult<Vec<Extraction>> {
782        let mut extractions = Vec::new();
783
784        match item {
785            Value::Object(obj) => {
786                for (key, value) in obj {
787                    let extraction_text = match value {
788                        Value::String(s) => s.clone(),
789                        Value::Number(n) => n.to_string(),
790                        Value::Bool(b) => b.to_string(),
791                        Value::Array(_) | Value::Object(_) => value.to_string(),
792                        Value::Null => continue,
793                    };
794
795                    let mut extraction = Extraction::new(key.clone(), extraction_text);
796                    if let Some(idx) = index {
797                        extraction.group_index = Some(idx);
798                    }
799                    extractions.push(extraction);
800                }
801            }
802            Value::String(s) => {
803                let extraction_class = if let Some(idx) = index {
804                    format!("item_{}", idx)
805                } else {
806                    "text".to_string()
807                };
808                extractions.push(Extraction::new(extraction_class, s.clone()));
809            }
810            _ => {
811                return Err(LangExtractError::parsing(
812                    format!("Unsupported item type: {:?}", item)
813                ));
814            }
815        }
816
817        Ok(extractions)
818    }
819
820    /// Validate extractions against expected schema
821    fn validate_extractions(&self, extractions: &[Extraction], expected_fields: &[String]) -> ValidationResult {
822        let mut errors = Vec::new();
823        let mut warnings = Vec::new();
824        let mut is_valid = true;
825        let mut coercion_details = Vec::new();
826
827        // Check for required fields if enabled
828        if self.validation_config.require_all_fields {
829            let extraction_classes: std::collections::HashSet<_> = 
830                extractions.iter().map(|e| &e.extraction_class).collect();
831            
832            for expected_field in expected_fields {
833                if !extraction_classes.contains(expected_field) {
834                    errors.push(ValidationError {
835                        message: format!("Required field '{}' is missing", expected_field),
836                        field_path: Some(expected_field.clone()),
837                        expected: Some("Present".to_string()),
838                        actual: Some("Missing".to_string()),
839                    });
840                    is_valid = false;
841                }
842            }
843        }
844
845        // Validate individual extractions and attempt type coercion
846        for extraction in extractions {
847            // Check for empty extraction text
848            if extraction.extraction_text.trim().is_empty() {
849                warnings.push(ValidationWarning {
850                    message: format!("Empty extraction text for field '{}'", extraction.extraction_class),
851                    field_path: Some(extraction.extraction_class.clone()),
852                });
853            }
854
855            // Check extraction text length
856            if extraction.extraction_text.len() > 1000 {
857                warnings.push(ValidationWarning {
858                    message: format!("Very long extraction text ({} chars) for field '{}'", 
859                        extraction.extraction_text.len(), extraction.extraction_class),
860                    field_path: Some(extraction.extraction_class.clone()),
861                });
862            }
863
864            // Attempt type coercion if enabled
865            if self.validation_config.enable_type_coercion {
866                let coercion_result = self.type_coercer.coerce_value(
867                    &extraction.extraction_class, 
868                    &extraction.extraction_text
869                );
870                coercion_details.push(coercion_result);
871            }
872        }
873
874        // Quality check - too few extractions might indicate poor model performance
875        if extractions.len() < expected_fields.len() / 2 {
876            warnings.push(ValidationWarning {
877                message: format!("Low extraction count: found {} but expected around {}", 
878                    extractions.len(), expected_fields.len()),
879                field_path: None,
880            });
881        }
882
883        // Build corrected data from coerced values
884        let corrected_data = if !coercion_details.is_empty() && coercion_details.iter().any(|d| d.success) {
885            let mut corrected_obj = serde_json::Map::new();
886            
887            for detail in &coercion_details {
888                if detail.success {
889                    if let Some(ref coerced_value) = detail.coerced_value {
890                        corrected_obj.insert(detail.field_name.clone(), coerced_value.clone());
891                    }
892                } else {
893                    // Keep original value as string if coercion failed
894                    corrected_obj.insert(detail.field_name.clone(), Value::String(detail.original_value.clone()));
895                }
896            }
897            
898            Some(Value::Object(corrected_obj))
899        } else {
900            None
901        };
902
903        // Create coercion summary
904        let coercion_summary = if !coercion_details.is_empty() {
905            let successful_coercions = coercion_details.iter().filter(|d| d.success).count();
906            let failed_coercions = coercion_details.len() - successful_coercions;
907            
908            Some(CoercionSummary {
909                successful_coercions,
910                failed_coercions,
911                coercion_details,
912            })
913        } else {
914            None
915        };
916
917        ValidationResult {
918            is_valid: is_valid && errors.is_empty(),
919            errors,
920            warnings,
921            corrected_data,
922            raw_output_file: None, // Set by caller
923            coercion_summary,
924        }
925    }
926}
927
928#[cfg(test)]
929mod tests {
930    use super::*;
931    use crate::ExtractConfig;
932    use std::fs;
933    use tempfile::TempDir;
934
935    fn create_test_config() -> ExtractConfig {
936        ExtractConfig {
937            debug: true,
938            ..Default::default()
939        }
940    }
941
942    fn create_test_resolver() -> Resolver {
943        let config = create_test_config();
944        Resolver::new(&config, true).unwrap()
945    }
946
947    fn create_test_resolver_with_temp_dir(temp_dir: &TempDir) -> Resolver {
948        let config = create_test_config();
949        let validation_config = ValidationConfig {
950            save_raw_outputs: true,
951            raw_outputs_dir: temp_dir.path().to_string_lossy().to_string(),
952            ..Default::default()
953        };
954        Resolver::with_validation_config(&config, true, validation_config).unwrap()
955    }
956
957    #[test]
958    fn test_validation_config_default() {
959        let config = ValidationConfig::default();
960        assert!(config.enable_schema_validation);
961        assert!(config.enable_type_coercion);
962        assert!(!config.require_all_fields);
963        assert!(config.save_raw_outputs);
964        assert_eq!(config.raw_outputs_dir, "./raw_outputs");
965        assert_eq!(config.quality_threshold, 0.0);
966    }
967
968    #[test]
969    fn test_raw_output_saving() {
970        let temp_dir = TempDir::new().unwrap();
971        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
972        
973        let test_output = r#"{"person": "John Doe", "age": "30"}"#;
974        let result = resolver.save_raw_output(test_output, Some("test_metadata"));
975        
976        assert!(result.is_ok());
977        let file_path = result.unwrap();
978        assert!(std::path::Path::new(&file_path).exists());
979        
980        let content = fs::read_to_string(&file_path).unwrap();
981        assert!(content.contains("Raw Model Output"));
982        assert!(content.contains("test_metadata"));
983        assert!(content.contains(test_output));
984    }
985
986    #[test]
987    fn test_parse_valid_json() {
988        let resolver = create_test_resolver();
989        let json_response = r#"[{"person": "John Doe", "age": "30"}]"#;
990        let expected_fields = vec!["person".to_string(), "age".to_string()];
991
992        let result = resolver.parse_response_with_repair(json_response, &expected_fields);
993        assert!(result.is_ok());
994        
995        let extractions = result.unwrap();
996        assert_eq!(extractions.len(), 2);
997        
998        // Check that we have both fields (order may vary)
999        let classes: std::collections::HashSet<_> = extractions.iter()
1000            .map(|e| e.extraction_class.as_str()).collect();
1001        assert!(classes.contains("person"));
1002        assert!(classes.contains("age"));
1003        
1004        // Check the values
1005        let person_extraction = extractions.iter().find(|e| e.extraction_class == "person").unwrap();
1006        assert_eq!(person_extraction.extraction_text, "John Doe");
1007        let age_extraction = extractions.iter().find(|e| e.extraction_class == "age").unwrap();
1008        assert_eq!(age_extraction.extraction_text, "30");
1009    }
1010
1011    #[test]
1012    fn test_parse_wrapped_json() {
1013        let resolver = create_test_resolver();
1014        let json_response = r#"{"data": [{"name": "Alice", "city": "NYC"}]}"#;
1015        let expected_fields = vec!["name".to_string(), "city".to_string()];
1016
1017        let result = resolver.parse_response_with_repair(json_response, &expected_fields);
1018        assert!(result.is_ok());
1019        
1020        let extractions = result.unwrap();
1021        assert_eq!(extractions.len(), 2);
1022        
1023        // Check that we have both fields (order may vary)
1024        let classes: std::collections::HashSet<_> = extractions.iter()
1025            .map(|e| e.extraction_class.as_str()).collect();
1026        assert!(classes.contains("name"));
1027        assert!(classes.contains("city"));
1028        
1029        // Check the values
1030        let name_extraction = extractions.iter().find(|e| e.extraction_class == "name").unwrap();
1031        assert_eq!(name_extraction.extraction_text, "Alice");
1032        let city_extraction = extractions.iter().find(|e| e.extraction_class == "city").unwrap();
1033        assert_eq!(city_extraction.extraction_text, "NYC");
1034    }
1035
1036    #[test]
1037    fn test_parse_invalid_json() {
1038        let resolver = create_test_resolver();
1039        let invalid_response = r#"This is not JSON at all!"#;
1040        let expected_fields = vec!["name".to_string()];
1041
1042        let result = resolver.parse_response_with_repair(invalid_response, &expected_fields);
1043        assert!(result.is_err());
1044    }
1045
1046    #[test]
1047    fn test_validation_required_fields() {
1048        let resolver = create_test_resolver();
1049        let extractions = vec![
1050            Extraction::new("person".to_string(), "John".to_string()),
1051        ];
1052        let expected_fields = vec!["person".to_string(), "age".to_string()];
1053        
1054        // Test with require_all_fields = false (default)
1055        let result = resolver.validate_extractions(&extractions, &expected_fields);
1056        assert!(result.is_valid); // Should pass because require_all_fields is false
1057        
1058        // Test with require_all_fields = true
1059        let config = create_test_config();
1060        let validation_config = ValidationConfig {
1061            require_all_fields: true,
1062            save_raw_outputs: false,
1063            ..Default::default()
1064        };
1065        let resolver = Resolver::with_validation_config(&config, true, validation_config).unwrap();
1066        let result = resolver.validate_extractions(&extractions, &expected_fields);
1067        assert!(!result.is_valid); // Should fail because "age" is missing
1068        assert_eq!(result.errors.len(), 1);
1069        assert!(result.errors[0].message.contains("age"));
1070    }
1071
1072    #[test]
1073    fn test_validation_empty_extractions() {
1074        let resolver = create_test_resolver();
1075        let extractions = vec![
1076            Extraction::new("person".to_string(), "".to_string()), // Empty text
1077            Extraction::new("age".to_string(), "25".to_string()),
1078        ];
1079        let expected_fields = vec!["person".to_string(), "age".to_string()];
1080        
1081        let result = resolver.validate_extractions(&extractions, &expected_fields);
1082        assert!(result.is_valid); // Valid despite warnings
1083        assert_eq!(result.warnings.len(), 1); // One warning for empty text
1084        assert!(result.warnings[0].message.contains("Empty extraction text"));
1085    }
1086
1087    #[test]
1088    fn test_validation_low_extraction_count() {
1089        let resolver = create_test_resolver();
1090        let extractions = vec![
1091            Extraction::new("person".to_string(), "John".to_string()),
1092        ];
1093        let expected_fields = vec![
1094            "person".to_string(), 
1095            "age".to_string(), 
1096            "city".to_string(), 
1097            "email".to_string()
1098        ]; // 4 expected, only 1 found
1099        
1100        let result = resolver.validate_extractions(&extractions, &expected_fields);
1101        assert!(result.is_valid); // Still valid, just warnings
1102        assert!(!result.warnings.is_empty());
1103        assert!(result.warnings.iter().any(|w| w.message.contains("Low extraction count")));
1104    }
1105
1106    #[test]
1107    fn test_validate_and_parse_success() {
1108        let temp_dir = TempDir::new().unwrap();
1109        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1110        
1111        let valid_json = r#"{"person": "John Doe", "age": "30"}"#;
1112        let expected_fields = vec!["person".to_string(), "age".to_string()];
1113        
1114        let result = resolver.validate_and_parse(valid_json, &expected_fields);
1115        assert!(result.is_ok());
1116        
1117        let (extractions, validation_result) = result.unwrap();
1118        assert_eq!(extractions.len(), 2);
1119        assert!(validation_result.is_valid);
1120        assert!(validation_result.raw_output_file.is_some());
1121        
1122        // Verify the raw output file was created
1123        let raw_file = validation_result.raw_output_file.unwrap();
1124        assert!(std::path::Path::new(&raw_file).exists());
1125    }
1126
1127    #[test]
1128    fn test_validate_and_parse_parse_failure() {
1129        let temp_dir = TempDir::new().unwrap();
1130        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1131
1132        let invalid_json = "This is definitely not JSON!";
1133        let expected_fields = vec!["person".to_string()];
1134
1135        let result = resolver.validate_and_parse(invalid_json, &expected_fields);
1136        assert!(result.is_err());
1137        // Raw output should still be saved even on parse failure
1138    }
1139
1140    #[test]
1141    fn test_clean_response_removes_code_fences() {
1142        let temp_dir = TempDir::new().unwrap();
1143        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1144
1145        // Test various code fence patterns
1146        let test_cases = vec![
1147            (r#"```json{"name": "John"}```"#, r#"{"name": "John"}"#),
1148            (r#"```yaml{"name": "John"}```"#, r#"{"name": "John"}"#),
1149            (r#"```{"name": "John"}```"#, r#"{"name": "John"}"#),
1150            (r#"```python{"name": "John"}```"#, r#"{"name": "John"}"#),
1151            (r#"Some text ```json{"name": "John"}``` more text"#, r#"Some text {"name": "John"} more text"#),
1152        ];
1153
1154        for (input, expected) in test_cases {
1155            let cleaned = resolver.clean_response(input);
1156            assert_eq!(cleaned, expected, "Failed to clean: {}", input);
1157        }
1158    }
1159
1160    #[test]
1161    fn test_detect_and_repair_malformed_json() {
1162        let temp_dir = TempDir::new().unwrap();
1163        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1164        let expected_fields = vec!["name".to_string(), "age".to_string(), "city".to_string()];
1165
1166        // Test case: malformed JSON with multiple classes in single extraction_text
1167        let malformed_json: serde_json::Value = serde_json::json!({
1168            "person": "name: John Doe, age: 30, city: New York"
1169        });
1170
1171        let repaired = resolver.detect_and_repair_malformed_json(&malformed_json, &expected_fields);
1172        assert!(repaired.is_some(), "Should detect malformed JSON");
1173
1174        let repaired = repaired.unwrap();
1175        if let Some(obj) = repaired.as_object() {
1176            // Should extract name, age, and city as separate fields
1177            assert!(obj.contains_key("name"), "Should extract name field");
1178            assert!(obj.contains_key("age"), "Should extract age field");
1179            assert!(obj.contains_key("city"), "Should extract city field");
1180
1181            assert_eq!(obj.get("name").unwrap().as_str().unwrap(), "John Doe");
1182            assert_eq!(obj.get("age").unwrap().as_str().unwrap(), "30");
1183            assert_eq!(obj.get("city").unwrap().as_str().unwrap(), "New York");
1184        } else {
1185            panic!("Repaired JSON should be an object");
1186        }
1187
1188        // Test case: well-formed JSON should not be repaired
1189        let well_formed_json: serde_json::Value = serde_json::json!({
1190            "name": "John Doe",
1191            "age": "30",
1192            "city": "New York"
1193        });
1194
1195        let repaired = resolver.detect_and_repair_malformed_json(&well_formed_json, &expected_fields);
1196        assert!(repaired.is_none(), "Well-formed JSON should not be repaired");
1197    }
1198
1199    #[test]
1200    fn test_parse_response_with_code_fences() {
1201        let temp_dir = TempDir::new().unwrap();
1202        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1203        let expected_fields = vec!["name".to_string(), "age".to_string()];
1204
1205        // Test parsing response wrapped in code fences
1206        let fenced_response = r#"```json
1207{
1208  "name": "Alice",
1209  "age": "25"
1210}
1211```"#;
1212
1213        let result = resolver.parse_response_with_repair(fenced_response, &expected_fields);
1214        assert!(result.is_ok(), "Should parse fenced JSON successfully");
1215
1216        let extractions = result.unwrap();
1217        assert_eq!(extractions.len(), 2, "Should extract 2 fields");
1218
1219        let names: Vec<_> = extractions.iter().filter(|e| e.extraction_class == "name").collect();
1220        let ages: Vec<_> = extractions.iter().filter(|e| e.extraction_class == "age").collect();
1221
1222        assert_eq!(names.len(), 1);
1223        assert_eq!(ages.len(), 1);
1224        assert_eq!(names[0].extraction_text, "Alice");
1225        assert_eq!(ages[0].extraction_text, "25");
1226    }
1227
1228    #[test]
1229    fn test_parse_response_with_malformed_repair() {
1230        let temp_dir = TempDir::new().unwrap();
1231        let resolver = create_test_resolver_with_temp_dir(&temp_dir);
1232        let expected_fields = vec!["name".to_string(), "age".to_string(), "profession".to_string()];
1233
1234        // Test parsing malformed response that should be repaired
1235        let malformed_response = r#"{
1236  "person": "name: Bob Smith, age: 35, profession: engineer"
1237}"#;
1238
1239        let result = resolver.parse_response_with_repair(malformed_response, &expected_fields);
1240        assert!(result.is_ok(), "Should parse and repair malformed JSON successfully");
1241
1242        let extractions = result.unwrap();
1243        assert_eq!(extractions.len(), 3, "Should extract 3 separate fields after repair");
1244
1245        let name_found = extractions.iter().any(|e| e.extraction_class == "name" && e.extraction_text == "Bob Smith");
1246        let age_found = extractions.iter().any(|e| e.extraction_class == "age" && e.extraction_text == "35");
1247        let profession_found = extractions.iter().any(|e| e.extraction_class == "profession" && e.extraction_text == "engineer");
1248
1249        assert!(name_found, "Should find extracted name");
1250        assert!(age_found, "Should find extracted age");
1251        assert!(profession_found, "Should find extracted profession");
1252    }
1253
1254    // Type Coercion Tests
1255    mod type_coercion_tests {
1256        use super::*;
1257
1258        fn create_coercion_resolver() -> Resolver {
1259            let config = create_test_config();
1260            let validation_config = ValidationConfig {
1261                enable_type_coercion: true,
1262                save_raw_outputs: false,
1263                ..Default::default()
1264            };
1265            Resolver::with_validation_config(&config, true, validation_config).unwrap()
1266        }
1267
1268        #[test]
1269        fn test_integer_coercion() {
1270            let resolver = create_coercion_resolver();
1271            let extractions = vec![
1272                Extraction::new("age".to_string(), "25".to_string()),
1273                Extraction::new("count".to_string(), "-10".to_string()),
1274                Extraction::new("year".to_string(), "2024".to_string()),
1275            ];
1276            let expected_fields = vec!["age".to_string(), "count".to_string(), "year".to_string()];
1277            
1278            let result = resolver.validate_extractions(&extractions, &expected_fields);
1279            assert!(result.is_valid);
1280            
1281            let coercion_summary = result.coercion_summary.unwrap();
1282            assert_eq!(coercion_summary.successful_coercions, 3);
1283            assert_eq!(coercion_summary.failed_coercions, 0);
1284            
1285            // Check specific coercions
1286            let age_coercion = coercion_summary.coercion_details.iter()
1287                .find(|d| d.field_name == "age").unwrap();
1288            assert!(age_coercion.success);
1289            assert_eq!(age_coercion.target_type, CoercionTargetType::Integer);
1290            assert_eq!(age_coercion.coerced_value.as_ref().unwrap().as_i64().unwrap(), 25);
1291        }
1292
1293        #[test]
1294        fn test_float_coercion() {
1295            let resolver = create_coercion_resolver();
1296            let extractions = vec![
1297                Extraction::new("score".to_string(), "94.7".to_string()),
1298                Extraction::new("percentage".to_string(), "-12.5".to_string()),
1299                Extraction::new("scientific".to_string(), "1.23e-4".to_string()),
1300            ];
1301            let expected_fields = vec!["score".to_string(), "percentage".to_string(), "scientific".to_string()];
1302            
1303            let result = resolver.validate_extractions(&extractions, &expected_fields);
1304            assert!(result.is_valid);
1305            
1306            let coercion_summary = result.coercion_summary.unwrap();
1307            assert_eq!(coercion_summary.successful_coercions, 3);
1308            
1309            let score_coercion = coercion_summary.coercion_details.iter()
1310                .find(|d| d.field_name == "score").unwrap();
1311            assert!(score_coercion.success);
1312            assert_eq!(score_coercion.target_type, CoercionTargetType::Float);
1313            assert!((score_coercion.coerced_value.as_ref().unwrap().as_f64().unwrap() - 94.7).abs() < 0.01);
1314        }
1315
1316        #[test]
1317        fn test_boolean_coercion() {
1318            let resolver = create_coercion_resolver();
1319            let extractions = vec![
1320                Extraction::new("active".to_string(), "true".to_string()),
1321                Extraction::new("enabled".to_string(), "yes".to_string()),
1322                Extraction::new("disabled".to_string(), "false".to_string()),
1323                Extraction::new("off".to_string(), "no".to_string()),
1324                Extraction::new("binary".to_string(), "1".to_string()),
1325                Extraction::new("zero".to_string(), "0".to_string()),
1326            ];
1327            let expected_fields = vec!["active".to_string(), "enabled".to_string(), "disabled".to_string(), "off".to_string(), "binary".to_string(), "zero".to_string()];
1328
1329            let result = resolver.validate_extractions(&extractions, &expected_fields);
1330            assert!(result.is_valid);
1331
1332            let coercion_summary = result.coercion_summary.unwrap();
1333            assert_eq!(coercion_summary.successful_coercions, 6);
1334
1335            let active_coercion = coercion_summary.coercion_details.iter()
1336                .find(|d| d.field_name == "active").unwrap();
1337            assert!(active_coercion.success);
1338            assert_eq!(active_coercion.target_type, CoercionTargetType::Boolean);
1339            assert_eq!(active_coercion.coerced_value.as_ref().unwrap().as_bool().unwrap(), true);
1340
1341            // "1" and "0" should now coerce to integers, not booleans
1342            let binary_coercion = coercion_summary.coercion_details.iter()
1343                .find(|d| d.field_name == "binary").unwrap();
1344            assert!(binary_coercion.success);
1345            assert_eq!(binary_coercion.target_type, CoercionTargetType::Integer);
1346            assert_eq!(binary_coercion.coerced_value.as_ref().unwrap().as_i64().unwrap(), 1);
1347
1348            let zero_coercion = coercion_summary.coercion_details.iter()
1349                .find(|d| d.field_name == "zero").unwrap();
1350            assert!(zero_coercion.success);
1351            assert_eq!(zero_coercion.target_type, CoercionTargetType::Integer);
1352            assert_eq!(zero_coercion.coerced_value.as_ref().unwrap().as_i64().unwrap(), 0);
1353        }
1354
1355        #[test]
1356        fn test_currency_coercion() {
1357            let resolver = create_coercion_resolver();
1358            let extractions = vec![
1359                Extraction::new("funding".to_string(), "$1.5 million".to_string()),
1360                Extraction::new("budget".to_string(), "$2.3M".to_string()),
1361                Extraction::new("salary".to_string(), "$75,000".to_string()),
1362                Extraction::new("value".to_string(), "500K".to_string()),
1363                Extraction::new("debt".to_string(), "$1.2 billion".to_string()),
1364            ];
1365            let expected_fields = vec!["funding".to_string(), "budget".to_string(), "salary".to_string(), "value".to_string(), "debt".to_string()];
1366            
1367            let result = resolver.validate_extractions(&extractions, &expected_fields);
1368            assert!(result.is_valid);
1369            
1370            let coercion_summary = result.coercion_summary.unwrap();
1371            assert_eq!(coercion_summary.successful_coercions, 5);
1372            
1373            let funding_coercion = coercion_summary.coercion_details.iter()
1374                .find(|d| d.field_name == "funding").unwrap();
1375            assert!(funding_coercion.success);
1376            assert_eq!(funding_coercion.target_type, CoercionTargetType::Currency);
1377            assert!((funding_coercion.coerced_value.as_ref().unwrap().as_f64().unwrap() - 1_500_000.0).abs() < 1.0);
1378        }
1379
1380        #[test]
1381        fn test_percentage_coercion() {
1382            let resolver = create_coercion_resolver();
1383            let extractions = vec![
1384                Extraction::new("accuracy".to_string(), "94.7%".to_string()),
1385                Extraction::new("completion".to_string(), "100%".to_string()),
1386                Extraction::new("error_rate".to_string(), "0.5%".to_string()),
1387            ];
1388            let expected_fields = vec!["accuracy".to_string(), "completion".to_string(), "error_rate".to_string()];
1389            
1390            let result = resolver.validate_extractions(&extractions, &expected_fields);
1391            assert!(result.is_valid);
1392            
1393            let coercion_summary = result.coercion_summary.unwrap();
1394            assert_eq!(coercion_summary.successful_coercions, 3);
1395            
1396            let accuracy_coercion = coercion_summary.coercion_details.iter()
1397                .find(|d| d.field_name == "accuracy").unwrap();
1398            assert!(accuracy_coercion.success);
1399            assert_eq!(accuracy_coercion.target_type, CoercionTargetType::Percentage);
1400            assert!((accuracy_coercion.coerced_value.as_ref().unwrap().as_f64().unwrap() - 0.947).abs() < 0.001);
1401        }
1402
1403        #[test]
1404        fn test_email_coercion() {
1405            let resolver = create_coercion_resolver();
1406            let extractions = vec![
1407                Extraction::new("contact".to_string(), "john.doe@example.com".to_string()),
1408                Extraction::new("support".to_string(), "support@company.org".to_string()),
1409                Extraction::new("invalid".to_string(), "not-an-email".to_string()),
1410            ];
1411            let expected_fields = vec!["contact".to_string(), "support".to_string(), "invalid".to_string()];
1412            
1413            let result = resolver.validate_extractions(&extractions, &expected_fields);
1414            assert!(result.is_valid);
1415            
1416            let coercion_summary = result.coercion_summary.unwrap();
1417            assert_eq!(coercion_summary.successful_coercions, 2); // Only 2 valid emails
1418            assert_eq!(coercion_summary.failed_coercions, 1);
1419            
1420            let contact_coercion = coercion_summary.coercion_details.iter()
1421                .find(|d| d.field_name == "contact").unwrap();
1422            assert!(contact_coercion.success);
1423            assert_eq!(contact_coercion.target_type, CoercionTargetType::Email);
1424            let coerced_obj = contact_coercion.coerced_value.as_ref().unwrap().as_object().unwrap();
1425            assert_eq!(coerced_obj.get("email").unwrap().as_str().unwrap(), "john.doe@example.com");
1426        }
1427
1428        #[test]
1429        fn test_phone_coercion() {
1430            let resolver = create_coercion_resolver();
1431            let extractions = vec![
1432                Extraction::new("phone1".to_string(), "(617) 555-1234".to_string()),
1433                Extraction::new("phone2".to_string(), "617-555-1234".to_string()),
1434                Extraction::new("phone3".to_string(), "617.555.1234".to_string()),
1435                Extraction::new("phone4".to_string(), "6175551234".to_string()),
1436                Extraction::new("invalid".to_string(), "123-45".to_string()),
1437            ];
1438            let expected_fields = vec!["phone1".to_string(), "phone2".to_string(), "phone3".to_string(), "phone4".to_string(), "invalid".to_string()];
1439            
1440            let result = resolver.validate_extractions(&extractions, &expected_fields);
1441            assert!(result.is_valid);
1442            
1443            let coercion_summary = result.coercion_summary.unwrap();
1444            assert_eq!(coercion_summary.successful_coercions, 4); // 4 valid phone numbers
1445            assert_eq!(coercion_summary.failed_coercions, 1);
1446            
1447            let phone1_coercion = coercion_summary.coercion_details.iter()
1448                .find(|d| d.field_name == "phone1").unwrap();
1449            assert!(phone1_coercion.success);
1450            assert_eq!(phone1_coercion.target_type, CoercionTargetType::PhoneNumber);
1451            let coerced_obj = phone1_coercion.coerced_value.as_ref().unwrap().as_object().unwrap();
1452            assert_eq!(coerced_obj.get("phone").unwrap().as_str().unwrap(), "(617) 555-1234");
1453        }
1454
1455        #[test]
1456        fn test_no_coercion_when_disabled() {
1457            let config = create_test_config();
1458            let validation_config = ValidationConfig {
1459                enable_type_coercion: false, // Disabled
1460                save_raw_outputs: false,
1461                ..Default::default()
1462            };
1463            let resolver = Resolver::with_validation_config(&config, true, validation_config).unwrap();
1464            
1465            let extractions = vec![
1466                Extraction::new("age".to_string(), "25".to_string()),
1467            ];
1468            let expected_fields = vec!["age".to_string()];
1469            
1470            let result = resolver.validate_extractions(&extractions, &expected_fields);
1471            assert!(result.is_valid);
1472            assert!(result.coercion_summary.is_none()); // No coercion attempted
1473        }
1474
1475        #[test]
1476        fn test_mixed_coercion_results() {
1477            let resolver = create_coercion_resolver();
1478            let extractions = vec![
1479                Extraction::new("age".to_string(), "25".to_string()), // Should coerce to integer
1480                Extraction::new("name".to_string(), "John Doe".to_string()), // No coercion
1481                Extraction::new("email".to_string(), "john@example.com".to_string()), // Should coerce to email
1482                Extraction::new("invalid_number".to_string(), "abc123".to_string()), // Should fail coercion
1483                Extraction::new("percentage".to_string(), "95%".to_string()), // Should coerce to percentage
1484            ];
1485            let expected_fields = vec!["age".to_string(), "name".to_string(), "email".to_string(), "invalid_number".to_string(), "percentage".to_string()];
1486            
1487            let result = resolver.validate_extractions(&extractions, &expected_fields);
1488            assert!(result.is_valid);
1489            
1490            let coercion_summary = result.coercion_summary.unwrap();
1491            assert_eq!(coercion_summary.successful_coercions, 3); // age, email, percentage
1492            assert_eq!(coercion_summary.failed_coercions, 2); // name, invalid_number
1493            
1494            // Check that successful coercions have the right types
1495            let successful_types: Vec<_> = coercion_summary.coercion_details.iter()
1496                .filter(|d| d.success)
1497                .map(|d| &d.target_type)
1498                .collect();
1499            assert!(successful_types.contains(&&CoercionTargetType::Integer));
1500            assert!(successful_types.contains(&&CoercionTargetType::Email));
1501            assert!(successful_types.contains(&&CoercionTargetType::Percentage));
1502        }
1503
1504        #[test]
1505        fn test_corrected_data_generation() {
1506            let resolver = create_coercion_resolver();
1507            let extractions = vec![
1508                Extraction::new("age".to_string(), "25".to_string()),
1509                Extraction::new("price".to_string(), "$19.99".to_string()),
1510                Extraction::new("active".to_string(), "true".to_string()),
1511                Extraction::new("invalid".to_string(), "not_a_number".to_string()),
1512            ];
1513            let expected_fields = vec!["age".to_string(), "price".to_string(), "active".to_string(), "invalid".to_string()];
1514            
1515            let result = resolver.validate_extractions(&extractions, &expected_fields);
1516            assert!(result.is_valid);
1517            
1518            // Check that corrected data was generated
1519            let corrected_data = result.corrected_data.unwrap();
1520            let corrected_obj = corrected_data.as_object().unwrap();
1521            
1522            // Check coerced values
1523            assert_eq!(corrected_obj.get("age").unwrap().as_i64().unwrap(), 25);
1524            assert_eq!(corrected_obj.get("price").unwrap().as_f64().unwrap(), 19.99);
1525            assert_eq!(corrected_obj.get("active").unwrap().as_bool().unwrap(), true);
1526            
1527            // Check that failed coercion keeps original string value
1528            assert_eq!(corrected_obj.get("invalid").unwrap().as_str().unwrap(), "not_a_number");
1529        }
1530    }
1531}