polars_redis/
infer.rs

1//! Schema inference for Redis data.
2//!
3//! This module provides functionality to infer Polars schemas from Redis data
4//! by sampling keys and analyzing field values.
5//!
6//! # Confidence Scores
7//!
8//! The `_with_confidence` variants of inference functions return detailed
9//! statistics about type inference quality:
10//!
11//! ```ignore
12//! let result = infer_hash_schema_with_confidence(url, pattern, sample_size)?;
13//!
14//! for (field, info) in &result.field_info {
15//!     if info.confidence < 0.9 {
16//!         println!("Warning: {} has low confidence ({:.0}%)", field, info.confidence * 100.0);
17//!     }
18//! }
19//! ```
20
21use std::collections::{HashMap, HashSet};
22
23use redis::AsyncCommands;
24use redis::aio::ConnectionManager;
25use tokio::runtime::Runtime;
26
27use crate::connection::RedisConnection;
28use crate::error::{Error, Result};
29use crate::schema::RedisType;
30
31/// Inferred schema from Redis data.
32#[derive(Debug, Clone)]
33pub struct InferredSchema {
34    /// Field names and their inferred types.
35    pub fields: Vec<(String, RedisType)>,
36    /// Number of keys sampled.
37    pub sample_count: usize,
38}
39
40/// Detailed inference information for a single field.
41#[derive(Debug, Clone)]
42pub struct FieldInferenceInfo {
43    /// The inferred type for this field.
44    pub inferred_type: RedisType,
45    /// Confidence score from 0.0 to 1.0.
46    /// 1.0 means all sampled values matched the inferred type.
47    pub confidence: f64,
48    /// Total number of samples for this field.
49    pub samples: usize,
50    /// Number of samples that successfully parsed as the inferred type.
51    pub valid: usize,
52    /// Number of null/missing values.
53    pub nulls: usize,
54    /// Type candidates that were considered, with their match counts.
55    pub type_candidates: HashMap<String, usize>,
56}
57
58impl FieldInferenceInfo {
59    /// Check if confidence is above a threshold (default: 0.9).
60    pub fn is_confident(&self, threshold: f64) -> bool {
61        self.confidence >= threshold
62    }
63
64    /// Get the percentage of null values.
65    pub fn null_ratio(&self) -> f64 {
66        if self.samples == 0 {
67            0.0
68        } else {
69            self.nulls as f64 / self.samples as f64
70        }
71    }
72}
73
74/// Inferred schema with detailed confidence information.
75#[derive(Debug, Clone)]
76pub struct InferredSchemaWithConfidence {
77    /// Field names and their inferred types.
78    pub fields: Vec<(String, RedisType)>,
79    /// Number of keys sampled.
80    pub sample_count: usize,
81    /// Detailed inference information for each field.
82    pub field_info: HashMap<String, FieldInferenceInfo>,
83}
84
85impl InferredSchemaWithConfidence {
86    /// Convert to a basic InferredSchema (discards confidence info).
87    pub fn to_basic(&self) -> InferredSchema {
88        InferredSchema {
89            fields: self.fields.clone(),
90            sample_count: self.sample_count,
91        }
92    }
93
94    /// Get fields with confidence below a threshold.
95    pub fn low_confidence_fields(&self, threshold: f64) -> Vec<(&str, f64)> {
96        self.field_info
97            .iter()
98            .filter(|(_, info)| info.confidence < threshold)
99            .map(|(name, info)| (name.as_str(), info.confidence))
100            .collect()
101    }
102
103    /// Check if all fields have confidence above a threshold.
104    pub fn all_confident(&self, threshold: f64) -> bool {
105        self.field_info
106            .values()
107            .all(|info| info.confidence >= threshold)
108    }
109
110    /// Get overall average confidence across all fields.
111    pub fn average_confidence(&self) -> f64 {
112        if self.field_info.is_empty() {
113            1.0
114        } else {
115            let sum: f64 = self.field_info.values().map(|info| info.confidence).sum();
116            sum / self.field_info.len() as f64
117        }
118    }
119}
120
121impl InferredSchema {
122    /// Convert to a list of (field_name, type_string) tuples for Python.
123    pub fn to_type_strings(&self) -> Vec<(String, String)> {
124        self.fields
125            .iter()
126            .map(|(name, dtype)| {
127                let type_str = match dtype {
128                    RedisType::Utf8 => "utf8",
129                    RedisType::Int64 => "int64",
130                    RedisType::Float64 => "float64",
131                    RedisType::Boolean => "bool",
132                    RedisType::Date => "date",
133                    RedisType::Datetime => "datetime",
134                };
135                (name.clone(), type_str.to_string())
136            })
137            .collect()
138    }
139
140    /// Apply schema overwrite - merge user-specified types with inferred types.
141    ///
142    /// User-specified types take precedence over inferred types. Fields that
143    /// exist in the overwrite but not in the inferred schema are added.
144    ///
145    /// # Arguments
146    /// * `overwrite` - User-specified field types that override inferred types
147    ///
148    /// # Returns
149    /// A new `InferredSchema` with merged fields.
150    ///
151    /// # Example
152    /// ```
153    /// use polars_redis::infer::InferredSchema;
154    /// use polars_redis::schema::RedisType;
155    ///
156    /// let inferred = InferredSchema {
157    ///     fields: vec![
158    ///         ("name".to_string(), RedisType::Utf8),
159    ///         ("age".to_string(), RedisType::Utf8),  // Inferred as string
160    ///         ("score".to_string(), RedisType::Float64),
161    ///     ],
162    ///     sample_count: 10,
163    /// };
164    ///
165    /// // Override age to be Int64
166    /// let overwrite = vec![
167    ///     ("age".to_string(), RedisType::Int64),
168    /// ];
169    ///
170    /// let merged = inferred.with_overwrite(&overwrite);
171    /// // merged.fields now has age as Int64
172    /// ```
173    pub fn with_overwrite(self, overwrite: &[(String, RedisType)]) -> Self {
174        let overwrite_map: HashMap<&str, &RedisType> =
175            overwrite.iter().map(|(k, v)| (k.as_str(), v)).collect();
176
177        // Track which fields exist in the original schema
178        let existing_fields: HashSet<String> = self.fields.iter().map(|(k, _)| k.clone()).collect();
179
180        // Start with existing fields, applying overwrites
181        let mut fields: Vec<(String, RedisType)> = self
182            .fields
183            .into_iter()
184            .map(|(name, dtype)| {
185                if let Some(&override_type) = overwrite_map.get(name.as_str()) {
186                    (name, *override_type)
187                } else {
188                    (name, dtype)
189                }
190            })
191            .collect();
192
193        // Add any fields from overwrite that weren't in the inferred schema
194        for (name, dtype) in overwrite {
195            if !existing_fields.contains(name) {
196                fields.push((name.clone(), *dtype));
197            }
198        }
199
200        // Re-sort alphabetically
201        fields.sort_by(|a, b| a.0.cmp(&b.0));
202
203        Self {
204            fields,
205            sample_count: self.sample_count,
206        }
207    }
208}
209
210/// Infer schema from Redis hashes.
211///
212/// # Arguments
213/// * `url` - Redis connection URL
214/// * `pattern` - Key pattern to match
215/// * `sample_size` - Maximum number of keys to sample
216/// * `type_inference` - Whether to infer types (if false, all fields are Utf8)
217///
218/// # Returns
219/// An `InferredSchema` with field names and types.
220pub fn infer_hash_schema(
221    url: &str,
222    pattern: &str,
223    sample_size: usize,
224    type_inference: bool,
225) -> Result<InferredSchema> {
226    let runtime =
227        Runtime::new().map_err(|e| Error::Runtime(format!("Failed to create runtime: {}", e)))?;
228
229    let connection = RedisConnection::new(url)?;
230    let mut conn = runtime.block_on(connection.get_connection_manager())?;
231
232    runtime.block_on(infer_hash_schema_async(
233        &mut conn,
234        pattern,
235        sample_size,
236        type_inference,
237    ))
238}
239
240/// Infer schema from Redis hashes with detailed confidence information.
241///
242/// This function returns confidence scores for each field, indicating how
243/// reliably the type was inferred. Use this when you need to:
244/// - Validate schema quality before processing large datasets
245/// - Identify fields that may need schema overrides
246/// - Debug type inference issues
247///
248/// # Arguments
249/// * `url` - Redis connection URL
250/// * `pattern` - Key pattern to match
251/// * `sample_size` - Maximum number of keys to sample
252///
253/// # Returns
254/// An `InferredSchemaWithConfidence` with field types and confidence data.
255///
256/// # Example
257/// ```ignore
258/// let result = infer_hash_schema_with_confidence(
259///     "redis://localhost:6379",
260///     "user:*",
261///     100,
262/// )?;
263///
264/// // Check for low-confidence fields
265/// for (field, confidence) in result.low_confidence_fields(0.9) {
266///     eprintln!("Warning: {} has {:.0}% confidence", field, confidence * 100.0);
267/// }
268///
269/// // Decide whether to proceed
270/// if result.all_confident(0.8) {
271///     let schema = result.to_basic();
272///     // Use schema for reading
273/// } else {
274///     // Consider using schema overrides
275/// }
276/// ```
277pub fn infer_hash_schema_with_confidence(
278    url: &str,
279    pattern: &str,
280    sample_size: usize,
281) -> Result<InferredSchemaWithConfidence> {
282    let runtime =
283        Runtime::new().map_err(|e| Error::Runtime(format!("Failed to create runtime: {}", e)))?;
284
285    let connection = RedisConnection::new(url)?;
286    let mut conn = runtime.block_on(connection.get_connection_manager())?;
287
288    runtime.block_on(infer_hash_schema_with_confidence_async(
289        &mut conn,
290        pattern,
291        sample_size,
292    ))
293}
294
295/// Async implementation of hash schema inference with confidence.
296async fn infer_hash_schema_with_confidence_async(
297    conn: &mut ConnectionManager,
298    pattern: &str,
299    sample_size: usize,
300) -> Result<InferredSchemaWithConfidence> {
301    // Collect sample keys
302    let keys = scan_sample_keys(conn, pattern, sample_size).await?;
303
304    if keys.is_empty() {
305        return Ok(InferredSchemaWithConfidence {
306            fields: vec![],
307            sample_count: 0,
308            field_info: HashMap::new(),
309        });
310    }
311
312    // Collect all field names and their values
313    let mut field_values: HashMap<String, Vec<Option<String>>> = HashMap::new();
314
315    for key in &keys {
316        let hash_data: HashMap<String, String> = conn.hgetall(key).await?;
317
318        // Track which fields this hash has
319        let fields_in_hash: HashSet<&String> = hash_data.keys().collect();
320
321        // Add values for fields that exist
322        for (field, value) in &hash_data {
323            field_values
324                .entry(field.clone())
325                .or_default()
326                .push(Some(value.clone()));
327        }
328
329        // Add None for fields that don't exist in this hash but exist in others
330        for (field, values) in &mut field_values {
331            if !fields_in_hash.contains(field) {
332                values.push(None);
333            }
334        }
335    }
336
337    // Infer types for each field with confidence
338    let mut fields: Vec<(String, RedisType)> = Vec::new();
339    let mut field_info: HashMap<String, FieldInferenceInfo> = HashMap::new();
340
341    for (name, values) in field_values {
342        let (dtype, info) = infer_type_from_values_with_confidence(&values);
343        fields.push((name.clone(), dtype));
344        field_info.insert(name, info);
345    }
346
347    // Sort fields alphabetically for consistent ordering
348    fields.sort_by(|a, b| a.0.cmp(&b.0));
349
350    Ok(InferredSchemaWithConfidence {
351        fields,
352        sample_count: keys.len(),
353        field_info,
354    })
355}
356
357/// Async implementation of hash schema inference.
358async fn infer_hash_schema_async(
359    conn: &mut ConnectionManager,
360    pattern: &str,
361    sample_size: usize,
362    type_inference: bool,
363) -> Result<InferredSchema> {
364    // Collect sample keys
365    let keys = scan_sample_keys(conn, pattern, sample_size).await?;
366
367    if keys.is_empty() {
368        return Ok(InferredSchema {
369            fields: vec![],
370            sample_count: 0,
371        });
372    }
373
374    // Collect all field names and their values
375    let mut field_values: HashMap<String, Vec<Option<String>>> = HashMap::new();
376
377    for key in &keys {
378        let hash_data: HashMap<String, String> = conn.hgetall(key).await?;
379
380        // Track which fields this hash has
381        let fields_in_hash: HashSet<&String> = hash_data.keys().collect();
382
383        // Add values for fields that exist
384        for (field, value) in &hash_data {
385            field_values
386                .entry(field.clone())
387                .or_default()
388                .push(Some(value.clone()));
389        }
390
391        // Add None for fields that don't exist in this hash but exist in others
392        for (field, values) in &mut field_values {
393            if !fields_in_hash.contains(field) {
394                values.push(None);
395            }
396        }
397    }
398
399    // Infer types for each field
400    let mut fields: Vec<(String, RedisType)> = field_values
401        .into_iter()
402        .map(|(name, values)| {
403            let dtype = if type_inference {
404                infer_type_from_values(&values)
405            } else {
406                RedisType::Utf8
407            };
408            (name, dtype)
409        })
410        .collect();
411
412    // Sort fields alphabetically for consistent ordering
413    fields.sort_by(|a, b| a.0.cmp(&b.0));
414
415    Ok(InferredSchema {
416        fields,
417        sample_count: keys.len(),
418    })
419}
420
421/// Infer schema from RedisJSON documents.
422///
423/// # Arguments
424/// * `url` - Redis connection URL
425/// * `pattern` - Key pattern to match
426/// * `sample_size` - Maximum number of keys to sample
427///
428/// # Returns
429/// An `InferredSchema` with field names and types.
430pub fn infer_json_schema(url: &str, pattern: &str, sample_size: usize) -> Result<InferredSchema> {
431    let runtime =
432        Runtime::new().map_err(|e| Error::Runtime(format!("Failed to create runtime: {}", e)))?;
433
434    let connection = RedisConnection::new(url)?;
435    let mut conn = runtime.block_on(connection.get_connection_manager())?;
436
437    runtime.block_on(infer_json_schema_async(&mut conn, pattern, sample_size))
438}
439
440/// Async implementation of JSON schema inference.
441async fn infer_json_schema_async(
442    conn: &mut ConnectionManager,
443    pattern: &str,
444    sample_size: usize,
445) -> Result<InferredSchema> {
446    // Collect sample keys
447    let keys = scan_sample_keys(conn, pattern, sample_size).await?;
448
449    if keys.is_empty() {
450        return Ok(InferredSchema {
451            fields: vec![],
452            sample_count: 0,
453        });
454    }
455
456    // Collect all field names and their values
457    let mut field_values: HashMap<String, Vec<Option<serde_json::Value>>> = HashMap::new();
458
459    for key in &keys {
460        // Fetch JSON document
461        let json_str: Option<String> = redis::cmd("JSON.GET")
462            .arg(key)
463            .arg("$")
464            .query_async(conn)
465            .await?;
466
467        if let Some(json_str) = json_str {
468            // Parse JSON - Redis returns an array wrapper
469            if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&json_str) {
470                let doc = match parsed {
471                    serde_json::Value::Array(mut arr) if !arr.is_empty() => arr.remove(0),
472                    other => other,
473                };
474
475                if let serde_json::Value::Object(obj) = doc {
476                    let fields_in_doc: HashSet<&String> = obj.keys().collect();
477
478                    // Add values for fields that exist
479                    for (field, value) in &obj {
480                        field_values
481                            .entry(field.clone())
482                            .or_default()
483                            .push(Some(value.clone()));
484                    }
485
486                    // Add None for fields that don't exist in this doc but exist in others
487                    for (field, values) in &mut field_values {
488                        if !fields_in_doc.contains(field) {
489                            values.push(None);
490                        }
491                    }
492                }
493            }
494        }
495    }
496
497    // Infer types for each field
498    let mut fields: Vec<(String, RedisType)> = field_values
499        .into_iter()
500        .map(|(name, values)| {
501            let dtype = infer_type_from_json_values(&values);
502            (name, dtype)
503        })
504        .collect();
505
506    // Sort fields alphabetically for consistent ordering
507    fields.sort_by(|a, b| a.0.cmp(&b.0));
508
509    Ok(InferredSchema {
510        fields,
511        sample_count: keys.len(),
512    })
513}
514
515/// Scan for sample keys matching a pattern.
516async fn scan_sample_keys(
517    conn: &mut ConnectionManager,
518    pattern: &str,
519    max_keys: usize,
520) -> Result<Vec<String>> {
521    let mut keys = Vec::new();
522    let mut cursor: u64 = 0;
523
524    loop {
525        let (new_cursor, batch): (u64, Vec<String>) = redis::cmd("SCAN")
526            .arg(cursor)
527            .arg("MATCH")
528            .arg(pattern)
529            .arg("COUNT")
530            .arg(100)
531            .query_async(conn)
532            .await?;
533
534        keys.extend(batch);
535        cursor = new_cursor;
536
537        if cursor == 0 || keys.len() >= max_keys {
538            break;
539        }
540    }
541
542    // Truncate to max_keys
543    keys.truncate(max_keys);
544    Ok(keys)
545}
546
547/// Infer type from a collection of string values.
548fn infer_type_from_values(values: &[Option<String>]) -> RedisType {
549    infer_type_from_values_with_confidence(values).0
550}
551
552/// Infer type from a collection of string values with detailed confidence info.
553///
554/// Returns (inferred_type, FieldInferenceInfo).
555fn infer_type_from_values_with_confidence(
556    values: &[Option<String>],
557) -> (RedisType, FieldInferenceInfo) {
558    let total_samples = values.len();
559    let null_count = values.iter().filter(|v| v.is_none()).count();
560    let non_null_values: Vec<&str> = values.iter().filter_map(|v| v.as_deref()).collect();
561
562    if non_null_values.is_empty() {
563        return (
564            RedisType::Utf8,
565            FieldInferenceInfo {
566                inferred_type: RedisType::Utf8,
567                confidence: 1.0, // No data means we default to Utf8 with full confidence
568                samples: total_samples,
569                valid: 0,
570                nulls: null_count,
571                type_candidates: HashMap::new(),
572            },
573        );
574    }
575
576    // Count how many values parse successfully for each type
577    let mut type_candidates: HashMap<String, usize> = HashMap::new();
578
579    let int_count = non_null_values
580        .iter()
581        .filter(|v| v.parse::<i64>().is_ok())
582        .count();
583    let float_count = non_null_values
584        .iter()
585        .filter(|v| v.parse::<f64>().is_ok())
586        .count();
587    let bool_count = non_null_values
588        .iter()
589        .filter(|v| is_boolean_string(v.to_lowercase().as_str()))
590        .count();
591
592    type_candidates.insert("int64".to_string(), int_count);
593    type_candidates.insert("float64".to_string(), float_count);
594    type_candidates.insert("bool".to_string(), bool_count);
595    type_candidates.insert("utf8".to_string(), non_null_values.len()); // Everything is valid as Utf8
596
597    let non_null_count = non_null_values.len();
598
599    // Determine best type (most specific that matches all values)
600    let (inferred_type, valid_count) = if int_count == non_null_count {
601        (RedisType::Int64, int_count)
602    } else if float_count == non_null_count {
603        (RedisType::Float64, float_count)
604    } else if bool_count == non_null_count {
605        (RedisType::Boolean, bool_count)
606    } else {
607        // Fall back to Utf8 - use the best non-Utf8 candidate for confidence
608        let best_specific = [
609            (RedisType::Int64, int_count),
610            (RedisType::Float64, float_count),
611            (RedisType::Boolean, bool_count),
612        ]
613        .into_iter()
614        .max_by_key(|(_, count)| *count);
615
616        if let Some((best_type, best_count)) = best_specific {
617            if best_count > 0 && best_count as f64 / non_null_count as f64 >= 0.5 {
618                // More than half match a specific type, but not all - low confidence
619                (best_type, best_count)
620            } else {
621                (RedisType::Utf8, non_null_count)
622            }
623        } else {
624            (RedisType::Utf8, non_null_count)
625        }
626    };
627
628    // Calculate confidence as ratio of valid values to total non-null values
629    let confidence = if non_null_count == 0 {
630        1.0
631    } else {
632        valid_count as f64 / non_null_count as f64
633    };
634
635    (
636        inferred_type,
637        FieldInferenceInfo {
638            inferred_type,
639            confidence,
640            samples: total_samples,
641            valid: valid_count,
642            nulls: null_count,
643            type_candidates,
644        },
645    )
646}
647
648/// Infer type from a collection of JSON values.
649fn infer_type_from_json_values(values: &[Option<serde_json::Value>]) -> RedisType {
650    let non_null_values: Vec<&serde_json::Value> =
651        values.iter().filter_map(|v| v.as_ref()).collect();
652
653    if non_null_values.is_empty() {
654        return RedisType::Utf8;
655    }
656
657    // Check if all values are the same JSON type
658    let first_type = json_value_type(non_null_values[0]);
659
660    if non_null_values
661        .iter()
662        .all(|v| json_value_type(v) == first_type)
663    {
664        match first_type {
665            "boolean" => RedisType::Boolean,
666            "integer" => RedisType::Int64,
667            "number" => RedisType::Float64,
668            _ => RedisType::Utf8,
669        }
670    } else {
671        // Mixed types - check if all numeric
672        if non_null_values
673            .iter()
674            .all(|v| matches!(json_value_type(v), "integer" | "number"))
675        {
676            RedisType::Float64
677        } else {
678            RedisType::Utf8
679        }
680    }
681}
682
683/// Get the type of a JSON value as a string.
684fn json_value_type(value: &serde_json::Value) -> &'static str {
685    match value {
686        serde_json::Value::Null => "null",
687        serde_json::Value::Bool(_) => "boolean",
688        serde_json::Value::Number(n) => {
689            if n.is_i64() || n.is_u64() {
690                "integer"
691            } else {
692                "number"
693            }
694        }
695        serde_json::Value::String(_) => "string",
696        serde_json::Value::Array(_) => "array",
697        serde_json::Value::Object(_) => "object",
698    }
699}
700
701/// Check if a string represents a boolean value.
702fn is_boolean_string(s: &str) -> bool {
703    matches!(
704        s,
705        "true" | "false" | "1" | "0" | "yes" | "no" | "t" | "f" | "y" | "n"
706    )
707}
708
709#[cfg(test)]
710mod tests {
711    use super::*;
712
713    #[test]
714    fn test_infer_type_int() {
715        let values = vec![
716            Some("1".to_string()),
717            Some("42".to_string()),
718            Some("-10".to_string()),
719        ];
720        assert!(matches!(infer_type_from_values(&values), RedisType::Int64));
721    }
722
723    #[test]
724    fn test_infer_type_float() {
725        let values = vec![
726            Some("1.5".to_string()),
727            Some("42.0".to_string()),
728            Some("-10.25".to_string()),
729        ];
730        assert!(matches!(
731            infer_type_from_values(&values),
732            RedisType::Float64
733        ));
734    }
735
736    #[test]
737    fn test_infer_type_mixed_numeric() {
738        // Mix of int-looking and float-looking strings -> Float64
739        let values = vec![
740            Some("1".to_string()),
741            Some("42.5".to_string()),
742            Some("-10".to_string()),
743        ];
744        assert!(matches!(
745            infer_type_from_values(&values),
746            RedisType::Float64
747        ));
748    }
749
750    #[test]
751    fn test_infer_type_bool() {
752        let values = vec![
753            Some("true".to_string()),
754            Some("false".to_string()),
755            Some("True".to_string()),
756        ];
757        assert!(matches!(
758            infer_type_from_values(&values),
759            RedisType::Boolean
760        ));
761    }
762
763    #[test]
764    fn test_infer_type_string() {
765        let values = vec![
766            Some("hello".to_string()),
767            Some("world".to_string()),
768            Some("123abc".to_string()),
769        ];
770        assert!(matches!(infer_type_from_values(&values), RedisType::Utf8));
771    }
772
773    #[test]
774    fn test_infer_type_with_nulls() {
775        let values = vec![Some("42".to_string()), None, Some("100".to_string())];
776        assert!(matches!(infer_type_from_values(&values), RedisType::Int64));
777    }
778
779    #[test]
780    fn test_infer_type_all_nulls() {
781        let values: Vec<Option<String>> = vec![None, None, None];
782        assert!(matches!(infer_type_from_values(&values), RedisType::Utf8));
783    }
784
785    #[test]
786    fn test_infer_json_type_bool() {
787        let values = vec![
788            Some(serde_json::Value::Bool(true)),
789            Some(serde_json::Value::Bool(false)),
790        ];
791        assert!(matches!(
792            infer_type_from_json_values(&values),
793            RedisType::Boolean
794        ));
795    }
796
797    #[test]
798    fn test_infer_json_type_int() {
799        let values = vec![
800            Some(serde_json::json!(42)),
801            Some(serde_json::json!(-10)),
802            Some(serde_json::json!(0)),
803        ];
804        assert!(matches!(
805            infer_type_from_json_values(&values),
806            RedisType::Int64
807        ));
808    }
809
810    #[test]
811    fn test_infer_json_type_float() {
812        let values = vec![
813            Some(serde_json::json!(42.5)),
814            Some(serde_json::json!(-10.25)),
815        ];
816        assert!(matches!(
817            infer_type_from_json_values(&values),
818            RedisType::Float64
819        ));
820    }
821
822    #[test]
823    fn test_infer_json_type_string() {
824        let values = vec![
825            Some(serde_json::json!("hello")),
826            Some(serde_json::json!("world")),
827        ];
828        assert!(matches!(
829            infer_type_from_json_values(&values),
830            RedisType::Utf8
831        ));
832    }
833
834    #[test]
835    fn test_schema_overwrite_basic() {
836        let inferred = InferredSchema {
837            fields: vec![
838                ("age".to_string(), RedisType::Utf8),
839                ("name".to_string(), RedisType::Utf8),
840                ("score".to_string(), RedisType::Float64),
841            ],
842            sample_count: 10,
843        };
844
845        // Override age to Int64
846        let overwrite = vec![("age".to_string(), RedisType::Int64)];
847        let merged = inferred.with_overwrite(&overwrite);
848
849        assert_eq!(merged.fields.len(), 3);
850        assert_eq!(merged.sample_count, 10);
851
852        // Find age and verify it's Int64
853        let age_field = merged.fields.iter().find(|(n, _)| n == "age").unwrap();
854        assert!(matches!(age_field.1, RedisType::Int64));
855
856        // name should still be Utf8
857        let name_field = merged.fields.iter().find(|(n, _)| n == "name").unwrap();
858        assert!(matches!(name_field.1, RedisType::Utf8));
859    }
860
861    #[test]
862    fn test_schema_overwrite_adds_new_fields() {
863        let inferred = InferredSchema {
864            fields: vec![("name".to_string(), RedisType::Utf8)],
865            sample_count: 5,
866        };
867
868        // Add a field that wasn't inferred
869        let overwrite = vec![("extra_field".to_string(), RedisType::Int64)];
870        let merged = inferred.with_overwrite(&overwrite);
871
872        assert_eq!(merged.fields.len(), 2);
873
874        // extra_field should be added
875        let extra = merged
876            .fields
877            .iter()
878            .find(|(n, _)| n == "extra_field")
879            .unwrap();
880        assert!(matches!(extra.1, RedisType::Int64));
881    }
882
883    #[test]
884    fn test_schema_overwrite_empty() {
885        let inferred = InferredSchema {
886            fields: vec![
887                ("a".to_string(), RedisType::Utf8),
888                ("b".to_string(), RedisType::Int64),
889            ],
890            sample_count: 10,
891        };
892
893        let overwrite: Vec<(String, RedisType)> = vec![];
894        let merged = inferred.with_overwrite(&overwrite);
895
896        assert_eq!(merged.fields.len(), 2);
897    }
898
899    #[test]
900    fn test_schema_overwrite_multiple() {
901        let inferred = InferredSchema {
902            fields: vec![
903                ("a".to_string(), RedisType::Utf8),
904                ("b".to_string(), RedisType::Utf8),
905                ("c".to_string(), RedisType::Utf8),
906            ],
907            sample_count: 10,
908        };
909
910        let overwrite = vec![
911            ("a".to_string(), RedisType::Int64),
912            ("c".to_string(), RedisType::Boolean),
913            ("d".to_string(), RedisType::Float64),
914        ];
915        let merged = inferred.with_overwrite(&overwrite);
916
917        assert_eq!(merged.fields.len(), 4);
918
919        let a = merged.fields.iter().find(|(n, _)| n == "a").unwrap();
920        assert!(matches!(a.1, RedisType::Int64));
921
922        let b = merged.fields.iter().find(|(n, _)| n == "b").unwrap();
923        assert!(matches!(b.1, RedisType::Utf8));
924
925        let c = merged.fields.iter().find(|(n, _)| n == "c").unwrap();
926        assert!(matches!(c.1, RedisType::Boolean));
927
928        let d = merged.fields.iter().find(|(n, _)| n == "d").unwrap();
929        assert!(matches!(d.1, RedisType::Float64));
930    }
931
932    // ========================================================================
933    // Property-Based Tests
934    // ========================================================================
935
936    /// Helper to infer type from a single value.
937    fn infer_single(s: &str) -> RedisType {
938        infer_type_from_values(&[Some(s.to_string())])
939    }
940
941    /// Helper to infer type from a single JSON value.
942    fn infer_single_json(v: &serde_json::Value) -> RedisType {
943        infer_type_from_json_values(&[Some(v.clone())])
944    }
945
946    mod proptest_tests {
947        use super::*;
948        use proptest::prelude::*;
949
950        proptest! {
951            /// Any valid i64 should be inferred as Int64.
952            #[test]
953            fn prop_infer_int64(value in any::<i64>()) {
954                let result = infer_single(&value.to_string());
955                prop_assert_eq!(result, RedisType::Int64);
956            }
957
958            /// Any valid f64 with decimal should be inferred as Float64.
959            #[test]
960            fn prop_infer_float64(value in any::<f64>().prop_filter("Must be finite", |v| v.is_finite())) {
961                // Format with decimal to ensure it's recognized as float
962                let s = format!("{:.1}", value);
963                let result = infer_single(&s);
964                prop_assert_eq!(result, RedisType::Float64);
965            }
966
967            /// Arbitrary non-numeric, non-boolean strings should be inferred as Utf8.
968            #[test]
969            fn prop_infer_utf8(s in "[a-zA-Z]{2}[a-zA-Z0-9]*") {
970                // Exclude strings that could be booleans (t, f, y, n, true, false, yes, no)
971                let lower = s.to_lowercase();
972                prop_assume!(!matches!(lower.as_str(), "true" | "false" | "yes" | "no" | "t" | "f" | "y" | "n"));
973                let result = infer_single(&s);
974                prop_assert_eq!(result, RedisType::Utf8);
975            }
976
977            /// Boolean strings should be inferred as Boolean.
978            #[test]
979            fn prop_infer_boolean(b in prop::bool::ANY) {
980                let s = if b { "true" } else { "false" };
981                let result = infer_single(s);
982                prop_assert_eq!(result, RedisType::Boolean);
983            }
984
985            /// Schema overwrite should preserve sample_count.
986            #[test]
987            fn prop_overwrite_preserves_sample_count(count in 1usize..1000) {
988                let inferred = InferredSchema {
989                    fields: vec![("x".to_string(), RedisType::Utf8)],
990                    sample_count: count,
991                };
992                let merged = inferred.with_overwrite(&[("x".to_string(), RedisType::Int64)]);
993                prop_assert_eq!(merged.sample_count, count);
994            }
995
996            /// Schema overwrite should include all original fields.
997            #[test]
998            fn prop_overwrite_includes_originals(
999                field_count in 1usize..20,
1000            ) {
1001                let fields: Vec<(String, RedisType)> = (0..field_count)
1002                    .map(|i| (format!("field_{}", i), RedisType::Utf8))
1003                    .collect();
1004
1005                let inferred = InferredSchema {
1006                    fields: fields.clone(),
1007                    sample_count: 10,
1008                };
1009
1010                let merged = inferred.with_overwrite(&[]);
1011                prop_assert_eq!(merged.fields.len(), field_count);
1012            }
1013
1014            /// Schema overwrite with same field should replace type.
1015            #[test]
1016            fn prop_overwrite_replaces_type(
1017                field_name in "[a-z]+",
1018            ) {
1019                let inferred = InferredSchema {
1020                    fields: vec![(field_name.clone(), RedisType::Utf8)],
1021                    sample_count: 5,
1022                };
1023
1024                let merged = inferred.with_overwrite(&[(field_name.clone(), RedisType::Int64)]);
1025
1026                let field = merged.fields.iter().find(|(n, _)| n == &field_name).unwrap();
1027                prop_assert!(matches!(field.1, RedisType::Int64));
1028            }
1029        }
1030    }
1031
1032    // ========================================================================
1033    // Edge Case Tests
1034    // ========================================================================
1035
1036    #[test]
1037    fn test_infer_type_whitespace() {
1038        // Whitespace-only strings should be Utf8
1039        assert_eq!(infer_single("   "), RedisType::Utf8);
1040        assert_eq!(infer_single("\t"), RedisType::Utf8);
1041        assert_eq!(infer_single("\n"), RedisType::Utf8);
1042    }
1043
1044    #[test]
1045    fn test_infer_type_special_numbers() {
1046        // Hexadecimal should be Utf8 (not parsed as number)
1047        assert_eq!(infer_single("0xFF"), RedisType::Utf8);
1048
1049        // Octal notation should be Utf8
1050        assert_eq!(infer_single("0o777"), RedisType::Utf8);
1051
1052        // Binary notation should be Utf8
1053        assert_eq!(infer_single("0b1010"), RedisType::Utf8);
1054    }
1055
1056    #[test]
1057    fn test_infer_type_numeric_edge_cases() {
1058        // Leading zeros - still valid integer
1059        assert_eq!(infer_single("007"), RedisType::Int64);
1060
1061        // Plus sign prefix
1062        assert_eq!(infer_single("+42"), RedisType::Int64);
1063
1064        // Scientific notation
1065        assert_eq!(infer_single("1e10"), RedisType::Float64);
1066        assert_eq!(infer_single("1E10"), RedisType::Float64);
1067        assert_eq!(infer_single("1.5e-3"), RedisType::Float64);
1068    }
1069
1070    #[test]
1071    fn test_infer_type_boolean_variations() {
1072        // Case insensitive - lowercase only for boolean detection
1073        assert_eq!(infer_single("true"), RedisType::Boolean);
1074        assert_eq!(infer_single("false"), RedisType::Boolean);
1075        assert_eq!(infer_single("yes"), RedisType::Boolean);
1076        assert_eq!(infer_single("no"), RedisType::Boolean);
1077
1078        // Not boolean (uppercase or unrecognized)
1079        assert_eq!(infer_single("yep"), RedisType::Utf8);
1080        assert_eq!(infer_single("nope"), RedisType::Utf8);
1081    }
1082
1083    #[test]
1084    fn test_infer_json_type_nested() {
1085        // Nested objects should be Utf8 (we don't recurse)
1086        let nested = serde_json::json!({"inner": {"deep": 123}});
1087        assert_eq!(infer_single_json(&nested), RedisType::Utf8);
1088
1089        // Arrays should be Utf8
1090        let arr = serde_json::json!([1, 2, 3]);
1091        assert_eq!(infer_single_json(&arr), RedisType::Utf8);
1092    }
1093
1094    #[test]
1095    fn test_schema_overwrite_case_sensitive() {
1096        let inferred = InferredSchema {
1097            fields: vec![("Name".to_string(), RedisType::Utf8)],
1098            sample_count: 5,
1099        };
1100
1101        // Different case should add new field, not overwrite
1102        let merged = inferred.with_overwrite(&[("name".to_string(), RedisType::Int64)]);
1103        assert_eq!(merged.fields.len(), 2);
1104    }
1105
1106    // ========================================================================
1107    // Confidence Score Tests
1108    // ========================================================================
1109
1110    #[test]
1111    fn test_confidence_all_integers() {
1112        let values = vec![
1113            Some("1".to_string()),
1114            Some("42".to_string()),
1115            Some("-10".to_string()),
1116        ];
1117        let (dtype, info) = infer_type_from_values_with_confidence(&values);
1118
1119        assert_eq!(dtype, RedisType::Int64);
1120        assert_eq!(info.confidence, 1.0);
1121        assert_eq!(info.samples, 3);
1122        assert_eq!(info.valid, 3);
1123        assert_eq!(info.nulls, 0);
1124        assert_eq!(info.type_candidates.get("int64"), Some(&3));
1125    }
1126
1127    #[test]
1128    fn test_confidence_with_nulls() {
1129        let values = vec![Some("42".to_string()), None, Some("100".to_string()), None];
1130        let (dtype, info) = infer_type_from_values_with_confidence(&values);
1131
1132        assert_eq!(dtype, RedisType::Int64);
1133        assert_eq!(info.confidence, 1.0); // Confidence based on non-null values
1134        assert_eq!(info.samples, 4);
1135        assert_eq!(info.valid, 2);
1136        assert_eq!(info.nulls, 2);
1137        assert!((info.null_ratio() - 0.5).abs() < 0.001);
1138    }
1139
1140    #[test]
1141    fn test_confidence_mixed_types_low_confidence() {
1142        // 3 integers, 2 strings -> should have lower confidence
1143        // Note: integers also parse as floats, so float64 count is also 3
1144        let values = vec![
1145            Some("1".to_string()),
1146            Some("2".to_string()),
1147            Some("3".to_string()),
1148            Some("hello".to_string()),
1149            Some("world".to_string()),
1150        ];
1151        let (dtype, info) = infer_type_from_values_with_confidence(&values);
1152
1153        // Falls back to Float64 with 60% confidence (3/5) - ints also parse as floats
1154        assert_eq!(dtype, RedisType::Float64);
1155        assert!((info.confidence - 0.6).abs() < 0.001);
1156        assert!(!info.is_confident(0.9));
1157        assert!(info.is_confident(0.5));
1158    }
1159
1160    #[test]
1161    fn test_confidence_all_nulls() {
1162        let values: Vec<Option<String>> = vec![None, None, None];
1163        let (dtype, info) = infer_type_from_values_with_confidence(&values);
1164
1165        assert_eq!(dtype, RedisType::Utf8);
1166        assert_eq!(info.confidence, 1.0); // Default with full confidence
1167        assert_eq!(info.samples, 3);
1168        assert_eq!(info.valid, 0);
1169        assert_eq!(info.nulls, 3);
1170    }
1171
1172    #[test]
1173    fn test_confidence_empty() {
1174        let values: Vec<Option<String>> = vec![];
1175        let (dtype, info) = infer_type_from_values_with_confidence(&values);
1176
1177        assert_eq!(dtype, RedisType::Utf8);
1178        assert_eq!(info.confidence, 1.0);
1179        assert_eq!(info.samples, 0);
1180    }
1181
1182    #[test]
1183    fn test_field_inference_info_is_confident() {
1184        let info = FieldInferenceInfo {
1185            inferred_type: RedisType::Int64,
1186            confidence: 0.85,
1187            samples: 100,
1188            valid: 85,
1189            nulls: 0,
1190            type_candidates: HashMap::new(),
1191        };
1192
1193        assert!(info.is_confident(0.8));
1194        assert!(!info.is_confident(0.9));
1195    }
1196
1197    #[test]
1198    fn test_field_inference_info_null_ratio() {
1199        let info = FieldInferenceInfo {
1200            inferred_type: RedisType::Int64,
1201            confidence: 1.0,
1202            samples: 100,
1203            valid: 75,
1204            nulls: 25,
1205            type_candidates: HashMap::new(),
1206        };
1207
1208        assert!((info.null_ratio() - 0.25).abs() < 0.001);
1209    }
1210
1211    #[test]
1212    fn test_inferred_schema_with_confidence_to_basic() {
1213        let mut field_info = HashMap::new();
1214        field_info.insert(
1215            "age".to_string(),
1216            FieldInferenceInfo {
1217                inferred_type: RedisType::Int64,
1218                confidence: 1.0,
1219                samples: 10,
1220                valid: 10,
1221                nulls: 0,
1222                type_candidates: HashMap::new(),
1223            },
1224        );
1225
1226        let schema = InferredSchemaWithConfidence {
1227            fields: vec![("age".to_string(), RedisType::Int64)],
1228            sample_count: 10,
1229            field_info,
1230        };
1231
1232        let basic = schema.to_basic();
1233        assert_eq!(basic.fields.len(), 1);
1234        assert_eq!(basic.sample_count, 10);
1235    }
1236
1237    #[test]
1238    fn test_inferred_schema_with_confidence_low_confidence_fields() {
1239        let mut field_info = HashMap::new();
1240        field_info.insert(
1241            "good".to_string(),
1242            FieldInferenceInfo {
1243                inferred_type: RedisType::Int64,
1244                confidence: 0.95,
1245                samples: 100,
1246                valid: 95,
1247                nulls: 0,
1248                type_candidates: HashMap::new(),
1249            },
1250        );
1251        field_info.insert(
1252            "bad".to_string(),
1253            FieldInferenceInfo {
1254                inferred_type: RedisType::Float64,
1255                confidence: 0.6,
1256                samples: 100,
1257                valid: 60,
1258                nulls: 0,
1259                type_candidates: HashMap::new(),
1260            },
1261        );
1262
1263        let schema = InferredSchemaWithConfidence {
1264            fields: vec![
1265                ("bad".to_string(), RedisType::Float64),
1266                ("good".to_string(), RedisType::Int64),
1267            ],
1268            sample_count: 100,
1269            field_info,
1270        };
1271
1272        let low = schema.low_confidence_fields(0.9);
1273        assert_eq!(low.len(), 1);
1274        assert_eq!(low[0].0, "bad");
1275        assert!((low[0].1 - 0.6).abs() < 0.001);
1276    }
1277
1278    #[test]
1279    fn test_inferred_schema_with_confidence_all_confident() {
1280        let mut field_info = HashMap::new();
1281        field_info.insert(
1282            "a".to_string(),
1283            FieldInferenceInfo {
1284                inferred_type: RedisType::Int64,
1285                confidence: 0.95,
1286                samples: 100,
1287                valid: 95,
1288                nulls: 0,
1289                type_candidates: HashMap::new(),
1290            },
1291        );
1292        field_info.insert(
1293            "b".to_string(),
1294            FieldInferenceInfo {
1295                inferred_type: RedisType::Utf8,
1296                confidence: 1.0,
1297                samples: 100,
1298                valid: 100,
1299                nulls: 0,
1300                type_candidates: HashMap::new(),
1301            },
1302        );
1303
1304        let schema = InferredSchemaWithConfidence {
1305            fields: vec![
1306                ("a".to_string(), RedisType::Int64),
1307                ("b".to_string(), RedisType::Utf8),
1308            ],
1309            sample_count: 100,
1310            field_info,
1311        };
1312
1313        assert!(schema.all_confident(0.9));
1314        assert!(!schema.all_confident(0.99));
1315    }
1316
1317    #[test]
1318    fn test_inferred_schema_with_confidence_average() {
1319        let mut field_info = HashMap::new();
1320        field_info.insert(
1321            "a".to_string(),
1322            FieldInferenceInfo {
1323                inferred_type: RedisType::Int64,
1324                confidence: 1.0,
1325                samples: 100,
1326                valid: 100,
1327                nulls: 0,
1328                type_candidates: HashMap::new(),
1329            },
1330        );
1331        field_info.insert(
1332            "b".to_string(),
1333            FieldInferenceInfo {
1334                inferred_type: RedisType::Float64,
1335                confidence: 0.8,
1336                samples: 100,
1337                valid: 80,
1338                nulls: 0,
1339                type_candidates: HashMap::new(),
1340            },
1341        );
1342
1343        let schema = InferredSchemaWithConfidence {
1344            fields: vec![
1345                ("a".to_string(), RedisType::Int64),
1346                ("b".to_string(), RedisType::Float64),
1347            ],
1348            sample_count: 100,
1349            field_info,
1350        };
1351
1352        assert!((schema.average_confidence() - 0.9).abs() < 0.001);
1353    }
1354
1355    #[test]
1356    fn test_confidence_type_candidates() {
1357        let values = vec![
1358            Some("1".to_string()),
1359            Some("2".to_string()),
1360            Some("3.5".to_string()),
1361        ];
1362        let (_, info) = infer_type_from_values_with_confidence(&values);
1363
1364        // All 3 are valid floats, only 2 are valid ints
1365        assert_eq!(info.type_candidates.get("float64"), Some(&3));
1366        assert_eq!(info.type_candidates.get("int64"), Some(&2));
1367        assert_eq!(info.type_candidates.get("utf8"), Some(&3));
1368    }
1369}