dsq_core/
lib.rs

1//! dsq-core: Core library for dsq data processing
2//!
3//! This crate provides the core functionality for dsq, a data processing tool that extends
4//! jq-ish syntax to work with structured data formats like Parquet, Avro, CSV, and more.
5//! dsq leverages `Polars` `DataFrames` to provide high-performance
6//! data manipulation across multiple file formats.
7//!
8//! # Features
9//!
10//! - **Format Flexibility**: Support for CSV, TSV, Parquet, Avro, JSON Lines, Arrow, and JSON
11//! - **Performance**: Built on `Polars` `DataFrames` with lazy evaluation and columnar operations
12//! - **Type Safety**: Proper type handling with clear error messages
13//!
14//! # Quick Start
15//!
16//! ```rust,ignore
17//! use dsq_core::{Value, ops, io};
18//!
19//! // Read a CSV file
20//! let data = io::read_file_sync("data.csv", &io::ReadOptions::default())?;
21//!
22//! // Apply operations
23//! let result = ops::OperationPipeline::new()
24//!     .select(vec!["name".to_string(), "age".to_string()])
25//!     .sort(vec![ops::SortOptions::desc("age".to_string())])
26//!     .head(10)
27//!     .execute(data)?;
28//!
29//! // Write to Parquet
30//! io::write_file_sync(&result, "output.parquet", &io::WriteOptions::default())?;
31//! # Ok::<(), dsq_core::Error>(())
32//! ```
33//!
34//! # Architecture
35//!
36//! The library is organized into several key modules:
37//!
38//! - `value` - Core value type that bridges JSON and `DataFrames`
39//! - [`ops`] - Data operations (select, filter, aggregate, join, transform)
40//! - [`io`] - Input/output for various file formats
41//! - [`filter`] - jq-compatible filter compilation and execution
42//! - [`error`] - Error handling and result types
43//! - [`format`] - File format detection and metadata
44//!
45//! # Examples
46//!
47//! ## Basic `DataFrame` Operations
48//!
49//! ```rust,ignore
50//! use dsq_core::{Value, ops::basic::*};
51//! use polars::prelude::*;
52//!
53//! let df = df! {
54//!     "name" => ["Alice", "Bob", "Charlie"],
55//!     "age" => [30, 25, 35],
56//!     "department" => ["Engineering", "Sales", "Engineering"]
57//! }?;
58//!
59//! let data = Value::DataFrame(df);
60//!
61//! // Select columns
62//! let selected = select_columns(&data, &["name".to_string(), "age".to_string()])?;
63//!
64//! // Sort by age
65//! let sorted = sort_by_columns(&selected, &[SortOptions::desc("age")])?;
66//!
67//! // Take first 2 rows
68//! let result = head(&sorted, 2)?;
69//! # Ok::<(), dsq_core::Error>(())
70//! ```
71//!
72//! ## Aggregation Operations
73//!
74//! ```rust,ignore
75//! use dsq_core::{Value, ops::aggregate::*};
76//!
77//! // Group by department and calculate statistics
78//! let aggregated = group_by_agg(
79//!     &data,
80//!     &["department".to_string()],
81//!     &[
82//!         AggregationFunction::Count,
83//!         AggregationFunction::Mean("age".to_string()),
84//!         AggregationFunction::Sum("salary".to_string()),
85//!     ]
86//! )?;
87//! # Ok::<(), dsq_core::Error>(())
88//! ```
89//!
90//! ## Join Operations
91//!
92//! ```rust,ignore
93//! use dsq_core::{Value, ops::join::*};
94//!
95//! let keys = JoinKeys::on(vec!["id".to_string()]);
96//! let options = JoinOptions {
97//!     join_type: JoinType::Inner,
98//!     ..Default::default()
99//! };
100//!
101//! let joined = join(&left_data, &right_data, &keys, &options)?;
102//! # Ok::<(), dsq_core::Error>(())
103//! ```
104//!
105//! ## Format Conversion
106//!
107//! ```rust,ignore
108//! use dsq_core::io;
109//!
110//! // Convert CSV to Parquet
111//! io::convert_file(
112//!     "data.csv",
113//!     "data.parquet",
114//!     &io::ReadOptions::default(),
115//!     &io::WriteOptions::default()
116//! )?;
117//! # Ok::<(), dsq_core::Error>(())
118//! ```
119//!
120//! ## Filter Execution
121//!
122//! ```rust,ignore
123//! use dsq_core::filter::{FilterExecutor, ExecutorConfig};
124//!
125//! let mut executor = FilterExecutor::with_config(
126//!     ExecutorConfig {
127//!         lazy_evaluation: true,
128//!         dataframe_optimizations: true,
129//!         ..Default::default()
130//!     }
131//! );
132//!
133//! // Execute jq-style filter on DataFrame
134//! let result = executor.execute_str(
135//!     r#"map(select(.age > 30)) | sort_by(.name)"#,
136//!     data
137//! )?;
138//! # Ok::<(), dsq_core::Error>(())
139//! ```
140//!
141//! # Error Handling
142//!
143//! All operations return `Result<T>` where errors are represented by the [`Error`] type:
144//!
145//! ```rust,ignore
146//! use dsq_core::{Error, Result, TypeError, FormatError};
147//!
148//! match some_operation() {
149//!     Ok(value) => println!("Success: {:?}", value),
150//!     Err(Error::Type(TypeError::InvalidConversion { from, to })) => {
151//!         eprintln!("Cannot convert from {} to {}", from, to);
152//!     }
153//!     Err(Error::Format(FormatError::Unknown(format))) => {
154//!         eprintln!("Unknown format: {}", format);
155//!     }
156//!     Err(e) => eprintln!("Other error: {}", e),
157//! }
158//! # fn some_operation() -> Result<()> { Ok(()) }
159//! ```
160//!
161//! # Performance Tips
162//!
163//! - Use lazy evaluation for large datasets with [`LazyFrame`](polars::prelude::LazyFrame)
164//! - Prefer columnar operations over row-by-row processing
165//! - Use appropriate data types to minimize memory usage
166//! - Consider using streaming for very large files that don't fit in memory
167//! - Enable DataFrame-specific optimizations in the filter executor
168//!
169//! # Feature Flags
170//!
171//! This crate supports several optional features:
172//!
173//! - `default` - Includes all commonly used functionality
174//! - `io-csv` - CSV/TSV reading and writing support
175//! - `io-parquet` - Parquet format support
176//! - `io-json` - JSON and JSON Lines support
177//! - `io-avro` - Avro format support (planned)
178//! - `io-arrow` - Arrow IPC format support
179//! - `filter` - jq-compatible filter compilation and execution
180//! - `repl` - Interactive REPL support (for CLI usage)
181
182pub use dsq_shared::{BuildInfo, VERSION};
183
184// Re-export format types from dsq-formats
185#[cfg(not(target_arch = "wasm32"))]
186pub use dsq_formats::{format::detect_format_from_content, format::FormatOptions, DataFormat};
187
188// Data operation modules
189pub mod ops;
190
191// Error handling
192/// Error types and handling
193pub mod error;
194
195// I/O modules - feature-gated
196#[cfg(feature = "io")]
197pub mod io;
198
199// Filter system modules - feature-gated
200#[cfg(feature = "filter")]
201pub mod filter;
202
203// Re-export commonly used types and functions
204pub use crate::error::{Error, FilterError, FormatError, Result, TypeError};
205
206pub use dsq_shared::value::Value;
207
208// Re-export key operation types
209pub use ops::{
210    recommended_batch_size, supports_operation, Operation, OperationPipeline, OperationType,
211};
212
213// Re-export basic operations
214pub use ops::basic::{
215    count, filter_values, head, reverse, select_columns, slice, sort_by_columns, tail, unique,
216    SortOptions,
217};
218
219// Re-export aggregation operations
220pub use ops::aggregate::{
221    group_by, group_by_agg, pivot, unpivot, AggregationFunction, WindowFunction,
222};
223
224// Re-export join operations
225pub use ops::join::{
226    inner_join, join, left_join, outer_join, right_join, JoinKeys, JoinOptions, JoinType,
227};
228
229// Re-export transformation operations
230pub use ops::transform::Transform;
231
232// Re-export utility functions
233pub use utils::{array, object};
234
235// Re-export I/O convenience functions
236#[cfg(feature = "io")]
237pub use io::{
238    convert_file, inspect_file, read_file, read_file_lazy, write_file, FileInfo, ReadOptions,
239    WriteOptions,
240};
241
242// Re-export filter system
243#[cfg(feature = "filter")]
244pub use filter::{
245    execute_filter, execute_filter_with_config, explain_filter, ExecutionResult, ExecutorConfig,
246    FilterCompiler, FilterExecutor,
247};
248
249/// Prelude module for convenient imports
250///
251/// This module re-exports the most commonly used types and functions,
252/// allowing users to import everything they need with a single use statement.
253///
254/// # Examples
255///
256/// ```rust
257/// use dsq_core::prelude::*;
258///
259/// // Now you have access to Value, Error, Result, common operations, etc.
260/// let data = Value::array(vec![Value::int(1), Value::int(2), Value::int(3)]);
261/// let length = count(&data)?;
262/// # Ok::<(), Error>(())
263/// ```
264pub mod prelude {
265    // Core types
266    pub use crate::{Error, Result, Value};
267
268    // Operations
269    pub use crate::ops::aggregate::{group_by, group_by_agg, AggregationFunction, WindowFunction};
270    pub use crate::ops::basic::{
271        count, filter_values, head, reverse, select_columns, slice, sort_by_columns, tail, unique,
272        SortOptions,
273    };
274    pub use crate::ops::join::{
275        inner_join, join, left_join, outer_join, right_join, JoinKeys, JoinOptions, JoinType,
276    };
277    pub use crate::ops::transform::Transform;
278    pub use crate::ops::{Operation, OperationPipeline, OperationType};
279
280    // I/O (if available)
281    #[cfg(feature = "io")]
282    pub use crate::io::{convert_file, read_file, write_file, ReadOptions, WriteOptions};
283
284    // Filter system (if available)
285    #[cfg(feature = "filter")]
286    pub use crate::filter::{execute_filter, ExecutorConfig, FilterExecutor};
287
288    // Re-export polars types that users commonly need
289    pub use polars::prelude::{DataFrame, LazyFrame, Series};
290}
291
292/// Build information for dsq-core
293pub const BUILD_INFO: BuildInfo = BuildInfo {
294    version: VERSION,
295    git_hash: option_env!("VERGEN_GIT_SHA"),
296    build_date: option_env!("VERGEN_BUILD_TIMESTAMP"),
297    rust_version: option_env!("VERGEN_RUSTC_SEMVER"),
298    features: &[
299        #[cfg(feature = "io")]
300        "io",
301        #[cfg(feature = "filter")]
302        "filter",
303        #[cfg(feature = "repl")]
304        "repl",
305    ],
306};
307
308/// Utility functions for working with dsq
309pub mod utils {
310    use std::collections::HashMap;
311
312    use crate::{Error, Result, Value};
313
314    /// Create a `Value::Object` from key-value pairs
315    ///
316    /// # Examples
317    ///
318    /// ```rust
319    /// use dsq_core::object;
320    /// use dsq_core::Value;
321    ///
322    /// let obj = object([
323    ///     ("name", Value::string("Alice")),
324    ///     ("age", Value::int(30)),
325    /// ]);
326    /// ```
327    pub fn object<I, K>(pairs: I) -> Value
328    where
329        I: IntoIterator<Item = (K, Value)>,
330        K: Into<String>,
331    {
332        let map: HashMap<String, Value> = pairs.into_iter().map(|(k, v)| (k.into(), v)).collect();
333        Value::Object(map)
334    }
335
336    /// Create a `Value::Array` from values
337    ///
338    /// # Examples
339    ///
340    /// ```rust
341    /// use dsq_core::utils::array;
342    /// use dsq_core::Value;
343    ///
344    /// let arr = array([
345    ///     Value::int(1),
346    ///     Value::int(2),
347    ///     Value::int(3),
348    /// ]);
349    /// ```
350    pub fn array<I>(values: I) -> Value
351    where
352        I: IntoIterator<Item = Value>,
353    {
354        Value::Array(values.into_iter().collect())
355    }
356
357    /// Try to extract a `DataFrame` from a Value
358    ///
359    /// # Examples
360    ///
361    /// ```rust
362    /// use dsq_core::utils::extract_dataframe;
363    /// use dsq_core::Value;
364    /// use polars::prelude::*;
365    ///
366    /// let df = df! {
367    ///     "name" => ["Alice", "Bob"],
368    ///     "age" => [30, 25]
369    /// }.unwrap();
370    ///
371    /// let value = Value::DataFrame(df.clone());
372    /// let extracted = extract_dataframe(&value).unwrap();
373    /// assert_eq!(extracted.height(), df.height());
374    /// ```
375    pub fn extract_dataframe(value: &Value) -> Result<&polars::prelude::DataFrame> {
376        match value {
377            Value::DataFrame(df) => Ok(df),
378            _ => Err(Error::operation(format!(
379                "Expected DataFrame, got {}",
380                value.type_name()
381            ))),
382        }
383    }
384
385    /// Try to convert any Value to a `DataFrame`
386    ///
387    /// # Examples
388    ///
389    /// ```rust
390    /// use dsq_core::utils::{object, array, to_dataframe};
391    /// use dsq_core::Value;
392    ///
393    /// let data = array([
394    ///     object([("name", Value::string("Alice")), ("age", Value::int(30))]),
395    ///     object([("name", Value::string("Bob")), ("age", Value::int(25))]),
396    /// ]);
397    ///
398    /// let df = to_dataframe(&data).unwrap();
399    /// assert_eq!(df.height(), 2);
400    /// ```
401    pub fn to_dataframe(value: &Value) -> Result<polars::prelude::DataFrame> {
402        Ok(value.to_dataframe()?)
403    }
404
405    /// Pretty print a Value for debugging
406    ///
407    /// # Examples
408    ///
409    /// ```rust
410    /// use dsq_core::utils::{object, pretty_print};
411    /// use dsq_core::Value;
412    ///
413    /// let obj = object([
414    ///     ("name", Value::string("Alice")),
415    ///     ("age", Value::int(30)),
416    /// ]);
417    ///
418    /// pretty_print(&obj);
419    /// ```
420    pub fn pretty_print(value: &Value) {
421        match value.to_json() {
422            Ok(json) => {
423                if let Ok(pretty) = serde_json::to_string_pretty(&json) {
424                    println!("{pretty}");
425                } else {
426                    println!("{value}");
427                }
428            }
429            Err(_) => println!("{value}"),
430        }
431    }
432
433    /// Get basic statistics about a Value
434    ///
435    /// Returns information like type, length, memory usage estimates, etc.
436    #[must_use]
437    pub fn value_stats(value: &Value) -> ValueStats {
438        ValueStats::from_value(value)
439    }
440
441    /// Statistics about a Value
442    #[derive(Debug, Clone)]
443    pub struct ValueStats {
444        /// Value type name
445        pub type_name: String,
446        /// Length (for arrays, strings, `DataFrames`, etc.)
447        pub length: Option<usize>,
448        /// Width (for `DataFrames`, objects)
449        pub width: Option<usize>,
450        /// Estimated memory usage in bytes
451        pub estimated_size: Option<usize>,
452        /// Whether the value is null/empty
453        pub is_empty: bool,
454    }
455
456    impl ValueStats {
457        fn from_value(value: &Value) -> Self {
458            let type_name = value.type_name().to_string();
459            let length = value.len();
460            let is_empty = value.is_empty();
461
462            let (width, estimated_size) = match value {
463                Value::DataFrame(df) => (Some(df.width()), Some(df.estimated_size())),
464                Value::Object(obj) => (
465                    Some(obj.len()),
466                    Some(obj.len() * 64), // Rough estimate
467                ),
468                Value::Array(arr) => (
469                    None,
470                    Some(arr.len() * 32), // Rough estimate
471                ),
472                Value::String(s) => (None, Some(s.len())),
473                _ => (None, Some(8)), // Rough estimate for scalars
474            };
475
476            Self {
477                type_name,
478                length,
479                width,
480                estimated_size,
481                is_empty,
482            }
483        }
484    }
485
486    impl std::fmt::Display for ValueStats {
487        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
488            write!(f, "Type: {}", self.type_name)?;
489
490            if let Some(length) = self.length {
491                write!(f, ", Length: {length}")?;
492            }
493
494            if let Some(width) = self.width {
495                write!(f, ", Width: {width}")?;
496            }
497
498            if let Some(size) = self.estimated_size {
499                write!(f, ", Size: ~{size} bytes")?;
500            }
501
502            if self.is_empty {
503                write!(f, " (empty)")?;
504            }
505
506            Ok(())
507        }
508    }
509}
510
511#[cfg(test)]
512mod tests {
513    use std::collections::HashMap;
514    #[allow(unused_imports)]
515    use std::path::Path;
516
517    use polars::prelude::*;
518
519    #[allow(unused_imports)]
520    use crate::utils::{array, extract_dataframe, object, to_dataframe, value_stats};
521
522    use super::*;
523
524    #[test]
525    fn test_version_info() {
526        assert!(!VERSION.is_empty());
527        println!("{BUILD_INFO}");
528    }
529
530    #[test]
531    fn test_utils_object() {
532        let obj = object([("name", Value::string("Alice")), ("age", Value::int(30))]);
533
534        match obj {
535            Value::Object(map) => {
536                assert_eq!(map.len(), 2);
537                assert_eq!(map.get("name"), Some(&Value::string("Alice")));
538                assert_eq!(map.get("age"), Some(&Value::int(30)));
539            }
540            _ => panic!("Expected object"),
541        }
542    }
543
544    #[test]
545    fn test_utils_array() {
546        let arr = array([Value::int(1), Value::int(2), Value::int(3)]);
547
548        match arr {
549            Value::Array(vec) => {
550                assert_eq!(vec.len(), 3);
551                assert_eq!(vec[0], Value::int(1));
552            }
553            _ => panic!("Expected array"),
554        }
555    }
556
557    #[test]
558    fn test_value_stats() {
559        let obj = object([("name", Value::string("Alice")), ("age", Value::int(30))]);
560
561        let stats = value_stats(&obj);
562        assert_eq!(stats.type_name, "object");
563        assert_eq!(stats.width, Some(2));
564        assert!(!stats.is_empty);
565    }
566
567    #[test]
568    fn test_extract_dataframe() {
569        use polars::prelude::*;
570
571        let df = df! {
572            "name" => ["Alice", "Bob"],
573            "age" => [30, 25]
574        }
575        .unwrap();
576
577        let value = Value::DataFrame(df.clone());
578        let extracted = extract_dataframe(&value).unwrap();
579        assert_eq!(extracted.height(), 2);
580
581        let non_df = Value::int(42);
582        assert!(extract_dataframe(&non_df).is_err());
583    }
584
585    #[test]
586    fn test_to_dataframe_conversion() {
587        let data = array([
588            object([("name", Value::string("Alice")), ("age", Value::int(30))]),
589            object([("name", Value::string("Bob")), ("age", Value::int(25))]),
590        ]);
591
592        let df = to_dataframe(&data).unwrap();
593        assert_eq!(df.height(), 2);
594        assert_eq!(df.width(), 2);
595    }
596
597    #[test]
598    fn test_value_type_checks() {
599        let null_val = Value::null();
600        assert!(null_val.is_null());
601        assert!(!null_val.is_dataframe());
602
603        let df_val = Value::dataframe(
604            df! {
605                "name" => ["Alice"],
606                "age" => [30]
607            }
608            .unwrap(),
609        );
610        assert!(df_val.is_dataframe());
611        assert!(!df_val.is_null());
612    }
613
614    #[test]
615    fn test_value_type_name() {
616        assert_eq!(Value::null().type_name(), "null");
617        assert_eq!(Value::bool(true).type_name(), "boolean");
618        assert_eq!(Value::int(42).type_name(), "integer");
619        assert_eq!(Value::float(3.14).type_name(), "float");
620        assert_eq!(Value::string("hello").type_name(), "string");
621        assert_eq!(Value::array(vec![]).type_name(), "array");
622        assert_eq!(Value::object(HashMap::new()).type_name(), "object");
623    }
624
625    #[test]
626    fn test_value_len_and_empty() {
627        assert_eq!(Value::string("hello").len(), Some(5));
628        assert_eq!(
629            Value::array(vec![Value::int(1), Value::int(2)]).len(),
630            Some(2)
631        );
632        assert_eq!(Value::null().len(), None);
633        assert_eq!(Value::int(42).len(), None);
634
635        assert!(Value::string("").is_empty());
636        assert!(Value::array(vec![]).is_empty());
637        assert!(!Value::string("hello").is_empty());
638        assert!(!Value::null().is_empty());
639    }
640
641    #[test]
642    fn test_value_index() {
643        let arr = Value::array(vec![Value::int(10), Value::int(20), Value::int(30)]);
644        assert_eq!(arr.index(0).unwrap(), Value::int(10));
645        assert_eq!(arr.index(1).unwrap(), Value::int(20));
646        assert_eq!(arr.index(-1).unwrap(), Value::int(30)); // negative indexing
647        assert_eq!(arr.index(10).unwrap(), Value::Null); // out of bounds
648
649        let s = Value::string("hello");
650        assert_eq!(s.index(0).unwrap(), Value::string("h"));
651        assert_eq!(s.index(4).unwrap(), Value::string("o"));
652        assert_eq!(s.index(-1).unwrap(), Value::string("o"));
653    }
654
655    #[test]
656    fn test_value_field() {
657        let mut obj = HashMap::new();
658        obj.insert("name".to_string(), Value::string("Alice"));
659        obj.insert("age".to_string(), Value::int(30));
660        let obj_val = Value::object(obj);
661
662        assert_eq!(obj_val.field("name").unwrap(), Value::string("Alice"));
663        assert_eq!(obj_val.field("age").unwrap(), Value::int(30));
664        assert_eq!(obj_val.field("missing").unwrap(), Value::Null);
665
666        // Test field access on array
667        let arr = Value::array(vec![obj_val.clone()]);
668        let names = arr.field("name").unwrap();
669        match names {
670            Value::Array(names_arr) => {
671                assert_eq!(names_arr.len(), 1);
672                assert_eq!(names_arr[0], Value::string("Alice"));
673            }
674            _ => panic!("Expected array"),
675        }
676    }
677
678    #[test]
679    fn test_value_to_json() {
680        // Test basic types
681        assert_eq!(Value::null().to_json().unwrap(), serde_json::Value::Null);
682        assert_eq!(
683            Value::bool(true).to_json().unwrap(),
684            serde_json::json!(true)
685        );
686        assert_eq!(Value::int(42).to_json().unwrap(), serde_json::json!(42));
687        assert_eq!(
688            Value::float(std::f64::consts::PI).to_json().unwrap(),
689            serde_json::json!(std::f64::consts::PI)
690        );
691        assert_eq!(
692            Value::string("hello").to_json().unwrap(),
693            serde_json::json!("hello")
694        );
695
696        // Test array
697        let arr = Value::array(vec![Value::int(1), Value::int(2)]);
698        let expected = serde_json::json!([1, 2]);
699        assert_eq!(arr.to_json().unwrap(), expected);
700
701        // Test object
702        let mut obj = HashMap::new();
703        obj.insert("name".to_string(), Value::string("Alice"));
704        obj.insert("age".to_string(), Value::int(30));
705        let obj_val = Value::object(obj);
706        let expected = serde_json::json!({"name": "Alice", "age": 30});
707        assert_eq!(obj_val.to_json().unwrap(), expected);
708    }
709
710    #[test]
711    fn test_value_from_json() {
712        assert_eq!(Value::from_json(serde_json::Value::Null), Value::null());
713        assert_eq!(Value::from_json(serde_json::json!(true)), Value::bool(true));
714        assert_eq!(Value::from_json(serde_json::json!(42)), Value::int(42));
715        assert_eq!(
716            Value::from_json(serde_json::json!(3.14)),
717            Value::float(3.14)
718        );
719        assert_eq!(
720            Value::from_json(serde_json::json!("hello")),
721            Value::string("hello")
722        );
723
724        let json_arr = serde_json::json!([1, 2, 3]);
725        let val_arr = Value::from_json(json_arr);
726        match val_arr {
727            Value::Array(arr) => {
728                assert_eq!(arr.len(), 3);
729                assert_eq!(arr[0], Value::int(1));
730            }
731            _ => panic!("Expected array"),
732        }
733
734        let json_obj = serde_json::json!({"name": "Alice", "age": 30});
735        let val_obj = Value::from_json(json_obj);
736        match val_obj {
737            Value::Object(obj) => {
738                assert_eq!(obj.get("name"), Some(&Value::string("Alice")));
739                assert_eq!(obj.get("age"), Some(&Value::int(30)));
740            }
741            _ => panic!("Expected object"),
742        }
743    }
744
745    #[cfg(feature = "filter")]
746    mod filter_tests {
747        use std::fs;
748        use std::io::Write;
749
750        use polars::prelude::*;
751        use tempfile::NamedTempFile;
752
753        use crate::utils::{array, object};
754
755        use super::*;
756
757        fn create_mock_data() -> Value {
758            // Create mock data similar to the example datasets
759
760            array(vec![
761                object(vec![
762                    ("title".to_string(), Value::string("Book A")),
763                    ("genre".to_string(), Value::string("Fiction")),
764                    ("price".to_string(), Value::float(19.99)),
765                    ("author".to_string(), Value::string("Author A")),
766                ]),
767                object(vec![
768                    ("title".to_string(), Value::string("Book B")),
769                    ("genre".to_string(), Value::string("Non-Fiction")),
770                    ("price".to_string(), Value::float(24.99)),
771                    ("author".to_string(), Value::string("Author B")),
772                ]),
773                object(vec![
774                    ("title".to_string(), Value::string("Book C")),
775                    ("genre".to_string(), Value::string("Fiction")),
776                    ("price".to_string(), Value::float(15.99)),
777                    ("author".to_string(), Value::string("Author C")),
778                ]),
779                object(vec![
780                    ("title".to_string(), Value::string("Book D")),
781                    ("genre".to_string(), Value::string("Fiction")),
782                    ("price".to_string(), Value::float(29.99)),
783                    ("author".to_string(), Value::string("Author D")),
784                ]),
785                object(vec![
786                    ("title".to_string(), Value::string("Book E")),
787                    ("genre".to_string(), Value::string("Science")),
788                    ("price".to_string(), Value::float(34.99)),
789                    ("author".to_string(), Value::string("Author E")),
790                ]),
791                object(vec![
792                    ("title".to_string(), Value::string("Book F")),
793                    ("genre".to_string(), Value::string("Fiction")),
794                    ("price".to_string(), Value::float(12.99)),
795                    ("author".to_string(), Value::string("Author F")),
796                ]),
797                object(vec![
798                    ("title".to_string(), Value::string("Book G")),
799                    ("genre".to_string(), Value::string("Non-Fiction")),
800                    ("price".to_string(), Value::float(22.99)),
801                    ("author".to_string(), Value::string("Author G")),
802                ]),
803            ])
804        }
805
806        fn create_employee_data() -> Value {
807            // Create mock employee data
808
809            array(vec![
810                object(vec![
811                    ("name".to_string(), Value::string("Alice Johnson")),
812                    ("department".to_string(), Value::string("Sales")),
813                    ("salary".to_string(), Value::int(75000)),
814                    ("age".to_string(), Value::int(32)),
815                ]),
816                object(vec![
817                    ("name".to_string(), Value::string("Bob Smith")),
818                    ("department".to_string(), Value::string("Engineering")),
819                    ("salary".to_string(), Value::int(82000)),
820                    ("age".to_string(), Value::int(28)),
821                ]),
822                object(vec![
823                    ("name".to_string(), Value::string("Carol Williams")),
824                    ("department".to_string(), Value::string("Sales")),
825                    ("salary".to_string(), Value::int(68000)),
826                    ("age".to_string(), Value::int(35)),
827                ]),
828                object(vec![
829                    ("name".to_string(), Value::string("David Brown")),
830                    ("department".to_string(), Value::string("Engineering")),
831                    ("salary".to_string(), Value::int(95000)),
832                    ("age".to_string(), Value::int(41)),
833                ]),
834                object(vec![
835                    ("name".to_string(), Value::string("Eve Davis")),
836                    ("department".to_string(), Value::string("Marketing")),
837                    ("salary".to_string(), Value::int(62000)),
838                    ("age".to_string(), Value::int(29)),
839                ]),
840                object(vec![
841                    ("name".to_string(), Value::string("Frank Miller")),
842                    ("department".to_string(), Value::string("Engineering")),
843                    ("salary".to_string(), Value::int(88000)),
844                    ("age".to_string(), Value::int(33)),
845                ]),
846                object(vec![
847                    ("name".to_string(), Value::string("Grace Wilson")),
848                    ("department".to_string(), Value::string("Sales")),
849                    ("salary".to_string(), Value::int(71000)),
850                    ("age".to_string(), Value::int(26)),
851                ]),
852                object(vec![
853                    ("name".to_string(), Value::string("Henry Moore")),
854                    ("department".to_string(), Value::string("Engineering")),
855                    ("salary".to_string(), Value::int(102000)),
856                    ("age".to_string(), Value::int(38)),
857                ]),
858                object(vec![
859                    ("name".to_string(), Value::string("Ivy Taylor")),
860                    ("department".to_string(), Value::string("Marketing")),
861                    ("salary".to_string(), Value::int(65000)),
862                    ("age".to_string(), Value::int(31)),
863                ]),
864                object(vec![
865                    ("name".to_string(), Value::string("Jack Anderson")),
866                    ("department".to_string(), Value::string("Sales")),
867                    ("salary".to_string(), Value::int(79000)),
868                    ("age".to_string(), Value::int(30)),
869                ]),
870            ])
871        }
872
873        #[test]
874        fn test_example_002_query_on_csv() {
875            let query = r"group_by(.genre) | map({
876  genre: .[0].genre,
877  count: length,
878  avg_price: (map(.price) | add / length)
879})";
880
881            let data = create_mock_data();
882            let result = crate::filter::execute_filter(query, &data);
883            assert!(
884                result.is_ok(),
885                "Failed to execute query on CSV: {:?}",
886                result.err()
887            );
888
889            let value = result.unwrap();
890            // Verify the result structure
891            match value {
892                Value::Array(arr) => {
893                    assert_eq!(arr.len(), 3); // 3 unique genres: Fiction, Non-Fiction, Science
894                                              // Check that each item has genre, count, and avg_price
895                    for item in &arr {
896                        match item {
897                            Value::Object(obj) => {
898                                assert!(obj.contains_key("genre"));
899                                assert!(obj.contains_key("count"));
900                                assert!(obj.contains_key("avg_price"));
901                            }
902                            _ => panic!("Expected object in result array"),
903                        }
904                    }
905                }
906                _ => panic!("Expected array result"),
907            }
908        }
909
910        #[test]
911        fn test_example_002_query_on_json() {
912            let query = r"group_by(.genre) | map({
913  genre: .[0].genre,
914  count: length,
915  avg_price: (map(.price) | add / length)
916})";
917
918            let data = create_mock_data();
919            let result = crate::filter::execute_filter(query, &data);
920            assert!(
921                result.is_ok(),
922                "Failed to execute query on JSON: {:?}",
923                result.err()
924            );
925
926            let value = result.unwrap();
927            match value {
928                Value::Array(arr) => {
929                    assert_eq!(arr.len(), 3); // 3 unique genres: Fiction, Non-Fiction, Science
930                }
931                _ => panic!("Expected array result"),
932            }
933        }
934
935        #[test]
936        fn test_example_002_query_on_tsv() {
937            let query = r"group_by(.genre) | map({
938  genre: .[0].genre,
939  count: length,
940  avg_price: (map(.price) | add / length)
941})";
942
943            let data = create_mock_data();
944            let result = crate::filter::execute_filter(query, &data);
945            assert!(
946                result.is_ok(),
947                "Failed to execute query on TSV: {:?}",
948                result.err()
949            );
950
951            let value = result.unwrap();
952            match value {
953                Value::Array(arr) => {
954                    assert_eq!(arr.len(), 3); // 3 unique genres: Fiction, Non-Fiction, Science
955                }
956                _ => panic!("Expected array result"),
957            }
958        }
959
960        #[test]
961        fn test_example_002_query_on_parquet() {
962            let query = r"group_by(.genre) | map({
963  genre: .[0].genre,
964  count: length,
965  avg_price: (map(.price) | add / length)
966})";
967
968            let data = create_mock_data();
969            let result = crate::filter::execute_filter(query, &data);
970            assert!(
971                result.is_ok(),
972                "Failed to execute query on Parquet: {:?}",
973                result.err()
974            );
975
976            let value = result.unwrap();
977            match value {
978                Value::Array(arr) => {
979                    assert_eq!(arr.len(), 3); // 3 unique genres: Fiction, Non-Fiction, Science
980                }
981                _ => panic!("Expected array result"),
982            }
983        }
984
985        #[test]
986        fn test_example_002_query_from_file() {
987            let query_path = "examples/example_002/query.dsq";
988            if Path::new(query_path).exists() {
989                let query = fs::read_to_string(query_path).unwrap();
990
991                // Test on mock data
992                let data = create_mock_data();
993                let result = execute_filter(&query, &data);
994                assert!(
995                    result.is_ok(),
996                    "Failed to execute query from file on mock data: {:?}",
997                    result.err()
998                );
999            } else {
1000                println!("Skipping query file test - query.dsq not found");
1001            }
1002        }
1003
1004        #[test]
1005        fn test_example_085_query_on_csv() {
1006            let query = r"group_by(.department) | map({
1007  dept: .[0].department,
1008  count: length,
1009  avg_salary: (map(.salary) | add / length)
1010})";
1011
1012            let data = create_employee_data();
1013            let result = execute_filter(query, &data);
1014            assert!(
1015                result.is_ok(),
1016                "Failed to execute query on mock data: {:?}",
1017                result.err()
1018            );
1019
1020            let value = result.unwrap();
1021            // Verify the result structure
1022            match value {
1023                Value::Array(arr) => {
1024                    assert_eq!(arr.len(), 3); // 3 departments: Sales, Marketing, Engineering
1025                                              // Check that each item has dept, count, and avg_salary
1026                    for item in &arr {
1027                        match item {
1028                            Value::Object(obj) => {
1029                                assert!(obj.contains_key("dept"));
1030                                assert!(obj.contains_key("count"));
1031                                assert!(obj.contains_key("avg_salary"));
1032                            }
1033                            _ => panic!("Expected object in result array"),
1034                        }
1035                    }
1036                }
1037                _ => panic!("Expected array result"),
1038            }
1039        }
1040
1041        #[test]
1042        fn test_example_085_query_from_file() {
1043            let query_path = "examples/example_085/query.dsq";
1044            if Path::new(query_path).exists() {
1045                let query = fs::read_to_string(query_path).unwrap();
1046
1047                // Test on mock data
1048                let data = create_employee_data();
1049                let result = execute_filter(&query, &data);
1050                assert!(
1051                    result.is_ok(),
1052                    "Failed to execute query from file on mock data: {:?}",
1053                    result.err()
1054                );
1055            } else {
1056                println!("Skipping query file test - query.dsq not found");
1057            }
1058        }
1059
1060        #[test]
1061        fn test_example_075_query_on_csv() {
1062            let query = r"map(.salary += 5000) | map({name, new_salary: .salary, department})";
1063
1064            let data = create_employee_data();
1065            let result = execute_filter(query, &data);
1066            assert!(
1067                result.is_ok(),
1068                "Failed to execute query on mock data: {:?}",
1069                result.err()
1070            );
1071
1072            let value = result.unwrap();
1073            // Verify the result structure
1074            match value {
1075                Value::Array(arr) => {
1076                    assert_eq!(arr.len(), 10); // 10 employees
1077                                               // Check that each item has name, new_salary, and department
1078                    for item in &arr {
1079                        match item {
1080                            Value::Object(obj) => {
1081                                assert!(obj.contains_key("name"));
1082                                assert!(obj.contains_key("new_salary"));
1083                                assert!(obj.contains_key("department"));
1084                                // Check that new_salary is salary + 5000
1085                                if let (
1086                                    Some(Value::String(name)),
1087                                    Some(Value::Int(new_salary)),
1088                                    Some(Value::String(_dept)),
1089                                ) = (
1090                                    obj.get("name"),
1091                                    obj.get("new_salary"),
1092                                    obj.get("department"),
1093                                ) {
1094                                    // Verify specific expected values from the mock data
1095                                    match name.as_str() {
1096                                        "Alice Johnson" => assert_eq!(*new_salary, 80000), // 75000 + 5000
1097                                        "Bob Smith" => assert_eq!(*new_salary, 87000), // 82000 + 5000
1098                                        "Carol Williams" => assert_eq!(*new_salary, 73000), // 68000 + 5000
1099                                        "David Brown" => assert_eq!(*new_salary, 100000), // 95000 + 5000
1100                                        "Eve Davis" => assert_eq!(*new_salary, 67000), // 62000 + 5000
1101                                        "Frank Miller" => assert_eq!(*new_salary, 93000), // 88000 + 5000
1102                                        "Grace Wilson" => assert_eq!(*new_salary, 76000), // 71000 + 5000
1103                                        "Henry Moore" => assert_eq!(*new_salary, 107000), // 102000 + 5000
1104                                        "Ivy Taylor" => assert_eq!(*new_salary, 70000), // 65000 + 5000
1105                                        "Jack Anderson" => assert_eq!(*new_salary, 84000), // 79000 + 5000
1106                                        _ => panic!("Unexpected employee name: {name}"),
1107                                    }
1108                                } else {
1109                                    panic!(
1110                                        "Expected string name, int new_salary, string department"
1111                                    );
1112                                }
1113                            }
1114                            _ => panic!("Expected object in result array"),
1115                        }
1116                    }
1117                }
1118                _ => panic!("Expected array result"),
1119            }
1120        }
1121
1122        #[test]
1123        #[ignore = "parser does not support field names with spaces"]
1124        fn test_csv_with_spaces_in_field_names() {
1125            // Test CSV data with spaces in field names
1126            let csv_data = r"id,US City Name,population,country
11271,New York,8500000,USA
11282,Los Angeles,4000000,USA
11293,London,9000000,UK
11304,Paris,2200000,France";
1131
1132            // Create a temporary file
1133            let mut temp_file = NamedTempFile::new().unwrap();
1134            temp_file.write_all(csv_data.as_bytes()).unwrap();
1135            let path = temp_file.path();
1136
1137            // Parse the CSV data
1138            let result = io::read_file_sync(path, &io::ReadOptions::default());
1139            assert!(
1140                result.is_ok(),
1141                "Failed to parse CSV with spaces in field names: {:?}",
1142                result.err()
1143            );
1144
1145            let value = result.unwrap();
1146            match value {
1147                Value::DataFrame(df) => {
1148                    // Check that the DataFrame has the correct columns
1149                    let column_names = df.get_column_names();
1150                    assert!(column_names.contains(&&PlSmallStr::from("id")));
1151                    assert!(column_names.contains(&&PlSmallStr::from("US City Name")));
1152                    assert!(column_names.contains(&&PlSmallStr::from("population")));
1153                    assert!(column_names.contains(&&PlSmallStr::from("country")));
1154
1155                    // Note: Bracket notation with spaces in field names is not yet supported
1156                    // This is a known limitation tracked in TODO.md
1157                    let _ = df; // Use df to avoid warning
1158                }
1159                _ => panic!("Expected DataFrame result"),
1160            }
1161        }
1162    }
1163}