polars_view/
data_filter.rs

1use crate::{
2    Arguments, DEFAULT_OVERRIDE_REGEX, DEFAULT_QUERY, FileExtension, PathExtension,
3    PolarsViewError, PolarsViewResult, UniqueElements, sql_commands,
4};
5use egui::{
6    Align, CollapsingHeader, Color32, DragValue, Frame, Grid, Layout, Stroke, TextEdit, Ui, Vec2,
7};
8use polars::{io::RowIndex, prelude::*};
9use regex::Regex;
10use tokio::task::spawn_blocking;
11
12use std::{
13    fmt::Debug,
14    fs::File,
15    num::NonZero,
16    path::{Path, PathBuf},
17    sync::Arc,
18};
19
20// --- Constants ---
21
22/// Static string listing common values treated as null/missing during CSV parsing.
23/// The `r#""#` syntax denotes a raw string literal, avoiding the need to escape quotes.
24pub static NULL_VALUES: &str = r#""", <N/D>"#;
25
26/// Default delimiter used for CSV parsing if not specified or detected.
27/// Using `&'static str` for common, immutable delimiters saves memory allocation.
28pub static DEFAULT_CSV_DELIMITER: &str = ";";
29
30/// Default name for the row number column if added.
31pub const DEFAULT_INDEX_COLUMN_NAME: &str = "Row Number";
32
33/// Default regex
34const DEFAULT_NORM_REGEX: &str = "^Val.*$";
35
36/// Default drop regex
37const DEFAULT_DROP_REGEX: &str = "^Temp.*$";
38
39/// Default starting offset for the row index column (e.g., 1 for 1-based).
40const DEFAULT_INDEX_COLUMN_OFFSET: u32 = 1;
41
42const DEFAULT_INFER_SCHEMA_ROWS: usize = 200;
43
44// Prevent potential infinite loops (e.g., schema keeps changing).
45pub const MAX_ATTEMPTS: u32 = 1000;
46
47// --- DataFilter Struct ---
48
49/// Holds configuration parameters related to **loading and querying** data.
50///
51/// This struct focuses on settings that define how data is initially read from a file
52/// and transformed via SQL queries or basic processing like null column removal.
53///
54/// Instances are created from `Arguments`, updated by the UI in `render_query`, and passed
55/// to `DataFrameContainer::load_data`. Changes here typically trigger a data reload/requery.
56#[derive(Debug, Clone, PartialEq)] // PartialEq allows simple change detection
57pub struct DataFilter {
58    /// The canonical, absolute path to the data file.
59    pub absolute_path: PathBuf,
60    /// The name assigned to the loaded DataFrame for use in SQL queries.
61    pub table_name: String,
62    /// The character used to separate columns in a CSV file.
63    pub csv_delimiter: String,
64    /// Read data from file
65    pub read_data_from_file: bool,
66    /// The schema (column names and data types) of the most recently loaded DataFrame.
67    /// Used by `sql_commands` for generating relevant examples.
68    pub schema: Arc<Schema>,
69    /// Maximum rows to scan for schema inference (CSV, JSON, NDJson).
70    pub infer_schema_rows: usize,
71    /// Flag to control removal of all-null columns after loading/querying.
72    pub exclude_null_cols: bool,
73    /// Comma-separated string of values to interpret as nulls during CSV parsing.
74    pub null_values: String,
75
76    /// Regex patterns matching columns to force read as String type.
77    ///
78    /// List of column names to force reading as String, overriding inference.
79    /// Useful for columns with large IDs/keys that look numeric.
80    pub force_string_patterns: Option<String>,
81
82    /// Flag indicating if the `query` should be executed during the next `load_data`.
83    /// Set by `render_query` if relevant UI fields change or the Apply button is clicked.
84    pub apply_sql: bool,
85    /// The SQL query string entered by the user.
86    pub query: String,
87
88    // --- NEW FIELDS for Index Column ---
89    /// Flag indicating if a row index column should be added.
90    pub add_row_index: bool,
91    /// The desired name for the row index column (will be checked for uniqueness later).
92    pub index_column_name: String,
93    /// The starting value for the row index column (e.g., 0 or 1).
94    pub index_column_offset: u32,
95    // --- END NEW FIELDS ---
96
97    // --- Normalize Columns ---
98    /// Flag indicating whether string columns will be normalized.
99    pub normalize: bool,
100    /// Regex pattern to select string columns.
101    pub normalize_regex: String,
102
103    // --- Drop Columns ---
104    pub drop: bool,
105    pub drop_regex: String,
106}
107
108impl Default for DataFilter {
109    /// Creates default `DataFilter` with sensible initial values.
110    fn default() -> Self {
111        DataFilter {
112            absolute_path: PathBuf::new(),
113            table_name: "AllData".to_string(),
114            csv_delimiter: DEFAULT_CSV_DELIMITER.to_string(),
115            read_data_from_file: true,
116            schema: Schema::default().into(),
117            infer_schema_rows: DEFAULT_INFER_SCHEMA_ROWS,
118            exclude_null_cols: false,
119            null_values: NULL_VALUES.to_string(),
120
121            force_string_patterns: DEFAULT_OVERRIDE_REGEX.map(ToString::to_string),
122
123            apply_sql: false,
124            query: DEFAULT_QUERY.to_string(),
125
126            // --- NEW DEFAULTS ---
127            add_row_index: false, // Default to false
128            index_column_name: DEFAULT_INDEX_COLUMN_NAME.to_string(),
129            index_column_offset: DEFAULT_INDEX_COLUMN_OFFSET,
130            // --- END NEW DEFAULTS ---
131
132            // --- NEW FIELDS for Normalize Columns ---
133            normalize: false,
134            normalize_regex: DEFAULT_NORM_REGEX.to_string(),
135            // --- END NEW FIELDS ---
136            drop: false,
137            drop_regex: DEFAULT_DROP_REGEX.to_string(),
138        }
139    }
140}
141
142// --- Methods ---
143
144impl DataFilter {
145    /// Creates a new `DataFilter` instance configured from command-line `Arguments`.
146    /// This is typically called once at application startup in `main.rs`.
147    ///
148    /// ### Arguments
149    /// * `args`: Parsed command-line arguments (`crate::Arguments`).
150    ///
151    /// ### Returns
152    /// A `PolarsViewResult` containing the configured `DataFilter` or an error
153    /// (e.g., if the path cannot be canonicalized).
154    pub fn new(args: &Arguments) -> PolarsViewResult<Self> {
155        // Ensure the path exists and get its absolute, canonical form.
156        let absolute_path = args.path.canonicalize()?;
157
158        // Determine apply_sql state from the CLI argument
159        let apply_sql = args.query.is_some();
160        let query = args
161            .query
162            .clone()
163            .unwrap_or_else(|| DEFAULT_QUERY.to_string()); // Use CLI arg or default        
164
165        // Determine normalization state from the CLI argument
166        let normalize = args.regex.is_some();
167        let normalize_regex = args
168            .regex
169            .clone()
170            .unwrap_or_else(|| DEFAULT_NORM_REGEX.to_string()); // Use CLI arg or default
171
172        // Use or_else: takes a closure executed only if the first option is None.
173        // This avoids the .to_string() for the default unless actually needed.
174        let force_string_patterns = args
175            .force_string_patterns // This is Option<String>
176            .clone() // Clone the Option<String> from args if needed later, otherwise maybe take ownership
177            .or(DEFAULT_OVERRIDE_REGEX.map(ToString::to_string)); // Use CLI arg or default
178
179        Ok(DataFilter {
180            absolute_path,
181            table_name: args.table_name.clone(),
182            csv_delimiter: args.delimiter.clone(),
183
184            apply_sql, // Directly set based on CLI argument presence
185            query,     // Directly set based on CLI argument value (or default)
186
187            exclude_null_cols: args.exclude_null_cols,
188            null_values: args.null_values.clone(), // Use user-provided nulls.
189
190            force_string_patterns,
191
192            normalize,            // Directly set based on CLI argument presence
193            normalize_regex,      // Directly set based on CLI argument value (or default)
194            ..Default::default()  // Use defaults for `schema`, `infer_schema_rows`.
195        })
196    }
197
198    /// Sets the data source path, canonicalizing it.
199    pub fn set_path(&mut self, path: &Path) -> PolarsViewResult<()> {
200        self.absolute_path = path.canonicalize()?;
201        tracing::debug!("absolute_path set to: {:#?}", self.absolute_path);
202        Ok(())
203    }
204
205    /// Gets the file extension from `absolute_path` in lowercase.
206    pub fn get_extension(&self) -> Option<String> {
207        self.absolute_path.extension_as_lowercase()
208    }
209
210    /// Determines the configuration for an optional row index column by resolving a unique name
211    /// against the provided schema.
212    ///
213    /// If `self.add_row_index` is true, this method finds a unique name based on
214    /// `self.index_column_name` and the provided `schema`, returning a `Some(RowIndex)`.
215    /// If the name resolution fails, it returns the specific PolarsError.
216    /// If `self.add_row_index` is false, it returns `Ok(None)`.
217    ///
218    /// ### Arguments
219    /// * `schema`: The schema against which the index column name should be checked for uniqueness.
220    ///   This should be the schema of the DataFrame *before* adding the index column.
221    ///
222    /// ### Returns
223    /// `PolarsResult<Option<RowIndex>>`: Ok(Some) if config is resolved, Ok(None) if disabled, Err if resolution fails.
224    pub fn get_row_index(&self, schema: &Schema) -> PolarsResult<Option<RowIndex>> {
225        // Check the main feature flag
226        if !self.add_row_index {
227            tracing::trace!("Row index addition disabled in filter.");
228            return Ok(None); // Feature disabled, return None config
229        }
230
231        // Feature is enabled. Resolve a unique name using the helper.
232        let unique_name = resolve_unique_column_name(
233            // Use the helper function
234            &self.index_column_name, // Base name
235            schema,                  // Schema to check uniqueness against
236        )?; // Propagate potential error from unique name resolution
237
238        // If we successfully resolved a unique name, return the full RowIndex config
239        let index_offset = self.index_column_offset;
240        Ok(Some(RowIndex {
241            name: unique_name,
242            offset: index_offset,
243        }))
244    }
245
246    /// Determines the `FileExtension` and orchestrates loading the DataFrame using the appropriate Polars reader.
247    /// This method centralizes the file-type-specific loading logic. Called by `DataFrameContainer::load_data`.
248    ///
249    /// **Important:** It mutates `self` by potentially updating `csv_delimiter` if automatic
250    /// detection during `read_csv_data` finds a different working delimiter than initially configured.
251    ///
252    /// ### Returns
253    /// A `PolarsViewResult` containing a tuple: `(DataFrame, FileExtension)` on success,
254    /// or a `PolarsViewError` (e.g., `FileType`, `CsvParsing`) on failure.
255    pub async fn get_df_and_extension(&mut self) -> PolarsViewResult<(DataFrame, FileExtension)> {
256        // Determine the file extension type using the helper from `extension.rs`.
257        let extension = FileExtension::from_path(&self.absolute_path);
258
259        // Match on the determined extension to call the correct reader function.
260        let (df, detected_delimiter) = match &extension {
261            FileExtension::Csv => self.read_csv_data().await?,
262            FileExtension::Json => self.read_json_data().await?,
263            FileExtension::NDJson => self.read_ndjson_data().await?,
264            FileExtension::Parquet => self.read_parquet_data().await?,
265            // Handle unsupported or missing extensions with specific errors.
266            FileExtension::Unknown(ext) => {
267                return Err(PolarsViewError::FileType(format!(
268                    "Unsupported extension: `{}` for file: `{}`",
269                    ext,
270                    self.absolute_path.display()
271                )));
272            }
273            FileExtension::Missing => {
274                return Err(PolarsViewError::FileType(format!(
275                    "Missing extension for file: `{}`",
276                    self.absolute_path.display()
277                )));
278            }
279        };
280
281        // If reading a CSV successfully detected a working delimiter, update the filters state.
282        // This ensures the UI reflects the delimiter actually used.
283        if let Some(byte) = detected_delimiter {
284            self.csv_delimiter = (byte as char).to_string();
285        }
286
287        tracing::debug!(
288            "fn get_df_and_extension(): Successfully loaded DataFrame with extension: {:?}",
289            extension
290        );
291
292        Ok((df, extension)) // Return the loaded DataFrame and the detected extension.
293    }
294
295    // --- Data Reading Helper Methods ---
296
297    /// Reads a standard JSON file into a Polars DataFrame.
298    /// Configures the reader using settings from `self` (e.g., `infer_schema_rows`).
299    ///
300    /// ### Returns
301    /// A `PolarsViewResult` containing `(DataFrame, None)` (delimiter is not applicable to JSON).
302    async fn read_json_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
303        tracing::debug!("Reading JSON data from: {}", self.absolute_path.display());
304        let file = File::open(&self.absolute_path)?;
305        let infer_schema_rows_for_task = self.infer_schema_rows;
306
307        // Execute the blocking read operation on a separate thread
308        let df = execute_polars_blocking(move || {
309            JsonReader::new(file)
310                .infer_schema_len(NonZero::new(infer_schema_rows_for_task))
311                .finish()
312        })
313        .await?;
314
315        tracing::debug!("JSON read complete. Shape: {:?}", df.shape());
316
317        Ok((df, None))
318    }
319
320    /// Reads a Newline-Delimited JSON (NDJson / JSON Lines) file into a Polars DataFrame.
321    /// Uses `LazyJsonLineReader` for potentially better performance/memory usage on large files.
322    ///
323    /// ### Returns
324    /// A `PolarsViewResult` containing `(DataFrame, None)`.
325    async fn read_ndjson_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
326        tracing::debug!("Reading NDJSON data from: {}", self.absolute_path.display());
327
328        // Clone data from self needed for the task closure.
329        let path_buf_for_task = PlPath::Local(self.absolute_path.clone().into());
330        let infer_schema_rows_for_task = self.infer_schema_rows;
331
332        // *** Use the helper function ***
333        let df = execute_polars_blocking(move || {
334            // 'move' captures path_buf_for_task, infer_schema_rows_for_task
335            // This code runs on the blocking thread.
336            let lazyframe = LazyJsonLineReader::new(path_buf_for_task) // Use cloned path
337                .low_memory(false) // Option to optimize for memory.
338                .with_infer_schema_length(NonZero::new(infer_schema_rows_for_task))
339                .with_ignore_errors(true)
340                .finish()?; // Returns PolarsResult<LazyFrame> (this finish() isn't the main blocking part)
341
342            // Collect the lazy frame - THIS IS THE BLOCKING PART
343            lazyframe.with_new_streaming(true).collect() // Returns PolarsResult<DataFrame>
344        })
345        .await?; // await the helper function
346
347        tracing::debug!("NDJSON read complete. Shape: {:?}", df.shape());
348        Ok((df, None))
349    }
350
351    /// Reads an Apache Parquet file into a Polars DataFrame.
352    ///
353    /// ### Returns
354    /// A `PolarsViewResult` containing `(DataFrame, None)`.
355    async fn read_parquet_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
356        tracing::debug!(
357            "Reading Parquet data from: {}",
358            self.absolute_path.display()
359        );
360
361        // Clone data from self needed for the task closure.
362        let path_buf_for_task = PlPath::Local(self.absolute_path.clone().into());
363        let args = ScanArgsParquet {
364            // ScanArgsParquet should be Send
365            low_memory: false, // Configure scan arguments as needed.
366            ..Default::default()
367        };
368
369        let df = execute_polars_blocking(move || {
370            // Use `LazyFrame::scan_parquet` for efficient scanning.
371            let lazyframe = LazyFrame::scan_parquet(path_buf_for_task, args)?; // Returns PolarsResult<LazyFrame>
372
373            // Collect into an eager DataFrame - THIS IS THE BLOCKING/COMPUTE PART.
374            lazyframe.with_new_streaming(true).collect() // Returns PolarsResult<DataFrame>
375        })
376        .await?; // await the helper function            
377
378        tracing::debug!("Parquet read complete. Shape: {:?}", df.shape());
379
380        Ok((df, None))
381    }
382
383    /// Reads a CSV file, attempting automatic delimiter detection if the initial one fails.
384    /// Iterates through common delimiters and tries reading a small chunk first for efficiency.
385    ///
386    /// ### Returns
387    /// A `PolarsViewResult` containing `(DataFrame, Option<u8>)` where `Option<u8>` is the
388    /// *successfully used* delimiter byte. Returns `Err(PolarsViewError::CsvParsing)` if
389    /// no common delimiter works.
390    async fn read_csv_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
391        // Get the currently configured separator byte. Error if invalid (e.g., empty string).
392        let initial_separator = self.get_csv_separator()?;
393
394        // List of common delimiters to try, starting with the configured one.
395        let mut delimiters_to_try = vec![initial_separator, b',', b';', b'|', b'\t', b':'];
396        // Remove duplicates if the initial separator is already in the common list.
397        delimiters_to_try.unique();
398        tracing::debug!(
399            "Attempting CSV read. Delimiters to try: {:?}",
400            delimiters_to_try
401                .iter()
402                .map(|&b| b as char)
403                .collect::<Vec<_>>()
404        );
405
406        // Look at the next element of the iterator without consuming it.
407        let mut iterator = delimiters_to_try.iter().peekable();
408
409        // Iterate through the potential delimiters.
410        while let Some(&delimiter) = iterator.next() {
411            // If peek() returns None, it means the current item was the last one
412            let is_last_element = iterator.peek().is_none();
413
414            // 1. Quick Check: Try reading only a small number of rows (NROWS_CHECK).
415            // This fails fast if the delimiter is fundamentally wrong (e.g., results in 1 column).
416            if let Ok(schema) = self
417                .attempt_csv_parse_structure(delimiter, is_last_element)
418                .await
419            {
420                // 2. Full Read: If the quick check passed, attempt to read the entire file.
421                tracing::debug!(
422                    "Trying to read full CSV file with delimiter: '{}'",
423                    delimiter as char
424                );
425                match self.attempt_read_csv(delimiter, &schema).await {
426                    Ok(lazyframe) => {
427                        // Success! Return the DataFrame and the delimiter that worked.
428                        tracing::info!(
429                            "Successfully read CSV with delimiter: '{}'",
430                            delimiter as char
431                        );
432
433                        // Execute the lazy plan and collect into an eager DataFrame on a blocking thread
434                        let df = execute_polars_blocking(move || {
435                            lazyframe.with_new_streaming(true).collect()
436                        })
437                        .await?;
438
439                        tracing::debug!("Data collection complete. Shape: {:?}", df.shape());
440                        return Ok((df, Some(delimiter)));
441                    }
442                    Err(e) => {
443                        // Full read failed even after quick check passed. Log and try next delimiter.
444                        tracing::warn!(
445                            "Full CSV read failed with delimiter '{}' after quick check passed: {}",
446                            delimiter as char,
447                            e
448                        );
449                        continue; // Try the next delimiter.
450                    }
451                }
452            }
453            // If quick check fails, implicitly try the next delimiter.
454        }
455
456        // If all delimiters failed, return a parsing error.
457        let msg = format!(
458            "Failed to read CSV '{}' with common delimiters. Check format or specify delimiter.",
459            self.absolute_path.display()
460        );
461        let error = PolarsViewError::CsvParsing(msg);
462        tracing::error!("{}", error);
463        Err(error)
464    }
465
466    /// Retrieves the CSV separator byte from the `csv_delimiter` String configuration.
467    ///
468    /// ### Returns
469    /// `Ok(u8)` containing the first byte, or `Err(PolarsViewError::InvalidDelimiter)`
470    /// if the string is empty or contains multi-byte characters (only first byte is used).
471    fn get_csv_separator(&self) -> PolarsViewResult<u8> {
472        self.csv_delimiter
473            .as_bytes() // Convert String to byte slice.
474            .first() // Get the first byte.
475            .copied() // Copy the byte out of the Option<&u8>.
476            // Map `None` (empty string) to an InvalidDelimiter error.
477            .ok_or_else(|| PolarsViewError::InvalidDelimiter(self.csv_delimiter.clone()))
478    }
479
480    /// Attempts to parse the CSV structure from the initial chunk of the file
481    /// using a specific delimiter and validates the result.
482    async fn attempt_csv_parse_structure(
483        &self,
484        delimiter: u8,
485        is_last_element: bool,
486    ) -> PolarsViewResult<Arc<Schema>> {
487        // Constant: Number of data rows to ask Polars to read *after* the header
488        // during this quick probe. 100 is a common heuristic to get enough
489        // context without reading the whole file.
490        const ROW_LIMIT: usize = 100;
491
492        tracing::debug!(
493            "Trying to parse CSV with delimiter: '{}'",
494            delimiter as char,
495        );
496
497        let file_path = &self.absolute_path;
498
499        // Perform a partial read from the file using the given delimiter.
500        let data_frame = read_csv_partial_from_path(delimiter, ROW_LIMIT, file_path).await?;
501
502        // **Basic Validation**: Check resulting width (important for delimiter detection loops)
503        // it's highly likely the delimiter was incorrect. Return an error early.
504        // This check is crucial for the delimiter detection loop in `read_csv_data`.
505        let min_expected_cols_on_success = if self.add_row_index { 2 } else { 1 }; // Assumes index is added *later*
506
507        if data_frame.width() <= min_expected_cols_on_success && !is_last_element {
508            tracing::warn!(
509                "CSV read with delimiter '{}' resulted in {} columns (expected > {}). Assuming incorrect delimiter.",
510                delimiter as char,
511                data_frame.width(),
512                min_expected_cols_on_success
513            );
514            // Return a specific error type or message indicating a likely delimiter issue.
515            return Err(PolarsViewError::CsvParsing(format!(
516                "Delimiter '{}' likely incorrect (resulted in {} columns)",
517                delimiter as char,
518                data_frame.width()
519            )));
520        }
521
522        tracing::debug!(
523            "CSV read successful with delimiter '{}'. Final shape (rows, columns): {:?}",
524            delimiter as char,
525            data_frame.shape()
526        );
527
528        Ok(data_frame.schema().clone())
529    }
530
531    async fn attempt_read_csv(
532        &self,
533        delimiter: u8,
534        previous_scheme: &Arc<Schema>,
535    ) -> PolarsViewResult<LazyFrame> {
536        tracing::debug!(
537            "Attempting CSV read with delimiter: '{}'",
538            delimiter as char,
539        );
540
541        /*
542        When LazyCsvReader tries to determine the data types (with_infer_schema_length()), it looks at the first N rows.
543        If your problematic column contains only digits in those initial rows,
544        Polars will likely infer a numeric type (like Int64, UInt64, or Float64).
545
546        However, a standard 64-bit integer or float cannot represent an arbitrarily long sequence of digits (like a 44-digit key).
547        When the reader later encounters these huge numbers and tries to parse them into the inferred numeric type, the parsing fails.
548        with_ignore_errors(true), instead of stopping, Polars replaces the unparseable value with null.
549        If all values in that column exceed the capacity of the inferred numeric type, the entire column becomes null, appearing "empty".
550
551        Solution:
552        .with_dtype_overwrite(dtypes_opt)
553        */
554
555        let mut dtypes_opt: Option<Arc<Schema>> = None;
556
557        if let Some(force_string_patterns) = &self.force_string_patterns {
558            // Build dtype overrides using the dedicated function
559            //    Pass the actual headers and the configured regex patterns.
560            let override_schema = build_dtype_override_schema(
561                previous_scheme,
562                force_string_patterns, // Get patterns from self
563            )?; // Propagate potential regex compilation errors
564
565            // Convert the resulting Schema into Option<Arc<Schema>>.
566            if !override_schema.is_empty() {
567                dtypes_opt = Some(Arc::new(override_schema));
568            };
569        }
570
571        let plpath = PlPath::Local(self.absolute_path.clone().into());
572
573        // Configure the LazyCsvReader using settings from `self`.
574        let lazyframe = LazyCsvReader::new(plpath)
575            .with_low_memory(false) // Can be set to true for lower memory usage at cost of speed.
576            .with_encoding(CsvEncoding::LossyUtf8) // Gracefully handle potential encoding errors.
577            .with_has_header(true) // Assume a header row.
578            .with_try_parse_dates(true) // Attempt automatic date parsing.
579            .with_separator(delimiter) // Use the specified delimiter.
580            .with_infer_schema_length(Some(self.infer_schema_rows)) // Use filter setting for inference.
581            .with_dtype_overwrite(dtypes_opt)
582            .with_ignore_errors(true) // Rows with parsing errors become nulls instead of stopping the read.
583            .with_missing_is_null(true) // Treat missing fields as null.
584            .with_null_values(None) // Apply fn replace_values_with_null()
585            .with_n_rows(None) // Apply row limit if specified.
586            .with_decimal_comma(false) // If files use ',' as decimal separator.
587            .with_row_index(None) // Apply fn add_row_index_column()
588            .with_rechunk(true) // Rechunk the memory to contiguous chunks when parsing is done.
589            .finish()?; // Finalize configuration and create the LazyFrame.
590
591        Ok(lazyframe)
592    }
593
594    /// Parses the comma-separated `null_values` string into a `Vec<&str>`,
595    /// removing surrounding double quotes if present.
596    ///
597    /// Logic:
598    /// 1. Splits the input string (`self.null_values`) by commas.
599    /// 2. Iterates through each resulting substring (`s`).
600    /// 3. For each substring:
601    ///    a. Trims leading and trailing whitespace.
602    ///    b. Checks if the `trimmed` string has at least 2 characters AND starts with `"` AND ends with `"`.
603    ///    c. If true, returns a slice (`&str`) representing the content *between* the quotes.
604    ///    Example: `"\"\""` becomes `""`, `" N/A "` becomes `"N/A"`, `" " "` becomes `" "`.
605    ///    d. If false (no surrounding quotes), returns a slice (`&str`) of the `trimmed` string itself.
606    ///    Example: `<N/D>` remains `<N/D>`, ` NA ` becomes `NA`.
607    /// 4. Collects all the resulting string slices into a `Vec<&str>`.
608    ///
609    /// Example Input: `"\"\", \" \", <N/D>, NA "`
610    /// Example Output: `vec!["", " ", "<N/D>", "NA"]`
611    pub fn parse_null_values(&self) -> Vec<&str> {
612        self.null_values
613            .split(',') // 1. Split the string by commas.
614            .map(|s| {
615                // For each part resulting from the split:
616                // 3a. Trim leading/trailing whitespace.
617                let trimmed = s.trim();
618                // 3b. Check if it's quoted (length >= 2, starts/ends with ").
619                if trimmed.len() >= 2 && trimmed.starts_with('"') && trimmed.ends_with('"') {
620                    // 3c. If quoted, return the slice between the quotes.
621                    trimmed[1..trimmed.len() - 1].trim()
622                } else {
623                    // 3d. If not quoted, return the trimmed slice directly.
624                    trimmed
625                }
626            })
627            .collect() // 4. Collect the processed slices into a vector.
628    }
629
630    // --- UI Rendering Methods ---
631
632    /// Renders the UI widgets for configuring data filters within the "Query" collapsing header.
633    /// This function is called by `layout.rs::render_side_panel`.
634    ///
635    /// **Crucially, it takes `&mut self`. Widgets modify `self` directly.**
636    /// It compares the state of `self` *before* and *after* rendering the widgets.
637    /// If any change occurred (user typed in a field, clicked a checkbox), it returns
638    /// `Some(self.clone())` containing the *modified* state. Otherwise, it returns `None`.
639    ///
640    /// The `layout.rs` code uses this return value:
641    /// - If `Some(new_filters)`, it triggers an asynchronous `DataFrameContainer::load_data` call.
642    /// - If `None`, no user change was detected in this frame, so no action is taken.
643    ///
644    /// It also sets `self.apply_sql = true` if any changes are detected, ensuring the SQL
645    /// query is re-applied upon reload.
646    ///
647    /// ### Arguments
648    /// * `ui`: The `egui::Ui` context for drawing the widgets.
649    ///
650    /// ### Returns
651    /// * `Some(DataFilter)`: If any filter setting was changed by the user in this frame.
652    /// * `None`: If no changes were detected.
653    pub fn render_query(&mut self, ui: &mut Ui) -> Option<DataFilter> {
654        // Clone the state *before* rendering UI widgets to detect changes later.
655        let filters_before_render = self.clone();
656        let mut result = None;
657
658        let width_min = 450.0; // Minimum width for the grid area.
659
660        // Use a grid layout for label-input pairs.
661        let grid = Grid::new("data_query_grid")
662            .num_columns(2)
663            .spacing([10.0, 20.0]) // Horizontal and vertical spacing.
664            .striped(true); // Alternating row backgrounds.
665
666        // Allocate UI space for the grid.
667        ui.allocate_ui_with_layout(
668            Vec2::new(ui.available_width(), ui.available_height()), // Occupy available width.
669            Layout::top_down(Align::LEFT),
670            |ui| {
671                grid.show(ui, |ui| {
672                    ui.set_min_width(width_min);
673
674                    // --- Render Individual Filter Widgets ---
675                    // Each `render_*` method takes `&mut self` and `ui`.
676
677                    self.render_add_row_number(ui);
678
679                    self.render_exclude_null_cols(ui);
680
681                    self.render_exclude_columns(ui);
682
683                    self.render_normalize_numbers(ui);
684
685                    self.render_null_values(ui);
686
687                    // Input for schema inference length (only for relevant file types).
688                    if matches!(
689                        self.get_extension().as_deref(), // Get extension as &str
690                        Some("csv" | "json" | "ndjson")  // Check if it's one of these
691                    ) {
692                        self.render_schema_length_input(ui);
693                    }
694
695                    // CSV-specific settings: delimiter.
696                    if self.get_extension().as_deref() == Some("csv") {
697                        self.render_csv_delimiter(ui);
698                    }
699
700                    // Input for table name used in SQL.
701                    self.render_table_name_input(ui);
702
703                    // Multiline input for the SQL query.
704                    self.render_sql_query_input(ui);
705
706                    // --- Change Detection & Apply Button ---
707
708                    // Compare current state `self` with the state before rendering.
709                    if *self != filters_before_render {
710                        // Mark that SQL needs to be (re-)applied.
711                        self.apply_sql = true;
712                        tracing::debug!("Change detected in DataFilter UI.");
713                    }
714
715                    if (self.csv_delimiter != filters_before_render.csv_delimiter)
716                        || (self.infer_schema_rows != filters_before_render.infer_schema_rows)
717                    {
718                        self.read_data_from_file = true;
719                    }
720
721                    // Add the "Apply SQL commands" button.
722                    ui.label(""); // For alignment.
723                    ui.with_layout(Layout::top_down(Align::Center), |ui| {
724                        if ui.button("Apply SQL commands").clicked() {
725                            if self.apply_sql {
726                                // Result contains DataFilter after editing some fields
727                                result = Some(self.clone());
728                            }
729
730                            tracing::debug!("Apply SQL commands: {}", self.apply_sql);
731                        }
732                    });
733                    ui.end_row();
734                }); // End grid.show
735            }, // End allocate_ui_with_layout
736        ); // End allocation
737
738        // Display the SQL examples section (collapsible).
739        self.render_sql_examples(ui);
740
741        result // Return the potentially updated filters.
742    }
743
744    // --- Helper Rendering Methods ---
745
746    fn render_add_row_number(&mut self, ui: &mut Ui) {
747        // --- Row 1: Feature Checkbox ---
748        ui.label("Add Row Number:");
749        // The checkbox directly modifies `self.add_row_index`
750        ui.checkbox(&mut self.add_row_index, "")
751            .on_hover_text("Add a new column that counts the rows (first column).");
752        ui.end_row();
753
754        // --- Conditional Configuration Inputs ---
755        // These rows are only added to the grid if the checkbox is checked.
756        if self.add_row_index {
757            // --- Index Name Input ---
758            // Use simple indentation in the label for visual structure
759            ui.label("\tName:");
760            let name_edit =
761                TextEdit::singleline(&mut self.index_column_name).desired_width(f32::INFINITY); // Use available width in the grid cell
762            ui.add(name_edit)
763                .on_hover_text("Name for the new index column (uniqueness checked later).");
764            ui.end_row();
765
766            // --- Index Offset Input ---
767            // Use simple indentation
768            ui.label("\tOffset:");
769            let offset_drag = DragValue::new(&mut self.index_column_offset)
770                .speed(1) // Increment by 1
771                .range(0..=u32::MAX); // Allow 0-based or 1-based commonly
772            ui.add(offset_drag)
773                .on_hover_text("Starting value for the index (e.g., 0 or 1).");
774            ui.end_row();
775        }
776        // No 'else' needed. If add_row_index is false, these rows are simply skipped.
777    }
778
779    /// Renders the checkbox for the "Remove Null Cols" option.
780    /// Modifies `self.exclude_null_cols` directly.
781    fn render_exclude_null_cols(&mut self, ui: &mut Ui) {
782        ui.label("Exclude Null Cols:");
783        ui.checkbox(&mut self.exclude_null_cols, "")
784            .on_hover_text("Remove columns containing only null values.");
785        ui.end_row();
786    }
787
788    fn render_exclude_columns(&mut self, ui: &mut Ui) {
789        // --- Row 1: Feature Checkbox ---
790        ui.label("Remove Columns:");
791        ui.checkbox(&mut self.drop, "")
792            .on_hover_text("Remove columns whose names match the specified regex pattern.");
793        ui.end_row();
794
795        // --- Conditional Configuration Inputs ---
796        // These rows are only added to the grid if the checkbox is checked.
797        if self.drop {
798            // --- Regex Input ---
799            // Use simple indentation in the label for visual structure
800            ui.label("\tRegex:");
801            let name_edit = TextEdit::singleline(&mut self.drop_regex).desired_width(f32::INFINITY); // Use available width in the grid cell
802            ui.add(name_edit).on_hover_text(
803                "Enter the regex pattern to identify columns to drop by name.\n\n\
804                Format Requirements:\n\
805                - Use `*` to drop ALL columns.\n\
806                - Use `^YourPattern$` to match the entire column name.\n  \
807                   (Must start with `^` and end with `$`).\n\n\
808                Regex Examples:\n\
809                - `^Temp.*$`   (Matches columns starting with 'Temp')\n\
810                - `^Value B$`    (Matches the exact column named 'Value B')\n\
811                - `^(ID|Key|Index)$` (Matches 'ID', 'Key', or 'Index' exactly)\n\
812                - `^.*_OLD$`    (Matches columns ending with '_OLD')\n\n\
813                (Invalid regex syntax or format will cause errors.)",
814            );
815            ui.end_row();
816        }
817    }
818
819    fn render_normalize_numbers(&mut self, ui: &mut Ui) {
820        // --- Row 1: Feature Checkbox ---
821        ui.label("Normalize Columns:");
822        ui.checkbox(&mut self.normalize, "").on_hover_text(
823            "Normalize Euro-style number strings in selected column names (via regex) to Float64.\n\
824            Example: '1.234,56' (String) to '1234.56' (Float64).",
825        );
826        ui.end_row();
827
828        // --- Conditional Configuration Inputs ---
829        // These rows are only added to the grid if the checkbox is checked.
830        if self.normalize {
831            // --- Regex Input ---
832            // Use simple indentation in the label for visual structure
833            ui.label("\tRegex:");
834            let name_edit =
835                TextEdit::singleline(&mut self.normalize_regex).desired_width(f32::INFINITY); // Use available width in the grid cell
836            ui.add(name_edit).on_hover_text(
837                r#"
838Enter a regex pattern to select String columns by name.
839
840Rules:
841- Use '*' for ALL String columns (caution!).
842- Use '^PATTERN$' for specific names (matches entire name).
843
844Example Columns:
845Row Number, Value1, Value2, ValueA, Valor, Total, SubTotal, Last Info
846
847Example Patterns:
8481. To select 'Value1', 'Value2':
849   ^Value\d$
850
8512. To select 'Value1', 'Value2', 'ValueA':
852   ^Value.*$
853
8543. To select 'Value1', 'Value2', 'ValueA', 'Valor':
855   ^Val.*$
856
8574. To select 'Value1', 'Value2', 'ValueA', 'Valor', 'Total', 'SubTotal':
858   ^(Val|.*Total).*$
859
8605. To select only 'Last Info' (note the space):
861   ^Last Info$
862
863(Applies only to columns that Polars identifies as String type.)"#,
864            );
865            ui.end_row();
866        }
867    }
868
869    /// Renders the `TextEdit` widget for specifying custom null values.
870    /// Modifies `self.null_values` directly based on user input.
871    fn render_null_values(&mut self, ui: &mut Ui) {
872        // Null Values Input Label
873        ui.label("Null Values:");
874
875        // Single-line text edit widget bound to the `self.null_values` string.
876        let null_values_edit =
877            TextEdit::singleline(&mut self.null_values).desired_width(f32::INFINITY); // Take available horizontal space.
878
879        // Add the widget to the UI and set its hover text.
880        ui.add(null_values_edit).on_hover_text(
881            "Comma-separated values to interpret as null during loading.\n\
882            Leading/trailing whitespace for each value is automatically trimmed.",
883        );
884
885        // End the row in the parent Grid layout.
886        ui.end_row();
887    }
888
889    /// Renders the `DragValue` widget for setting `infer_schema_rows`.
890    /// Modifies `self.infer_schema_rows` directly.
891    fn render_schema_length_input(&mut self, ui: &mut Ui) {
892        ui.label("Infer Rows:");
893        ui.add(
894            DragValue::new(&mut self.infer_schema_rows)
895                .speed(1) // Increment/decrement speed.
896                .range(0..=usize::MAX), // 0: No inference
897        )
898        .on_hover_text(
899            "Number of rows to scan for inferring data types (CSV/JSON)\n0: No inference",
900        );
901        ui.end_row();
902    }
903
904    /// Renders the `TextEdit` widgets for CSV-specific settings: delimiter.
905    /// Modifies `self.csv_delimiter` directly.
906    fn render_csv_delimiter(&mut self, ui: &mut Ui) {
907        // CSV Delimiter Input
908        ui.label("CSV Delimiter:");
909        let csv_delimiter_edit = TextEdit::singleline(&mut self.csv_delimiter)
910            .char_limit(1) // Restrict to a single character.
911            .desired_width(f32::INFINITY);
912        ui.add(csv_delimiter_edit)
913            .on_hover_text("Enter the single character CSV delimiter");
914        ui.end_row();
915    }
916
917    /// Renders the `TextEdit` widget for the SQL table name.
918    /// Modifies `self.table_name` directly.
919    fn render_table_name_input(&mut self, ui: &mut Ui) {
920        ui.label("SQL Table Name:");
921        let table_name_edit =
922            TextEdit::singleline(&mut self.table_name).desired_width(f32::INFINITY);
923        ui.add(table_name_edit)
924            .on_hover_text("Name of the table to use in SQL queries (e.g., FROM TableName)");
925        ui.end_row();
926    }
927
928    /*
929    /// Renders the multiline `TextEdit` widget for the SQL query.
930    /// Modifies `self.query` directly.
931    fn render_sql_query_input(&mut self, ui: &mut Ui) {
932        ui.label("SQL Query:");
933        let query_edit = TextEdit::multiline(&mut self.query)
934            .desired_width(f32::INFINITY)
935            // Set a reasonable initial height for the multiline input.
936            .desired_rows(4);
937        ui.add(query_edit)
938            .on_hover_text("Enter SQL query to filter/transform data (uses Polars SQL syntax)");
939        ui.end_row();
940    }
941    */
942
943    /// Renders tabbed SQL examples and the editable query input `self.query`.
944    /// Handles selecting examples and editing the query. Tabs will wrap if needed.
945    /// ### Logic
946    /// 1. Generate SQL examples via `sql_commands` using `self.schema`.
947    /// 2. Manage selected tab index using `egui::Memory`.
948    /// 3. Render **wrapping horizontal tabs** for examples using `ui.horizontal_wrapped`.
949    /// 4. On tab click: update index, copy example to `self.query`.
950    /// 5. Render multiline `TextEdit` bound to `&mut self.query`.
951    ///
952    /// Note: Actual *triggering* of reload happens in `render_query` based on overall state change detection or Apply click.
953    fn render_sql_query_input(&mut self, ui: &mut Ui) {
954        ui.label("SQL Query:"); // Label for the whole section
955        ui.vertical(|ui| {
956            // Group the examples and editor vertically
957            // Configure minimum width for the vertical group if needed
958            ui.set_min_width(300.0);
959
960            // 1. Generate examples based on the current schema
961            let examples = sql_commands(&self.schema);
962            if examples.is_empty() {
963                // If no schema or examples, just show the editor
964                ui.add(
965                    TextEdit::multiline(&mut self.query)
966                        .desired_width(f32::INFINITY)
967                        .desired_rows(8) // Slightly more rows if no examples
968                        .font(egui::TextStyle::Monospace),
969                );
970                return; // Skip rendering examples if none exist
971            }
972
973            // 2. Get/Set selected tab index from Memory for persistence
974            let tab_id = ui.id().with("sql_query_tab_index");
975            let mut selected_tab_index =
976                ui.memory_mut(|mem| *mem.data.get_persisted_mut_or_default::<usize>(tab_id));
977
978            // Clamp selected index to be valid (in case number of examples changes)
979            selected_tab_index = selected_tab_index.min(examples.len().saturating_sub(1));
980
981            // 3. Render Tabs using horizontal_wrapped
982            ui.separator();
983            ui.label("Examples:"); // Label for the tabs
984
985            // Use horizontal_wrapped to lay out tabs, wrapping them onto new lines
986            ui.horizontal_wrapped(|ui| {
987                // Iterate through examples to create selectable labels (tabs)
988                for i in 0..examples.len() {
989                    let is_selected = selected_tab_index == i;
990                    let tab_name = format!("{}", i + 1); // Simple number for the tab
991
992                    // Create the selectable label (acting as a tab)
993                    let resp = ui
994                        .selectable_label(is_selected, tab_name)
995                        // Show the first line of the SQL query as hover text
996                        .on_hover_text(
997                            examples
998                                .get(i) // Safely get the example string
999                                .and_then(|s| s.lines().next()) // Get the first line
1000                                .unwrap_or(""), // Default to empty string if error/empty
1001                        );
1002
1003                    // 4. Handle tab click logic
1004                    if resp.clicked() && !is_selected {
1005                        selected_tab_index = i; // Update the selected index
1006
1007                        // Update the main query editor text with the clicked example
1008                        if let Some(example_query) = examples.get(i) {
1009                            self.query = example_query.clone(); // Set editor text
1010                            // Change is detected by render_query comparing before/after state
1011                            tracing::debug!(
1012                                "Switched SQL Query tab to Example {}, query text updated.",
1013                                i + 1
1014                            );
1015                        }
1016
1017                        // Store the newly selected index back into egui's memory
1018                        ui.memory_mut(|mem| mem.data.insert_persisted(tab_id, selected_tab_index));
1019                    }
1020                }
1021            }); // End horizontal_wrapped
1022
1023            ui.separator(); // Separator between tabs and editor
1024
1025            // 5. Render the ACTIVE query editor below the tabs
1026            ui.add(
1027                TextEdit::multiline(&mut self.query)
1028                    .desired_width(f32::INFINITY) // Take full available width
1029                    .desired_rows(6) // Set preferred number of visible lines
1030                    .font(egui::TextStyle::Monospace), // Use a monospace font for SQL
1031            )
1032            .on_hover_text(
1033                "Enter SQL query (Polars SQL).\n\
1034                Click Example tabs above.\n\
1035                Changes trigger reload on Apply/focus change.",
1036            );
1037        }); // End vertical group
1038        ui.end_row(); // End the row in the parent Grid layout
1039    }
1040
1041    /// Renders the collapsible section displaying SQL command examples.
1042    /// Uses `sql_commands` to generate examples relevant to the current `self.schema`.
1043    fn render_sql_examples(&self, ui: &mut Ui) {
1044        CollapsingHeader::new("SQL Command Examples")
1045            .default_open(false)
1046            .show(ui, |ui| {
1047                // Tip about quoting identifiers.
1048                let quoting_tip = "Tip: Use double quotes (\") or backticks (`) around column names with spaces or special characters (e.g., \"Column Name\" or `Column Name`).";
1049                ui.label(quoting_tip);
1050
1051                // Frame around the examples.
1052                Frame::default()
1053                    .stroke(Stroke::new(1.0, Color32::GRAY))
1054                    .outer_margin(2.0)
1055                    .inner_margin(10.0)
1056                    .show(ui, |ui| {
1057                        // Link to Polars SQL documentation.
1058                        ui.vertical_centered(|ui| {
1059                             let polars_sql_url = "https://docs.pola.rs/api/python/stable/reference/sql/index.html";
1060                            ui.hyperlink_to("Polars SQL Reference", polars_sql_url).on_hover_text(polars_sql_url);
1061                        });
1062                        ui.separator();
1063
1064                        // Generate and display SQL examples based on the current schema.
1065                        // The `sql_commands` function (in `sqls.rs`) dynamically creates these.
1066                        let examples = sql_commands(&self.schema);
1067                        let mut ex_num = Vec::new();
1068                        for (index, example) in examples.iter().enumerate() {
1069                            ex_num.push(format!("Example {count}:\n{example}", count = index + 1));
1070                        }
1071
1072                        // Make the examples selectable for easy copying.
1073                        ui.add(egui::Label::new(ex_num.join("\n\n")).selectable(true));
1074                    });
1075            });
1076    }
1077}
1078
1079/// Reads a CSV file from the specified path using Polars, applying given options
1080/// and limiting the number of data rows read.
1081///
1082/// This function configures a Polars CsvReader with specific parsing and reading
1083/// options and executes the read operation directly from the file path.
1084/// It's suitable for getting the schema (when `infer_schema_length(Some(0))`) or
1085/// reading a limited number of initial data rows (`with_n_rows`).
1086///
1087/// ### Configuration Behavior:
1088/// - `has_header(true)`: Assumes the file has a header row for column names.
1089/// - `infer_schema_length(Some(0))`: Instructs Polars to infer column names from the
1090///   header row *only*, using default types (typically String), without using data
1091///   rows to guess types. If combined with `n_rows > 0`, it reads data
1092///   rows but ignores their content for type inference.
1093/// - `with_n_rows(n_rows)`: Limits the number of *data* rows parsed after the header.
1094/// - `ignore_errors(true)`: Skips rows/fields with parsing errors rather than stopping.
1095/// - `missing_is_null(true)`: Treats empty fields (`""`) as null values.
1096pub async fn read_csv_partial_from_path(
1097    delimiter: u8,
1098    n_rows: usize,
1099    path: &Path,
1100) -> PolarsViewResult<DataFrame> {
1101    tracing::debug!("Read a CSV file using Polars limited to {} rows.", n_rows,);
1102
1103    // 1. Define the CSV parsing options.
1104    let csv_parse_options = CsvParseOptions::default()
1105        .with_encoding(CsvEncoding::LossyUtf8) // Handle potentially non-strict UTF8
1106        .with_missing_is_null(true) // Treat empty fields as nulls
1107        .with_separator(delimiter); // Set the chosen delimiter
1108
1109    // 2. Define the main CSV reading options.
1110    let csv_read_options = CsvReadOptions::default()
1111        .with_parse_options(csv_parse_options) // Apply the parsing sub-options
1112        .with_has_header(true) // File has a header row
1113        .with_infer_schema_length(Some(0)) // Number of rows to use for schema inference (0 means header only)
1114        .with_ignore_errors(true) // Allow skipping rows/fields that fail to parse
1115        .with_n_rows(Some(n_rows)) // Limits the number of rows to read.
1116        .try_into_reader_with_file_path(Some(path.to_path_buf()))?;
1117
1118    // 3. Execute the blocking read operation on a separate thread
1119    let df = execute_polars_blocking(move || csv_read_options.finish()).await?;
1120
1121    tracing::debug!("Partial CSV read complete. Shape: {:?}", df.shape());
1122    Ok(df)
1123}
1124
1125/// Builds a Polars Schema specifying DataType::String overrides for columns
1126/// whose names match a given regex pattern or wildcard.
1127///
1128/// This function creates a schema intended for the `with_dtypes` option
1129/// of Polars readers (like `LazyCsvReader`), ensuring specific columns are treated
1130/// as text regardless of their inferred type.
1131fn build_dtype_override_schema(
1132    input_schema: &Arc<Schema>,
1133    regex_pattern: &str,
1134) -> PolarsViewResult<Schema> {
1135    let mut overrides_schema = Schema::default(); // Initialize the resulting schema
1136
1137    // --- Handle Wildcard Case ("*") ---
1138    // If the pattern is "*", override ALL columns to String.
1139    if regex_pattern.trim() == "*" {
1140        tracing::debug!(
1141            "Wildcard pattern '{regex_pattern}' provided. Overriding all columns to String."
1142        );
1143        return Ok(input_schema.as_ref().clone()); // Return the fully populated override schema
1144    }
1145
1146    // --- Handle Specific Regex Pattern Case ---
1147    // If it's not a wildcard, compile the regex pattern.
1148
1149    // Validate the required ^...$ format *before* compiling
1150    if !(regex_pattern.starts_with('^') && regex_pattern.ends_with('$')) {
1151        return Err(PolarsViewError::InvalidRegexPattern(
1152            regex_pattern.to_string(),
1153        ));
1154    }
1155
1156    // Attempt to compile the regex
1157    let compiled_regex = match Regex::new(regex_pattern) {
1158        Ok(re) => re,
1159        Err(e) => {
1160            // Return specific error for invalid syntax
1161            return Err(PolarsViewError::InvalidRegexSyntax {
1162                pattern: regex_pattern.to_string(),
1163                error: e.to_string(),
1164            });
1165        }
1166    };
1167
1168    // Check this compiled regex against each actual header name.
1169    for col_name in input_schema.iter_names() {
1170        if compiled_regex.is_match(col_name) {
1171            // Insert the override into the schema.
1172            overrides_schema.insert(col_name.clone(), DataType::String);
1173        }
1174    }
1175
1176    // Log the final outcome for debugging purposes.
1177    if !overrides_schema.is_empty() {
1178        tracing::debug!(
1179            override_cols = ?overrides_schema.iter_names().collect::<Vec<_>>(),
1180            "Pattern '{}' matched {} columns: ",
1181            regex_pattern,
1182            overrides_schema.len()
1183        );
1184    } else {
1185        tracing::debug!("Provided regex patterns did not match any header columns.");
1186    }
1187
1188    Ok(overrides_schema) // Return the successfully built schema (might be empty)
1189}
1190
1191/// Helper function to find a unique column name based on a base name and schema.
1192/// Appends suffixes "_1", "_2", etc., if the base name conflicts with existing column names.
1193fn resolve_unique_column_name(base_name: &str, schema: &Schema) -> PolarsResult<PlSmallStr> {
1194    // Check if the base name is available first (most common case)
1195    if schema.get(base_name).is_none() {
1196        tracing::debug!("Base name '{}' is available.", base_name);
1197        return Ok(base_name.into());
1198    }
1199
1200    // Base name conflicts, generate alternative names with suffixes.
1201    tracing::debug!(
1202        "Base name '{}' conflicts. Searching unique name.",
1203        base_name
1204    );
1205    let mut suffix_counter = 1u32;
1206    loop {
1207        let candidate_name = format!("{base_name}_{suffix_counter}");
1208
1209        if schema.get(&candidate_name).is_none() {
1210            // Found a unique name
1211            tracing::debug!("Found unique name: '{}'.", candidate_name);
1212            return Ok(candidate_name.into()); // Return the unique name
1213        }
1214
1215        // Safety check for potential overflow and limit attempts
1216        suffix_counter = suffix_counter.checked_add(1).unwrap_or(MAX_ATTEMPTS); // If overflow, go to max attempts
1217
1218        // Prevent infinite loops. Return error if a unique name cannot be found after max attempts.
1219        if suffix_counter >= MAX_ATTEMPTS {
1220            let msg = format!(
1221                "Failed to find a unique column name starting with '{base_name}' after {MAX_ATTEMPTS} attempts."
1222            );
1223            tracing::error!("{}", msg);
1224            return Err(PolarsError::ComputeError(msg.into()));
1225        }
1226    }
1227}
1228
1229/// Executes a potentially blocking Polars operation on a separate Tokio blocking thread.
1230///
1231/// Wraps the closure `op` which is expected to return a `PolarsResult<T>`,
1232/// runs it with `spawn_blocking`, awaits the result, and maps both the
1233/// `JoinError` and the inner `PolarsError` to `PolarsViewError`.
1234///
1235/// ### Arguments
1236/// * `op`: A closure that performs the blocking work and returns `PolarsResult<T>`.
1237///   It must be `Send` and have a `'static` lifetime, meaning it must
1238///   take ownership of or only use data that can be moved across threads
1239///   and lives for the duration of the program (or the task).
1240///
1241/// ### Returns
1242/// A `PolarsViewResult<T>` containing the result of the operation `T` on success,
1243/// or a mapped `PolarsViewError` if the spawned task fails (`TokioJoin`) or
1244/// the Polars operation itself fails (`Polars`).
1245async fn execute_polars_blocking<T, F>(op: F) -> PolarsViewResult<T>
1246where
1247    // F is the type of the closure
1248    F: FnOnce() -> Result<T, PolarsError> + Send + 'static, // The closure trait bounds
1249    // T is the success type returned by the closure (e.g., DataFrame)
1250    T: Debug + Send + 'static, // The success type must be Send and have static lifetime
1251                               // PolarsError: Debug,
1252{
1253    // Spawn the blocking task
1254    let result_from_task = spawn_blocking(op).await; // Result<Result<T, PolarsError>, JoinError>
1255
1256    // Map JoinError to PolarsViewError::TokioJoin
1257    let polars_result = result_from_task.map_err(PolarsViewError::from)?; // Requires PolarsViewError::from(JoinError)
1258
1259    // Map PolarsError to PolarsViewError::Polars
1260    let final_result = polars_result.map_err(PolarsViewError::from)?; // Requires PolarsViewError::from(PolarsError)
1261
1262    Ok(final_result) // Return the successfully extracted value or the mapped PolarsError
1263}
1264
1265//----------------------------------------------------------------------------//
1266//                                   Tests                                    //
1267//----------------------------------------------------------------------------//
1268
1269/// Run tests with:
1270/// cargo test -- --show-output tests_override_columns
1271#[cfg(test)]
1272mod tests_override_columns {
1273    use super::*;
1274    use std::{fs::File, io::Write};
1275    use tempfile::NamedTempFile;
1276
1277    // --- Test Setup Helper (Unchanged conceptually, ensure path and error mapping ok) ---
1278    fn setup_test_csv(
1279        content: &str, // CSV content as string
1280        delimiter: char,
1281        force_string_patterns: Option<String>, // Regex Columns to configure for override
1282    ) -> PolarsViewResult<(NamedTempFile, DataFilter)> {
1283        let temp_file = NamedTempFile::new()?;
1284        let file_path = temp_file.path().to_path_buf();
1285
1286        // Write content to the temp file
1287        let mut file = File::create(&file_path)?;
1288        file.write_all(content.as_bytes())?;
1289        file.flush()?; // Ensure data is written
1290
1291        // Create DataFilter using struct update syntax (Clippy Fix)
1292        let filter = DataFilter {
1293            absolute_path: file_path,             // Set specific value
1294            force_string_patterns,                // Set specific value
1295            csv_delimiter: delimiter.to_string(), // Set specific value
1296            ..Default::default()                  // Fill the rest with defaults
1297        };
1298
1299        Ok((temp_file, filter))
1300    }
1301
1302    // --- Test Case 1: Override applied correctly ---
1303    #[tokio::test] // Requires tokio features in dev-dependencies
1304    async fn test_csv_read_with_override_success() -> PolarsViewResult<()> {
1305        println!("\n--- Test: Override Applied Successfully ---");
1306        // 1. Define CSV Content with large numbers AS TEXT
1307        let csv_content = "\
1308long_id;value;text
130912345678901234567890123456789012345678901234;10.5;abc
131098765432109876543210987654321098765432109876;20.0;def
131112345;30.7;ghi";
1312        // No need for df_input - the csv_content is the direct input representation
1313        println!("Input CSV Content:\n{csv_content}\n");
1314
1315        // 2. Define Expected Output DataFrame (long_id is String)
1316        let df_expected = df!(
1317            "long_id" => &[
1318                "12345678901234567890123456789012345678901234",
1319                "98765432109876543210987654321098765432109876",
1320                "12345"
1321            ],
1322            "value" => &[10.5, 20.0, 30.7],
1323            "text" => &["abc", "def", "ghi"]
1324        )
1325        .expect("Failed to create expected DataFrame");
1326        println!("Expected DF (After Read):\n{df_expected}");
1327
1328        // 3. Setup: Use helper to create CSV and Filter WITH the override
1329        let delimiter = ';';
1330        let col_regex = "^long_id$".to_string();
1331        let (_temp_file, filter) = // Keep _temp_file handle!
1332                setup_test_csv(csv_content, delimiter, Some(col_regex))?;
1333
1334        let schema = filter
1335            .attempt_csv_parse_structure(delimiter as u8, false)
1336            .await?;
1337        println!("schema: {schema:#?}");
1338
1339        // 4. Execute the function under test
1340        let lazyframe = filter.attempt_read_csv(delimiter as u8, &schema).await?;
1341        println!("get lazyframe");
1342
1343        // Execute the lazy plan and collect into an eager DataFrame
1344        // *** WRAP COLLECT IN SPAWN_BLOCKING ***
1345        let df_output =
1346            execute_polars_blocking(move || lazyframe.with_new_streaming(true).collect()).await?;
1347
1348        println!("Output DF (Actual Read):\n{df_output}");
1349
1350        // 5. Assertions
1351        assert_eq!(
1352            df_output.schema().get("long_id"),
1353            Some(&DataType::String),
1354            "Schema Check Failed: 'long_id' should be DataType::String"
1355        );
1356        assert_eq!(
1357            df_output.schema().get("value"),
1358            Some(&DataType::Float64),
1359            "Schema Check Failed: 'value' should be DataType::Float64"
1360        );
1361        assert_eq!(
1362            df_output.schema().get("text"),
1363            Some(&DataType::String),
1364            "Schema Check Failed: 'text' should be DataType::String"
1365        );
1366        assert_eq!(
1367            df_output, df_expected,
1368            "Content Check Failed: Output DF does not match expected DF"
1369        );
1370
1371        Ok(())
1372    }
1373
1374    // --- Test Case 2: Override *not* applied (expect nulls) ---
1375    #[tokio::test] // Requires tokio features in dev-dependencies
1376    async fn test_csv_read_without_override_yields_nulls() -> PolarsViewResult<()> {
1377        println!("\n--- Test: No Override Applied (Expect Nulls) ---");
1378        // 1. Define CSV Content (same large numbers AS TEXT)
1379        let csv_content = "\
1380long_id;value;text
138112345678901234567890123456789012345678901234;10.5;abc
138298765432109876543210987654321098765432109876;20.0;def";
1383        println!("Input CSV Content:\n{csv_content}\n");
1384
1385        // 2. Define Expected Output Pattern (long_id should be all nulls)
1386        let df_expected_pattern = df!(
1387            "long_id" => Series::new_null("long_id".into(), 2).cast(&DataType::Int64)?, // Series of 2 nulls
1388            "value" => &[10.5, 20.0],
1389            "text" => &["abc", "def"]
1390        )
1391        .expect("Failed to create expected pattern DataFrame");
1392        println!("Expected DF Pattern (After Read, note long_id nulls):\n{df_expected_pattern}");
1393
1394        // 3. Setup: Use helper with an EMPTY override list
1395        let delimiter = ';';
1396        let col_regex = "^Col Name$".to_string();
1397        let (_temp_file, filter) = setup_test_csv(csv_content, delimiter, Some(col_regex))?;
1398
1399        let schema = filter
1400            .attempt_csv_parse_structure(delimiter as u8, false)
1401            .await?;
1402        println!("schema: {schema:#?}");
1403
1404        // 4. Execute the function under test
1405        let lazyframe = filter.attempt_read_csv(delimiter as u8, &schema).await?;
1406        println!("get lazyframe");
1407
1408        // Execute the lazy plan and collect into an eager DataFrame
1409        // *** WRAP COLLECT IN SPAWN_BLOCKING ***
1410        let df_output = spawn_blocking(move || lazyframe.with_new_streaming(true).collect())
1411            .await
1412            .map_err(PolarsViewError::from)? // Convert Tokio JoinError to PolarsViewError
1413            .map_err(PolarsViewError::from)?; // Convert PolarsError to PolarsViewError
1414
1415        println!("Output DF (Actual Read):\n{df_output}");
1416
1417        // 5. Assertions
1418        let long_id_col = df_output.column("long_id")?;
1419        assert!(
1420            long_id_col.is_null().all(), // Verify ALL values are null
1421            "Content Check Failed: 'long_id' column should be all nulls without override. Type: {:?}, Null count: {}",
1422            long_id_col.dtype(),
1423            long_id_col.null_count()
1424        );
1425        // Verify other columns match the pattern
1426        assert_eq!(
1427            df_output.column("value")?,
1428            df_expected_pattern.column("value")?
1429        );
1430        assert_eq!(
1431            df_output.column("text")?,
1432            df_expected_pattern.column("text")?
1433        );
1434
1435        Ok(())
1436    }
1437}
polars_view/data_filter.rs

polars_view/
data_filter.rs