polars_view/data_filter.rs
1use crate::{
2 Arguments, DEFAULT_OVERRIDE_REGEX, DEFAULT_QUERY, FileExtension, PathExtension,
3 PolarsViewError, PolarsViewResult, UniqueElements, sql_commands,
4};
5use egui::{
6 Align, CollapsingHeader, Color32, DragValue, Frame, Grid, Layout, Stroke, TextEdit, Ui, Vec2,
7};
8use polars::{io::RowIndex, prelude::*};
9use regex::Regex;
10use tokio::task::spawn_blocking;
11
12use std::{
13 fmt::Debug,
14 fs::File,
15 num::NonZero,
16 path::{Path, PathBuf},
17 sync::Arc,
18};
19
20// --- Constants ---
21
22/// Static string listing common values treated as null/missing during CSV parsing.
23/// The `r#""#` syntax denotes a raw string literal, avoiding the need to escape quotes.
24pub static NULL_VALUES: &str = r#""", <N/D>"#;
25
26/// Default delimiter used for CSV parsing if not specified or detected.
27/// Using `&'static str` for common, immutable delimiters saves memory allocation.
28pub static DEFAULT_CSV_DELIMITER: &str = ";";
29
30/// Default name for the row number column if added.
31pub const DEFAULT_INDEX_COLUMN_NAME: &str = "Row Number";
32
33/// Default regex
34const DEFAULT_NORM_REGEX: &str = "^Val.*$";
35
36/// Default drop regex
37const DEFAULT_DROP_REGEX: &str = "^Temp.*$";
38
39/// Default starting offset for the row index column (e.g., 1 for 1-based).
40const DEFAULT_INDEX_COLUMN_OFFSET: u32 = 1;
41
42const DEFAULT_INFER_SCHEMA_ROWS: usize = 200;
43
44// Prevent potential infinite loops (e.g., schema keeps changing).
45pub const MAX_ATTEMPTS: u32 = 1000;
46
47// --- DataFilter Struct ---
48
49/// Holds configuration parameters related to **loading and querying** data.
50///
51/// This struct focuses on settings that define how data is initially read from a file
52/// and transformed via SQL queries or basic processing like null column removal.
53///
54/// Instances are created from `Arguments`, updated by the UI in `render_query`, and passed
55/// to `DataFrameContainer::load_data`. Changes here typically trigger a data reload/requery.
56#[derive(Debug, Clone, PartialEq)] // PartialEq allows simple change detection
57pub struct DataFilter {
58 /// The canonical, absolute path to the data file.
59 pub absolute_path: PathBuf,
60 /// The name assigned to the loaded DataFrame for use in SQL queries.
61 pub table_name: String,
62 /// The character used to separate columns in a CSV file.
63 pub csv_delimiter: String,
64 /// Read data from file
65 pub read_data_from_file: bool,
66 /// The schema (column names and data types) of the most recently loaded DataFrame.
67 /// Used by `sql_commands` for generating relevant examples.
68 pub schema: Arc<Schema>,
69 /// Maximum rows to scan for schema inference (CSV, JSON, NDJson).
70 pub infer_schema_rows: usize,
71 /// Flag to control removal of all-null columns after loading/querying.
72 pub exclude_null_cols: bool,
73 /// Comma-separated string of values to interpret as nulls during CSV parsing.
74 pub null_values: String,
75
76 /// Regex patterns matching columns to force read as String type.
77 ///
78 /// List of column names to force reading as String, overriding inference.
79 /// Useful for columns with large IDs/keys that look numeric.
80 pub force_string_patterns: Option<String>,
81
82 /// Flag indicating if the `query` should be executed during the next `load_data`.
83 /// Set by `render_query` if relevant UI fields change or the Apply button is clicked.
84 pub apply_sql: bool,
85 /// The SQL query string entered by the user.
86 pub query: String,
87
88 // --- NEW FIELDS for Index Column ---
89 /// Flag indicating if a row index column should be added.
90 pub add_row_index: bool,
91 /// The desired name for the row index column (will be checked for uniqueness later).
92 pub index_column_name: String,
93 /// The starting value for the row index column (e.g., 0 or 1).
94 pub index_column_offset: u32,
95 // --- END NEW FIELDS ---
96
97 // --- Normalize Columns ---
98 /// Flag indicating whether string columns will be normalized.
99 pub normalize: bool,
100 /// Regex pattern to select string columns.
101 pub normalize_regex: String,
102
103 // --- Drop Columns ---
104 pub drop: bool,
105 pub drop_regex: String,
106}
107
108impl Default for DataFilter {
109 /// Creates default `DataFilter` with sensible initial values.
110 fn default() -> Self {
111 DataFilter {
112 absolute_path: PathBuf::new(),
113 table_name: "AllData".to_string(),
114 csv_delimiter: DEFAULT_CSV_DELIMITER.to_string(),
115 read_data_from_file: true,
116 schema: Schema::default().into(),
117 infer_schema_rows: DEFAULT_INFER_SCHEMA_ROWS,
118 exclude_null_cols: false,
119 null_values: NULL_VALUES.to_string(),
120
121 force_string_patterns: DEFAULT_OVERRIDE_REGEX.map(ToString::to_string),
122
123 apply_sql: false,
124 query: DEFAULT_QUERY.to_string(),
125
126 // --- NEW DEFAULTS ---
127 add_row_index: false, // Default to false
128 index_column_name: DEFAULT_INDEX_COLUMN_NAME.to_string(),
129 index_column_offset: DEFAULT_INDEX_COLUMN_OFFSET,
130 // --- END NEW DEFAULTS ---
131
132 // --- NEW FIELDS for Normalize Columns ---
133 normalize: false,
134 normalize_regex: DEFAULT_NORM_REGEX.to_string(),
135 // --- END NEW FIELDS ---
136 drop: false,
137 drop_regex: DEFAULT_DROP_REGEX.to_string(),
138 }
139 }
140}
141
142// --- Methods ---
143
144impl DataFilter {
145 /// Creates a new `DataFilter` instance configured from command-line `Arguments`.
146 /// This is typically called once at application startup in `main.rs`.
147 ///
148 /// ### Arguments
149 /// * `args`: Parsed command-line arguments (`crate::Arguments`).
150 ///
151 /// ### Returns
152 /// A `PolarsViewResult` containing the configured `DataFilter` or an error
153 /// (e.g., if the path cannot be canonicalized).
154 pub fn new(args: &Arguments) -> PolarsViewResult<Self> {
155 // Ensure the path exists and get its absolute, canonical form.
156 let absolute_path = args.path.canonicalize()?;
157
158 // Determine apply_sql state from the CLI argument
159 let apply_sql = args.query.is_some();
160 let query = args
161 .query
162 .clone()
163 .unwrap_or_else(|| DEFAULT_QUERY.to_string()); // Use CLI arg or default
164
165 // Determine normalization state from the CLI argument
166 let normalize = args.regex.is_some();
167 let normalize_regex = args
168 .regex
169 .clone()
170 .unwrap_or_else(|| DEFAULT_NORM_REGEX.to_string()); // Use CLI arg or default
171
172 // Use or_else: takes a closure executed only if the first option is None.
173 // This avoids the .to_string() for the default unless actually needed.
174 let force_string_patterns = args
175 .force_string_patterns // This is Option<String>
176 .clone() // Clone the Option<String> from args if needed later, otherwise maybe take ownership
177 .or(DEFAULT_OVERRIDE_REGEX.map(ToString::to_string)); // Use CLI arg or default
178
179 Ok(DataFilter {
180 absolute_path,
181 table_name: args.table_name.clone(),
182 csv_delimiter: args.delimiter.clone(),
183
184 apply_sql, // Directly set based on CLI argument presence
185 query, // Directly set based on CLI argument value (or default)
186
187 exclude_null_cols: args.exclude_null_cols,
188 null_values: args.null_values.clone(), // Use user-provided nulls.
189
190 force_string_patterns,
191
192 normalize, // Directly set based on CLI argument presence
193 normalize_regex, // Directly set based on CLI argument value (or default)
194 ..Default::default() // Use defaults for `schema`, `infer_schema_rows`.
195 })
196 }
197
198 /// Sets the data source path, canonicalizing it.
199 pub fn set_path(&mut self, path: &Path) -> PolarsViewResult<()> {
200 self.absolute_path = path.canonicalize()?;
201 tracing::debug!("absolute_path set to: {:#?}", self.absolute_path);
202 Ok(())
203 }
204
205 /// Gets the file extension from `absolute_path` in lowercase.
206 pub fn get_extension(&self) -> Option<String> {
207 self.absolute_path.extension_as_lowercase()
208 }
209
210 /// Determines the configuration for an optional row index column by resolving a unique name
211 /// against the provided schema.
212 ///
213 /// If `self.add_row_index` is true, this method finds a unique name based on
214 /// `self.index_column_name` and the provided `schema`, returning a `Some(RowIndex)`.
215 /// If the name resolution fails, it returns the specific PolarsError.
216 /// If `self.add_row_index` is false, it returns `Ok(None)`.
217 ///
218 /// ### Arguments
219 /// * `schema`: The schema against which the index column name should be checked for uniqueness.
220 /// This should be the schema of the DataFrame *before* adding the index column.
221 ///
222 /// ### Returns
223 /// `PolarsResult<Option<RowIndex>>`: Ok(Some) if config is resolved, Ok(None) if disabled, Err if resolution fails.
224 pub fn get_row_index(&self, schema: &Schema) -> PolarsResult<Option<RowIndex>> {
225 // Check the main feature flag
226 if !self.add_row_index {
227 tracing::trace!("Row index addition disabled in filter.");
228 return Ok(None); // Feature disabled, return None config
229 }
230
231 // Feature is enabled. Resolve a unique name using the helper.
232 let unique_name = resolve_unique_column_name(
233 // Use the helper function
234 &self.index_column_name, // Base name
235 schema, // Schema to check uniqueness against
236 )?; // Propagate potential error from unique name resolution
237
238 // If we successfully resolved a unique name, return the full RowIndex config
239 let index_offset = self.index_column_offset;
240 Ok(Some(RowIndex {
241 name: unique_name,
242 offset: index_offset,
243 }))
244 }
245
246 /// Determines the `FileExtension` and orchestrates loading the DataFrame using the appropriate Polars reader.
247 /// This method centralizes the file-type-specific loading logic. Called by `DataFrameContainer::load_data`.
248 ///
249 /// **Important:** It mutates `self` by potentially updating `csv_delimiter` if automatic
250 /// detection during `read_csv_data` finds a different working delimiter than initially configured.
251 ///
252 /// ### Returns
253 /// A `PolarsViewResult` containing a tuple: `(DataFrame, FileExtension)` on success,
254 /// or a `PolarsViewError` (e.g., `FileType`, `CsvParsing`) on failure.
255 pub async fn get_df_and_extension(&mut self) -> PolarsViewResult<(DataFrame, FileExtension)> {
256 // Determine the file extension type using the helper from `extension.rs`.
257 let extension = FileExtension::from_path(&self.absolute_path);
258
259 // Match on the determined extension to call the correct reader function.
260 let (df, detected_delimiter) = match &extension {
261 FileExtension::Csv => self.read_csv_data().await?,
262 FileExtension::Json => self.read_json_data().await?,
263 FileExtension::NDJson => self.read_ndjson_data().await?,
264 FileExtension::Parquet => self.read_parquet_data().await?,
265 // Handle unsupported or missing extensions with specific errors.
266 FileExtension::Unknown(ext) => {
267 return Err(PolarsViewError::FileType(format!(
268 "Unsupported extension: `{}` for file: `{}`",
269 ext,
270 self.absolute_path.display()
271 )));
272 }
273 FileExtension::Missing => {
274 return Err(PolarsViewError::FileType(format!(
275 "Missing extension for file: `{}`",
276 self.absolute_path.display()
277 )));
278 }
279 };
280
281 // If reading a CSV successfully detected a working delimiter, update the filters state.
282 // This ensures the UI reflects the delimiter actually used.
283 if let Some(byte) = detected_delimiter {
284 self.csv_delimiter = (byte as char).to_string();
285 }
286
287 tracing::debug!(
288 "fn get_df_and_extension(): Successfully loaded DataFrame with extension: {:?}",
289 extension
290 );
291
292 Ok((df, extension)) // Return the loaded DataFrame and the detected extension.
293 }
294
295 // --- Data Reading Helper Methods ---
296
297 /// Reads a standard JSON file into a Polars DataFrame.
298 /// Configures the reader using settings from `self` (e.g., `infer_schema_rows`).
299 ///
300 /// ### Returns
301 /// A `PolarsViewResult` containing `(DataFrame, None)` (delimiter is not applicable to JSON).
302 async fn read_json_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
303 tracing::debug!("Reading JSON data from: {}", self.absolute_path.display());
304 let file = File::open(&self.absolute_path)?;
305 let infer_schema_rows_for_task = self.infer_schema_rows;
306
307 // Execute the blocking read operation on a separate thread
308 let df = execute_polars_blocking(move || {
309 JsonReader::new(file)
310 .infer_schema_len(NonZero::new(infer_schema_rows_for_task))
311 .finish()
312 })
313 .await?;
314
315 tracing::debug!("JSON read complete. Shape: {:?}", df.shape());
316
317 Ok((df, None))
318 }
319
320 /// Reads a Newline-Delimited JSON (NDJson / JSON Lines) file into a Polars DataFrame.
321 /// Uses `LazyJsonLineReader` for potentially better performance/memory usage on large files.
322 ///
323 /// ### Returns
324 /// A `PolarsViewResult` containing `(DataFrame, None)`.
325 async fn read_ndjson_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
326 tracing::debug!("Reading NDJSON data from: {}", self.absolute_path.display());
327
328 // Clone data from self needed for the task closure.
329 let path_buf_for_task = PlPath::Local(self.absolute_path.clone().into());
330 let infer_schema_rows_for_task = self.infer_schema_rows;
331
332 // *** Use the helper function ***
333 let df = execute_polars_blocking(move || {
334 // 'move' captures path_buf_for_task, infer_schema_rows_for_task
335 // This code runs on the blocking thread.
336 let lazyframe = LazyJsonLineReader::new(path_buf_for_task) // Use cloned path
337 .low_memory(false) // Option to optimize for memory.
338 .with_infer_schema_length(NonZero::new(infer_schema_rows_for_task))
339 .with_ignore_errors(true)
340 .finish()?; // Returns PolarsResult<LazyFrame> (this finish() isn't the main blocking part)
341
342 // Collect the lazy frame - THIS IS THE BLOCKING PART
343 lazyframe.with_new_streaming(true).collect() // Returns PolarsResult<DataFrame>
344 })
345 .await?; // await the helper function
346
347 tracing::debug!("NDJSON read complete. Shape: {:?}", df.shape());
348 Ok((df, None))
349 }
350
351 /// Reads an Apache Parquet file into a Polars DataFrame.
352 ///
353 /// ### Returns
354 /// A `PolarsViewResult` containing `(DataFrame, None)`.
355 async fn read_parquet_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
356 tracing::debug!(
357 "Reading Parquet data from: {}",
358 self.absolute_path.display()
359 );
360
361 // Clone data from self needed for the task closure.
362 let path_buf_for_task = PlPath::Local(self.absolute_path.clone().into());
363 let args = ScanArgsParquet {
364 // ScanArgsParquet should be Send
365 low_memory: false, // Configure scan arguments as needed.
366 ..Default::default()
367 };
368
369 let df = execute_polars_blocking(move || {
370 // Use `LazyFrame::scan_parquet` for efficient scanning.
371 let lazyframe = LazyFrame::scan_parquet(path_buf_for_task, args)?; // Returns PolarsResult<LazyFrame>
372
373 // Collect into an eager DataFrame - THIS IS THE BLOCKING/COMPUTE PART.
374 lazyframe.with_new_streaming(true).collect() // Returns PolarsResult<DataFrame>
375 })
376 .await?; // await the helper function
377
378 tracing::debug!("Parquet read complete. Shape: {:?}", df.shape());
379
380 Ok((df, None))
381 }
382
383 /// Reads a CSV file, attempting automatic delimiter detection if the initial one fails.
384 /// Iterates through common delimiters and tries reading a small chunk first for efficiency.
385 ///
386 /// ### Returns
387 /// A `PolarsViewResult` containing `(DataFrame, Option<u8>)` where `Option<u8>` is the
388 /// *successfully used* delimiter byte. Returns `Err(PolarsViewError::CsvParsing)` if
389 /// no common delimiter works.
390 async fn read_csv_data(&self) -> PolarsViewResult<(DataFrame, Option<u8>)> {
391 // Get the currently configured separator byte. Error if invalid (e.g., empty string).
392 let initial_separator = self.get_csv_separator()?;
393
394 // List of common delimiters to try, starting with the configured one.
395 let mut delimiters_to_try = vec![initial_separator, b',', b';', b'|', b'\t', b':'];
396 // Remove duplicates if the initial separator is already in the common list.
397 delimiters_to_try.unique();
398 tracing::debug!(
399 "Attempting CSV read. Delimiters to try: {:?}",
400 delimiters_to_try
401 .iter()
402 .map(|&b| b as char)
403 .collect::<Vec<_>>()
404 );
405
406 // Look at the next element of the iterator without consuming it.
407 let mut iterator = delimiters_to_try.iter().peekable();
408
409 // Iterate through the potential delimiters.
410 while let Some(&delimiter) = iterator.next() {
411 // If peek() returns None, it means the current item was the last one
412 let is_last_element = iterator.peek().is_none();
413
414 // 1. Quick Check: Try reading only a small number of rows (NROWS_CHECK).
415 // This fails fast if the delimiter is fundamentally wrong (e.g., results in 1 column).
416 if let Ok(schema) = self
417 .attempt_csv_parse_structure(delimiter, is_last_element)
418 .await
419 {
420 // 2. Full Read: If the quick check passed, attempt to read the entire file.
421 tracing::debug!(
422 "Trying to read full CSV file with delimiter: '{}'",
423 delimiter as char
424 );
425 match self.attempt_read_csv(delimiter, &schema).await {
426 Ok(lazyframe) => {
427 // Success! Return the DataFrame and the delimiter that worked.
428 tracing::info!(
429 "Successfully read CSV with delimiter: '{}'",
430 delimiter as char
431 );
432
433 // Execute the lazy plan and collect into an eager DataFrame on a blocking thread
434 let df = execute_polars_blocking(move || {
435 lazyframe.with_new_streaming(true).collect()
436 })
437 .await?;
438
439 tracing::debug!("Data collection complete. Shape: {:?}", df.shape());
440 return Ok((df, Some(delimiter)));
441 }
442 Err(e) => {
443 // Full read failed even after quick check passed. Log and try next delimiter.
444 tracing::warn!(
445 "Full CSV read failed with delimiter '{}' after quick check passed: {}",
446 delimiter as char,
447 e
448 );
449 continue; // Try the next delimiter.
450 }
451 }
452 }
453 // If quick check fails, implicitly try the next delimiter.
454 }
455
456 // If all delimiters failed, return a parsing error.
457 let msg = format!(
458 "Failed to read CSV '{}' with common delimiters. Check format or specify delimiter.",
459 self.absolute_path.display()
460 );
461 let error = PolarsViewError::CsvParsing(msg);
462 tracing::error!("{}", error);
463 Err(error)
464 }
465
466 /// Retrieves the CSV separator byte from the `csv_delimiter` String configuration.
467 ///
468 /// ### Returns
469 /// `Ok(u8)` containing the first byte, or `Err(PolarsViewError::InvalidDelimiter)`
470 /// if the string is empty or contains multi-byte characters (only first byte is used).
471 fn get_csv_separator(&self) -> PolarsViewResult<u8> {
472 self.csv_delimiter
473 .as_bytes() // Convert String to byte slice.
474 .first() // Get the first byte.
475 .copied() // Copy the byte out of the Option<&u8>.
476 // Map `None` (empty string) to an InvalidDelimiter error.
477 .ok_or_else(|| PolarsViewError::InvalidDelimiter(self.csv_delimiter.clone()))
478 }
479
480 /// Attempts to parse the CSV structure from the initial chunk of the file
481 /// using a specific delimiter and validates the result.
482 async fn attempt_csv_parse_structure(
483 &self,
484 delimiter: u8,
485 is_last_element: bool,
486 ) -> PolarsViewResult<Arc<Schema>> {
487 // Constant: Number of data rows to ask Polars to read *after* the header
488 // during this quick probe. 100 is a common heuristic to get enough
489 // context without reading the whole file.
490 const ROW_LIMIT: usize = 100;
491
492 tracing::debug!(
493 "Trying to parse CSV with delimiter: '{}'",
494 delimiter as char,
495 );
496
497 let file_path = &self.absolute_path;
498
499 // Perform a partial read from the file using the given delimiter.
500 let data_frame = read_csv_partial_from_path(delimiter, ROW_LIMIT, file_path).await?;
501
502 // **Basic Validation**: Check resulting width (important for delimiter detection loops)
503 // it's highly likely the delimiter was incorrect. Return an error early.
504 // This check is crucial for the delimiter detection loop in `read_csv_data`.
505 let min_expected_cols_on_success = if self.add_row_index { 2 } else { 1 }; // Assumes index is added *later*
506
507 if data_frame.width() <= min_expected_cols_on_success && !is_last_element {
508 tracing::warn!(
509 "CSV read with delimiter '{}' resulted in {} columns (expected > {}). Assuming incorrect delimiter.",
510 delimiter as char,
511 data_frame.width(),
512 min_expected_cols_on_success
513 );
514 // Return a specific error type or message indicating a likely delimiter issue.
515 return Err(PolarsViewError::CsvParsing(format!(
516 "Delimiter '{}' likely incorrect (resulted in {} columns)",
517 delimiter as char,
518 data_frame.width()
519 )));
520 }
521
522 tracing::debug!(
523 "CSV read successful with delimiter '{}'. Final shape (rows, columns): {:?}",
524 delimiter as char,
525 data_frame.shape()
526 );
527
528 Ok(data_frame.schema().clone())
529 }
530
531 async fn attempt_read_csv(
532 &self,
533 delimiter: u8,
534 previous_scheme: &Arc<Schema>,
535 ) -> PolarsViewResult<LazyFrame> {
536 tracing::debug!(
537 "Attempting CSV read with delimiter: '{}'",
538 delimiter as char,
539 );
540
541 /*
542 When LazyCsvReader tries to determine the data types (with_infer_schema_length()), it looks at the first N rows.
543 If your problematic column contains only digits in those initial rows,
544 Polars will likely infer a numeric type (like Int64, UInt64, or Float64).
545
546 However, a standard 64-bit integer or float cannot represent an arbitrarily long sequence of digits (like a 44-digit key).
547 When the reader later encounters these huge numbers and tries to parse them into the inferred numeric type, the parsing fails.
548 with_ignore_errors(true), instead of stopping, Polars replaces the unparseable value with null.
549 If all values in that column exceed the capacity of the inferred numeric type, the entire column becomes null, appearing "empty".
550
551 Solution:
552 .with_dtype_overwrite(dtypes_opt)
553 */
554
555 let mut dtypes_opt: Option<Arc<Schema>> = None;
556
557 if let Some(force_string_patterns) = &self.force_string_patterns {
558 // Build dtype overrides using the dedicated function
559 // Pass the actual headers and the configured regex patterns.
560 let override_schema = build_dtype_override_schema(
561 previous_scheme,
562 force_string_patterns, // Get patterns from self
563 )?; // Propagate potential regex compilation errors
564
565 // Convert the resulting Schema into Option<Arc<Schema>>.
566 if !override_schema.is_empty() {
567 dtypes_opt = Some(Arc::new(override_schema));
568 };
569 }
570
571 let plpath = PlPath::Local(self.absolute_path.clone().into());
572
573 // Configure the LazyCsvReader using settings from `self`.
574 let lazyframe = LazyCsvReader::new(plpath)
575 .with_low_memory(false) // Can be set to true for lower memory usage at cost of speed.
576 .with_encoding(CsvEncoding::LossyUtf8) // Gracefully handle potential encoding errors.
577 .with_has_header(true) // Assume a header row.
578 .with_try_parse_dates(true) // Attempt automatic date parsing.
579 .with_separator(delimiter) // Use the specified delimiter.
580 .with_infer_schema_length(Some(self.infer_schema_rows)) // Use filter setting for inference.
581 .with_dtype_overwrite(dtypes_opt)
582 .with_ignore_errors(true) // Rows with parsing errors become nulls instead of stopping the read.
583 .with_missing_is_null(true) // Treat missing fields as null.
584 .with_null_values(None) // Apply fn replace_values_with_null()
585 .with_n_rows(None) // Apply row limit if specified.
586 .with_decimal_comma(false) // If files use ',' as decimal separator.
587 .with_row_index(None) // Apply fn add_row_index_column()
588 .with_rechunk(true) // Rechunk the memory to contiguous chunks when parsing is done.
589 .finish()?; // Finalize configuration and create the LazyFrame.
590
591 Ok(lazyframe)
592 }
593
594 /// Parses the comma-separated `null_values` string into a `Vec<&str>`,
595 /// removing surrounding double quotes if present.
596 ///
597 /// Logic:
598 /// 1. Splits the input string (`self.null_values`) by commas.
599 /// 2. Iterates through each resulting substring (`s`).
600 /// 3. For each substring:
601 /// a. Trims leading and trailing whitespace.
602 /// b. Checks if the `trimmed` string has at least 2 characters AND starts with `"` AND ends with `"`.
603 /// c. If true, returns a slice (`&str`) representing the content *between* the quotes.
604 /// Example: `"\"\""` becomes `""`, `" N/A "` becomes `"N/A"`, `" " "` becomes `" "`.
605 /// d. If false (no surrounding quotes), returns a slice (`&str`) of the `trimmed` string itself.
606 /// Example: `<N/D>` remains `<N/D>`, ` NA ` becomes `NA`.
607 /// 4. Collects all the resulting string slices into a `Vec<&str>`.
608 ///
609 /// Example Input: `"\"\", \" \", <N/D>, NA "`
610 /// Example Output: `vec!["", " ", "<N/D>", "NA"]`
611 pub fn parse_null_values(&self) -> Vec<&str> {
612 self.null_values
613 .split(',') // 1. Split the string by commas.
614 .map(|s| {
615 // For each part resulting from the split:
616 // 3a. Trim leading/trailing whitespace.
617 let trimmed = s.trim();
618 // 3b. Check if it's quoted (length >= 2, starts/ends with ").
619 if trimmed.len() >= 2 && trimmed.starts_with('"') && trimmed.ends_with('"') {
620 // 3c. If quoted, return the slice between the quotes.
621 trimmed[1..trimmed.len() - 1].trim()
622 } else {
623 // 3d. If not quoted, return the trimmed slice directly.
624 trimmed
625 }
626 })
627 .collect() // 4. Collect the processed slices into a vector.
628 }
629
630 // --- UI Rendering Methods ---
631
632 /// Renders the UI widgets for configuring data filters within the "Query" collapsing header.
633 /// This function is called by `layout.rs::render_side_panel`.
634 ///
635 /// **Crucially, it takes `&mut self`. Widgets modify `self` directly.**
636 /// It compares the state of `self` *before* and *after* rendering the widgets.
637 /// If any change occurred (user typed in a field, clicked a checkbox), it returns
638 /// `Some(self.clone())` containing the *modified* state. Otherwise, it returns `None`.
639 ///
640 /// The `layout.rs` code uses this return value:
641 /// - If `Some(new_filters)`, it triggers an asynchronous `DataFrameContainer::load_data` call.
642 /// - If `None`, no user change was detected in this frame, so no action is taken.
643 ///
644 /// It also sets `self.apply_sql = true` if any changes are detected, ensuring the SQL
645 /// query is re-applied upon reload.
646 ///
647 /// ### Arguments
648 /// * `ui`: The `egui::Ui` context for drawing the widgets.
649 ///
650 /// ### Returns
651 /// * `Some(DataFilter)`: If any filter setting was changed by the user in this frame.
652 /// * `None`: If no changes were detected.
653 pub fn render_query(&mut self, ui: &mut Ui) -> Option<DataFilter> {
654 // Clone the state *before* rendering UI widgets to detect changes later.
655 let filters_before_render = self.clone();
656 let mut result = None;
657
658 let width_min = 450.0; // Minimum width for the grid area.
659
660 // Use a grid layout for label-input pairs.
661 let grid = Grid::new("data_query_grid")
662 .num_columns(2)
663 .spacing([10.0, 20.0]) // Horizontal and vertical spacing.
664 .striped(true); // Alternating row backgrounds.
665
666 // Allocate UI space for the grid.
667 ui.allocate_ui_with_layout(
668 Vec2::new(ui.available_width(), ui.available_height()), // Occupy available width.
669 Layout::top_down(Align::LEFT),
670 |ui| {
671 grid.show(ui, |ui| {
672 ui.set_min_width(width_min);
673
674 // --- Render Individual Filter Widgets ---
675 // Each `render_*` method takes `&mut self` and `ui`.
676
677 self.render_add_row_number(ui);
678
679 self.render_exclude_null_cols(ui);
680
681 self.render_exclude_columns(ui);
682
683 self.render_normalize_numbers(ui);
684
685 self.render_null_values(ui);
686
687 // Input for schema inference length (only for relevant file types).
688 if matches!(
689 self.get_extension().as_deref(), // Get extension as &str
690 Some("csv" | "json" | "ndjson") // Check if it's one of these
691 ) {
692 self.render_schema_length_input(ui);
693 }
694
695 // CSV-specific settings: delimiter.
696 if self.get_extension().as_deref() == Some("csv") {
697 self.render_csv_delimiter(ui);
698 }
699
700 // Input for table name used in SQL.
701 self.render_table_name_input(ui);
702
703 // Multiline input for the SQL query.
704 self.render_sql_query_input(ui);
705
706 // --- Change Detection & Apply Button ---
707
708 // Compare current state `self` with the state before rendering.
709 if *self != filters_before_render {
710 // Mark that SQL needs to be (re-)applied.
711 self.apply_sql = true;
712 tracing::debug!("Change detected in DataFilter UI.");
713 }
714
715 if (self.csv_delimiter != filters_before_render.csv_delimiter)
716 || (self.infer_schema_rows != filters_before_render.infer_schema_rows)
717 {
718 self.read_data_from_file = true;
719 }
720
721 // Add the "Apply SQL commands" button.
722 ui.label(""); // For alignment.
723 ui.with_layout(Layout::top_down(Align::Center), |ui| {
724 if ui.button("Apply SQL commands").clicked() {
725 if self.apply_sql {
726 // Result contains DataFilter after editing some fields
727 result = Some(self.clone());
728 }
729
730 tracing::debug!("Apply SQL commands: {}", self.apply_sql);
731 }
732 });
733 ui.end_row();
734 }); // End grid.show
735 }, // End allocate_ui_with_layout
736 ); // End allocation
737
738 // Display the SQL examples section (collapsible).
739 self.render_sql_examples(ui);
740
741 result // Return the potentially updated filters.
742 }
743
744 // --- Helper Rendering Methods ---
745
746 fn render_add_row_number(&mut self, ui: &mut Ui) {
747 // --- Row 1: Feature Checkbox ---
748 ui.label("Add Row Number:");
749 // The checkbox directly modifies `self.add_row_index`
750 ui.checkbox(&mut self.add_row_index, "")
751 .on_hover_text("Add a new column that counts the rows (first column).");
752 ui.end_row();
753
754 // --- Conditional Configuration Inputs ---
755 // These rows are only added to the grid if the checkbox is checked.
756 if self.add_row_index {
757 // --- Index Name Input ---
758 // Use simple indentation in the label for visual structure
759 ui.label("\tName:");
760 let name_edit =
761 TextEdit::singleline(&mut self.index_column_name).desired_width(f32::INFINITY); // Use available width in the grid cell
762 ui.add(name_edit)
763 .on_hover_text("Name for the new index column (uniqueness checked later).");
764 ui.end_row();
765
766 // --- Index Offset Input ---
767 // Use simple indentation
768 ui.label("\tOffset:");
769 let offset_drag = DragValue::new(&mut self.index_column_offset)
770 .speed(1) // Increment by 1
771 .range(0..=u32::MAX); // Allow 0-based or 1-based commonly
772 ui.add(offset_drag)
773 .on_hover_text("Starting value for the index (e.g., 0 or 1).");
774 ui.end_row();
775 }
776 // No 'else' needed. If add_row_index is false, these rows are simply skipped.
777 }
778
779 /// Renders the checkbox for the "Remove Null Cols" option.
780 /// Modifies `self.exclude_null_cols` directly.
781 fn render_exclude_null_cols(&mut self, ui: &mut Ui) {
782 ui.label("Exclude Null Cols:");
783 ui.checkbox(&mut self.exclude_null_cols, "")
784 .on_hover_text("Remove columns containing only null values.");
785 ui.end_row();
786 }
787
788 fn render_exclude_columns(&mut self, ui: &mut Ui) {
789 // --- Row 1: Feature Checkbox ---
790 ui.label("Remove Columns:");
791 ui.checkbox(&mut self.drop, "")
792 .on_hover_text("Remove columns whose names match the specified regex pattern.");
793 ui.end_row();
794
795 // --- Conditional Configuration Inputs ---
796 // These rows are only added to the grid if the checkbox is checked.
797 if self.drop {
798 // --- Regex Input ---
799 // Use simple indentation in the label for visual structure
800 ui.label("\tRegex:");
801 let name_edit = TextEdit::singleline(&mut self.drop_regex).desired_width(f32::INFINITY); // Use available width in the grid cell
802 ui.add(name_edit).on_hover_text(
803 "Enter the regex pattern to identify columns to drop by name.\n\n\
804 Format Requirements:\n\
805 - Use `*` to drop ALL columns.\n\
806 - Use `^YourPattern$` to match the entire column name.\n \
807 (Must start with `^` and end with `$`).\n\n\
808 Regex Examples:\n\
809 - `^Temp.*$` (Matches columns starting with 'Temp')\n\
810 - `^Value B$` (Matches the exact column named 'Value B')\n\
811 - `^(ID|Key|Index)$` (Matches 'ID', 'Key', or 'Index' exactly)\n\
812 - `^.*_OLD$` (Matches columns ending with '_OLD')\n\n\
813 (Invalid regex syntax or format will cause errors.)",
814 );
815 ui.end_row();
816 }
817 }
818
819 fn render_normalize_numbers(&mut self, ui: &mut Ui) {
820 // --- Row 1: Feature Checkbox ---
821 ui.label("Normalize Columns:");
822 ui.checkbox(&mut self.normalize, "").on_hover_text(
823 "Normalize Euro-style number strings in selected column names (via regex) to Float64.\n\
824 Example: '1.234,56' (String) to '1234.56' (Float64).",
825 );
826 ui.end_row();
827
828 // --- Conditional Configuration Inputs ---
829 // These rows are only added to the grid if the checkbox is checked.
830 if self.normalize {
831 // --- Regex Input ---
832 // Use simple indentation in the label for visual structure
833 ui.label("\tRegex:");
834 let name_edit =
835 TextEdit::singleline(&mut self.normalize_regex).desired_width(f32::INFINITY); // Use available width in the grid cell
836 ui.add(name_edit).on_hover_text(
837 r#"
838Enter a regex pattern to select String columns by name.
839
840Rules:
841- Use '*' for ALL String columns (caution!).
842- Use '^PATTERN$' for specific names (matches entire name).
843
844Example Columns:
845Row Number, Value1, Value2, ValueA, Valor, Total, SubTotal, Last Info
846
847Example Patterns:
8481. To select 'Value1', 'Value2':
849 ^Value\d$
850
8512. To select 'Value1', 'Value2', 'ValueA':
852 ^Value.*$
853
8543. To select 'Value1', 'Value2', 'ValueA', 'Valor':
855 ^Val.*$
856
8574. To select 'Value1', 'Value2', 'ValueA', 'Valor', 'Total', 'SubTotal':
858 ^(Val|.*Total).*$
859
8605. To select only 'Last Info' (note the space):
861 ^Last Info$
862
863(Applies only to columns that Polars identifies as String type.)"#,
864 );
865 ui.end_row();
866 }
867 }
868
869 /// Renders the `TextEdit` widget for specifying custom null values.
870 /// Modifies `self.null_values` directly based on user input.
871 fn render_null_values(&mut self, ui: &mut Ui) {
872 // Null Values Input Label
873 ui.label("Null Values:");
874
875 // Single-line text edit widget bound to the `self.null_values` string.
876 let null_values_edit =
877 TextEdit::singleline(&mut self.null_values).desired_width(f32::INFINITY); // Take available horizontal space.
878
879 // Add the widget to the UI and set its hover text.
880 ui.add(null_values_edit).on_hover_text(
881 "Comma-separated values to interpret as null during loading.\n\
882 Leading/trailing whitespace for each value is automatically trimmed.",
883 );
884
885 // End the row in the parent Grid layout.
886 ui.end_row();
887 }
888
889 /// Renders the `DragValue` widget for setting `infer_schema_rows`.
890 /// Modifies `self.infer_schema_rows` directly.
891 fn render_schema_length_input(&mut self, ui: &mut Ui) {
892 ui.label("Infer Rows:");
893 ui.add(
894 DragValue::new(&mut self.infer_schema_rows)
895 .speed(1) // Increment/decrement speed.
896 .range(0..=usize::MAX), // 0: No inference
897 )
898 .on_hover_text(
899 "Number of rows to scan for inferring data types (CSV/JSON)\n0: No inference",
900 );
901 ui.end_row();
902 }
903
904 /// Renders the `TextEdit` widgets for CSV-specific settings: delimiter.
905 /// Modifies `self.csv_delimiter` directly.
906 fn render_csv_delimiter(&mut self, ui: &mut Ui) {
907 // CSV Delimiter Input
908 ui.label("CSV Delimiter:");
909 let csv_delimiter_edit = TextEdit::singleline(&mut self.csv_delimiter)
910 .char_limit(1) // Restrict to a single character.
911 .desired_width(f32::INFINITY);
912 ui.add(csv_delimiter_edit)
913 .on_hover_text("Enter the single character CSV delimiter");
914 ui.end_row();
915 }
916
917 /// Renders the `TextEdit` widget for the SQL table name.
918 /// Modifies `self.table_name` directly.
919 fn render_table_name_input(&mut self, ui: &mut Ui) {
920 ui.label("SQL Table Name:");
921 let table_name_edit =
922 TextEdit::singleline(&mut self.table_name).desired_width(f32::INFINITY);
923 ui.add(table_name_edit)
924 .on_hover_text("Name of the table to use in SQL queries (e.g., FROM TableName)");
925 ui.end_row();
926 }
927
928 /*
929 /// Renders the multiline `TextEdit` widget for the SQL query.
930 /// Modifies `self.query` directly.
931 fn render_sql_query_input(&mut self, ui: &mut Ui) {
932 ui.label("SQL Query:");
933 let query_edit = TextEdit::multiline(&mut self.query)
934 .desired_width(f32::INFINITY)
935 // Set a reasonable initial height for the multiline input.
936 .desired_rows(4);
937 ui.add(query_edit)
938 .on_hover_text("Enter SQL query to filter/transform data (uses Polars SQL syntax)");
939 ui.end_row();
940 }
941 */
942
943 /// Renders tabbed SQL examples and the editable query input `self.query`.
944 /// Handles selecting examples and editing the query. Tabs will wrap if needed.
945 /// ### Logic
946 /// 1. Generate SQL examples via `sql_commands` using `self.schema`.
947 /// 2. Manage selected tab index using `egui::Memory`.
948 /// 3. Render **wrapping horizontal tabs** for examples using `ui.horizontal_wrapped`.
949 /// 4. On tab click: update index, copy example to `self.query`.
950 /// 5. Render multiline `TextEdit` bound to `&mut self.query`.
951 ///
952 /// Note: Actual *triggering* of reload happens in `render_query` based on overall state change detection or Apply click.
953 fn render_sql_query_input(&mut self, ui: &mut Ui) {
954 ui.label("SQL Query:"); // Label for the whole section
955 ui.vertical(|ui| {
956 // Group the examples and editor vertically
957 // Configure minimum width for the vertical group if needed
958 ui.set_min_width(300.0);
959
960 // 1. Generate examples based on the current schema
961 let examples = sql_commands(&self.schema);
962 if examples.is_empty() {
963 // If no schema or examples, just show the editor
964 ui.add(
965 TextEdit::multiline(&mut self.query)
966 .desired_width(f32::INFINITY)
967 .desired_rows(8) // Slightly more rows if no examples
968 .font(egui::TextStyle::Monospace),
969 );
970 return; // Skip rendering examples if none exist
971 }
972
973 // 2. Get/Set selected tab index from Memory for persistence
974 let tab_id = ui.id().with("sql_query_tab_index");
975 let mut selected_tab_index =
976 ui.memory_mut(|mem| *mem.data.get_persisted_mut_or_default::<usize>(tab_id));
977
978 // Clamp selected index to be valid (in case number of examples changes)
979 selected_tab_index = selected_tab_index.min(examples.len().saturating_sub(1));
980
981 // 3. Render Tabs using horizontal_wrapped
982 ui.separator();
983 ui.label("Examples:"); // Label for the tabs
984
985 // Use horizontal_wrapped to lay out tabs, wrapping them onto new lines
986 ui.horizontal_wrapped(|ui| {
987 // Iterate through examples to create selectable labels (tabs)
988 for i in 0..examples.len() {
989 let is_selected = selected_tab_index == i;
990 let tab_name = format!("{}", i + 1); // Simple number for the tab
991
992 // Create the selectable label (acting as a tab)
993 let resp = ui
994 .selectable_label(is_selected, tab_name)
995 // Show the first line of the SQL query as hover text
996 .on_hover_text(
997 examples
998 .get(i) // Safely get the example string
999 .and_then(|s| s.lines().next()) // Get the first line
1000 .unwrap_or(""), // Default to empty string if error/empty
1001 );
1002
1003 // 4. Handle tab click logic
1004 if resp.clicked() && !is_selected {
1005 selected_tab_index = i; // Update the selected index
1006
1007 // Update the main query editor text with the clicked example
1008 if let Some(example_query) = examples.get(i) {
1009 self.query = example_query.clone(); // Set editor text
1010 // Change is detected by render_query comparing before/after state
1011 tracing::debug!(
1012 "Switched SQL Query tab to Example {}, query text updated.",
1013 i + 1
1014 );
1015 }
1016
1017 // Store the newly selected index back into egui's memory
1018 ui.memory_mut(|mem| mem.data.insert_persisted(tab_id, selected_tab_index));
1019 }
1020 }
1021 }); // End horizontal_wrapped
1022
1023 ui.separator(); // Separator between tabs and editor
1024
1025 // 5. Render the ACTIVE query editor below the tabs
1026 ui.add(
1027 TextEdit::multiline(&mut self.query)
1028 .desired_width(f32::INFINITY) // Take full available width
1029 .desired_rows(6) // Set preferred number of visible lines
1030 .font(egui::TextStyle::Monospace), // Use a monospace font for SQL
1031 )
1032 .on_hover_text(
1033 "Enter SQL query (Polars SQL).\n\
1034 Click Example tabs above.\n\
1035 Changes trigger reload on Apply/focus change.",
1036 );
1037 }); // End vertical group
1038 ui.end_row(); // End the row in the parent Grid layout
1039 }
1040
1041 /// Renders the collapsible section displaying SQL command examples.
1042 /// Uses `sql_commands` to generate examples relevant to the current `self.schema`.
1043 fn render_sql_examples(&self, ui: &mut Ui) {
1044 CollapsingHeader::new("SQL Command Examples")
1045 .default_open(false)
1046 .show(ui, |ui| {
1047 // Tip about quoting identifiers.
1048 let quoting_tip = "Tip: Use double quotes (\") or backticks (`) around column names with spaces or special characters (e.g., \"Column Name\" or `Column Name`).";
1049 ui.label(quoting_tip);
1050
1051 // Frame around the examples.
1052 Frame::default()
1053 .stroke(Stroke::new(1.0, Color32::GRAY))
1054 .outer_margin(2.0)
1055 .inner_margin(10.0)
1056 .show(ui, |ui| {
1057 // Link to Polars SQL documentation.
1058 ui.vertical_centered(|ui| {
1059 let polars_sql_url = "https://docs.pola.rs/api/python/stable/reference/sql/index.html";
1060 ui.hyperlink_to("Polars SQL Reference", polars_sql_url).on_hover_text(polars_sql_url);
1061 });
1062 ui.separator();
1063
1064 // Generate and display SQL examples based on the current schema.
1065 // The `sql_commands` function (in `sqls.rs`) dynamically creates these.
1066 let examples = sql_commands(&self.schema);
1067 let mut ex_num = Vec::new();
1068 for (index, example) in examples.iter().enumerate() {
1069 ex_num.push(format!("Example {count}:\n{example}", count = index + 1));
1070 }
1071
1072 // Make the examples selectable for easy copying.
1073 ui.add(egui::Label::new(ex_num.join("\n\n")).selectable(true));
1074 });
1075 });
1076 }
1077}
1078
1079/// Reads a CSV file from the specified path using Polars, applying given options
1080/// and limiting the number of data rows read.
1081///
1082/// This function configures a Polars CsvReader with specific parsing and reading
1083/// options and executes the read operation directly from the file path.
1084/// It's suitable for getting the schema (when `infer_schema_length(Some(0))`) or
1085/// reading a limited number of initial data rows (`with_n_rows`).
1086///
1087/// ### Configuration Behavior:
1088/// - `has_header(true)`: Assumes the file has a header row for column names.
1089/// - `infer_schema_length(Some(0))`: Instructs Polars to infer column names from the
1090/// header row *only*, using default types (typically String), without using data
1091/// rows to guess types. If combined with `n_rows > 0`, it reads data
1092/// rows but ignores their content for type inference.
1093/// - `with_n_rows(n_rows)`: Limits the number of *data* rows parsed after the header.
1094/// - `ignore_errors(true)`: Skips rows/fields with parsing errors rather than stopping.
1095/// - `missing_is_null(true)`: Treats empty fields (`""`) as null values.
1096pub async fn read_csv_partial_from_path(
1097 delimiter: u8,
1098 n_rows: usize,
1099 path: &Path,
1100) -> PolarsViewResult<DataFrame> {
1101 tracing::debug!("Read a CSV file using Polars limited to {} rows.", n_rows,);
1102
1103 // 1. Define the CSV parsing options.
1104 let csv_parse_options = CsvParseOptions::default()
1105 .with_encoding(CsvEncoding::LossyUtf8) // Handle potentially non-strict UTF8
1106 .with_missing_is_null(true) // Treat empty fields as nulls
1107 .with_separator(delimiter); // Set the chosen delimiter
1108
1109 // 2. Define the main CSV reading options.
1110 let csv_read_options = CsvReadOptions::default()
1111 .with_parse_options(csv_parse_options) // Apply the parsing sub-options
1112 .with_has_header(true) // File has a header row
1113 .with_infer_schema_length(Some(0)) // Number of rows to use for schema inference (0 means header only)
1114 .with_ignore_errors(true) // Allow skipping rows/fields that fail to parse
1115 .with_n_rows(Some(n_rows)) // Limits the number of rows to read.
1116 .try_into_reader_with_file_path(Some(path.to_path_buf()))?;
1117
1118 // 3. Execute the blocking read operation on a separate thread
1119 let df = execute_polars_blocking(move || csv_read_options.finish()).await?;
1120
1121 tracing::debug!("Partial CSV read complete. Shape: {:?}", df.shape());
1122 Ok(df)
1123}
1124
1125/// Builds a Polars Schema specifying DataType::String overrides for columns
1126/// whose names match a given regex pattern or wildcard.
1127///
1128/// This function creates a schema intended for the `with_dtypes` option
1129/// of Polars readers (like `LazyCsvReader`), ensuring specific columns are treated
1130/// as text regardless of their inferred type.
1131fn build_dtype_override_schema(
1132 input_schema: &Arc<Schema>,
1133 regex_pattern: &str,
1134) -> PolarsViewResult<Schema> {
1135 let mut overrides_schema = Schema::default(); // Initialize the resulting schema
1136
1137 // --- Handle Wildcard Case ("*") ---
1138 // If the pattern is "*", override ALL columns to String.
1139 if regex_pattern.trim() == "*" {
1140 tracing::debug!(
1141 "Wildcard pattern '{regex_pattern}' provided. Overriding all columns to String."
1142 );
1143 return Ok(input_schema.as_ref().clone()); // Return the fully populated override schema
1144 }
1145
1146 // --- Handle Specific Regex Pattern Case ---
1147 // If it's not a wildcard, compile the regex pattern.
1148
1149 // Validate the required ^...$ format *before* compiling
1150 if !(regex_pattern.starts_with('^') && regex_pattern.ends_with('$')) {
1151 return Err(PolarsViewError::InvalidRegexPattern(
1152 regex_pattern.to_string(),
1153 ));
1154 }
1155
1156 // Attempt to compile the regex
1157 let compiled_regex = match Regex::new(regex_pattern) {
1158 Ok(re) => re,
1159 Err(e) => {
1160 // Return specific error for invalid syntax
1161 return Err(PolarsViewError::InvalidRegexSyntax {
1162 pattern: regex_pattern.to_string(),
1163 error: e.to_string(),
1164 });
1165 }
1166 };
1167
1168 // Check this compiled regex against each actual header name.
1169 for col_name in input_schema.iter_names() {
1170 if compiled_regex.is_match(col_name) {
1171 // Insert the override into the schema.
1172 overrides_schema.insert(col_name.clone(), DataType::String);
1173 }
1174 }
1175
1176 // Log the final outcome for debugging purposes.
1177 if !overrides_schema.is_empty() {
1178 tracing::debug!(
1179 override_cols = ?overrides_schema.iter_names().collect::<Vec<_>>(),
1180 "Pattern '{}' matched {} columns: ",
1181 regex_pattern,
1182 overrides_schema.len()
1183 );
1184 } else {
1185 tracing::debug!("Provided regex patterns did not match any header columns.");
1186 }
1187
1188 Ok(overrides_schema) // Return the successfully built schema (might be empty)
1189}
1190
1191/// Helper function to find a unique column name based on a base name and schema.
1192/// Appends suffixes "_1", "_2", etc., if the base name conflicts with existing column names.
1193fn resolve_unique_column_name(base_name: &str, schema: &Schema) -> PolarsResult<PlSmallStr> {
1194 // Check if the base name is available first (most common case)
1195 if schema.get(base_name).is_none() {
1196 tracing::debug!("Base name '{}' is available.", base_name);
1197 return Ok(base_name.into());
1198 }
1199
1200 // Base name conflicts, generate alternative names with suffixes.
1201 tracing::debug!(
1202 "Base name '{}' conflicts. Searching unique name.",
1203 base_name
1204 );
1205 let mut suffix_counter = 1u32;
1206 loop {
1207 let candidate_name = format!("{base_name}_{suffix_counter}");
1208
1209 if schema.get(&candidate_name).is_none() {
1210 // Found a unique name
1211 tracing::debug!("Found unique name: '{}'.", candidate_name);
1212 return Ok(candidate_name.into()); // Return the unique name
1213 }
1214
1215 // Safety check for potential overflow and limit attempts
1216 suffix_counter = suffix_counter.checked_add(1).unwrap_or(MAX_ATTEMPTS); // If overflow, go to max attempts
1217
1218 // Prevent infinite loops. Return error if a unique name cannot be found after max attempts.
1219 if suffix_counter >= MAX_ATTEMPTS {
1220 let msg = format!(
1221 "Failed to find a unique column name starting with '{base_name}' after {MAX_ATTEMPTS} attempts."
1222 );
1223 tracing::error!("{}", msg);
1224 return Err(PolarsError::ComputeError(msg.into()));
1225 }
1226 }
1227}
1228
1229/// Executes a potentially blocking Polars operation on a separate Tokio blocking thread.
1230///
1231/// Wraps the closure `op` which is expected to return a `PolarsResult<T>`,
1232/// runs it with `spawn_blocking`, awaits the result, and maps both the
1233/// `JoinError` and the inner `PolarsError` to `PolarsViewError`.
1234///
1235/// ### Arguments
1236/// * `op`: A closure that performs the blocking work and returns `PolarsResult<T>`.
1237/// It must be `Send` and have a `'static` lifetime, meaning it must
1238/// take ownership of or only use data that can be moved across threads
1239/// and lives for the duration of the program (or the task).
1240///
1241/// ### Returns
1242/// A `PolarsViewResult<T>` containing the result of the operation `T` on success,
1243/// or a mapped `PolarsViewError` if the spawned task fails (`TokioJoin`) or
1244/// the Polars operation itself fails (`Polars`).
1245async fn execute_polars_blocking<T, F>(op: F) -> PolarsViewResult<T>
1246where
1247 // F is the type of the closure
1248 F: FnOnce() -> Result<T, PolarsError> + Send + 'static, // The closure trait bounds
1249 // T is the success type returned by the closure (e.g., DataFrame)
1250 T: Debug + Send + 'static, // The success type must be Send and have static lifetime
1251 // PolarsError: Debug,
1252{
1253 // Spawn the blocking task
1254 let result_from_task = spawn_blocking(op).await; // Result<Result<T, PolarsError>, JoinError>
1255
1256 // Map JoinError to PolarsViewError::TokioJoin
1257 let polars_result = result_from_task.map_err(PolarsViewError::from)?; // Requires PolarsViewError::from(JoinError)
1258
1259 // Map PolarsError to PolarsViewError::Polars
1260 let final_result = polars_result.map_err(PolarsViewError::from)?; // Requires PolarsViewError::from(PolarsError)
1261
1262 Ok(final_result) // Return the successfully extracted value or the mapped PolarsError
1263}
1264
1265//----------------------------------------------------------------------------//
1266// Tests //
1267//----------------------------------------------------------------------------//
1268
1269/// Run tests with:
1270/// cargo test -- --show-output tests_override_columns
1271#[cfg(test)]
1272mod tests_override_columns {
1273 use super::*;
1274 use std::{fs::File, io::Write};
1275 use tempfile::NamedTempFile;
1276
1277 // --- Test Setup Helper (Unchanged conceptually, ensure path and error mapping ok) ---
1278 fn setup_test_csv(
1279 content: &str, // CSV content as string
1280 delimiter: char,
1281 force_string_patterns: Option<String>, // Regex Columns to configure for override
1282 ) -> PolarsViewResult<(NamedTempFile, DataFilter)> {
1283 let temp_file = NamedTempFile::new()?;
1284 let file_path = temp_file.path().to_path_buf();
1285
1286 // Write content to the temp file
1287 let mut file = File::create(&file_path)?;
1288 file.write_all(content.as_bytes())?;
1289 file.flush()?; // Ensure data is written
1290
1291 // Create DataFilter using struct update syntax (Clippy Fix)
1292 let filter = DataFilter {
1293 absolute_path: file_path, // Set specific value
1294 force_string_patterns, // Set specific value
1295 csv_delimiter: delimiter.to_string(), // Set specific value
1296 ..Default::default() // Fill the rest with defaults
1297 };
1298
1299 Ok((temp_file, filter))
1300 }
1301
1302 // --- Test Case 1: Override applied correctly ---
1303 #[tokio::test] // Requires tokio features in dev-dependencies
1304 async fn test_csv_read_with_override_success() -> PolarsViewResult<()> {
1305 println!("\n--- Test: Override Applied Successfully ---");
1306 // 1. Define CSV Content with large numbers AS TEXT
1307 let csv_content = "\
1308long_id;value;text
130912345678901234567890123456789012345678901234;10.5;abc
131098765432109876543210987654321098765432109876;20.0;def
131112345;30.7;ghi";
1312 // No need for df_input - the csv_content is the direct input representation
1313 println!("Input CSV Content:\n{csv_content}\n");
1314
1315 // 2. Define Expected Output DataFrame (long_id is String)
1316 let df_expected = df!(
1317 "long_id" => &[
1318 "12345678901234567890123456789012345678901234",
1319 "98765432109876543210987654321098765432109876",
1320 "12345"
1321 ],
1322 "value" => &[10.5, 20.0, 30.7],
1323 "text" => &["abc", "def", "ghi"]
1324 )
1325 .expect("Failed to create expected DataFrame");
1326 println!("Expected DF (After Read):\n{df_expected}");
1327
1328 // 3. Setup: Use helper to create CSV and Filter WITH the override
1329 let delimiter = ';';
1330 let col_regex = "^long_id$".to_string();
1331 let (_temp_file, filter) = // Keep _temp_file handle!
1332 setup_test_csv(csv_content, delimiter, Some(col_regex))?;
1333
1334 let schema = filter
1335 .attempt_csv_parse_structure(delimiter as u8, false)
1336 .await?;
1337 println!("schema: {schema:#?}");
1338
1339 // 4. Execute the function under test
1340 let lazyframe = filter.attempt_read_csv(delimiter as u8, &schema).await?;
1341 println!("get lazyframe");
1342
1343 // Execute the lazy plan and collect into an eager DataFrame
1344 // *** WRAP COLLECT IN SPAWN_BLOCKING ***
1345 let df_output =
1346 execute_polars_blocking(move || lazyframe.with_new_streaming(true).collect()).await?;
1347
1348 println!("Output DF (Actual Read):\n{df_output}");
1349
1350 // 5. Assertions
1351 assert_eq!(
1352 df_output.schema().get("long_id"),
1353 Some(&DataType::String),
1354 "Schema Check Failed: 'long_id' should be DataType::String"
1355 );
1356 assert_eq!(
1357 df_output.schema().get("value"),
1358 Some(&DataType::Float64),
1359 "Schema Check Failed: 'value' should be DataType::Float64"
1360 );
1361 assert_eq!(
1362 df_output.schema().get("text"),
1363 Some(&DataType::String),
1364 "Schema Check Failed: 'text' should be DataType::String"
1365 );
1366 assert_eq!(
1367 df_output, df_expected,
1368 "Content Check Failed: Output DF does not match expected DF"
1369 );
1370
1371 Ok(())
1372 }
1373
1374 // --- Test Case 2: Override *not* applied (expect nulls) ---
1375 #[tokio::test] // Requires tokio features in dev-dependencies
1376 async fn test_csv_read_without_override_yields_nulls() -> PolarsViewResult<()> {
1377 println!("\n--- Test: No Override Applied (Expect Nulls) ---");
1378 // 1. Define CSV Content (same large numbers AS TEXT)
1379 let csv_content = "\
1380long_id;value;text
138112345678901234567890123456789012345678901234;10.5;abc
138298765432109876543210987654321098765432109876;20.0;def";
1383 println!("Input CSV Content:\n{csv_content}\n");
1384
1385 // 2. Define Expected Output Pattern (long_id should be all nulls)
1386 let df_expected_pattern = df!(
1387 "long_id" => Series::new_null("long_id".into(), 2).cast(&DataType::Int64)?, // Series of 2 nulls
1388 "value" => &[10.5, 20.0],
1389 "text" => &["abc", "def"]
1390 )
1391 .expect("Failed to create expected pattern DataFrame");
1392 println!("Expected DF Pattern (After Read, note long_id nulls):\n{df_expected_pattern}");
1393
1394 // 3. Setup: Use helper with an EMPTY override list
1395 let delimiter = ';';
1396 let col_regex = "^Col Name$".to_string();
1397 let (_temp_file, filter) = setup_test_csv(csv_content, delimiter, Some(col_regex))?;
1398
1399 let schema = filter
1400 .attempt_csv_parse_structure(delimiter as u8, false)
1401 .await?;
1402 println!("schema: {schema:#?}");
1403
1404 // 4. Execute the function under test
1405 let lazyframe = filter.attempt_read_csv(delimiter as u8, &schema).await?;
1406 println!("get lazyframe");
1407
1408 // Execute the lazy plan and collect into an eager DataFrame
1409 // *** WRAP COLLECT IN SPAWN_BLOCKING ***
1410 let df_output = spawn_blocking(move || lazyframe.with_new_streaming(true).collect())
1411 .await
1412 .map_err(PolarsViewError::from)? // Convert Tokio JoinError to PolarsViewError
1413 .map_err(PolarsViewError::from)?; // Convert PolarsError to PolarsViewError
1414
1415 println!("Output DF (Actual Read):\n{df_output}");
1416
1417 // 5. Assertions
1418 let long_id_col = df_output.column("long_id")?;
1419 assert!(
1420 long_id_col.is_null().all(), // Verify ALL values are null
1421 "Content Check Failed: 'long_id' column should be all nulls without override. Type: {:?}, Null count: {}",
1422 long_id_col.dtype(),
1423 long_id_col.null_count()
1424 );
1425 // Verify other columns match the pattern
1426 assert_eq!(
1427 df_output.column("value")?,
1428 df_expected_pattern.column("value")?
1429 );
1430 assert_eq!(
1431 df_output.column("text")?,
1432 df_expected_pattern.column("text")?
1433 );
1434
1435 Ok(())
1436 }
1437}