sql_cli/data/
direct_csv_loader.rs

1/// Direct CSV to `DataTable` loader - bypasses JSON intermediate format
2use crate::data::datatable::{DataColumn, DataRow, DataTable};
3use crate::data::value_parsing::parse_value;
4use anyhow::Result;
5use csv;
6use std::fs::File;
7use std::io::{BufRead, BufReader};
8use std::path::Path;
9use tracing::{debug, info};
10
11pub struct DirectCsvLoader;
12
13impl DirectCsvLoader {
14    /// Load CSV directly into `DataTable` without JSON intermediate
15    pub fn load_csv_direct<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
16        let path = path.as_ref();
17        info!("Direct CSV load: Loading {} into DataTable", path.display());
18
19        // Track memory before loading
20        crate::utils::memory_tracker::track_memory("direct_csv_start");
21
22        let file = File::open(path)?;
23        let mut reader = csv::Reader::from_reader(file);
24
25        // Get headers and create columns
26        let headers = reader.headers()?.clone(); // Clone to release the borrow
27        let mut table = DataTable::new(table_name);
28
29        for header in &headers {
30            table.add_column(DataColumn::new(header.to_string()));
31        }
32
33        crate::utils::memory_tracker::track_memory("direct_csv_headers");
34
35        // Re-open file to read raw lines for null detection
36        let file2 = File::open(path)?;
37        let mut line_reader = BufReader::new(file2);
38        let mut raw_line = String::new();
39        // Skip header line
40        line_reader.read_line(&mut raw_line)?;
41
42        // Read rows directly into DataTable
43        let mut row_count = 0;
44        for result in reader.records() {
45            let record = result?;
46
47            // Read the corresponding raw line
48            raw_line.clear();
49            line_reader.read_line(&mut raw_line)?;
50
51            let mut values = Vec::with_capacity(headers.len());
52
53            for (i, field) in record.iter().enumerate() {
54                // Check if this field is truly null (between commas) vs empty string
55                let is_null = field.is_empty() && Self::is_null_field(&raw_line, i);
56                let value = parse_value(field, is_null);
57                values.push(value);
58            }
59
60            table
61                .add_row(DataRow::new(values))
62                .map_err(|e| anyhow::anyhow!(e))?;
63            row_count += 1;
64
65            // Track memory every 5000 rows
66            if row_count % 5000 == 0 {
67                crate::utils::memory_tracker::track_memory(&format!("direct_csv_{row_count}rows"));
68            }
69        }
70
71        // Infer column types from the data
72        table.infer_column_types();
73
74        crate::utils::memory_tracker::track_memory("direct_csv_complete");
75
76        info!(
77            "Direct CSV load complete: {} rows, {} columns, ~{} MB",
78            table.row_count(),
79            table.column_count(),
80            table.estimate_memory_size() / 1024 / 1024
81        );
82
83        Ok(table)
84    }
85
86    /// Helper to detect if a field in the raw CSV line is a null (unquoted empty)
87    fn is_null_field(raw_line: &str, field_index: usize) -> bool {
88        let mut comma_count = 0;
89        let mut in_quotes = false;
90        let mut field_start = 0;
91        let mut prev_char = ' ';
92
93        for (i, ch) in raw_line.chars().enumerate() {
94            if ch == '"' && prev_char != '\\' {
95                in_quotes = !in_quotes;
96            }
97
98            if ch == ',' && !in_quotes {
99                if comma_count == field_index {
100                    let field_end = i;
101                    let field_content = &raw_line[field_start..field_end].trim();
102                    // If empty, check if it was quoted (quoted empty = empty string, unquoted empty = NULL)
103                    if field_content.is_empty() {
104                        return true; // Unquoted empty field -> NULL
105                    }
106                    // If it starts and ends with quotes but is empty inside, it's an empty string, not NULL
107                    if field_content.starts_with('"')
108                        && field_content.ends_with('"')
109                        && field_content.len() == 2
110                    {
111                        return false; // Quoted empty field -> empty string
112                    }
113                    return false; // Non-empty field -> not NULL
114                }
115                comma_count += 1;
116                field_start = i + 1;
117            }
118            prev_char = ch;
119        }
120
121        // Check last field
122        if comma_count == field_index {
123            let field_content = raw_line[field_start..]
124                .trim()
125                .trim_end_matches('\n')
126                .trim_end_matches('\r');
127            // If empty, check if it was quoted
128            if field_content.is_empty() {
129                return true; // Unquoted empty field -> NULL
130            }
131            // If it starts and ends with quotes but is empty inside, it's an empty string, not NULL
132            if field_content.starts_with('"')
133                && field_content.ends_with('"')
134                && field_content.len() == 2
135            {
136                return false; // Quoted empty field -> empty string
137            }
138            return false; // Non-empty field -> not NULL
139        }
140
141        false // Field not found -> not NULL (shouldn't happen)
142    }
143
144    /// Execute a SQL query directly on a `DataTable` (no JSON)
145    pub fn query_datatable(table: &DataTable, sql: &str) -> Result<DataTable> {
146        // For now, just return a reference/clone of the table
147        // In the future, this would apply WHERE/ORDER BY/etc directly on DataTable
148        debug!("Direct query on DataTable: {}", sql);
149
150        // Simple SELECT * for now
151        if sql.trim().to_uppercase().starts_with("SELECT *") {
152            Ok(table.clone())
153        } else {
154            // TODO: Implement proper SQL execution on DataTable
155            Ok(table.clone())
156        }
157    }
158}