1#![allow(clippy::should_implement_trait)]
6use super::functions::detect_delimiter;
7#[allow(unused_imports)]
8use super::functions::*;
9#[allow(unused_imports)]
10use super::functions_2::*;
11
12#[allow(dead_code)]
14pub struct CsvRecord {
15 pub fields: Vec<String>,
17}
18#[allow(dead_code)]
20pub struct CsvFile {
21 pub headers: Vec<String>,
23 pub records: Vec<CsvRecord>,
25}
26impl CsvFile {
27 pub fn new(headers: Vec<String>) -> Self {
29 CsvFile {
30 headers,
31 records: Vec::new(),
32 }
33 }
34 pub fn add_record(&mut self, fields: Vec<String>) {
36 self.records.push(CsvRecord { fields });
37 }
38 pub fn add_record_f64(&mut self, values: &[f64]) {
40 let fields = values.iter().map(|v| format!("{}", v)).collect();
41 self.records.push(CsvRecord { fields });
42 }
43 pub fn record_count(&self) -> usize {
45 self.records.len()
46 }
47 pub fn column_count(&self) -> usize {
49 self.headers.len()
50 }
51 pub fn get_column_f64(&self, col_idx: usize) -> Result<Vec<f64>, String> {
53 if col_idx >= self.headers.len() {
54 return Err(format!("Column index {} out of range", col_idx));
55 }
56 let mut out = Vec::with_capacity(self.records.len());
57 for (row, rec) in self.records.iter().enumerate() {
58 let s = rec
59 .fields
60 .get(col_idx)
61 .ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
62 let v: f64 = s
63 .trim()
64 .parse()
65 .map_err(|e| format!("Row {}, col {}: parse error: {}", row, col_idx, e))?;
66 out.push(v);
67 }
68 Ok(out)
69 }
70 pub fn get_column_by_name(&self, name: &str) -> Option<usize> {
72 self.headers.iter().position(|h| h == name)
73 }
74 #[allow(clippy::inherent_to_string)]
76 pub fn to_string(&self) -> String {
77 self.to_string_with_delimiter(',')
78 }
79 #[allow(dead_code)]
81 pub fn to_string_with_delimiter(&self, delim: char) -> String {
82 let mut out = String::new();
83 let d = delim.to_string();
84 out.push_str(&self.headers.join(&d));
85 out.push('\n');
86 for rec in &self.records {
87 out.push_str(&rec.fields.join(&d));
88 out.push('\n');
89 }
90 out
91 }
92 pub fn from_str(s: &str) -> Result<Self, String> {
94 Self::from_str_with_delimiter(s, ',')
95 }
96 #[allow(dead_code)]
98 pub fn from_str_with_delimiter(s: &str, delim: char) -> Result<Self, String> {
99 let mut lines = s.lines();
100 let header_line = lines.next().ok_or("Empty CSV input")?;
101 let headers: Vec<String> = header_line
102 .split(delim)
103 .map(|f| f.trim().to_string())
104 .collect();
105 if headers.is_empty() || headers.iter().all(|h| h.is_empty()) {
106 return Err("No headers found".to_string());
107 }
108 let mut records = Vec::new();
109 for line in lines {
110 if line.trim().is_empty() {
111 continue;
112 }
113 let fields: Vec<String> = line.split(delim).map(|f| f.trim().to_string()).collect();
114 records.push(CsvRecord { fields });
115 }
116 Ok(CsvFile { headers, records })
117 }
118 pub fn filter_rows(&self, col_idx: usize, pred: impl Fn(f64) -> bool) -> CsvFile {
121 let mut out = CsvFile::new(self.headers.clone());
122 for rec in &self.records {
123 if let Some(s) = rec.fields.get(col_idx)
124 && let Ok(v) = s.trim().parse::<f64>()
125 && pred(v)
126 {
127 out.records.push(CsvRecord {
128 fields: rec.fields.clone(),
129 });
130 }
131 }
132 out
133 }
134 #[allow(dead_code)]
136 pub fn infer_column_type(&self, col_idx: usize) -> ColumnType {
137 if col_idx >= self.headers.len() {
138 return ColumnType::Text;
139 }
140 let mut all_int = true;
141 let mut all_float = true;
142 let mut any_value = false;
143 for rec in &self.records {
144 if let Some(s) = rec.fields.get(col_idx) {
145 let s = s.trim();
146 if s.is_empty() {
147 continue;
148 }
149 any_value = true;
150 if s.parse::<i64>().is_err() {
151 all_int = false;
152 }
153 if s.parse::<f64>().is_err() {
154 all_float = false;
155 }
156 }
157 }
158 if !any_value {
159 return ColumnType::Text;
160 }
161 if all_int {
162 ColumnType::Integer
163 } else if all_float {
164 ColumnType::Float
165 } else {
166 ColumnType::Text
167 }
168 }
169 #[allow(dead_code)]
171 pub fn select_columns(&self, col_indices: &[usize]) -> CsvFile {
172 let headers: Vec<String> = col_indices
173 .iter()
174 .filter_map(|&i| self.headers.get(i).cloned())
175 .collect();
176 let mut out = CsvFile::new(headers);
177 for rec in &self.records {
178 let fields: Vec<String> = col_indices
179 .iter()
180 .map(|&i| rec.fields.get(i).cloned().unwrap_or_default())
181 .collect();
182 out.records.push(CsvRecord { fields });
183 }
184 out
185 }
186 #[allow(dead_code)]
189 pub fn select_columns_by_name(&self, names: &[&str]) -> CsvFile {
190 let indices: Vec<usize> = names
191 .iter()
192 .filter_map(|n| self.get_column_by_name(n))
193 .collect();
194 self.select_columns(&indices)
195 }
196 #[allow(dead_code)]
199 pub fn normalize_headers(&mut self) {
200 for h in &mut self.headers {
201 let normalized: String = h
202 .trim()
203 .to_lowercase()
204 .chars()
205 .map(|c| {
206 if c.is_alphanumeric() || c == '_' {
207 c
208 } else {
209 '_'
210 }
211 })
212 .collect();
213 *h = normalized;
214 }
215 }
216 #[allow(dead_code)]
219 pub fn column_stats(&self, col_idx: usize) -> Option<ColumnStats> {
220 let values = self.get_column_f64(col_idx).ok()?;
221 if values.is_empty() {
222 return None;
223 }
224 let mut min = f64::INFINITY;
225 let mut max = f64::NEG_INFINITY;
226 let mut sum = 0.0;
227 for &v in &values {
228 if v < min {
229 min = v;
230 }
231 if v > max {
232 max = v;
233 }
234 sum += v;
235 }
236 let count = values.len();
237 Some(ColumnStats {
238 min,
239 max,
240 mean: sum / count as f64,
241 count,
242 sum,
243 })
244 }
245 #[allow(dead_code)]
248 pub fn all_column_stats(&self) -> Vec<(String, ColumnStats)> {
249 let mut result = Vec::new();
250 for i in 0..self.headers.len() {
251 if let Some(stats) = self.column_stats(i) {
252 result.push((self.headers[i].clone(), stats));
253 }
254 }
255 result
256 }
257 #[allow(dead_code)]
259 pub fn get_column_strings(&self, col_idx: usize) -> Result<Vec<String>, String> {
260 if col_idx >= self.headers.len() {
261 return Err(format!("Column index {} out of range", col_idx));
262 }
263 let mut out = Vec::with_capacity(self.records.len());
264 for (row, rec) in self.records.iter().enumerate() {
265 let s = rec
266 .fields
267 .get(col_idx)
268 .ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
269 out.push(s.trim().to_string());
270 }
271 Ok(out)
272 }
273 #[allow(dead_code)]
275 pub fn get_column_i64(&self, col_idx: usize) -> Result<Vec<i64>, String> {
276 if col_idx >= self.headers.len() {
277 return Err(format!("Column index {} out of range", col_idx));
278 }
279 let mut out = Vec::with_capacity(self.records.len());
280 for (row, rec) in self.records.iter().enumerate() {
281 let s = rec
282 .fields
283 .get(col_idx)
284 .ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
285 let v: i64 = s
286 .trim()
287 .parse()
288 .map_err(|e| format!("Row {}, col {}: parse error: {}", row, col_idx, e))?;
289 out.push(v);
290 }
291 Ok(out)
292 }
293 #[allow(dead_code)]
295 pub fn sort_by_column(&mut self, col_idx: usize) {
296 self.records.sort_by(|a, b| {
297 let va = a
298 .fields
299 .get(col_idx)
300 .and_then(|s| s.trim().parse::<f64>().ok())
301 .unwrap_or(f64::NAN);
302 let vb = b
303 .fields
304 .get(col_idx)
305 .and_then(|s| s.trim().parse::<f64>().ok())
306 .unwrap_or(f64::NAN);
307 va.partial_cmp(&vb).unwrap_or(std::cmp::Ordering::Equal)
308 });
309 }
310}
311#[allow(dead_code)]
313pub struct CsvWriter {
314 pub(super) headers: Vec<String>,
315 pub(super) delimiter: char,
316 pub(super) lines: Vec<String>,
317}
318impl CsvWriter {
319 pub fn new(headers: Vec<String>, delimiter: char) -> Self {
321 let header_line = headers.join(&delimiter.to_string());
322 Self {
323 headers,
324 delimiter,
325 lines: vec![header_line],
326 }
327 }
328 pub fn write_row(&mut self, values: &[&str]) {
331 let n = self.headers.len();
332 let row: Vec<&str> = (0..n)
333 .map(|i| values.get(i).copied().unwrap_or(""))
334 .collect();
335 self.lines.push(row.join(&self.delimiter.to_string()));
336 }
337 pub fn write_row_f64(&mut self, values: &[f64]) {
339 let strs: Vec<String> = values.iter().map(|v| format!("{v:.6}")).collect();
340 let refs: Vec<&str> = strs.iter().map(String::as_str).collect();
341 self.write_row(&refs);
342 }
343 pub fn finish(self) -> String {
345 self.lines.join("\n")
346 }
347 pub fn row_count(&self) -> usize {
349 self.lines.len().saturating_sub(1)
350 }
351}
352#[allow(dead_code)]
354#[derive(Debug, Clone, Copy, PartialEq)]
355pub enum AggOp {
356 Sum,
358 Mean,
360 Min,
362 Max,
364 Std,
366 Count,
368}
369#[allow(dead_code)]
373pub struct TimeSeriesCsv {
374 pub csv: CsvFile,
376 pub time_column: String,
378}
379impl TimeSeriesCsv {
380 pub fn new(csv: CsvFile, time_column: &str) -> Self {
382 Self {
383 csv,
384 time_column: time_column.to_owned(),
385 }
386 }
387 pub fn from_str(s: &str, time_column: &str) -> Result<Self, String> {
389 let csv = CsvFile::from_str(s)?;
390 Ok(Self::new(csv, time_column))
391 }
392 pub fn times(&self) -> Option<Vec<f64>> {
395 let idx = self
396 .csv
397 .headers
398 .iter()
399 .position(|h| h == &self.time_column)?;
400 self.csv.get_column_f64(idx).ok()
401 }
402 pub fn column_f64(&self, name: &str) -> Option<Vec<f64>> {
404 let idx = self.csv.headers.iter().position(|h| h == name)?;
405 self.csv.get_column_f64(idx).ok()
406 }
407 pub fn n_steps(&self) -> usize {
409 self.csv.records.len()
410 }
411 pub fn duration(&self) -> f64 {
413 let ts = self.times().unwrap_or_default();
414 if ts.len() < 2 {
415 return 0.0;
416 }
417 let min = ts.iter().cloned().fold(f64::INFINITY, f64::min);
418 let max = ts.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
419 max - min
420 }
421}
422#[allow(dead_code)]
424#[derive(Debug, Clone)]
425pub struct CsvSchema {
426 pub columns: Vec<(String, ColumnType)>,
428}
429impl CsvSchema {
430 pub fn new(columns: Vec<(String, ColumnType)>) -> Self {
432 Self { columns }
433 }
434 pub fn len(&self) -> usize {
436 self.columns.len()
437 }
438 pub fn is_empty(&self) -> bool {
440 self.columns.is_empty()
441 }
442 pub fn validate(&self, csv: &CsvFile) -> Vec<String> {
448 let mut errors = Vec::new();
449 if csv.headers.len() != self.columns.len() {
450 errors.push(format!(
451 "column count mismatch: schema has {} columns, file has {}",
452 self.columns.len(),
453 csv.headers.len()
454 ));
455 return errors;
456 }
457 for (col_idx, (name, expected_type)) in self.columns.iter().enumerate() {
458 if csv.headers[col_idx] != *name {
459 errors.push(format!(
460 "column {} name mismatch: expected '{}', got '{}'",
461 col_idx, name, csv.headers[col_idx]
462 ));
463 }
464 for (row_idx, record) in csv.records.iter().enumerate() {
465 if col_idx >= record.fields.len() {
466 errors.push(format!("row {} column {} missing", row_idx, col_idx));
467 continue;
468 }
469 let v = &record.fields[col_idx];
470 match expected_type {
471 ColumnType::Integer => {
472 if v.parse::<i64>().is_err() {
473 errors.push(format!(
474 "row {} column '{}': expected Integer, got '{}'",
475 row_idx, name, v
476 ));
477 }
478 }
479 ColumnType::Float => {
480 if v.parse::<f64>().is_err() {
481 errors.push(format!(
482 "row {} column '{}': expected Float, got '{}'",
483 row_idx, name, v
484 ));
485 }
486 }
487 ColumnType::Text => {}
488 }
489 }
490 }
491 errors
492 }
493}
494pub struct LazyCsvIter<'a> {
500 pub(super) lines: std::str::Lines<'a>,
501 pub(super) delimiter: char,
502 pub headers: Vec<String>,
504}
505impl<'a> LazyCsvIter<'a> {
506 pub fn new(input: &'a str, delimiter: char) -> Self {
509 let mut lines = input.lines();
510 let headers = lines
511 .next()
512 .map(|h| {
513 h.split(delimiter)
514 .map(str::trim)
515 .map(String::from)
516 .collect()
517 })
518 .unwrap_or_default();
519 Self {
520 lines,
521 delimiter,
522 headers,
523 }
524 }
525}
526#[allow(dead_code)]
528#[derive(Debug, Default)]
529pub struct CsvValidationReport {
530 pub errors: Vec<String>,
532}
533impl CsvValidationReport {
534 pub fn is_valid(&self) -> bool {
536 self.errors.is_empty()
537 }
538 pub fn error_count(&self) -> usize {
540 self.errors.len()
541 }
542}
543#[derive(Debug, Clone, PartialEq)]
545#[allow(dead_code)]
546pub enum ColumnType {
547 Integer,
549 Float,
551 Text,
553}
554#[allow(dead_code)]
556#[derive(Debug, Clone)]
557pub enum CsvColumnData {
558 Integer(Vec<i64>),
560 Float(Vec<f64>),
562 Text(Vec<String>),
564}
565#[allow(dead_code)]
566impl CsvColumnData {
567 pub fn len(&self) -> usize {
569 match self {
570 CsvColumnData::Integer(v) => v.len(),
571 CsvColumnData::Float(v) => v.len(),
572 CsvColumnData::Text(v) => v.len(),
573 }
574 }
575 pub fn is_empty(&self) -> bool {
577 self.len() == 0
578 }
579 pub fn column_type(&self) -> ColumnType {
581 match self {
582 CsvColumnData::Integer(_) => ColumnType::Integer,
583 CsvColumnData::Float(_) => ColumnType::Float,
584 CsvColumnData::Text(_) => ColumnType::Text,
585 }
586 }
587}
588#[allow(dead_code)]
593#[derive(Debug, Clone)]
594pub struct CsvDataFrame {
595 pub column_names: Vec<String>,
597 pub columns: Vec<CsvColumnData>,
599}
600#[allow(dead_code)]
601impl CsvDataFrame {
602 pub fn from_csv(csv: &CsvFile) -> Self {
604 let mut column_names = csv.headers.clone();
605 let mut columns: Vec<CsvColumnData> = Vec::with_capacity(csv.headers.len());
606 for col_idx in 0..csv.headers.len() {
607 let col_type = csv.infer_column_type(col_idx);
608 let col_data = match col_type {
609 ColumnType::Integer => {
610 let vals: Vec<i64> = csv
611 .records
612 .iter()
613 .map(|r| {
614 r.fields
615 .get(col_idx)
616 .and_then(|s| s.trim().parse::<i64>().ok())
617 .unwrap_or(0)
618 })
619 .collect();
620 CsvColumnData::Integer(vals)
621 }
622 ColumnType::Float => {
623 let vals: Vec<f64> = csv
624 .records
625 .iter()
626 .map(|r| {
627 r.fields
628 .get(col_idx)
629 .and_then(|s| s.trim().parse::<f64>().ok())
630 .unwrap_or(f64::NAN)
631 })
632 .collect();
633 CsvColumnData::Float(vals)
634 }
635 ColumnType::Text => {
636 let vals: Vec<String> = csv
637 .records
638 .iter()
639 .map(|r| {
640 r.fields
641 .get(col_idx)
642 .map(|s| s.trim().to_string())
643 .unwrap_or_default()
644 })
645 .collect();
646 CsvColumnData::Text(vals)
647 }
648 };
649 columns.push(col_data);
650 }
651 for (i, name) in column_names.iter_mut().enumerate() {
652 if name.is_empty() {
653 *name = format!("col_{}", i);
654 }
655 }
656 CsvDataFrame {
657 column_names,
658 columns,
659 }
660 }
661 pub fn from_str(s: &str) -> std::result::Result<Self, String> {
665 let csv = CsvFile::from_str(s).map_err(|e| format!("line 1: {e}"))?;
666 Ok(Self::from_csv(&csv))
667 }
668 pub fn from_str_with_delimiter(s: &str, delim: char) -> std::result::Result<Self, String> {
670 let csv = CsvFile::from_str_with_delimiter(s, delim).map_err(|e| format!("line 1: {e}"))?;
671 Ok(Self::from_csv(&csv))
672 }
673 pub fn n_rows(&self) -> usize {
675 self.columns.first().map(|c| c.len()).unwrap_or(0)
676 }
677 pub fn n_cols(&self) -> usize {
679 self.columns.len()
680 }
681 pub fn column_index(&self, name: &str) -> Option<usize> {
683 self.column_names.iter().position(|n| n == name)
684 }
685 pub fn column(&self, idx: usize) -> Option<&CsvColumnData> {
687 self.columns.get(idx)
688 }
689 pub fn column_by_name(&self, name: &str) -> Option<&CsvColumnData> {
691 let idx = self.column_index(name)?;
692 self.column(idx)
693 }
694 pub fn float_column(&self, name: &str) -> Option<&Vec<f64>> {
696 match self.column_by_name(name)? {
697 CsvColumnData::Float(v) => Some(v),
698 _ => None,
699 }
700 }
701 pub fn integer_column(&self, name: &str) -> Option<&Vec<i64>> {
703 match self.column_by_name(name)? {
704 CsvColumnData::Integer(v) => Some(v),
705 _ => None,
706 }
707 }
708 pub fn text_column(&self, name: &str) -> Option<&Vec<String>> {
710 match self.column_by_name(name)? {
711 CsvColumnData::Text(v) => Some(v),
712 _ => None,
713 }
714 }
715 pub fn to_csv_string(&self) -> String {
717 let mut out = self.column_names.join(",");
718 out.push('\n');
719 let n_rows = self.n_rows();
720 for row in 0..n_rows {
721 let fields: Vec<String> = self
722 .columns
723 .iter()
724 .map(|col| match col {
725 CsvColumnData::Integer(v) => {
726 v.get(row).map(|x| x.to_string()).unwrap_or_default()
727 }
728 CsvColumnData::Float(v) => {
729 v.get(row).map(|x| format!("{}", x)).unwrap_or_default()
730 }
731 CsvColumnData::Text(v) => v.get(row).cloned().unwrap_or_default(),
732 })
733 .collect();
734 out.push_str(&fields.join(","));
735 out.push('\n');
736 }
737 out
738 }
739}
740#[allow(dead_code)]
745pub struct StreamingCsvReader<'a> {
746 pub delimiter: char,
748 pub headers: Vec<String>,
750 pub(super) lines: std::str::Lines<'a>,
751 pub(super) row: usize,
753}
754#[allow(dead_code)]
755impl<'a> StreamingCsvReader<'a> {
756 pub fn new(input: &'a str, delimiter: char) -> Self {
758 let mut lines = input.lines();
759 let headers = lines
760 .next()
761 .map(|h| {
762 h.split(delimiter)
763 .map(str::trim)
764 .map(String::from)
765 .collect()
766 })
767 .unwrap_or_default();
768 Self {
769 delimiter,
770 headers,
771 lines,
772 row: 0,
773 }
774 }
775 pub fn auto(input: &'a str) -> Self {
777 let delim = detect_delimiter(input);
778 Self::new(input, delim)
779 }
780 pub fn n_cols(&self) -> usize {
782 self.headers.len()
783 }
784 pub fn current_row(&self) -> usize {
786 self.row
787 }
788 pub fn next_row(&mut self) -> Option<Vec<String>> {
790 loop {
791 let line = self.lines.next()?;
792 if line.trim().is_empty() {
793 continue;
794 }
795 self.row += 1;
796 return Some(
797 line.split(self.delimiter)
798 .map(str::trim)
799 .map(String::from)
800 .collect(),
801 );
802 }
803 }
804 pub fn collect_all(mut self) -> CsvFile {
806 let mut file = CsvFile::new(self.headers.clone());
807 while let Some(fields) = self.next_row() {
808 file.add_record(fields);
809 }
810 file
811 }
812}
813#[derive(Debug, Clone)]
815#[allow(dead_code)]
816pub struct ColumnStats {
817 pub min: f64,
819 pub max: f64,
821 pub mean: f64,
823 pub count: usize,
825 pub sum: f64,
827}
828#[allow(dead_code)]
834#[derive(Debug, Clone, Default)]
835pub struct TrajectoryFrame {
836 pub title: String,
838 pub positions: Vec<[f64; 3]>,
840}
841#[allow(dead_code)]
842impl TrajectoryFrame {
843 pub fn new() -> Self {
845 Self::default()
846 }
847 pub fn n_atoms(&self) -> usize {
849 self.positions.len()
850 }
851}