tidy_viewer/datatype.rs
1//! # Data Type Module
2//!
3//! This module provides data type inference, formatting, and validation functions
4//! for tabular data processing. It handles various data types including numbers,
5//! dates, times, logical values, and missing values.
6//!
7//! ## Key Functions
8//!
9//! - **Type Detection**: Functions to identify data types (`is_integer`, `is_double`, etc.)
10//! - **Formatting**: Functions to format data with significant figures and width constraints
11//! - **Column Analysis**: Functions to analyze entire columns for type consistency
12//! - **Width Calculation**: Functions to calculate optimal column widths
13//!
14//! ## Usage Examples
15//!
16//! ```rust
17//! use tidy_viewer::datatype::{is_integer, is_double, format_strings, ValueType};
18//!
19//! // Detect data types
20//! assert!(is_integer("123"));
21//! assert!(is_double("123.45"));
22//!
23//! // Format a column of data
24//! let data = vec!["123", "456.78", "NA"];
25//! let formatted = format_strings(&data, 2, 20, 3, false, 13);
26//!
27//! // Infer column type
28//! let col_type = tidy_viewer::get_col_data_type(&data);
29//! ```
30
31use itertools::Itertools;
32use lazy_static::lazy_static;
33use regex::Regex;
34use std::str::FromStr;
35use unicode_truncate::UnicodeTruncateStr;
36use unicode_width::UnicodeWidthStr;
37
38pub mod sigfig;
39
40/// Represents the type of a value in tabular data.
41///
42/// This enum is used to classify the data type of individual values
43/// or entire columns in tabular data.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum ValueType {
46 /// Boolean/logical values (true/false, T/F, 1/0)
47 Boolean,
48 /// Integer numbers
49 Integer,
50 /// Floating-point numbers
51 Double,
52 /// Date values (YYYY-MM-DD format)
53 Date,
54 /// Time values (HH:MM:SS format)
55 Time,
56 /// Date and time values
57 DateTime,
58 /// Text/character data
59 Character,
60 /// Missing or null values
61 Na,
62}
63
64/// Checks if a string represents a logical/boolean value.
65///
66/// Recognizes various boolean representations:
67/// - `true`, `false`, `TRUE`, `FALSE`
68/// - `t`, `f`, `T`, `F`
69/// - `True`, `False`
70/// - `1`, `0`
71///
72/// # Examples
73///
74/// ```rust
75/// use tidy_viewer::datatype::is_logical;
76///
77/// assert!(is_logical("true"));
78/// assert!(is_logical("FALSE"));
79/// assert!(is_logical("1"));
80/// assert!(is_logical("0"));
81/// assert!(!is_logical("hello"));
82/// ```
83pub fn is_logical(text: &str) -> bool {
84 // col_logical -l, T,F,TRUE,FALSE,True,False,true,false,t,f,1,0
85 lazy_static! {
86 static ref R: Regex =
87 Regex::new(r"^true$|^false$|^t$|^f$|TRUE$|^FALSE$|^T$|^F$|^True|^False|^1$|^0$")
88 .unwrap();
89 }
90 R.is_match(text)
91}
92
93/// Checks if a string represents an integer value.
94///
95/// Recognizes integers with optional leading sign and whitespace.
96///
97/// # Examples
98///
99/// ```rust
100/// use tidy_viewer::datatype::is_integer;
101///
102/// assert!(is_integer("123"));
103/// assert!(is_integer("-456"));
104/// assert!(is_integer(" 789 "));
105/// assert!(!is_integer("123.45"));
106/// assert!(!is_integer("abc"));
107/// ```
108pub fn is_integer(text: &str) -> bool {
109 //let integer = "5";
110 lazy_static! {
111 static ref R: Regex = Regex::new(r"^\s*([+-]?[1-9][0-9]*|0)\s*$").unwrap();
112 }
113 R.is_match(text)
114}
115
116pub fn is_negative_number(text: &str) -> bool {
117 lazy_static! {
118 static ref R: Regex = Regex::new(r"^\s*-[0-9]*.?[0-9]*\s*$").unwrap();
119 }
120 R.is_match(text)
121}
122
123/// Checks if a string represents a floating-point number.
124///
125/// # Examples
126///
127/// ```rust
128/// use tidy_viewer::datatype::is_double;
129///
130/// assert!(is_double("123.45"));
131/// assert!(is_double("-456.78"));
132/// assert!(is_double("123"));
133/// assert!(!is_double("abc"));
134/// ```
135pub fn is_double(text: &str) -> bool {
136 f64::from_str(text.trim()).is_ok()
137}
138
139/// Checks if a string represents a number in scientific notation.
140///
141/// # Examples
142///
143/// ```rust
144/// use tidy_viewer::datatype::is_scientific_notation;
145///
146/// assert!(is_scientific_notation("1.23e-4"));
147/// assert!(is_scientific_notation("1.23E+4"));
148/// assert!(!is_scientific_notation("123.45"));
149/// assert!(!is_scientific_notation("abc"));
150/// ```
151pub fn is_scientific_notation(text: &str) -> bool {
152 lazy_static! {
153 static ref R: Regex = Regex::new(r"^[+-]?[0-9]*\.?[0-9]+[eE][+-]?[0-9]+$").unwrap();
154 }
155 R.is_match(text.trim())
156}
157
158/// Checks if a string represents a time value (HH:MM:SS format).
159///
160/// # Examples
161///
162/// ```rust
163/// use tidy_viewer::datatype::is_time;
164///
165/// assert!(is_time("11:59:37"));
166/// assert!(is_time("23:45:12"));
167/// assert!(!is_time("25:00:00")); // Invalid hour
168/// assert!(!is_time("abc"));
169/// ```
170pub fn is_time(text: &str) -> bool {
171 //let time = "11:59:37 UTC";
172 //https://stackoverflow.com/a/25873711
173 lazy_static! {
174 static ref R: Regex =
175 Regex::new(r"^(?:[01][0-9]|2[0123]):(?:[012345][0-9]):(?:[012345][0-9])$").unwrap();
176 }
177 R.is_match(text)
178}
179
180/// Checks if a string represents a date value (YYYY-MM-DD format).
181///
182/// # Examples
183///
184/// ```rust
185/// use tidy_viewer::datatype::is_date;
186///
187/// assert!(is_date("2020-10-09"));
188/// assert!(is_date("1999-12-31"));
189/// assert!(!is_date("2020/10/09")); // Wrong format
190/// assert!(!is_date("abc"));
191/// ```
192pub fn is_date(text: &str) -> bool {
193 lazy_static! {
194 static ref R: Regex = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap();
195 }
196 R.is_match(text)
197}
198
199/// Checks if a string represents a date-time value.
200///
201/// # Examples
202///
203/// ```rust
204/// use tidy_viewer::datatype::is_date_time;
205///
206/// assert!(is_date_time("11:59:37"));
207/// assert!(is_date_time("23:45:12"));
208/// assert!(!is_date_time("25:00:00")); // Invalid hour
209/// assert!(!is_date_time("abc"));
210/// ```
211pub fn is_date_time(text: &str) -> bool {
212 //let datetime = "2020-10-09 11:59:37 UTC";
213 //https://stackoverflow.com/a/25873711
214 lazy_static! {
215 static ref R: Regex =
216 Regex::new(r"^(?:[01][0-9]|2[0123]):(?:[012345][0-9]):(?:[012345][0-9])").unwrap();
217 }
218 R.is_match(text)
219}
220
221/// Checks if a string represents a missing/null value.
222///
223/// Recognizes various NA representations:
224/// - Empty strings
225/// - `NA`, `N/A`, `NaN`, `null`
226/// - Case variations: `na`, `Na`, `NULL`, etc.
227///
228/// # Examples
229///
230/// ```rust
231/// use tidy_viewer::datatype::is_na;
232///
233/// assert!(is_na(""));
234/// assert!(is_na("NA"));
235/// assert!(is_na("null"));
236/// assert!(is_na("N/A"));
237/// assert!(!is_na("hello"));
238/// ```
239pub fn is_na(text: &str) -> bool {
240 lazy_static! {
241 static ref R: Regex = Regex::new(
242 r"^$|^(?:N(?:(?:(?:one|AN|a[Nn]|/A)|[Aa])|ull)|n(?:ull|an?|/a?)|(?:missing))$"
243 )
244 .unwrap();
245 }
246 R.is_match(text)
247}
248
249#[allow(dead_code)]
250pub fn is_na_string_padded(text: &str) -> bool {
251 lazy_static! {
252 static ref R: Regex = Regex::new(
253 r"^$|(^|\s)(?:N(?:(?:(?:AN|a[Nn]|/A)|[Aa])|ull)|n(?:ull|an?|/a?)|(?:missing))\s*$"
254 )
255 .unwrap();
256 }
257 R.is_match(text)
258}
259
260// utilities
261
262/// Infers the data type of a string value.
263///
264/// This function analyzes a string and returns the most appropriate `ValueType`.
265/// The inference follows a priority order: NA → Boolean → Integer → Double → Date → Time → Character.
266///
267/// # Examples
268///
269/// ```rust
270/// use tidy_viewer::datatype::{infer_type_from_string, ValueType};
271///
272/// assert_eq!(infer_type_from_string(""), ValueType::Na);
273/// assert_eq!(infer_type_from_string("true"), ValueType::Boolean);
274/// assert_eq!(infer_type_from_string("123"), ValueType::Integer);
275/// assert_eq!(infer_type_from_string("123.45"), ValueType::Double);
276/// assert_eq!(infer_type_from_string("2020-10-09"), ValueType::Date);
277/// assert_eq!(infer_type_from_string("11:59:37"), ValueType::Time);
278/// assert_eq!(infer_type_from_string("hello"), ValueType::Character);
279/// ```
280pub fn infer_type_from_string(text: &str) -> ValueType {
281 if is_time(text) {
282 ValueType::Time
283 } else if is_logical(text) {
284 ValueType::Boolean
285 } else if is_integer(text) {
286 ValueType::Integer
287 } else if is_date_time(text) {
288 ValueType::DateTime
289 } else if is_date(text) {
290 ValueType::Date
291 } else if is_double(text) {
292 ValueType::Double
293 } else if text.is_empty() | is_na(text) {
294 ValueType::Na
295 } else {
296 ValueType::Character
297 }
298}
299
300/// Formats a column of strings with consistent width and significant figures.
301///
302/// This is the main formatting function that processes an entire column of data.
303/// It applies significant figure formatting to numeric values, handles NA values,
304/// and ensures all strings fit within the specified width constraints.
305///
306/// # Arguments
307///
308/// * `vec_col` - Vector of string references representing the column data
309/// * `lower_column_width` - Minimum column width
310/// * `upper_column_width` - Maximum column width
311/// * `sigfig` - Number of significant figures for numeric values
312/// * `preserve_scientific` - Whether to preserve scientific notation
313/// * `max_decimal_width` - Maximum width for decimal places
314///
315/// # Returns
316///
317/// A vector of formatted strings with consistent width and formatting.
318///
319/// # Examples
320///
321/// ```rust
322/// use tidy_viewer::datatype::format_strings;
323///
324/// let data = vec!["123.456", "NA", "-42.1", "hello"];
325/// let formatted = format_strings(&data, 2, 20, 3, false, 13);
326///
327/// // All formatted strings will have consistent width and formatting
328/// assert_eq!(formatted.len(), 4);
329/// ```
330pub fn format_strings(
331 vec_col: &[&str],
332 lower_column_width: usize,
333 upper_column_width: usize,
334 sigfig: i64,
335 preserve_scientific: bool,
336 max_decimal_width: usize,
337) -> Vec<String> {
338 let ellipsis = '\u{2026}';
339
340 let strings_and_fracts: Vec<(String, usize, usize)> = vec_col
341 .iter()
342 .map(|&string| format_if_na(string))
343 .map(|string| format_if_num(&string, sigfig, preserve_scientific, max_decimal_width))
344 .map(|string| {
345 // the string, and the length of its fractional digits if any
346 let (lhs, rhs) = if is_double(&string) {
347 let mut split = string.split('.');
348 (
349 split.next().map(|lhs| lhs.len()).unwrap_or_default(),
350 split.next().map(|rhs| rhs.len()).unwrap_or_default(),
351 )
352 } else {
353 (0, 0)
354 };
355 (string, lhs, rhs)
356 })
357 .collect();
358
359 let max_fract: usize = strings_and_fracts
360 .iter()
361 .map(|(_, _, fract)| *fract)
362 .max()
363 .unwrap_or_default();
364 let max_whole: usize = strings_and_fracts
365 .iter()
366 .map(|(_, whole, _)| *whole)
367 .max()
368 .unwrap_or_default();
369
370 let strings_and_widths: Vec<(String, usize)> = strings_and_fracts
371 .into_iter()
372 .map(|(mut string, whole, fract)| {
373 if max_fract > 0 && is_double(&string) {
374 if whole < max_whole {
375 let mut s = String::new();
376 s.push_str(&" ".repeat(max_whole - whole));
377 s.push_str(&string);
378 string = s;
379 }
380
381 string.push_str(&" ".repeat(max_fract - fract));
382 } else if max_fract > 0 && is_na(&string) {
383 if 2 < max_whole {
384 let mut s = String::new();
385 s.push_str(&" ".repeat(max_whole - 2));
386 s.push_str(&string);
387 string = s;
388 }
389
390 string.push_str(&" ".repeat(max_fract - fract));
391 }
392 let len = UnicodeWidthStr::width(string.as_str());
393 // the string and its length
394 (string, len)
395 })
396 .collect();
397
398 let max_width: usize = strings_and_widths
399 .iter()
400 .map(|(_, width)| *width)
401 .max()
402 .unwrap_or_default()
403 .clamp(lower_column_width, upper_column_width);
404
405 strings_and_widths
406 .into_iter()
407 .map(|(string, len)| {
408 if len > max_width {
409 let (rv, _) = string.unicode_truncate(max_width - 1);
410 let spacer: &str = " ";
411 let string_and_ellipses = [rv.to_string(), ellipsis.to_string()].join("");
412 [string_and_ellipses, spacer.to_string()].join("")
413 } else {
414 let add_space = max_width - len + 1;
415 let borrowed_string: &str = &" ".repeat(add_space);
416 [string, "".to_string()].join(borrowed_string)
417 }
418 })
419 .collect()
420}
421
422pub fn format_if_na(text: &str) -> String {
423 // todo add repeat strings for NA
424 let missing_string_value = "NA";
425 let string = if is_na(text) {
426 missing_string_value
427 } else {
428 text
429 };
430 string.to_string()
431}
432
433pub fn format_if_num(
434 text: &str,
435 sigfig: i64,
436 preserve_scientific: bool,
437 max_decimal_width: usize,
438) -> String {
439 // If preserve_scientific is enabled and the input is already in scientific notation, keep it
440 if preserve_scientific && is_scientific_notation(text) {
441 return text.to_string();
442 }
443
444 if let Ok(val) = text.parse::<f64>() {
445 let decimal_formatted = sigfig::DecimalSplits { val, sigfig }.final_string();
446
447 // Check if we should auto-switch to scientific notation based on decimal width
448 if decimal_formatted.len() > max_decimal_width {
449 // Format in scientific notation with appropriate precision
450 if val.abs() < 1e-4 || val.abs() >= 10f64.powi(sigfig as i32) {
451 return format!(
452 "{:.precision$e}",
453 val,
454 precision = (sigfig - 1).max(0) as usize
455 );
456 }
457 }
458
459 decimal_formatted
460 } else {
461 text.to_string()
462 }
463}
464
465pub fn get_col_data_type(col: &[&str]) -> ValueType {
466 // counts the frequency of the datatypes in the column
467 // returns the most frequent while ignoring NA values.
468 col.iter()
469 .map(|x| infer_type_from_string(x))
470 .filter(|x| !matches!(x, &ValueType::Na))
471 .group_by(|&x| x)
472 .into_iter()
473 .map(|(key, group)| (key, group.count()))
474 .max_by_key(|&(_, count)| count)
475 .map(|(key, _)| key)
476 .unwrap()
477}
478
479pub fn parse_delimiter(src: &str) -> Result<u8, String> {
480 let bytes = src.as_bytes();
481 match *bytes {
482 [del] => Ok(del),
483 [b'\\', b't'] => Ok(b'\t'),
484 _ => Err(format!(
485 "expected one byte as delimiter, got {} bytes (\"{}\")",
486 bytes.len(),
487 src
488 )),
489 }
490}
491
492/// Calculates the optimal width for a column based on its content.
493///
494/// This function analyzes all strings in a column and determines the optimal width
495/// that accommodates the content while respecting minimum and maximum constraints.
496/// It uses Unicode width calculation to handle multi-byte characters correctly.
497///
498/// # Arguments
499///
500/// * `column` - Vector of formatted strings representing the column
501/// * `min_width` - Minimum allowed column width
502/// * `max_width` - Maximum allowed column width
503///
504/// # Returns
505///
506/// The calculated optimal width for the column.
507///
508/// # Examples
509///
510/// ```rust
511/// use tidy_viewer::datatype::calculate_column_width;
512///
513/// let column = vec!["123".to_string(), "456.78".to_string(), "hello".to_string()];
514/// let width = calculate_column_width(&column, 2, 20);
515///
516/// assert!(width >= 2 && width <= 20);
517/// ```
518#[allow(dead_code)]
519pub fn calculate_column_width(column: &[String], min_width: usize, max_width: usize) -> usize {
520 let max_content_width = column
521 .iter()
522 .map(|cell| unicode_width::UnicodeWidthStr::width(cell.as_str()))
523 .max()
524 .unwrap_or(0);
525
526 max_content_width.clamp(min_width, max_width)
527}
528
529#[cfg(test)]
530mod tests {
531 use crate::datatype::{format_if_num, is_scientific_notation, parse_delimiter};
532
533 #[test]
534 fn one_byte_delimiter() {
535 assert_eq!(parse_delimiter(","), Ok(b','));
536 assert_eq!(parse_delimiter(";"), Ok(b';'));
537 assert_eq!(parse_delimiter("|"), Ok(b'|'));
538 assert_eq!(parse_delimiter(" "), Ok(b' '));
539 assert_eq!(parse_delimiter("\t"), Ok(b'\t'));
540 }
541
542 #[test]
543 fn tab_delimiter() {
544 assert_eq!(parse_delimiter("\\t"), Ok(b'\t'));
545 }
546
547 #[test]
548 fn delimiter_wrong_length() {
549 assert_eq!(
550 parse_delimiter(""),
551 Err("expected one byte as delimiter, got 0 bytes (\"\")".to_string())
552 );
553 assert_eq!(
554 parse_delimiter("too long"),
555 Err("expected one byte as delimiter, got 8 bytes (\"too long\")".to_string())
556 );
557 assert_eq!(
558 parse_delimiter("\\n"),
559 Err("expected one byte as delimiter, got 2 bytes (\"\\n\")".to_string())
560 );
561 }
562
563 #[test]
564 fn test_is_scientific_notation() {
565 // Valid scientific notation
566 assert_eq!(is_scientific_notation("1.23e-7"), true);
567 assert_eq!(is_scientific_notation("5.67e15"), true);
568 assert_eq!(is_scientific_notation("-4.56e-10"), true);
569 assert_eq!(is_scientific_notation("+2.34e8"), true);
570 assert_eq!(is_scientific_notation("1e5"), true);
571 assert_eq!(is_scientific_notation("3.14E-2"), true);
572 assert_eq!(is_scientific_notation("7.849613446523261e-05"), true);
573
574 // Invalid scientific notation (should be false)
575 assert_eq!(is_scientific_notation("1.23"), false);
576 assert_eq!(is_scientific_notation("123"), false);
577 assert_eq!(is_scientific_notation("0.0001"), false);
578 assert_eq!(is_scientific_notation("e5"), false);
579 assert_eq!(is_scientific_notation("1.23e"), false);
580 assert_eq!(is_scientific_notation("text"), false);
581 assert_eq!(is_scientific_notation(""), false);
582 }
583
584 #[test]
585 fn test_format_if_num_preserve_scientific() {
586 // Test preserve scientific functionality
587 assert_eq!(format_if_num("1.23e-7", 3, true, 13), "1.23e-7");
588 assert_eq!(format_if_num("5.67e15", 3, true, 13), "5.67e15");
589 assert_eq!(format_if_num("-4.56e-10", 3, true, 13), "-4.56e-10");
590
591 // Test normal numbers with preserve scientific (should use sigfig)
592 assert_eq!(format_if_num("1.23456", 3, true, 13), "1.23");
593 assert_eq!(format_if_num("123.456", 3, true, 13), "123.");
594
595 // Test without preserve scientific (should convert to decimal)
596 assert_eq!(format_if_num("1.23e-7", 3, false, 13), "0.000000123");
597 }
598
599 #[test]
600 fn test_format_if_num_max_decimal_width() {
601 // Test auto-conversion based on decimal width
602 // Very small number should be converted to scientific notation
603 assert_eq!(format_if_num("0.000000123", 3, false, 8), "1.23e-7");
604
605 // Large number should be converted to scientific notation
606 assert_eq!(format_if_num("123456789012345", 3, false, 8), "1.23e14");
607
608 // Normal number within threshold should stay decimal
609 assert_eq!(format_if_num("3.14159", 3, false, 8), "3.14");
610
611 // Test with higher threshold
612 assert_eq!(format_if_num("0.000000123", 3, false, 15), "0.000000123");
613 }
614
615 #[test]
616 fn test_format_if_num_combined_flags() {
617 // Test both preserve_scientific and max_decimal_width together
618 // Scientific notation input should be preserved regardless of width
619 assert_eq!(format_if_num("1.23e-7", 3, true, 5), "1.23e-7");
620
621 // Long decimal should be auto-converted even with preserve_scientific
622 assert_eq!(format_if_num("0.000000123", 3, true, 8), "1.23e-7");
623 }
624}