use crate::csv::{Delimiters, Error};
use itertools::Itertools;
use regex::Regex;
use std::collections::HashMap;
use std::io::{BufRead, BufReader, Read, Seek};
use tracing::{debug, warn};
fn guess_format_from_line(
line: &str,
field_separator_hint: Option<char>,
) -> Result<(Option<char>, Option<char>), Error> {
let mut field_separator = field_separator_hint;
if field_separator.is_none() {
if line.find(';').is_some() {
field_separator = Some(';');
} else {
let field_sep_regex = Regex::new(r"\w([,|])[\W\w]")?;
let capture = field_sep_regex.captures_iter(line).next();
if let Some(cap) = capture {
field_separator = Some(cap[1].chars().next().ok_or_else(|| {
Error::InvalidAccess(format!(
"Could not capture field separator for guessing from '{line}'"
))
})?);
}
}
}
let decimal_separator_candidates = [',', '.'];
let context_acceptable_candidates = if let Some(field_separator) = field_separator {
decimal_separator_candidates
.into_iter()
.filter(|c| *c != field_separator)
.join("")
} else {
decimal_separator_candidates.into_iter().join("")
};
let decimal_separator_regex_string = format!(r"\d([{context_acceptable_candidates}])\d");
debug!(
"Regex for decimal sep: '{}'",
decimal_separator_regex_string.as_str()
);
let decimal_separator_regex = Regex::new(decimal_separator_regex_string.as_str())?;
let mut separators: HashMap<char, usize> = HashMap::new();
for capture in decimal_separator_regex.captures_iter(line) {
let sep = capture[1].chars().next().ok_or_else(|| {
Error::InvalidAccess(format!(
"Could not capture decimal separator for guessing from '{line}'"
))
})?;
if let Some(entry) = separators.get_mut(&sep) {
*entry += 1;
} else {
separators.insert(sep, 1);
}
}
debug!(
"Found separator candidates with occurrence count: {:?}",
separators
);
let decimal_separator = separators
.iter()
.sorted_by(|a, b| b.1.cmp(a.1))
.map(|s| s.0.to_owned())
.next();
Ok((field_separator, decimal_separator))
}
pub(crate) fn guess_format_from_reader<R: Read + Seek>(
mut input: &mut R,
) -> Result<Delimiters, Error> {
let mut format = (None, None);
let bufreader = BufReader::new(&mut input);
debug!("Guessing format from reader...");
for line in bufreader.lines().map_while(Result::ok) {
debug!("Guessing format from line: '{}'", line.as_str());
format = guess_format_from_line(line.as_str(), format.0)?;
debug!("Current format: {:?}", format);
if format.0.is_some() && format.1.is_some() {
break;
}
}
input.rewind()?;
if format.0.is_none() {
warn!("Could not guess field delimiter, setting to default");
format.0 = Delimiters::default().field_delimiter;
}
let delim = Delimiters {
field_delimiter: format.0,
decimal_separator: format.1,
};
debug!(
"Inferring of csv delimiters resulted in decimal separators: '{:?}', field delimiter: '{:?}'",
delim.decimal_separator, delim.field_delimiter
);
Ok(delim)
}
#[cfg(test)]
mod format_guessing_tests {
use super::*;
use std::fs::File;
#[test]
fn format_detection_basics() {
let format = guess_format_from_line(
"-0.969654597744788,-0.215275534510198,0.115869999295192,7.04555232210696",
None,
)
.unwrap();
assert_eq!(format, (Some(','), Some('.')));
let format = guess_format_from_line(
"-0.969654597744788;-0.215275534510198;0.115869999295192;7.04555232210696",
None,
)
.unwrap();
assert_eq!(format, (Some(';'), Some('.')));
let format = guess_format_from_line(
"-0.969654597744788,-0.215275534510198,0.115869999295192,7.04555232210696",
None,
)
.unwrap();
assert_eq!(format, (Some(','), Some('.')));
}
#[test]
fn format_detection_from_file() {
let format =
guess_format_from_reader(&mut File::open("tests/csv/data/Annotations.csv").unwrap())
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(','),
decimal_separator: Some('.')
}
);
}
#[test]
fn format_detection_from_file_metrology_special() {
let format = guess_format_from_reader(
&mut File::open("tests/csv/data/Multi_Apply_Rotation.csv").unwrap(),
)
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(','),
decimal_separator: Some('.')
}
);
}
#[test]
fn format_detection_from_file_metrology_other_special() {
let format = guess_format_from_reader(
&mut File::open("tests/csv/data/CM_quality_threshold.csv").unwrap(),
)
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(','),
decimal_separator: None
}
);
}
#[test]
fn format_detection_from_file_analysis_pia_table() {
let format = guess_format_from_reader(
&mut File::open("tests/csv/data/easy_pore_export_annoration_table_result.csv").unwrap(),
)
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(';'),
decimal_separator: Some(',')
}
);
}
#[test]
fn format_detection_from_file_no_field_sep() {
let format =
guess_format_from_reader(&mut File::open("tests/csv/data/no_field_sep.csv").unwrap())
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: None,
decimal_separator: Some('.')
}
);
}
#[test]
fn format_detection_from_file_semicolon_formatting() {
let format = guess_format_from_reader(
&mut File::open(
"tests/integ/data/display_of_status_message_in_cm_tables/expected/Volume1.csv",
)
.unwrap(),
)
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(';'),
decimal_separator: Some(',')
}
);
}
#[test]
fn format_detection_from_file_semicolon_separators() {
let format =
guess_format_from_reader(&mut File::open("tests/csv/data/Components.csv").unwrap())
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(';'),
decimal_separator: Some(',')
}
);
}
#[test]
fn format_detection_from_file_dot_comma_formatting() {
let format = guess_format_from_reader(
&mut File::open(
"tests/integ/data/display_of_status_message_in_cm_tables/actual/Volume1.csv",
)
.unwrap(),
)
.unwrap();
assert_eq!(
format,
Delimiters {
field_delimiter: Some(','),
decimal_separator: Some('.')
}
);
}
}