mule/
separator_inference.rs1use crate::dataset_file::DatasetFile;
2use crate::errors::Result;
3use itertools::Itertools;
4use std::collections::HashMap;
5use std::path::Path;
6use tokio_stream::StreamExt;
7
8static COMMON_SEPARATORS: [&str; 3] = [",", "\t", "|"];
9
10pub async fn infer_separator(path: impl AsRef<Path>) -> Result<String> {
12 let mut counts: HashMap<&str, usize> = HashMap::default();
13 let mut lines = DatasetFile::new(path).read_lines().await?;
14 while let Some(line_res) = lines.next().await {
15 let line = line_res?;
16 for sep in COMMON_SEPARATORS.iter() {
17 *counts.entry(sep).or_default() += line.clone().matches(sep).count();
18 }
19 }
20 let sep = counts
21 .into_iter()
22 .sorted_by_key(|(_, v)| *v)
23 .last()
24 .map(|(k, _)| k)
25 .unwrap_or(",");
26 Ok(sep.to_string())
27}
28
29#[cfg(test)]
30mod test {
31 use super::*;
32
33 #[tokio::test]
34 pub async fn test_separator_inference() -> Result<()> {
35 assert_eq!(infer_separator("datasets/sales-100.tsv").await?, "\t");
36 assert_eq!(
37 infer_separator("datasets/sales-100.csv").await.unwrap(),
38 ","
39 );
40
41 Ok(())
42 }
43}