mule/
separator_inference.rs

1use crate::dataset_file::DatasetFile;
2use crate::errors::Result;
3use itertools::Itertools;
4use std::collections::HashMap;
5use std::path::Path;
6use tokio_stream::StreamExt;
7
8static COMMON_SEPARATORS: [&str; 3] = [",", "\t", "|"];
9
10/// Infer the separator as the most commonly used separator in the file
11pub async fn infer_separator(path: impl AsRef<Path>) -> Result<String> {
12    let mut counts: HashMap<&str, usize> = HashMap::default();
13    let mut lines = DatasetFile::new(path).read_lines().await?;
14    while let Some(line_res) = lines.next().await {
15        let line = line_res?;
16        for sep in COMMON_SEPARATORS.iter() {
17            *counts.entry(sep).or_default() += line.clone().matches(sep).count();
18        }
19    }
20    let sep = counts
21        .into_iter()
22        .sorted_by_key(|(_, v)| *v)
23        .last()
24        .map(|(k, _)| k)
25        .unwrap_or(",");
26    Ok(sep.to_string())
27}
28
29#[cfg(test)]
30mod test {
31    use super::*;
32
33    #[tokio::test]
34    pub async fn test_separator_inference() -> Result<()> {
35        assert_eq!(infer_separator("datasets/sales-100.tsv").await?, "\t");
36        assert_eq!(
37            infer_separator("datasets/sales-100.csv").await.unwrap(),
38            ","
39        );
40
41        Ok(())
42    }
43}