clickhouse-format 0.3.0

ClickHouse Formats
Documentation
use core::marker::PhantomData;
use std::collections::HashMap;

use csv::{ReaderBuilder, StringRecord, StringRecordsIntoIter};
use serde::de::DeserializeOwned;

use crate::format_name::FormatName;

use super::{Output, OutputResult};

pub struct TsvRawOutput<T> {
    names: Option<Vec<String>>,
    types: Option<Vec<String>>,
    phantom: PhantomData<T>,
}
impl<T> Default for TsvRawOutput<T> {
    fn default() -> Self {
        Self::new()
    }
}
impl<T> TsvRawOutput<T> {
    pub fn new() -> Self {
        Self {
            names: None,
            types: None,
            phantom: PhantomData,
        }
    }
    pub fn with_names(names: Vec<String>) -> Self {
        Self {
            names: Some(names),
            types: None,
            phantom: PhantomData,
        }
    }
    pub fn with_names_and_types(names: Vec<String>, types: Vec<String>) -> Self {
        Self {
            names: Some(names),
            types: Some(types),
            phantom: PhantomData,
        }
    }
    pub(crate) fn from_raw_parts(names: Option<Vec<String>>, types: Option<Vec<String>>) -> Self {
        Self {
            names,
            types,
            phantom: PhantomData,
        }
    }
}

impl<T> Output for TsvRawOutput<T>
where
    T: DeserializeOwned,
{
    type Row = T;
    type Info = Option<HashMap<String, String>>;

    type Error = csv::Error;

    fn format_name() -> FormatName {
        FormatName::TsvRaw
    }

    fn deserialize(&self, slice: &[u8]) -> OutputResult<Self::Row, Self::Info, Self::Error> {
        let rdr = ReaderBuilder::new()
            .delimiter(b'\t')
            .has_headers(false)
            .from_reader(slice);

        self.deserialize_with_records(rdr.into_records())
    }
}
impl<T> TsvRawOutput<T>
where
    T: DeserializeOwned,
{
    pub(crate) fn deserialize_with_records(
        &self,
        records: StringRecordsIntoIter<&[u8]>,
    ) -> OutputResult<<Self as Output>::Row, <Self as Output>::Info, <Self as Output>::Error> {
        let header = &self.names.to_owned().map(StringRecord::from);
        let mut data: Vec<T> = vec![];
        for record in records {
            let record = record?;
            let row: T = record.deserialize(header.as_ref())?;
            data.push(row);
        }

        let info = if let Some(types) = &self.types {
            self.names
                .to_owned()
                .map(|x| x.into_iter().zip(types.to_owned()).collect())
        } else {
            None
        };

        Ok((data, info))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::{fs, path::PathBuf};

    use crate::test_helpers::{TestStringsRow, TEST_STRINGS_ROW_1};

    #[test]
    fn simple() -> Result<(), Box<dyn std::error::Error>> {
        let file_path = PathBuf::new().join("tests/files/TSVRaw.tsv");
        let content = fs::read_to_string(&file_path)?;

        assert_eq!(
            TsvRawOutput::<HashMap<String, String>>::format_name(),
            file_path
                .file_stem()
                .unwrap()
                .to_string_lossy()
                .parse()
                .unwrap()
        );

        let (rows, info) = TsvRawOutput::<HashMap<String, String>>::with_names(vec![
            "array1".into(),
            "array2".into(),
            "tuple1".into(),
            "tuple2".into(),
            "map1".into(),
        ])
        .deserialize(content.as_bytes())?;
        assert_eq!(rows.first().unwrap().get("tuple1").unwrap(), "(1,'a')");
        assert_eq!(info, None);

        let (rows, info) = TsvRawOutput::<TestStringsRow>::new().deserialize(content.as_bytes())?;
        assert_eq!(rows.first().unwrap(), &*TEST_STRINGS_ROW_1);
        assert_eq!(info, None);

        Ok(())
    }
}