chewdata/document/
mod.rs

1pub mod byte;
2#[cfg(feature = "csv")]
3pub mod csv;
4pub mod json;
5pub mod jsonl;
6#[cfg(feature = "parquet")]
7pub mod parquet;
8pub mod text;
9#[cfg(feature = "toml")]
10pub mod toml;
11#[cfg(feature = "xml")]
12pub mod xml;
13pub mod yaml;
14
15#[cfg(feature = "csv")]
16use self::csv::Csv;
17use self::jsonl::Jsonl;
18#[cfg(feature = "parquet")]
19use self::parquet::Parquet;
20use self::text::Text;
21#[cfg(feature = "toml")]
22use self::toml::Toml;
23#[cfg(feature = "xml")]
24use self::xml::Xml;
25use self::yaml::Yaml;
26use self::{byte::Byte, json::Json};
27use super::Metadata;
28use crate::DataSet;
29use serde::{Deserialize, Serialize};
30use std::io::{self, Error, ErrorKind, Result};
31
32#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum DocumentType {
35    #[cfg(feature = "csv")]
36    #[serde(rename = "csv")]
37    Csv(Csv),
38    #[serde(rename = "json")]
39    Json(Json),
40    #[serde(rename = "jsonl")]
41    Jsonl(Jsonl),
42    #[cfg(feature = "xml")]
43    #[serde(rename = "xml")]
44    Xml(Xml),
45    #[serde(rename = "yaml")]
46    #[serde(alias = "yml")]
47    Yaml(Yaml),
48    #[cfg(feature = "toml")]
49    #[serde(rename = "toml")]
50    Toml(Toml),
51    #[serde(rename = "text")]
52    #[serde(alias = "txt")]
53    Text(Text),
54    #[serde(rename = "byte")]
55    Byte(Byte),
56    #[cfg(feature = "parquet")]
57    #[serde(rename = "parquet")]
58    Parquet(Parquet),
59}
60
61impl Default for DocumentType {
62    fn default() -> Self {
63        DocumentType::Json(Json::default())
64    }
65}
66
67impl DocumentType {
68    pub fn boxed_inner(self) -> Box<dyn Document> {
69        match self {
70            #[cfg(feature = "csv")]
71            DocumentType::Csv(document) => Box::new(document),
72            DocumentType::Json(document) => Box::new(document),
73            DocumentType::Jsonl(document) => Box::new(document),
74            #[cfg(feature = "xml")]
75            DocumentType::Xml(document) => Box::new(document),
76            DocumentType::Yaml(document) => Box::new(document),
77            #[cfg(feature = "toml")]
78            DocumentType::Toml(document) => Box::new(document),
79            DocumentType::Text(document) => Box::new(document),
80            DocumentType::Byte(document) => Box::new(document),
81            #[cfg(feature = "parquet")]
82            DocumentType::Parquet(document) => Box::new(document),
83        }
84    }
85    pub fn ref_inner(&self) -> &dyn Document {
86        match self {
87            #[cfg(feature = "csv")]
88            DocumentType::Csv(document) => document,
89            DocumentType::Json(document) => document,
90            DocumentType::Jsonl(document) => document,
91            #[cfg(feature = "xml")]
92            DocumentType::Xml(document) => document,
93            DocumentType::Yaml(document) => document,
94            #[cfg(feature = "toml")]
95            DocumentType::Toml(document) => document,
96            DocumentType::Text(document) => document,
97            DocumentType::Byte(document) => document,
98            #[cfg(feature = "parquet")]
99            DocumentType::Parquet(document) => document,
100        }
101    }
102    pub fn ref_mut_inner(&mut self) -> &mut dyn Document {
103        match self {
104            #[cfg(feature = "csv")]
105            DocumentType::Csv(document) => document,
106            DocumentType::Json(document) => document,
107            DocumentType::Jsonl(document) => document,
108            #[cfg(feature = "xml")]
109            DocumentType::Xml(document) => document,
110            DocumentType::Yaml(document) => document,
111            #[cfg(feature = "toml")]
112            DocumentType::Toml(document) => document,
113            DocumentType::Text(document) => document,
114            DocumentType::Byte(document) => document,
115            #[cfg(feature = "parquet")]
116            DocumentType::Parquet(document) => document,
117        }
118    }
119    pub fn guess(metadata: &Metadata) -> Result<Box<dyn Document>> {
120        Ok(match &metadata.mime_subtype {
121            Some(mime_subtype) => match mime_subtype.as_str() {
122                #[cfg(feature = "csv")]
123                "csv" => Box::new(Csv {
124                    metadata: metadata.clone(),
125                    ..Default::default()
126                }),
127                "json" => Box::new(Json {
128                    metadata: metadata.clone(),
129                    ..Default::default()
130                }),
131                "x-ndjson" | "jsonl" => Box::new(Jsonl {
132                    metadata: metadata.clone(),
133                    ..Default::default()
134                }),
135                #[cfg(feature = "parquet")]
136                "parquet" => Box::new(Parquet {
137                    metadata: metadata.clone(),
138                    ..Default::default()
139                }),
140                "text" | "txt" => Box::new(Text {
141                    metadata: metadata.clone(),
142                }),
143                "octet-stream" => Box::new(Byte {
144                    metadata: metadata.clone(),
145                }),
146                #[cfg(feature = "toml")]
147                "toml" => Box::new(Toml {
148                    metadata: metadata.clone(),
149                }),
150                #[cfg(feature = "xml")]
151                "xml" => Box::new(Xml {
152                    metadata: metadata.clone(),
153                    ..Default::default()
154                }),
155                "x-yaml" => Box::new(Yaml {
156                    metadata: metadata.clone(),
157                }),
158                _ => {
159                    return Err(Error::new(
160                        ErrorKind::InvalidData,
161                        format!(
162                            "The document can't be guessed with this format '{}'",
163                            mime_subtype
164                        ),
165                    ))
166                }
167            },
168            None => DocumentType::default().boxed_inner(),
169        })
170    }
171}
172
173/// Every document_builder that implement this trait can get/write json_value through a connector.
174pub trait Document: Send + Sync + DocumentClone + std::fmt::Debug {
175    fn set_metadata(&mut self, metadata: Metadata);
176    fn metadata(&self) -> Metadata;
177    /// Check if the buffer has data
178    fn has_data(&self, buffer: &[u8]) -> io::Result<bool> {
179        Ok(!buffer.is_empty())
180    }
181    /// Check if it's possible to append new data into the end of the document
182    ///
183    /// True: Append the data to the end of the document
184    /// False: Replace the document
185    fn can_append(&self) -> bool {
186        true
187    }
188    /// Return the header data used to identify when the data start
189    ///             |--------|------|--------|
190    /// document => | header | data | footer |
191    ///             |--------|------|--------|
192    fn header(&self, _dataset: &DataSet) -> io::Result<Vec<u8>> {
193        Ok(Default::default())
194    }
195    /// Return the footer data used to identify when the data end
196    ///             |--------|------|--------|
197    /// document => | header | data | footer |
198    ///             |--------|------|--------|
199    fn footer(&self, _dataset: &DataSet) -> io::Result<Vec<u8>> {
200        Ok(Default::default())
201    }
202    /// Return the terminator to seperate lines of data
203    fn terminator(&self) -> io::Result<Vec<u8>> {
204        Ok(Default::default())
205    }
206    /// Set the entry path. The entry path is the path to reach the data into the document.
207    ///
208    /// For example, in json, the entry path for `{"field1":{"sub_field1":10}}` will be `/field1/sub_field1`
209    fn set_entry_path(&mut self, _entry_point: String) {}
210    /// Read buffer of bytes and transform it into dataset
211    fn read(&self, buffer: &[u8]) -> io::Result<DataSet>;
212    /// Write dataset into a buffer of bytes
213    fn write(&self, dataset: &DataSet) -> io::Result<Vec<u8>>;
214}
215
216pub trait DocumentClone {
217    fn clone_box(&self) -> Box<dyn Document>;
218}
219
220impl<T> DocumentClone for T
221where
222    T: 'static + Document + Clone,
223{
224    fn clone_box(&self) -> Box<dyn Document> {
225        Box::new(self.clone())
226    }
227}
228
229impl Clone for Box<dyn Document> {
230    fn clone(&self) -> Box<dyn Document> {
231        self.clone_box()
232    }
233}
234
235#[cfg(test)]
236mod test {
237    use super::*;
238    #[cfg(feature = "csv")]
239    #[test]
240    fn it_should_deserialize_in_csv_type() {
241        let config = r#"{"type":"csv"}"#;
242        let document_builder_expected = DocumentType::Csv(Csv::default());
243        let document_builder_result: DocumentType =
244            serde_json::from_str(config).expect("Can't deserialize the config");
245        assert_eq!(document_builder_expected, document_builder_result);
246    }
247    #[test]
248    fn it_should_deserialize_in_json_type() {
249        let config = r#"{"type":"json"}"#;
250        let document_builder_expected = DocumentType::Json(Json::default());
251        let document_builder_result: DocumentType =
252            serde_json::from_str(config).expect("Can't deserialize the config");
253        assert_eq!(document_builder_expected, document_builder_result);
254    }
255    #[test]
256    fn it_should_deserialize_in_jsonl_type() {
257        let config = r#"{"type":"jsonl"}"#;
258        let document_builder_expected = DocumentType::Jsonl(Jsonl::default());
259        let document_builder_result: DocumentType =
260            serde_json::from_str(config).expect("Can't deserialize the config");
261        assert_eq!(document_builder_expected, document_builder_result);
262    }
263    #[test]
264    fn it_should_deserialize_in_yaml_type() {
265        let config = r#"{"type":"yaml"}"#;
266        let document_builder_expected = DocumentType::Yaml(Yaml::default());
267        let document_builder_result: DocumentType =
268            serde_json::from_str(config).expect("Can't deserialize the config");
269        assert_eq!(document_builder_expected, document_builder_result);
270    }
271    #[cfg(feature = "xml")]
272    #[test]
273    fn it_should_deserialize_in_xml_type() {
274        let config = r#"{"type":"xml"}"#;
275        let document_builder_expected = DocumentType::Xml(Xml::default());
276        let document_builder_result: DocumentType =
277            serde_json::from_str(config).expect("Can't deserialize the config");
278        assert_eq!(document_builder_expected, document_builder_result);
279    }
280    #[cfg(feature = "toml")]
281    #[test]
282    fn it_should_deserialize_in_toml_type() {
283        let config = r#"{"type":"toml"}"#;
284        let document_builder_expected = DocumentType::Toml(Toml::default());
285        let document_builder_result: DocumentType =
286            serde_json::from_str(config).expect("Can't deserialize the config");
287        assert_eq!(document_builder_expected, document_builder_result);
288    }
289    #[test]
290    #[should_panic(expected = "missing field `type`")]
291    fn it_should_not_deserialize_without_type() {
292        let config = r#"{}"#;
293        let _document_builder_result: DocumentType = serde_json::from_str(config).unwrap();
294    }
295}