1pub mod byte;
2#[cfg(feature = "csv")]
3pub mod csv;
4pub mod json;
5pub mod jsonl;
6#[cfg(feature = "parquet")]
7pub mod parquet;
8pub mod text;
9#[cfg(feature = "toml")]
10pub mod toml;
11#[cfg(feature = "xml")]
12pub mod xml;
13pub mod yaml;
14
15#[cfg(feature = "csv")]
16use self::csv::Csv;
17use self::jsonl::Jsonl;
18#[cfg(feature = "parquet")]
19use self::parquet::Parquet;
20use self::text::Text;
21#[cfg(feature = "toml")]
22use self::toml::Toml;
23#[cfg(feature = "xml")]
24use self::xml::Xml;
25use self::yaml::Yaml;
26use self::{byte::Byte, json::Json};
27use super::Metadata;
28use crate::DataSet;
29use serde::{Deserialize, Serialize};
30use std::io::{self, Error, ErrorKind, Result};
31
32#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Eq)]
33#[serde(tag = "type")]
34pub enum DocumentType {
35 #[cfg(feature = "csv")]
36 #[serde(rename = "csv")]
37 Csv(Csv),
38 #[serde(rename = "json")]
39 Json(Json),
40 #[serde(rename = "jsonl")]
41 Jsonl(Jsonl),
42 #[cfg(feature = "xml")]
43 #[serde(rename = "xml")]
44 Xml(Xml),
45 #[serde(rename = "yaml")]
46 #[serde(alias = "yml")]
47 Yaml(Yaml),
48 #[cfg(feature = "toml")]
49 #[serde(rename = "toml")]
50 Toml(Toml),
51 #[serde(rename = "text")]
52 #[serde(alias = "txt")]
53 Text(Text),
54 #[serde(rename = "byte")]
55 Byte(Byte),
56 #[cfg(feature = "parquet")]
57 #[serde(rename = "parquet")]
58 Parquet(Parquet),
59}
60
61impl Default for DocumentType {
62 fn default() -> Self {
63 DocumentType::Json(Json::default())
64 }
65}
66
67impl DocumentType {
68 pub fn boxed_inner(self) -> Box<dyn Document> {
69 match self {
70 #[cfg(feature = "csv")]
71 DocumentType::Csv(document) => Box::new(document),
72 DocumentType::Json(document) => Box::new(document),
73 DocumentType::Jsonl(document) => Box::new(document),
74 #[cfg(feature = "xml")]
75 DocumentType::Xml(document) => Box::new(document),
76 DocumentType::Yaml(document) => Box::new(document),
77 #[cfg(feature = "toml")]
78 DocumentType::Toml(document) => Box::new(document),
79 DocumentType::Text(document) => Box::new(document),
80 DocumentType::Byte(document) => Box::new(document),
81 #[cfg(feature = "parquet")]
82 DocumentType::Parquet(document) => Box::new(document),
83 }
84 }
85 pub fn ref_inner(&self) -> &dyn Document {
86 match self {
87 #[cfg(feature = "csv")]
88 DocumentType::Csv(document) => document,
89 DocumentType::Json(document) => document,
90 DocumentType::Jsonl(document) => document,
91 #[cfg(feature = "xml")]
92 DocumentType::Xml(document) => document,
93 DocumentType::Yaml(document) => document,
94 #[cfg(feature = "toml")]
95 DocumentType::Toml(document) => document,
96 DocumentType::Text(document) => document,
97 DocumentType::Byte(document) => document,
98 #[cfg(feature = "parquet")]
99 DocumentType::Parquet(document) => document,
100 }
101 }
102 pub fn ref_mut_inner(&mut self) -> &mut dyn Document {
103 match self {
104 #[cfg(feature = "csv")]
105 DocumentType::Csv(document) => document,
106 DocumentType::Json(document) => document,
107 DocumentType::Jsonl(document) => document,
108 #[cfg(feature = "xml")]
109 DocumentType::Xml(document) => document,
110 DocumentType::Yaml(document) => document,
111 #[cfg(feature = "toml")]
112 DocumentType::Toml(document) => document,
113 DocumentType::Text(document) => document,
114 DocumentType::Byte(document) => document,
115 #[cfg(feature = "parquet")]
116 DocumentType::Parquet(document) => document,
117 }
118 }
119 pub fn guess(metadata: &Metadata) -> Result<Box<dyn Document>> {
120 Ok(match &metadata.mime_subtype {
121 Some(mime_subtype) => match mime_subtype.as_str() {
122 #[cfg(feature = "csv")]
123 "csv" => Box::new(Csv {
124 metadata: metadata.clone(),
125 ..Default::default()
126 }),
127 "json" => Box::new(Json {
128 metadata: metadata.clone(),
129 ..Default::default()
130 }),
131 "x-ndjson" | "jsonl" => Box::new(Jsonl {
132 metadata: metadata.clone(),
133 ..Default::default()
134 }),
135 #[cfg(feature = "parquet")]
136 "parquet" => Box::new(Parquet {
137 metadata: metadata.clone(),
138 ..Default::default()
139 }),
140 "text" | "txt" => Box::new(Text {
141 metadata: metadata.clone(),
142 }),
143 "octet-stream" => Box::new(Byte {
144 metadata: metadata.clone(),
145 }),
146 #[cfg(feature = "toml")]
147 "toml" => Box::new(Toml {
148 metadata: metadata.clone(),
149 }),
150 #[cfg(feature = "xml")]
151 "xml" => Box::new(Xml {
152 metadata: metadata.clone(),
153 ..Default::default()
154 }),
155 "x-yaml" => Box::new(Yaml {
156 metadata: metadata.clone(),
157 }),
158 _ => {
159 return Err(Error::new(
160 ErrorKind::InvalidData,
161 format!(
162 "The document can't be guessed with this format '{}'",
163 mime_subtype
164 ),
165 ))
166 }
167 },
168 None => DocumentType::default().boxed_inner(),
169 })
170 }
171}
172
173pub trait Document: Send + Sync + DocumentClone + std::fmt::Debug {
175 fn set_metadata(&mut self, metadata: Metadata);
176 fn metadata(&self) -> Metadata;
177 fn has_data(&self, buffer: &[u8]) -> io::Result<bool> {
179 Ok(!buffer.is_empty())
180 }
181 fn can_append(&self) -> bool {
186 true
187 }
188 fn header(&self, _dataset: &DataSet) -> io::Result<Vec<u8>> {
193 Ok(Default::default())
194 }
195 fn footer(&self, _dataset: &DataSet) -> io::Result<Vec<u8>> {
200 Ok(Default::default())
201 }
202 fn terminator(&self) -> io::Result<Vec<u8>> {
204 Ok(Default::default())
205 }
206 fn set_entry_path(&mut self, _entry_point: String) {}
210 fn read(&self, buffer: &[u8]) -> io::Result<DataSet>;
212 fn write(&self, dataset: &DataSet) -> io::Result<Vec<u8>>;
214}
215
216pub trait DocumentClone {
217 fn clone_box(&self) -> Box<dyn Document>;
218}
219
220impl<T> DocumentClone for T
221where
222 T: 'static + Document + Clone,
223{
224 fn clone_box(&self) -> Box<dyn Document> {
225 Box::new(self.clone())
226 }
227}
228
229impl Clone for Box<dyn Document> {
230 fn clone(&self) -> Box<dyn Document> {
231 self.clone_box()
232 }
233}
234
235#[cfg(test)]
236mod test {
237 use super::*;
238 #[cfg(feature = "csv")]
239 #[test]
240 fn it_should_deserialize_in_csv_type() {
241 let config = r#"{"type":"csv"}"#;
242 let document_builder_expected = DocumentType::Csv(Csv::default());
243 let document_builder_result: DocumentType =
244 serde_json::from_str(config).expect("Can't deserialize the config");
245 assert_eq!(document_builder_expected, document_builder_result);
246 }
247 #[test]
248 fn it_should_deserialize_in_json_type() {
249 let config = r#"{"type":"json"}"#;
250 let document_builder_expected = DocumentType::Json(Json::default());
251 let document_builder_result: DocumentType =
252 serde_json::from_str(config).expect("Can't deserialize the config");
253 assert_eq!(document_builder_expected, document_builder_result);
254 }
255 #[test]
256 fn it_should_deserialize_in_jsonl_type() {
257 let config = r#"{"type":"jsonl"}"#;
258 let document_builder_expected = DocumentType::Jsonl(Jsonl::default());
259 let document_builder_result: DocumentType =
260 serde_json::from_str(config).expect("Can't deserialize the config");
261 assert_eq!(document_builder_expected, document_builder_result);
262 }
263 #[test]
264 fn it_should_deserialize_in_yaml_type() {
265 let config = r#"{"type":"yaml"}"#;
266 let document_builder_expected = DocumentType::Yaml(Yaml::default());
267 let document_builder_result: DocumentType =
268 serde_json::from_str(config).expect("Can't deserialize the config");
269 assert_eq!(document_builder_expected, document_builder_result);
270 }
271 #[cfg(feature = "xml")]
272 #[test]
273 fn it_should_deserialize_in_xml_type() {
274 let config = r#"{"type":"xml"}"#;
275 let document_builder_expected = DocumentType::Xml(Xml::default());
276 let document_builder_result: DocumentType =
277 serde_json::from_str(config).expect("Can't deserialize the config");
278 assert_eq!(document_builder_expected, document_builder_result);
279 }
280 #[cfg(feature = "toml")]
281 #[test]
282 fn it_should_deserialize_in_toml_type() {
283 let config = r#"{"type":"toml"}"#;
284 let document_builder_expected = DocumentType::Toml(Toml::default());
285 let document_builder_result: DocumentType =
286 serde_json::from_str(config).expect("Can't deserialize the config");
287 assert_eq!(document_builder_expected, document_builder_result);
288 }
289 #[test]
290 #[should_panic(expected = "missing field `type`")]
291 fn it_should_not_deserialize_without_type() {
292 let config = r#"{}"#;
293 let _document_builder_result: DocumentType = serde_json::from_str(config).unwrap();
294 }
295}