1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
//! CSV-based source and reader objects and implentation.

use std::collections::HashMap;
use std::fmt::Debug;
use std::path::PathBuf;
use std::str::FromStr;

use csv_sniffer::metadata::Metadata;
use csv_sniffer::Sniffer;

use cons::*;
use error::*;
use field::FieldIdent;
use field::Value;
use fieldlist::{FieldDesignator, FieldPayloadCons, FieldSpec, SpecCons};
use label::{TypedValue, Valued};
use source::decode::decode;
use source::file::{FileLocator, LocalFileReader, Uri};
use store::{AssocFrameLookup, AssocStorage, DataStore, IntoView, PushFrontFromValueIter};

/// CSV Data source. Contains location of data file, and computes CSV metadata. Can be turned into
/// `CsvReader` object.
#[derive(Debug, Clone)]
pub struct CsvSource {
    // File source object for the CSV file
    src: FileLocator,
    // CSV file metadata (from `csv-sniffer` crate)
    metadata: Metadata,
}

impl CsvSource {
    /// Create a new `CsvSource` object with provided file location. This constructor will analyze
    /// (sniff) the file to detect its metadata (delimiter, quote character, preamble, etc.)
    ///
    /// # Error
    /// Fails if unable to open the file at the provided location, or if CSV analysis fails.
    pub fn new<L: Into<FileLocator>>(loc: L) -> Result<CsvSource> {
        let loc = loc.into();
        //TODO: make sample size configurable?
        let mut file_reader = LocalFileReader::new(&loc)?;
        let metadata = Sniffer::new().sniff_reader(&mut file_reader)?;

        Ok(CsvSource { src: loc, metadata })
    }
    /// Return the compute `Metadata` for this CSV source.
    pub fn metadata(&self) -> &Metadata {
        &self.metadata
    }
}

/// Type alias for [Cons](../../cons/struct.Cons.html)-list specifying label, data type, and source
/// index information of a CSV data source.
pub type CsvSrcSpecCons<Label, DType, Tail> = FieldPayloadCons<Label, DType, usize, Tail>;

/// A trait for converting an object into a [CsvSrcSpecCons](type.CsvSrcSpecCons.html).
pub trait IntoCsvSrcSpec {
    /// Resultant `CsvSrcSpecCons` object.
    type CsvSrcSpec;

    /// Convert this into a `CsvSrcSpecCons` cons-list. `headers` is a map of column header names
    /// to column indices. `num_fields` is the number of columns in the CSV file (for checking for
    /// indexing errors).
    fn into_csv_src_spec(
        self,
        headers: &HashMap<String, usize>,
        num_fields: usize,
    ) -> Result<Self::CsvSrcSpec>;
}
impl IntoCsvSrcSpec for Nil {
    type CsvSrcSpec = Nil;

    fn into_csv_src_spec(
        self,
        _headers: &HashMap<String, usize>,
        _num_fields: usize,
    ) -> Result<Nil> {
        Ok(Nil)
    }
}

impl<Label, DType, Tail> IntoCsvSrcSpec for SpecCons<Label, DType, Tail>
where
    Tail: IntoCsvSrcSpec,
{
    type CsvSrcSpec = CsvSrcSpecCons<Label, DType, Tail::CsvSrcSpec>;

    fn into_csv_src_spec(
        self,
        headers: &HashMap<String, usize>,
        num_fields: usize,
    ) -> Result<CsvSrcSpecCons<Label, DType, Tail::CsvSrcSpec>> {
        let idx = match *self.head.value_ref() {
            FieldDesignator::Expr(ref s) => *headers
                .get(s)
                .ok_or(AgnesError::FieldNotFound(FieldIdent::Name(s.to_string())))?,
            FieldDesignator::Idx(idx) => {
                if idx >= num_fields {
                    return Err(AgnesError::IndexError {
                        index: idx,
                        len: num_fields,
                    });
                };
                idx
            }
        };
        Ok(Cons {
            head: TypedValue::from(idx).into(),
            tail: self.tail.into_csv_src_spec(headers, num_fields)?,
        })
    }
}

/// A trait for building a [DataStore](../../store/struct.DataStore.html) from a
/// [CsvSrcSpecCons](type.CsvSrcSpecCons.html).
pub trait BuildDStore {
    /// `Fields` type parameter of the resultant `DataStore`.
    type OutputFields: AssocStorage;

    /// Builds a `DataStore` from the source spec (`self`) and a CSV source `src`.
    fn build(&mut self, src: &CsvSource) -> Result<DataStore<Self::OutputFields>>;
}
impl BuildDStore for Nil {
    type OutputFields = Nil;
    fn build(&mut self, _src: &CsvSource) -> Result<DataStore<Nil>> {
        Ok(DataStore::<Nil>::empty())
    }
}
impl<Label, DType, Tail> BuildDStore for CsvSrcSpecCons<Label, DType, Tail>
where
    Tail: BuildDStore,
    DataStore<<Tail as BuildDStore>::OutputFields>: PushFrontFromValueIter<Label, DType>,
    Tail::OutputFields: PushBack<FieldSpec<Label, DType>>,
    <Tail::OutputFields as PushBack<FieldSpec<Label, DType>>>::Output: AssocStorage,
    Label: Debug,
    DType: FromStr + Debug + Default + Clone,
    ParseError: From<<DType as FromStr>::Err>,
{
    type OutputFields = <DataStore<<Tail as BuildDStore>::OutputFields> as PushFrontFromValueIter<
        Label,
        DType,
    >>::OutputFields;

    fn build(&mut self, src: &CsvSource) -> Result<DataStore<Self::OutputFields>> {
        let file_reader = LocalFileReader::new(&src.src)?;
        let mut csv_reader = src.metadata.dialect.open_reader(file_reader)?;
        let ds = self.tail.build(src)?;

        let values: Vec<Value<DType>> = csv_reader
            .byte_records()
            .map(|row| {
                let record = row?;
                let value = decode(record.get(*self.head.value_ref().value_ref()).ok_or_else(
                    || AgnesError::FieldNotFound(FieldIdent::Name(stringify![Field].to_string())),
                )?)?;
                Ok(value)
            })
            .map(|sresult| {
                sresult.and_then(|s| {
                    let trimmed = s.trim();
                    if trimmed.is_empty() {
                        Ok(Value::Na)
                    } else {
                        trimmed
                            .parse::<DType>()
                            .map(|value| Value::Exists(value))
                            .map_err(|e| AgnesError::Parse(e.into()))
                    }
                })
            })
            .collect::<Result<_>>()?;
        let ds = ds.push_front_from_value_iter::<Label, DType, _, _>(values);

        Ok(ds)
    }
}

/// Object for reading CSV sources.
#[derive(Debug)]
pub struct CsvReader<CsvSpec> {
    src: CsvSource,
    csv_src_spec: CsvSpec,
}

impl<CsvSrcSpec> CsvReader<CsvSrcSpec>
where
    CsvSrcSpec: Debug,
{
    /// Create a new CSV reader from a CSV source specification. This will process header row (if
    /// exists), and verify the fields specified in the `CsvSource` object exist in this CSV file.
    pub fn new<Spec>(src: &CsvSource, spec: Spec) -> Result<CsvReader<Spec::CsvSrcSpec>>
    where
        Spec: IntoCsvSrcSpec<CsvSrcSpec = CsvSrcSpec>,
    {
        let file_reader = LocalFileReader::new(&src.src)?;
        let mut csv_reader = src.metadata.dialect.open_reader(file_reader)?;

        debug_assert_eq!(src.metadata.num_fields, src.metadata.types.len());

        let headers = if src.metadata.dialect.header.has_header_row {
            let headers = csv_reader.headers()?;
            if headers.len() != src.metadata.num_fields {
                return Err(AgnesError::CsvDialect(
                    "header row does not match sniffed number of fields in CSV file".into(),
                ));
            }
            headers
                .iter()
                .enumerate()
                .map(|(i, s)| (s.to_string(), i))
                .collect::<HashMap<_, _>>()
        } else {
            HashMap::new()
        };
        let csv_src_spec = spec.into_csv_src_spec(&headers, src.metadata.num_fields)?;

        Ok(CsvReader {
            //TODO: remove source from here
            src: src.clone(),
            csv_src_spec,
        })
    }

    /// Read a `CsvSource` into a `DataStore` object.
    pub fn read(&mut self) -> Result<DataStore<CsvSrcSpec::OutputFields>>
    where
        CsvSrcSpec: BuildDStore,
    {
        self.csv_src_spec.build(&self.src)
    }
}

/// Utility function for loading a CSV file from a [FileLocator](../file/enum.FileLocator.html).
///
/// Fails if unable to find or read file at location specified.
pub fn load_csv<L: Into<FileLocator>, Spec>(
    loc: L,
    spec: Spec,
) -> Result<<DataStore<<Spec::CsvSrcSpec as BuildDStore>::OutputFields> as IntoView>::Output>
where
    Spec: IntoCsvSrcSpec,
    Spec::CsvSrcSpec: BuildDStore + Debug,
    <Spec::CsvSrcSpec as BuildDStore>::OutputFields: AssocFrameLookup,
{
    let source = CsvSource::new(loc)?;
    let mut csv_reader = CsvReader::new(&source, spec)?;
    Ok(csv_reader.read()?.into_view())
}

/// Utility function for loading a CSV file from a URI string.
///
/// Fails if unable to parse `uri`, or if unable to find or read file at the location specified.
pub fn load_csv_from_uri<Spec>(
    uri: &str,
    spec: Spec,
) -> Result<<DataStore<<Spec::CsvSrcSpec as BuildDStore>::OutputFields> as IntoView>::Output>
where
    Spec: IntoCsvSrcSpec,
    Spec::CsvSrcSpec: BuildDStore + Debug,
    <Spec::CsvSrcSpec as BuildDStore>::OutputFields: AssocFrameLookup,
{
    load_csv(Uri::from_uri(uri.parse::<hyper::Uri>()?)?, spec)
}

/// Utility function for loading a CSV file from a local file path.
///
/// Fails if unable to find or read file at the location specified.
pub fn load_csv_from_path<P, Spec>(
    path: P,
    spec: Spec,
) -> Result<<DataStore<<Spec::CsvSrcSpec as BuildDStore>::OutputFields> as IntoView>::Output>
where
    P: Into<PathBuf>,
    Spec: IntoCsvSrcSpec,
    Spec::CsvSrcSpec: BuildDStore + Debug,
    <Spec::CsvSrcSpec as BuildDStore>::OutputFields: AssocFrameLookup,
{
    load_csv(path.into(), spec)
}