1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
//! Record is an immutable collection of files

use std::io::{self, Read};
use hash::Hasher;

/// Record's file
///
/// This trait represent an abstraction of a file: something that has a name
/// and binary content to read.
pub trait File {
    /// Associated `Read` type
    type Read : Read;
    /// Returns file's name
    fn name(&self) -> &str;
    /// Returns a mutable reference to `Self::Read`
    fn read(&mut self) -> &mut Self::Read;
    /// Consumes itself and returns `Self::Read`
    fn into_read(self) -> Self::Read;
}

impl<S, R> File for (S, R) where S: AsRef<str>, R: Read {
    type Read = R;

    fn name(&self) -> &str {
        self.0.as_ref()
    }

    fn read(&mut self) -> &mut Self::Read {
        &mut self.1
    }

    fn into_read(self) -> Self::Read {
        self.1
    }

}

use std::marker::PhantomData;

/// A collection of always ordered files
///
/// With limited ways to construct this structure, it's
/// always ensured to have all its files sorted as required
/// by SIT for deterministic hashing.
pub struct OrderedFiles<'a, F: File>(Vec<F>, PhantomData<&'a ()>);

impl<'a, F: File> OrderedFiles<'a, F>  where F: 'a, F::Read: 'a {
    /// Returns a boxed version of itself
    ///
    /// It's useful to ensure type compatibility between branches,
    /// in one of which an intersection of differently-typed `File`s
    /// were ordered together.
    ///
    /// ```
    /// extern crate sit_core;
    ///
    /// use std::io::Cursor;
    /// use sit_core::record::{BoxedOrderedFiles, OrderedFiles};
    ///
    /// let files: OrderedFiles<_> = vec![("file", &b"hello"[..])].into();
    /// let extra: OrderedFiles<_> =  vec![("file", Cursor::new(String::from("world")))].into();
    ///
    /// let some_condition = true;
    ///
    /// let all_files = if some_condition {
    ///    files + extra
    /// } else {
    ///    files.boxed()
    /// };
    ///
    /// ```
    pub fn boxed(self) -> BoxedOrderedFiles<'a> {
        #[inline]
        fn boxed_file<'f, F: File + 'f>(file: F) -> (String, Box<Read + 'f>) where F::Read: 'f {
            (file.name().into(), Box::new(file.into_read()) as Box<Read + 'f>)
        }
        let files: Vec<_> = self.0.into_iter().map(boxed_file).collect();
        files.into()
    }
}

impl<'a, F: File> OrderedFiles<'a, F> {
    /// Deterministically hashes all ordered files and allows to process them as well
    ///
    /// For every file, it will call `per_file(file_name)` and use the returned positive value
    /// (from inside of `Ok(f_)`) to call `per_chunk(f_, chunk)` on every chunk of read data.
    ///
    /// This method's primary motivation is to allow hashing and saving files at the same time,
    /// to avoid re-reading them to accomplish both of the operations. By itself, however,
    /// this function doesn't do anything in term of saving files (or any other functionality),
    /// that is responsibility of `per_file` and `per_chunk` callbacks.
    pub fn hash_and<PF, F_, PC, E>(mut self, hasher: &mut Hasher, per_file: PF, per_chunk: PC) -> Result<(), E>
        where PF: Fn(&str) -> Result<F_, E>, PC: Fn(F_, &[u8]) -> Result<F_, E>, E: From<io::Error> {
        let mut buf = vec![0; 4096];
        for file in self.0.iter_mut() {
            let name: String = file.name().into();
            hasher.process(name.as_bytes());
            let mut reader = file.read();
            let mut file_processor = per_file(&name)?;
            loop {
                let bytes_read = reader.read(&mut buf)?;
                hasher.process(&buf);
                file_processor = per_chunk(file_processor, &buf[0..bytes_read])?;
                if bytes_read == 0 {
                    break;
                }
            }
        }
        Ok(())
    }
    /// Deterministically hashes all ordered files
    pub fn hash(self, hasher: &mut Hasher) -> Result<(), io::Error> {
        self.hash_and(hasher, |_| Ok(()), |v, _| Ok(v))
    }
}

impl<'a, I, F> From<I> for OrderedFiles<'a, (String, F::Read)> where I: IntoIterator<Item=F>, F: File + 'a {
    fn from(i: I) -> Self {
        let mut files: Vec<_> = i.into_iter().map(|file| {
            // replace backslashes with slashes (Windows)
            let name_for_hashing: String = file.name().replace("\\", "/").into();
            use relative_path::RelativePath;
            let name_for_hashing: String = RelativePath::new(&name_for_hashing).normalize().as_str().into();
            (name_for_hashing, file.into_read())
        }).collect();

        files.sort_unstable_by(|f1, f2| f1.name().cmp(f2.name()));
        OrderedFiles(files, PhantomData)
    }
}

pub type BoxedOrderedFiles<'a> = OrderedFiles<'a, (String, Box<Read + 'a>)>;

use std::ops::{Add, Sub};

impl<'a, F1, F2> Add<OrderedFiles<'a, F2>> for OrderedFiles<'a, F1> where F1: File + 'a, F2: File + 'a, F1::Read: 'a, F2::Read: 'a {
    type Output = BoxedOrderedFiles<'a>;

    fn add(self, rhs: OrderedFiles<'a, F2>) -> Self::Output {
        let mut files = self.boxed().0;
        let mut rhs_files = rhs.boxed().0;
        files.append(&mut rhs_files);
        files.into()
    }
}

impl<'a, F1, F2, I> Add<I> for OrderedFiles<'a, F1> where F1: File + 'a, F2: File + 'a, F1::Read: 'a, F2::Read: 'a, I: IntoIterator<Item = OrderedFiles<'a, F2>> {
    type Output = BoxedOrderedFiles<'a>;

    fn add(self, rhs: I) -> Self::Output {
        let mut files = self.boxed().0;
        for rhs in rhs.into_iter() {
            let mut rhs_files = rhs.boxed().0;
            files.append(&mut rhs_files);
        }
        files.into()
    }
}

impl<'a, F, S> Sub<S> for OrderedFiles<'a, F> where F: File + 'a, S: AsRef<str> + 'a {
    type Output = Self;
    fn sub(self, rhs: S) -> Self::Output {
        let name = rhs.as_ref();
        let files: Vec<_> = self.0.into_iter().filter(|f| f.name() != name).collect();
        OrderedFiles(files, PhantomData)
    }
}

#[cfg(test)]
mod ordered_files_tests {
    use proptest::collection::*;
    use super::*;

    proptest! {
      #[test]
      fn sorted(ref i in vec("\\PC*", 0..10)) {
        let ordered_files = OrderedFiles::from(i.clone().into_iter().map(|v| (v, &[][..])));
        for i in 1..ordered_files.0.len() {
           assert!(ordered_files.0[i].name() >= ordered_files.0[i-1].name());
        }
      }

      #[test]
      fn add_sorted(ref i1 in vec("\\PC*", 0..10), ref i2 in vec("\\PC*", 0..10)) {
        let ordered_files1 = OrderedFiles::from(i1.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files2 = OrderedFiles::from(i2.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files = ordered_files1 + ordered_files2;
        for i in 1..ordered_files.0.len() {
           assert!(ordered_files.0[i].name() >= ordered_files.0[i-1].name());
        }
      }

      #[test]
      fn add_includes(ref i1 in vec("\\PC*", 0..10), ref i2 in vec("\\PC*", 0..10)) {
        let ordered_files1 = OrderedFiles::from(i1.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files1_ = OrderedFiles::from(i1.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files2 = OrderedFiles::from(i2.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files2_ = OrderedFiles::from(i2.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files = ordered_files1 + ordered_files2;
        for i in ordered_files1_.0 {
           assert!(ordered_files.0.iter().find(|f| f.name() == i.name()).is_some());
        }
        for i in ordered_files2_.0 {
           assert!(ordered_files.0.iter().find(|f| f.name() == i.name()).is_some());
        }
      }

      #[test]
      fn add_includes_iter(ref i1 in vec("\\PC*", 0..10), ref i2 in vec("\\PC*", 0..10)) {
        let ordered_files1 = OrderedFiles::from(i1.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files1_ = OrderedFiles::from(i1.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files2 = OrderedFiles::from(i2.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files2_ = OrderedFiles::from(i2.clone().into_iter().map(|v| (v, &[][..])));
        let ordered_files = ordered_files1 + ::std::iter::once(ordered_files2);
        for i in ordered_files1_.0 {
           assert!(ordered_files.0.iter().find(|f| f.name() == i.name()).is_some());
        }
        for i in ordered_files2_.0 {
           assert!(ordered_files.0.iter().find(|f| f.name() == i.name()).is_some());
        }
      }

     #[test]
     fn sub_excludes(ref names in vec("\\PC*", 0..10), i in 0..9) {
        prop_assume!(i as usize + 1 <= names.len());
        let ordered_files1 = OrderedFiles::from(names.clone().into_iter().map(|v| (v, &[][..])));
        let name = &names[i as usize];
        let ordered_files = ordered_files1 - name;
        assert!(ordered_files.0.iter().find(|f| f.name() == name).is_none());
      }

    }

    #[test]
    fn ordered_files_normalizes() {
        let files1: OrderedFiles<_> = vec![("test/../hello", &b""[..]), ("/test0", &b""[..]), ("a\\b", &b""[..]), ("./test1", &b""[..])].into();
        let files2: OrderedFiles<_> = vec![("hello", &b""[..]), ("test0", &b""[..]), ("a/b", &b""[..]), ("test1", &b""[..])].into();
        assert_eq!(files1.0.iter().map(|f| f.name()).collect::<Vec<_>>(),
                   files2.0.iter().map(|f| f.name()).collect::<Vec<_>>());
    }
}

/// Record is an immutable collection of files
pub trait Record {
   /// Implementation's type for reading files
   type Read : ::std::io::Read;
   /// Implementation's type for non-encoded hash
   type Hash : AsRef<[u8]>;
   /// Implementation's type for file names
   type Str : AsRef<str>;
   /// Implementation's iterator type for listing files
   type Iter : Iterator<Item=(Self::Str, Self::Read)>;
   /// Returns record hash
   fn hash(&self) -> Self::Hash;
   /// Returns encoded record hash
   ///
   /// The encoding is defined by its container (typically, the repository)
   /// and is intended to be human-readable and it MUST be an encoding of the
   /// byte array returned by [`hash`]
   ///
   /// [`hash`]: struct.Record.html#hash
   fn encoded_hash(&self) -> Self::Str;

   /// Returns enclosing item's ID
   fn item_id(&self) -> Self::Str;

   /// Returns an iterator over files in the record
   fn file_iter(&self) -> Self::Iter;
}


use serde_json::{Value as JsonValue, Map as JsonMap};
use serde::Serializer;
use serde::ser::SerializeStruct;

pub trait RecordExt: Record {

   fn has_type<S: AsRef<str>>(&self, typ: S) -> bool {
      let len = 6 + typ.as_ref().len();
      self.file_iter().any(|(name, _)| {
         let name = name.as_ref();
         name.len() == len &&
         name.starts_with(".type/") &&
         name.ends_with(typ.as_ref())
      })
   }

   fn file<S: AsRef<str>>(&self, file: S) -> Option<Self::Read> {
      let file = file.as_ref();
      self.file_iter().find(|&(ref name, _)| name.as_ref() == file).and_then(|(_, reader)| Some(reader))
   }

   fn serde_serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where
        S: Serializer {
       use std::io::Read;
       let mut record = serializer.serialize_struct("Record", 2)?;
       let mut files = JsonMap::new();
       let mut buf = Vec::new();
       for (name, mut reader) in self.file_iter() {
           let name = name.as_ref().into();
           match reader.read_to_end(&mut buf) {
               Ok(_) => {
                   match ::std::str::from_utf8(&buf) {
                       Err(_) => {
                           let mut typ = JsonMap::new();
                           typ.insert("type".into(), JsonValue::String("binary".into()));
                           files.insert(name, JsonValue::Object(typ));
                       },
                       Ok(str) => {
                           files.insert(name, JsonValue::String(str.into()));
                       }
                   }
               },
               Err(err) => {
                   let mut error = JsonMap::new();
                   error.insert("error".into(), JsonValue::String(format!("{}", err)));
                   files.insert(name, JsonValue::Object(error));
               }
           }
           buf.clear();
       }
       record.serialize_field("hash", self.encoded_hash().as_ref().into())?;
       record.serialize_field("files", &JsonValue::Object(files))?;
       record.end()
    }

}

impl<T> RecordExt for T where T: Record {}