npyz 0.9.0

NumPy file format (de-)serialization. Fork of outdated npy-rs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590

use std::io;
use core::fmt;

use py_literal::ParseError;
pub use py_literal::Value;
use byteorder::{LittleEndian, ReadBytesExt};
use num_bigint::Sign;

use crate::type_str::TypeStr;

/// Representation of a Numpy type
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DType {
    /// Numpy type string. First character is `'>'` for big endian, `'<'` for little endian,
    /// or can be `'|'` if it doesn't matter.
    ///
    /// Examples: `>i4`, `<u8`, `>f8`, `|S7`. The number usually corresponds to the number of
    /// bytes (with the single exception of unicode strings `|U3`).
    Plain(TypeStr),

    /// Fixed-size inner array type.
    ///
    /// This is only possible inside structured arrays, where fields can themselves be arrays.
    /// E.g. in the `DType` for `dtype=[('abc', 'i4', [2, 3])]`, the `DType` for `abc`
    /// will be `Array(2, Array(3, Plain("<i4")))`. In rust, such an array could be read using
    /// the following element type:
    #[cfg_attr(any(not(doctest), feature="derive"), doc = r##"
```
# #[allow(unused)]
#[derive(npyz::Serialize, npyz::Deserialize, npyz::AutoSerialize)]
struct Row {
    abc: [[i32; 3]; 2],
}
```
"##)]
    Array(u64, Box<DType>),

    /// A structure record array
    Record(Vec<Field>),
}

#[derive(Debug, Clone, PartialEq, Eq)]
/// A field of a structured array dtype
pub struct Field {
    /// The name of the field
    pub name: String,

    /// The dtype of the field
    pub dtype: DType,
}

impl DType {
    /// Numpy format description of record dtype.
    ///
    /// Calling `descr` on [`DType::Array`] will not produce a valid python expression
    /// (the string will only be suitable for error messages).
    pub fn descr(&self) -> String {
        use DType::*;
        match self {
            Plain(ty) => format!("'{}'", ty),
            Record(fields) => {
                fields.iter()
                    .map(|Field { name, dtype }| {
                        let name = PyUtf8StringLiteral(name);
                        match dtype {
                            ty@Plain(_) |
                            ty@Record(_) => format!("({}, {}), ", name, ty.descr()),

                            array@Array(..) => {
                                let (shape, elem_ty) = extract_full_array_shape(array);
                                let shape_str = shape.iter().fold(String::new(), |o, n| o + &format!("{},", n));
                                format!("({}, {}, ({})), ", name, elem_ty.descr(), shape_str)
                            },
                        }
                    }).fold("[".to_string(), |o, n| o + &n) + "]"
            },

            Array(n, inner) => format!("<< array {} of {} >>", n, inner.descr()),
        }
    }

    // Create from description AST
    pub(crate) fn from_descr(descr: &Value) -> io::Result<Self> {
        use DType::*;
        match descr {
            Value::String(string) => Ok(Self::new_scalar(convert_string_to_type_str(string)?)),
            Value::List(list) => Ok(Record(convert_list_to_record_fields(list)?)),
            _ => Err(invalid_data("must be string or list")),
        }
    }

    // not part of stable API, but needed by the serialize_array test
    #[doc(hidden)]
    pub fn parse(source: &str) -> io::Result<Self> {
        let descr = parse_header_text_to_io_result(source.as_bytes())?;
        Self::from_descr(&descr)
    }

    /// Construct a scalar `DType`. (one which is not a nested array or record type)
    pub fn new_scalar(ty: TypeStr) -> Self {
        DType::Plain(ty)
    }

    /// Return a `TypeStr` only if the `DType` is a primitive scalar. (no arrays or record types)
    pub(crate) fn as_scalar(&self) -> Option<&TypeStr> {
        match self {
            DType::Plain(ty) => Some(ty),
            _ => None,
        }
    }

    /// Get the number of bytes that each item of this type occupies.
    ///
    /// If this size is not fixed (e.g. `|O`) or would overflow the platform's `usize` type, returns `None`.
    /// You can differentiate between these two error cases by calling [`Self::has_variable_size`].
    pub fn num_bytes(&self) -> Option<usize> {
        match self {
            DType::Plain(ty) => ty.num_bytes(),
            DType::Array(n, inner) => inner.num_bytes()?.checked_mul(usize::try_from(*n).ok()?),
            DType::Record(fields) => {
                fields.iter().map(|field| field.dtype.num_bytes())
                    .fold(Some(0), |a, b| a?.checked_add(b?))
            },
        }
    }

    /// `true` if an array using this `DType` is stored using `pickle`.
    ///
    /// This is true if and only if the type contains an [`'O'`](TypeChar::Object) dtype.
    ///
    /// Pickled arrays present unique challenges, and most of the `npyz` crate does not support them
    /// beyond parsing or writing the file header.
    pub fn uses_pickled_array(&self) -> bool {
        match self {
            DType::Plain(ty) => ty.uses_pickled_array(),
            DType::Array(_, inner) => inner.uses_pickled_array(),
            DType::Record(fields) => fields.iter().any(|field| field.dtype.uses_pickled_array()),
        }
    }
}

fn convert_list_to_record_fields(values: &[Value]) -> io::Result<Vec<Field>> {
    values.iter()
        .map(|value| match *value {
            Value::List(ref tuple) => convert_tuple_to_record_field(tuple),
            Value::Tuple(ref tuple) => convert_tuple_to_record_field(tuple),
            _ => Err(invalid_data("list must contain list or tuple"))
        })
        .collect()
}

fn convert_tuple_to_record_field(tuple: &[Value]) -> io::Result<Field> {
    match tuple.len() {
        2 | 3 => {}
        _ => return Err(invalid_data("list entry must contain 2 or 3 items")),
    }
    let name = match &tuple[0] {
        Value::String(name) => name.clone(),
        _ => return Err(invalid_data("list entry must contain a string for id")),
    };

    let mut dtype = DType::from_descr(&tuple[1])?;
    if let Some(s) = tuple.get(2) {
        let mut shape = convert_value_to_shape(s)?;
        while let Some(dim) = shape.pop() {
            dtype = DType::Array(dim, Box::new(dtype));
        }
    };

    Ok(Field { name, dtype })
}

fn convert_value_to_sequence(field: &Value) -> Option<&[Value]> {
    match field {
        &Value::List(ref lengths) => Some(lengths),
        &Value::Tuple(ref lengths) => Some(lengths),
        _ => None
    }
}

fn convert_value_to_shape_integer(number: &Value) -> io::Result<u64> {
    if let Value::Integer(number) = number {
        let parts = number.to_u64_digits();
        match parts {
            (Sign::Minus, _) => Err(invalid_data("dimension cannot be negative")),
            (Sign::NoSign, _) => Ok(0),
            (_, parts) if parts.len() == 1 => Ok(parts[0]),
            _ => Err(invalid_data("dimension cannot be larger than u64"))
        }
    } else {
        Err(invalid_data("dimension must be an integer"))
    }
}

fn extract_full_array_shape(mut dtype: &DType) -> (Vec<u64>, &DType) {
    let mut shape = vec![];
    while let &DType::Array(dim, ref inner) = dtype {
        shape.push(dim);
        dtype = inner;
    }
    (shape, dtype)
}

pub(crate) fn convert_value_to_shape(field: &Value) -> io::Result<Vec<u64>> {
    convert_value_to_sequence(field)
        .map(|f| f.iter().map(convert_value_to_shape_integer).collect())
        .ok_or(invalid_data("shape must be list or tuple"))?
}

fn convert_string_to_type_str(string: &str) -> io::Result<TypeStr> {
    match string.parse() {
        Ok(ty) => Ok(ty),
        Err(e) => Err(invalid_data(format_args!("invalid type string: {}", e))),
    }
}

fn invalid_data(message: impl ToString) -> io::Error {
    io::Error::new(io::ErrorKind::InvalidData, message.to_string())
}

pub(crate) fn read_header(r: &mut dyn io::Read) -> io::Result<Value> {
    let PreHeader { version_props, header_size } = read_pre_header(r)?;

    // FIXME: properly account for encoding
    let _ = version_props.encoding;
    let mut header_text = vec![0; header_size];
    r.read_exact(&mut header_text)?;

    parse_header_text_to_io_result(&header_text)
}

fn parse_header_text_to_io_result(bytes: &[u8]) -> io::Result<Value> {
    let without_newline = match bytes.split_last() {
        Some((&b'\n', rest)) => rest,
        _ => bytes,
    };
    std::str::from_utf8(without_newline)
        .map_err(|_| invalid_data("could not parse utf-8"))?
        .parse()
        .map_err(|e: ParseError| invalid_data(format_args!("could not parse Python expression: {}", e.to_string())))
}

struct PreHeader {
    version_props: VersionProps,
    header_size: usize,
}

fn read_pre_header(r: &mut dyn io::Read) -> io::Result<PreHeader> {
    let version = read_magic_and_version(r)?;
    let version_props = get_version_props(version)?;

    let header_size = match version_props.header_size_type {
        HeaderSizeType::U32 => r.read_u32::<LittleEndian>()? as usize,
        HeaderSizeType::U16 => r.read_u16::<LittleEndian>()? as usize,
    };

    Ok(PreHeader { version_props, header_size })
}

fn read_magic_and_version(r: &mut dyn io::Read) -> io::Result<(u8, u8)> {
    let magic_err = || invalid_data("magic not found for NPY file");

    let mut buf = [0u8; 8];
    r.read_exact(&mut buf).map_err(|e| match e.kind() {
        io::ErrorKind::UnexpectedEof => magic_err(),
        _ => e,
    })?;

    if &buf[..6] != b"\x93NUMPY" {
        return Err(magic_err());
    }
    Ok((buf[6], buf[7]))
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum HeaderSizeType { U16, U32 }

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum HeaderEncoding {
    // Note: there is a suspicious phrase in the documentation:
    //
    //    "replaces the ASCII string (which in practice was latin1)"
    //
    // ...which suggests that this might actually be ANSI and not ASCII?
    Ascii,
    Utf8,
}
#[derive(Debug, Clone)]
pub(crate) struct VersionProps {
    pub(crate) header_size_type: HeaderSizeType,
    pub(crate) encoding: HeaderEncoding,
}
impl VersionProps {
    pub fn bytes_before_text(&self) -> usize {
        match self.header_size_type {
            HeaderSizeType::U16 => 10,
            HeaderSizeType::U32 => 12,
        }
    }
}

pub(crate) fn get_version_props(version: (u8, u8)) -> io::Result<VersionProps> {
    use self::HeaderSizeType::*;
    use self::HeaderEncoding::*;
    match version {
        (1, 0) => Ok(VersionProps { header_size_type: U16, encoding: Ascii }),
        (2, 0) => Ok(VersionProps { header_size_type: U32, encoding: Ascii }),
        (3, 0) => Ok(VersionProps { header_size_type: U32, encoding: Utf8 }),
        _ => Err(invalid_data(format_args!("unsupported version: ({}, {})", version.0, version.1))),
    }
}

/// Formats a python string literal.
///
/// Unlike the [`Display`] impl for [`py_literal`], the string is encoded in
/// UTF-8 (supported by NPY version 3), resulting in fewer escapes.
struct PyUtf8StringLiteral<'a>(&'a str);

impl fmt::Display for PyUtf8StringLiteral<'_> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let replaced = {
            self.0.replace("\\", "\\\\")
                .replace("'", "\\'")
                .replace("\r", "\\r")
                .replace("\n", "\\n")
        };
        write!(f, "'{}'", replaced)
    }
}

pub(crate) fn get_minimal_version(required_props: VersionProps) -> (u8, u8) {
    if required_props.encoding == HeaderEncoding::Utf8 {
        (3, 0)
    } else if required_props.header_size_type == HeaderSizeType::U32 {
        (2, 0)
    } else {
        (1, 0)
    }
}


#[cfg(test)]
mod tests {
    use super::*;
    use std::error::Error;

    type TestResult = std::result::Result<(), Box<dyn Error>>;

    #[test]
    fn description_of_record_array_as_python_list_of_tuples() -> TestResult {
        let dtype = DType::Record(vec![
            Field {
                name: "float".to_string(),
                dtype: DType::Plain(">f4".parse()?),
            },
            Field {
                name: "byte".to_string(),
                dtype: DType::Plain("<u1".parse()?),
            }
        ]);
        let expected = "[('float', '>f4'), ('byte', '<u1'), ]";
        assert_eq!(dtype.descr(), expected);
        Ok(())
    }

    #[test]
    fn description_of_unstructured_primitive_array() -> TestResult {
        let dtype = DType::Plain(">f8".parse()?);
        assert_eq!(dtype.descr(), "'>f8'");
        Ok(())
    }

    #[test]
    fn description_of_nested_record_dtype() -> TestResult {
        let dtype = DType::Record(vec![
            Field {
                name: "parent".to_string(),
                dtype: DType::Record(vec![
                    Field {
                        name: "child".to_string(),
                        dtype: DType::Plain("<i4".parse()?),
                    },
                ]),
            }
        ]);
        assert_eq!(dtype.descr(), "[('parent', [('child', '<i4'), ]), ]");
        Ok(())
    }

    #[test]
    fn converts_simple_description_to_plain_dtype() -> TestResult {
        let dtype = ">f8";
        assert_eq!(
            DType::from_descr(&Value::String(dtype.to_string())).unwrap(),
            DType::Plain(dtype.parse()?),
        );
        Ok(())
    }

    #[test]
    fn converts_non_endian_description_to_plain_dtype() -> TestResult {
        let dtype = "|u1";
        assert_eq!(
            DType::from_descr(&Value::String(dtype.to_string())).unwrap(),
            DType::Plain(dtype.parse()?),
        );
        Ok(())
    }


    #[test]
    fn converts_object_description_to_plain_dtype() -> TestResult {
        let dtype = "|O";
        assert_eq!(
            DType::from_descr(&Value::String(dtype.to_string())).unwrap(),
            DType::Plain(dtype.parse()?),
        );
        Ok(())
    }

    #[test]
    fn converts_record_description_to_record_dtype() -> TestResult {
        let descr = parse("[('a', '<u2'), ('b', '<f4')]");
        let expected_dtype = DType::Record(vec![
            Field {
                name: "a".to_string(),
                dtype: DType::Plain("<u2".parse()?),
            },
            Field {
                name: "b".to_string(),
                dtype: DType::Plain("<f4".parse()?),
            }
        ]);
        assert_eq!(DType::from_descr(&descr).unwrap(), expected_dtype);
        Ok(())
    }

    #[test]
    fn handles_variable_size_record() -> TestResult {
        let descr = parse("[('a', '<u2'), ('b', '|O')]");
        let expected_dtype = DType::Record(vec![
            Field {
                name: "a".to_string(),
                dtype: DType::Plain("<u2".parse()?),
            },
            Field {
                name: "b".to_string(),
                dtype: DType::Plain("|O".parse()?),
            }
        ]);
        assert_eq!(DType::from_descr(&descr).unwrap(), expected_dtype);
        Ok(())
    }

    #[test]
    fn funny_member_name_roundtrips() -> TestResult {
        let original_dtype = DType::Record(vec![
            Field {
                name: " \'\"\r\n\\ ".to_string(),
                dtype: DType::Plain("<u2".parse()?),
            },
        ]);
        let descr = parse(&original_dtype.descr());
        assert_eq!(DType::from_descr(&descr).unwrap(), original_dtype);
        Ok(())
    }

    #[test]
    fn record_description_with_onedimensional_field_shape_declaration() -> TestResult {
        let descr = parse("[('a', '>f8', (1,))]");
        let expected_dtype = DType::Record(vec![
            Field {
                name: "a".to_string(),
                dtype: DType::Array(1, Box::new(DType::Plain(">f8".parse()?))),
            }
        ]);
        assert_eq!(DType::from_descr(&descr).unwrap(), expected_dtype);
        Ok(())
    }

    #[test]
    fn record_description_with_nested_record_field() -> TestResult {
        let descr = parse("[('parent', [('child', '<i4')])]");
        let expected_dtype = DType::Record(vec![
            Field {
                name: "parent".to_string(),
                dtype: DType::Record(vec![
                    Field {
                        name: "child".to_string(),
                        dtype: DType::Plain("<i4".parse()?),
                    },
                ]),
            }
        ]);
        assert_eq!(DType::from_descr(&descr).unwrap(), expected_dtype);
        Ok(())
    }

    #[test]
    fn nested_record_field_array() -> TestResult {
        let descr = parse("[('parent', [('child', '<i4')], (2,))]");
        let expected_dtype = DType::Record(vec![
            Field {
                name: "parent".to_string(),
                dtype: DType::Array(2, Box::new(DType::Record(vec![
                    Field {
                        name: "child".to_string(),
                        dtype: DType::Plain("<i4".parse()?),
                    },
                ]))),
            }
        ]);
        assert_eq!(DType::from_descr(&descr).unwrap(), expected_dtype);
        Ok(())
    }

    #[test]
    fn test_size_of_packed_data() -> TestResult {
        let descr = parse("[('a', '<u2'), ('b', '>i4')]");
        let dtype = DType::from_descr(&descr).unwrap();
        assert_eq!(dtype.num_bytes(), Some(6));
        Ok(())
    }

    #[test]
    fn test_variable_size() -> TestResult {
        let descr = parse("[('a', '<u2'), ('b', '|O')]");
        let dtype = DType::from_descr(&descr).unwrap();
        assert_eq!(dtype.num_bytes(), None);
        assert!(dtype.uses_pickled_array());
        Ok(())
    }

    #[test]
    fn errors_on_value_variants_that_cannot_be_converted() {
        let no_dtype = Value::Boolean(false);
        assert!(DType::from_descr(&no_dtype).is_err());
    }

    #[test]
    fn errors_when_record_list_does_not_contain_lists() {
        let faulty_list = parse("['a', 123]");
        assert!(DType::from_descr(&faulty_list).is_err());
    }

    #[test]
    fn errors_when_record_list_entry_contains_too_few_items() {
        let faulty_list = parse("[('a',)]");
        assert!(DType::from_descr(&faulty_list).is_err());
    }

    #[test]
    fn errors_when_record_list_entry_contains_too_many_items() {
        let faulty_list = parse("[('a', 1, 2, 3)]");
        assert!(DType::from_descr(&faulty_list).is_err());
    }

    #[test]
    fn errors_when_record_list_entry_contains_non_strings_for_id_or_dtype() {
        let faulty_list = parse("[(1, 2)]");
        assert!(DType::from_descr(&faulty_list).is_err());
    }

    #[test]
    fn errors_when_shape_is_not_a_list() {
        let no_shape = parse("1");
        assert!(convert_value_to_shape(&no_shape).is_err());
    }

    #[test]
    fn errors_when_shape_number_is_not_a_number() {
        let no_number = parse("[]");
        assert!(convert_value_to_shape_integer(&no_number).is_err());
    }

    #[test]
    fn errors_when_shape_number_is_negative() {
        assert!(convert_value_to_shape_integer(&parse("-1")).is_err());
    }

    #[test]
    fn errors_when_shape_number_is_larger_than_u64() {
        assert!(convert_value_to_shape_integer(&parse("18446744073709551616")).is_err());
    }

    fn parse(source: &str) -> Value {
        source.parse().unwrap_or_else(|e| panic!("could not parse Python expression:\n{}", e))
    }
}