ion_rs/
reader.rs

1use std::io;
2use std::io::Read;
3use std::ops::Range;
4
5use delegate::delegate;
6
7use crate::binary::constants::v1_0::IVM;
8use crate::constants::v1_0::system_symbol_ids;
9use crate::data_source::ToIonDataSource;
10use crate::element::{Blob, Clob};
11use crate::raw_reader::{RawReader, RawStreamItem};
12use crate::raw_symbol_token::RawSymbolToken;
13use crate::result::{decoding_error, decoding_error_raw, IonResult};
14use crate::stream_reader::IonReader;
15use crate::symbol_table::SymbolTable;
16use crate::types::{Decimal, Int, Symbol, Timestamp};
17use crate::{BlockingRawBinaryReader, BlockingRawTextReader, IonType};
18use std::fmt::{Display, Formatter};
19
20use crate::types::Str;
21/// Configures and constructs new instances of [Reader].
22pub struct ReaderBuilder {}
23
24impl ReaderBuilder {
25    /// Constructs a [ReaderBuilder] pre-populated with common default settings.
26    pub fn new() -> ReaderBuilder {
27        ReaderBuilder {
28            // Eventually, this will contain settings like a `Catalog` implementation.
29        }
30    }
31
32    /// Applies the specified settings to a new instance of `Reader`. This process involves
33    /// reading some data from the beginning of `input` to detect whether its content is
34    /// text or binary Ion. If this read operation fails, `build` will return an `Err`
35    /// describing the problem it encountered.
36    pub fn build<'a, I: 'a + ToIonDataSource>(self, input: I) -> IonResult<Reader<'a>> {
37        // Convert the provided input into an implementation of `BufRead`
38        let mut input = input.to_ion_data_source();
39        // Stack-allocated buffer to hold the first four bytes from input
40        let mut header: [u8; 4] = [0u8; 4];
41
42        // Read up to four bytes of input. This has to be done somewhat manually. Convenience
43        // functions like `read_exact` will return an error if the input doesn't contain the
44        // correct number of bytes, and there are legal Ion streams that have fewer than four
45        // bytes in them. (For example, the stream `1 `.)
46        let mut total_bytes_read = 0usize;
47        while total_bytes_read < IVM.len() {
48            let bytes_read = input.read(&mut header[total_bytes_read..])?;
49            // If `bytes_read` is zero, we reached the end of the file before we could get
50            // all four bytes. That means this isn't a (valid) binary stream. We'll assume
51            // it's text.
52            if bytes_read == 0 {
53                // `header` is a stack-allocated buffer that won't outlive this function call.
54                // If it were full, we could move the whole `[u8; 4]` into the reader. However,
55                // only some of it is populated and we can't use a slice of it because the array
56                // is short-lived. Instead we'll make a statically owned copy of the bytes that
57                // we can move into the reader.
58                let owned_header = Vec::from(&header[..total_bytes_read]);
59                // The file was too short to be binary Ion. Construct a text Reader.
60                return Self::make_text_reader(owned_header);
61            }
62            total_bytes_read += bytes_read;
63        }
64
65        // If we've reached this point, we successfully read 4 bytes from the file into `header`.
66        // Match against `header` to see if it contains the Ion 1.0 version marker.
67        match header {
68            [0xe0, 0x01, 0x00, 0xea] => {
69                // Binary Ion v1.0
70                let full_input = io::Cursor::new(header).chain(input);
71                Ok(Self::make_binary_reader(full_input)?)
72            }
73            [0xe0, major, minor, 0xea] => {
74                // Binary Ion v{major}.{minor}
75                decoding_error(format!(
76                    "cannot read Ion v{major}.{minor}; only v1.0 is supported"
77                ))
78            }
79            _ => {
80                // It's not binary, assume it's text
81                let full_input = io::Cursor::new(header).chain(input);
82                Ok(Self::make_text_reader(full_input)?)
83            }
84        }
85    }
86
87    fn make_text_reader<'a, I: 'a + ToIonDataSource>(data: I) -> IonResult<Reader<'a>> {
88        let raw_reader = Box::new(BlockingRawTextReader::new(data)?);
89        Ok(Reader {
90            raw_reader,
91            symbol_table: SymbolTable::new(),
92        })
93    }
94
95    fn make_binary_reader<'a, I: 'a + ToIonDataSource>(data: I) -> IonResult<Reader<'a>> {
96        let raw_reader = Box::new(BlockingRawBinaryReader::new(data)?);
97        Ok(Reader {
98            raw_reader,
99            symbol_table: SymbolTable::new(),
100        })
101    }
102}
103
104impl Default for ReaderBuilder {
105    fn default() -> Self {
106        ReaderBuilder::new()
107    }
108}
109
110/// A Reader that uses dynamic dispatch to abstract over the format (text or binary) being
111/// read by an underlying [RawReader].
112pub type Reader<'a> = UserReader<Box<dyn RawReader + 'a>>;
113
114/// A streaming Ion reader that resolves symbol IDs into their corresponding text.
115///
116/// Reader itself is format-agnostic; all format-specific logic is handled by the
117/// wrapped [RawReader] implementation.
118pub struct UserReader<R: RawReader> {
119    raw_reader: R,
120    symbol_table: SymbolTable,
121}
122
123impl<R: RawReader> UserReader<R> {
124    pub fn new(raw_reader: R) -> UserReader<R> {
125        UserReader {
126            raw_reader,
127            symbol_table: SymbolTable::new(),
128        }
129    }
130}
131
132// This module exists to allow our integration tests to directly construct a `UserReader`
133// with not-yet-supported settings. We want users to use `ReaderBuilder` instead; eventually,
134// `ReaderBuilder` will also work for the integration tests and we can remove this.
135// See: https://github.com/amazon-ion/ion-rust/issues/484
136#[doc(hidden)]
137pub mod integration_testing {
138    use crate::{RawReader, Reader, UserReader};
139
140    pub fn new_reader<'a, R: 'a + RawReader>(raw_reader: R) -> Reader<'a> {
141        UserReader::new(Box::new(raw_reader))
142    }
143}
144
145/// Stream components that an application-level [Reader] implementation may encounter.
146#[derive(Debug, Eq, PartialEq, Copy, Clone)]
147pub enum StreamItem {
148    /// A non-null Ion value and its corresponding Ion data type.
149    Value(IonType),
150    /// A null Ion value and its corresponding Ion data type.
151    Null(IonType),
152    /// Indicates that the reader is not positioned over anything. This can happen:
153    /// * before the reader has begun processing the stream.
154    /// * after the reader has stepped into a container, but before the reader has called next()
155    /// * after the reader has stepped out of a container, but before the reader has called next()
156    /// * after the reader has read the last item in a container
157    Nothing,
158}
159
160impl StreamItem {
161    /// If `is_null` is `true`, returns `StreamItem::Value(ion_type)`. Otherwise,
162    /// returns `StreamItem::Null(ion_type)`.
163    pub fn nullable_value(ion_type: IonType, is_null: bool) -> StreamItem {
164        if is_null {
165            StreamItem::Null(ion_type)
166        } else {
167            StreamItem::Value(ion_type)
168        }
169    }
170}
171
172impl Display for StreamItem {
173    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
174        use StreamItem::*;
175        match self {
176            Value(ion_type) => write!(f, "{ion_type}"),
177            Null(ion_type) => write!(f, "null.{ion_type}"),
178            Nothing => Ok(()),
179        }
180    }
181}
182
183impl<R: RawReader> UserReader<R> {
184    pub fn read_raw_symbol(&mut self) -> IonResult<RawSymbolToken> {
185        self.raw_reader.read_symbol()
186    }
187
188    pub fn raw_field_name_token(&mut self) -> IonResult<RawSymbolToken> {
189        self.raw_reader.field_name()
190    }
191
192    fn read_symbol_table(&mut self) -> IonResult<()> {
193        self.raw_reader.step_in()?;
194
195        let mut is_append = false;
196        let mut new_symbols = vec![];
197
198        // It's illegal for a symbol table to have multiple `symbols` or `imports` fields.
199        // Keep track of whether we've already encountered them.
200        let mut has_found_symbols_field = false;
201        let mut has_found_imports_field = false;
202
203        loop {
204            let ion_type = match self.raw_reader.next()? {
205                RawStreamItem::Value(ion_type) => ion_type,
206                RawStreamItem::Null(_) => continue,
207                RawStreamItem::Nothing => break,
208                RawStreamItem::VersionMarker(major, minor) => {
209                    return decoding_error(format!(
210                        "encountered Ion version marker for v{major}.{minor} in symbol table"
211                    ))
212                }
213            };
214
215            let field_id = self
216                .raw_reader
217                .field_name()
218                .expect("No field ID found inside $ion_symbol_table struct.");
219            match (field_id, ion_type) {
220                // The field name is either SID 6 or the text 'imports' and the
221                // field value is a non-null List
222                (symbol, IonType::List)
223                    if symbol.matches(system_symbol_ids::IMPORTS, "imports") =>
224                {
225                    // TODO: SST imports. This implementation only supports local symbol
226                    //       table imports and appends.
227                    return decoding_error("importing shared symbol tables is not yet supported");
228                }
229                // The field name is either SID 6 or the text 'imports' and the
230                // field value is a non-null symbol
231                (symbol, IonType::Symbol)
232                    if symbol.matches(system_symbol_ids::IMPORTS, "imports") =>
233                {
234                    if has_found_imports_field {
235                        return decoding_error("symbol table had multiple 'imports' fields");
236                    }
237                    has_found_imports_field = true;
238                    let import_symbol = self.raw_reader.read_symbol()?;
239                    if !import_symbol.matches(3, "$ion_symbol_table") {
240                        // Field name `imports` with a symbol other than $ion_symbol_table is ignored
241                        continue;
242                    }
243                    is_append = true;
244                }
245                // The field name is either SID 7 or the text 'imports' and the
246                // field value is a non-null list
247                (symbol, IonType::List)
248                    if symbol.matches(system_symbol_ids::SYMBOLS, "symbols") =>
249                {
250                    if has_found_symbols_field {
251                        return decoding_error("symbol table had multiple 'symbols' fields");
252                    }
253                    has_found_symbols_field = true;
254                    self.raw_reader.step_in()?;
255                    loop {
256                        use RawStreamItem::*;
257                        match self.raw_reader.next()? {
258                            Value(IonType::String) => {
259                                new_symbols.push(Some(self.raw_reader.read_string()?));
260                            }
261                            Value(_) | Null(_) => {
262                                // If we encounter a non-string or null, add a placeholder
263                                new_symbols.push(None);
264                            }
265                            VersionMarker(_, _) => {
266                                return decoding_error("Found IVM in symbol table.")
267                            }
268                            Nothing => break,
269                        }
270                    }
271                    self.raw_reader.step_out()?;
272                }
273                something_else => {
274                    unimplemented!("No support for {:?}", something_else);
275                }
276            }
277        }
278
279        if is_append {
280            // We're adding new symbols to the end of the symbol table.
281            for maybe_text in new_symbols.drain(..) {
282                let _sid = self.symbol_table.intern_or_add_placeholder(maybe_text);
283            }
284        } else {
285            // The symbol table has been set by defining new symbols without importing the current
286            // symbol table.
287            self.symbol_table.reset();
288            for maybe_text in new_symbols.drain(..) {
289                let _sid = self.symbol_table.intern_or_add_placeholder(maybe_text);
290            }
291        }
292
293        self.raw_reader.step_out()?;
294        Ok(())
295    }
296
297    fn raw_annotations(&mut self) -> impl Iterator<Item = RawSymbolToken> + '_ {
298        // RawReader implementations do not attempt to resolve each annotation into text.
299        // Additionally, they perform all I/O related to annotations in their implementations
300        // of Reader::next. As such, it's safe to call `unwrap()` on each raw annotation.
301        self.raw_reader.annotations().map(|a| a.unwrap())
302    }
303
304    pub fn symbol_table(&self) -> &SymbolTable {
305        &self.symbol_table
306    }
307}
308
309impl<R: RawReader> IonReader for UserReader<R> {
310    type Item = StreamItem;
311    type Symbol = Symbol;
312
313    fn current(&self) -> Self::Item {
314        if let Some(ion_type) = self.ion_type() {
315            return if self.is_null() {
316                StreamItem::Null(ion_type)
317            } else {
318                StreamItem::Value(ion_type)
319            };
320        }
321        StreamItem::Nothing
322    }
323
324    /// Advances the raw reader to the next user-level Ion value, processing any system-level directives
325    /// encountered along the way.
326    // v-- Clippy complains that `next` resembles `Iterator::next()`
327    #[allow(clippy::should_implement_trait)]
328    fn next(&mut self) -> IonResult<Self::Item> {
329        use RawStreamItem::*;
330        loop {
331            match self.raw_reader.next()? {
332                VersionMarker(1, 0) => {
333                    self.symbol_table.reset();
334                }
335                VersionMarker(major, minor) => {
336                    return decoding_error(format!(
337                        "Encountered a version marker for v{major}.{minor}, but only v1.0 is supported."
338                    ));
339                }
340                Value(IonType::Struct) => {
341                    // Top-level structs whose _first_ annotation is $ion_symbol_table are
342                    // interpreted as local symbol tables. Other trailing annotations (if any) are
343                    // ignored. If the first annotation is something other than `$ion_symbol_table`,
344                    // the struct is considered user data even if one of the trailing annotations
345                    // is `$ion_symbol_table`. For more information, see this section of the spec:
346                    // https://amazon-ion.github.io/ion-docs/docs/symbols.html#local-symbol-tables
347                    if self.raw_reader.depth() == 0 {
348                        let is_symtab = match self.raw_reader.annotations().next() {
349                            Some(Err(error)) => return Err(error),
350                            Some(Ok(symbol))
351                                if symbol.matches(
352                                    system_symbol_ids::ION_SYMBOL_TABLE,
353                                    "$ion_symbol_table",
354                                ) =>
355                            {
356                                true
357                            }
358                            _ => false,
359                        };
360                        // This logic cannot be merged into the `match` statement above because
361                        // `self.read_symbol_table()` requires a mutable borrow which is not
362                        // possible while iterating over the reader's annotations.
363                        if is_symtab {
364                            self.read_symbol_table()?;
365                            continue;
366                        }
367                    }
368                    return Ok(StreamItem::Value(IonType::Struct));
369                }
370                Value(ion_type) => return Ok(StreamItem::Value(ion_type)),
371                Null(ion_type) => return Ok(StreamItem::Null(ion_type)),
372                Nothing => return Ok(StreamItem::Nothing),
373            }
374        }
375    }
376
377    fn field_name(&self) -> IonResult<Self::Symbol> {
378        match self.raw_reader.field_name()? {
379            RawSymbolToken::SymbolId(sid) => {
380                self.symbol_table.symbol_for(sid).cloned().ok_or_else(|| {
381                    decoding_error_raw(format!("encountered field ID with unknown text: ${sid}"))
382                })
383            }
384            RawSymbolToken::Text(text) => Ok(Symbol::owned(text)),
385        }
386    }
387
388    fn annotations<'a>(&'a self) -> Box<dyn Iterator<Item = IonResult<Self::Symbol>> + 'a> {
389        let iterator = self
390            .raw_reader
391            .annotations()
392            .map(move |raw_token| match raw_token? {
393                RawSymbolToken::SymbolId(sid) => {
394                    self.symbol_table.symbol_for(sid).cloned().ok_or_else(|| {
395                        decoding_error_raw(format!("found annotation ID with unknown text: ${sid}"))
396                    })
397                }
398                RawSymbolToken::Text(text) => Ok(Symbol::owned(text)),
399            });
400        Box::new(iterator)
401    }
402
403    fn read_symbol(&mut self) -> IonResult<Self::Symbol> {
404        match self.raw_reader.read_symbol()? {
405            RawSymbolToken::SymbolId(symbol_id) => {
406                if let Some(symbol) = self.symbol_table.symbol_for(symbol_id) {
407                    Ok(symbol.clone())
408                } else {
409                    decoding_error(format!(
410                        "Found symbol ID ${symbol_id}, which is not defined."
411                    ))
412                }
413            }
414            RawSymbolToken::Text(text) => Ok(Symbol::owned(text)),
415        }
416    }
417
418    // The Reader needs to expose many of the same functions as the Cursor, but only some of those
419    // need to be re-defined to allow for system value processing. Any method listed here will be
420    // delegated to self.raw_reader directly.
421    delegate! {
422        to self.raw_reader {
423            fn is_null(&self) -> bool;
424            fn ion_version(&self) -> (u8, u8);
425            fn ion_type(&self) -> Option<IonType>;
426            fn read_null(&mut self) -> IonResult<IonType>;
427            fn read_bool(&mut self) -> IonResult<bool>;
428            fn read_int(&mut self) -> IonResult<Int>;
429            fn read_i64(&mut self) -> IonResult<i64>;
430            fn read_f32(&mut self) -> IonResult<f32>;
431            fn read_f64(&mut self) -> IonResult<f64>;
432            fn read_decimal(&mut self) -> IonResult<Decimal>;
433            fn read_string(&mut self) -> IonResult<Str>;
434            fn read_str(&mut self) -> IonResult<&str>;
435            fn read_blob(&mut self) -> IonResult<Blob>;
436            fn read_clob(&mut self) -> IonResult<Clob>;
437            fn read_timestamp(&mut self) -> IonResult<Timestamp>;
438            fn step_in(&mut self) -> IonResult<()>;
439            fn step_out(&mut self) -> IonResult<()>;
440            fn parent_type(&self) -> Option<IonType>;
441            fn depth(&self) -> usize;
442        }
443    }
444}
445
446/// Functionality that is only available if the data source we're reading from is in-memory, like
447/// a `Vec<u8>` or `&[u8]`.
448impl<T: AsRef<[u8]>> UserReader<BlockingRawBinaryReader<io::Cursor<T>>> {
449    delegate! {
450        to self.raw_reader {
451            pub fn raw_bytes(&self) -> Option<&[u8]>;
452            pub fn raw_field_id_bytes(&self) -> Option<&[u8]>;
453            pub fn raw_header_bytes(&self) -> Option<&[u8]>;
454            pub fn raw_value_bytes(&self) -> Option<&[u8]>;
455            pub fn raw_annotations_bytes(&self) -> Option<&[u8]>;
456
457            pub fn field_id_length(&self) -> Option<usize>;
458            pub fn field_id_offset(&self) -> Option<usize>;
459            pub fn field_id_range(&self) -> Option<Range<usize>>;
460
461            pub fn annotations_length(&self) -> Option<usize>;
462            pub fn annotations_offset(&self) -> Option<usize>;
463            pub fn annotations_range(&self) -> Option<Range<usize>>;
464
465            pub fn header_length(&self) -> usize;
466            pub fn header_offset(&self) -> usize;
467            pub fn header_range(&self) -> Range<usize>;
468
469            pub fn value_length(&self) -> usize;
470            pub fn value_offset(&self) -> usize;
471            pub fn value_range(&self) -> Range<usize>;
472        }
473    }
474}
475
476#[cfg(test)]
477mod tests {
478    use std::io;
479
480    use super::*;
481    use crate::binary::constants::v1_0::IVM;
482    use crate::BlockingRawBinaryReader;
483
484    use crate::result::IonResult;
485    use crate::types::IonType;
486    use crate::StreamItem::Value;
487
488    type TestDataSource = io::Cursor<Vec<u8>>;
489
490    // Create a growable byte vector that starts with the Ion 1.0 version marker
491    fn ion_data(bytes: &[u8]) -> Vec<u8> {
492        let mut data = Vec::new();
493        data.extend_from_slice(&IVM);
494        data.extend_from_slice(bytes);
495        data
496    }
497
498    // Creates an io::Cursor over the provided data
499    fn data_source_for(bytes: &[u8]) -> TestDataSource {
500        let data = ion_data(bytes);
501        io::Cursor::new(data)
502    }
503
504    // Prepends an Ion 1.0 IVM to the provided data and then creates a BinaryIonCursor over it
505    fn raw_binary_reader_for(bytes: &[u8]) -> BlockingRawBinaryReader<TestDataSource> {
506        use RawStreamItem::*;
507        let mut raw_reader =
508            BlockingRawBinaryReader::new(data_source_for(bytes)).expect("unable to create reader");
509        assert_eq!(raw_reader.ion_type(), None);
510        assert_eq!(raw_reader.next(), Ok(VersionMarker(1, 0)));
511        assert_eq!(raw_reader.ion_version(), (1u8, 0u8));
512        raw_reader
513    }
514
515    fn ion_reader_for(bytes: &[u8]) -> Reader {
516        ReaderBuilder::new().build(ion_data(bytes)).unwrap()
517    }
518
519    const EXAMPLE_STREAM: &[u8] = &[
520        // $ion_symbol_table::{imports: $ion_symbol_table, symbols: ["foo", "bar", "baz"]}
521        0xEE, // Var len annotations
522        0x92, // Annotations + Value length: 21 bytes
523        0x81, // Annotations length: 1
524        0x83, // Annotation 3 ('$ion_symbol_table')
525        0xDE, // Var len struct
526        0x8E, // Length: 14 bytes
527        0x87, // Field ID 7 ('symbols')
528        0xBC, // 12-byte List
529        0x83, 0x66, 0x6f, 0x6f, // "foo"
530        0x83, 0x62, 0x61, 0x72, // "bar"
531        0x83, 0x62, 0x61, 0x7a, // "baz"
532        // System: {$10: 1, $11: 2, $12: 3}
533        // User: {foo: 1, bar: 2, baz: 3}
534        0xD9, // 9-byte struct
535        0x8A, // Field ID 10
536        0x21, 0x01, // Integer 1
537        0x8B, // Field ID 11
538        0x21, 0x02, // Integer 2
539        0x8C, // Field ID 12
540        0x21, 0x03, // Integer 3
541    ];
542
543    #[test]
544    fn test_read_struct() -> IonResult<()> {
545        let mut reader = ion_reader_for(EXAMPLE_STREAM);
546
547        assert_eq!(Value(IonType::Struct), reader.next()?);
548        reader.step_in()?;
549
550        assert_eq!(reader.next()?, Value(IonType::Int));
551        assert_eq!(reader.field_name()?, "foo");
552
553        assert_eq!(reader.next()?, Value(IonType::Int));
554        assert_eq!(reader.field_name()?, "bar");
555
556        assert_eq!(reader.next()?, Value(IonType::Int));
557        assert_eq!(reader.field_name()?, "baz");
558
559        Ok(())
560    }
561}