apache_avro/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! A library for working with [Apache Avro](https://avro.apache.org/) in Rust.
19//!
20//! Please check our [documentation](https://docs.rs/apache-avro) for examples, tutorials and API reference.
21//!
22//! **[Apache Avro](https://avro.apache.org/)** is a data serialization system which provides rich
23//! data structures and a compact, fast, binary data format.
24//!
25//! All data in Avro is schematized, as in the following example:
26//!
27//! ```json
28//! {
29//! "type": "record",
30//! "name": "test",
31//! "fields": [
32//! {"name": "a", "type": "long", "default": 42},
33//! {"name": "b", "type": "string"}
34//! ]
35//! }
36//! ```
37//!
38//! There are basically two ways of handling Avro data in Rust:
39//!
40//! * **as Avro-specialized data types** based on an Avro schema;
41//! * **as generic Rust serde-compatible types** implementing/deriving `Serialize` and
42//! `Deserialize`;
43//!
44//! **apache-avro** provides a way to read and write both these data representations easily and
45//! efficiently.
46//!
47//! # Installing the library
48//!
49//!
50//! Add to your `Cargo.toml`:
51//!
52//! ```toml
53//! [dependencies]
54//! apache-avro = "x.y"
55//! ```
56//!
57//! Or in case you want to leverage the **Snappy** codec:
58//!
59//! ```toml
60//! [dependencies.apache-avro]
61//! version = "x.y"
62//! features = ["snappy"]
63//! ```
64//!
65//! Or in case you want to leverage the **Zstandard** codec:
66//!
67//! ```toml
68//! [dependencies.apache-avro]
69//! version = "x.y"
70//! features = ["zstandard"]
71//! ```
72//!
73//! Or in case you want to leverage the **Bzip2** codec:
74//!
75//! ```toml
76//! [dependencies.apache-avro]
77//! version = "x.y"
78//! features = ["bzip"]
79//! ```
80//!
81//! Or in case you want to leverage the **Xz** codec:
82//!
83//! ```toml
84//! [dependencies.apache-avro]
85//! version = "x.y"
86//! features = ["xz"]
87//! ```
88//!
89//! # Upgrading to a newer minor version
90//!
91//! The library is still in beta, so there might be backward-incompatible changes between minor
92//! versions. If you have troubles upgrading, check the [version upgrade guide](https://github.com/apache/avro/blob/main/lang/rust/migration_guide.md).
93//!
94//! # Defining a schema
95//!
96//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and
97//! **can** be used while reading and they carry the information regarding the type of data we are
98//! handling. Avro schemas are used for both schema validation and resolution of Avro data.
99//!
100//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string:
101//!
102//! ```
103//! use apache_avro::Schema;
104//!
105//! let raw_schema = r#"
106//! {
107//! "type": "record",
108//! "name": "test",
109//! "fields": [
110//! {"name": "a", "type": "long", "default": 42},
111//! {"name": "b", "type": "string"}
112//! ]
113//! }
114//! "#;
115//!
116//! // if the schema is not valid, this function will return an error
117//! let schema = Schema::parse_str(raw_schema).unwrap();
118//!
119//! // schemas can be printed for debugging
120//! println!("{:?}", schema);
121//! ```
122//!
123//! Additionally, a list of of definitions (which may depend on each other) can be given and all of
124//! them will be parsed into the corresponding schemas.
125//!
126//! ```
127//! use apache_avro::Schema;
128//!
129//! let raw_schema_1 = r#"{
130//! "name": "A",
131//! "type": "record",
132//! "fields": [
133//! {"name": "field_one", "type": "float"}
134//! ]
135//! }"#;
136//!
137//! // This definition depends on the definition of A above
138//! let raw_schema_2 = r#"{
139//! "name": "B",
140//! "type": "record",
141//! "fields": [
142//! {"name": "field_one", "type": "A"}
143//! ]
144//! }"#;
145//!
146//! // if the schemas are not valid, this function will return an error
147//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap();
148//!
149//! // schemas can be printed for debugging
150//! println!("{:?}", schemas);
151//! ```
152//! *N.B.* It is important to note that the composition of schema definitions requires schemas with names.
153//! For this reason, only schemas of type Record, Enum, and Fixed should be input into this function.
154//!
155//! The library provides also a programmatic interface to define schemas without encoding them in
156//! JSON (for advanced use), but we highly recommend the JSON interface. Please read the API
157//! reference in case you are interested.
158//!
159//! For more information about schemas and what kind of information you can encapsulate in them,
160//! please refer to the appropriate section of the
161//! [Avro Specification](https://avro.apache.org/docs/current/spec.html#schemas).
162//!
163//! # Writing data
164//!
165//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against
166//! the provided schema in the process. As mentioned before, there are two ways of handling Avro
167//! data in Rust.
168//!
169//! **NOTE:** The library also provides a low-level interface for encoding a single datum in Avro
170//! bytecode without generating markers and headers (for advanced use), but we highly recommend the
171//! `Writer` interface to be totally Avro-compatible. Please read the API reference in case you are
172//! interested.
173//!
174//! ## The avro way
175//!
176//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the
177//! associated type provided by the library to specify the data we want to serialize:
178//!
179//! ```
180//! # use apache_avro::Schema;
181//! use apache_avro::types::Record;
182//! use apache_avro::Writer;
183//! #
184//! # let raw_schema = r#"
185//! # {
186//! # "type": "record",
187//! # "name": "test",
188//! # "fields": [
189//! # {"name": "a", "type": "long", "default": 42},
190//! # {"name": "b", "type": "string"}
191//! # ]
192//! # }
193//! # "#;
194//! # let schema = Schema::parse_str(raw_schema).unwrap();
195//! // a writer needs a schema and something to write to
196//! let mut writer = Writer::new(&schema, Vec::new());
197//!
198//! // the Record type models our Record schema
199//! let mut record = Record::new(writer.schema()).unwrap();
200//! record.put("a", 27i64);
201//! record.put("b", "foo");
202//!
203//! // schema validation happens here
204//! writer.append(record).unwrap();
205//!
206//! // this is how to get back the resulting avro bytecode
207//! // this performs a flush operation to make sure data has been written, so it can fail
208//! // you can also call `writer.flush()` yourself without consuming the writer
209//! let encoded = writer.into_inner().unwrap();
210//! ```
211//!
212//! The vast majority of the times, schemas tend to define a record as a top-level container
213//! encapsulating all the values to convert as fields and providing documentation for them, but in
214//! case we want to directly define an Avro value, the library offers that capability via the
215//! `Value` interface.
216//!
217//! ```
218//! use apache_avro::types::Value;
219//!
220//! let mut value = Value::String("foo".to_string());
221//! ```
222//!
223//! ## The serde way
224//!
225//! Given that the schema we defined above is an Avro *Record*, we can directly use a Rust struct
226//! deriving `Serialize` to model our data:
227//!
228//! ```
229//! # use apache_avro::Schema;
230//! # use serde::Serialize;
231//! use apache_avro::Writer;
232//!
233//! #[derive(Debug, Serialize)]
234//! struct Test {
235//! a: i64,
236//! b: String,
237//! }
238//!
239//! # let raw_schema = r#"
240//! # {
241//! # "type": "record",
242//! # "name": "test",
243//! # "fields": [
244//! # {"name": "a", "type": "long", "default": 42},
245//! # {"name": "b", "type": "string"}
246//! # ]
247//! # }
248//! # "#;
249//! # let schema = Schema::parse_str(raw_schema).unwrap();
250//! // a writer needs a schema and something to write to
251//! let mut writer = Writer::new(&schema, Vec::new());
252//!
253//! // the structure models our Record schema
254//! let test = Test {
255//! a: 27,
256//! b: "foo".to_owned(),
257//! };
258//!
259//! // schema validation happens here
260//! writer.append_ser(test).unwrap();
261//!
262//! // this is how to get back the resulting avro bytecode
263//! // this performs a flush operation to make sure data is written, so it can fail
264//! // you can also call `writer.flush()` yourself without consuming the writer
265//! let encoded = writer.into_inner();
266//! ```
267//!
268//! The vast majority of the times, schemas tend to define a record as a top-level container
269//! encapsulating all the values to convert as fields and providing documentation for them, but in
270//! case we want to directly define an Avro value, any type implementing `Serialize` should work.
271//!
272//! ```
273//! let mut value = "foo".to_string();
274//! ```
275//!
276//! ## Using codecs to compress data
277//!
278//! Avro supports three different compression codecs when encoding data:
279//!
280//! * **Null**: leaves data uncompressed;
281//! * **Deflate**: writes the data block using the deflate algorithm as specified in RFC 1951, and
282//! typically implemented using the zlib library. Note that this format (unlike the "zlib format" in
283//! RFC 1950) does not have a checksum.
284//! * **Snappy**: uses Google's [Snappy](http://google.github.io/snappy/) compression library. Each
285//! compressed block is followed by the 4-byte, big-endianCRC32 checksum of the uncompressed data in
286//! the block. You must enable the `snappy` feature to use this codec.
287//! * **Zstandard**: uses Facebook's [Zstandard](https://facebook.github.io/zstd/) compression library.
288//! You must enable the `zstandard` feature to use this codec.
289//! * **Bzip2**: uses [BZip2](https://sourceware.org/bzip2/) compression library.
290//! You must enable the `bzip` feature to use this codec.
291//! * **Xz**: uses [xz2](https://github.com/alexcrichton/xz2-rs) compression library.
292//! You must enable the `xz` feature to use this codec.
293//!
294//! To specify a codec to use to compress data, just specify it while creating a `Writer`:
295//! ```
296//! # use apache_avro::Schema;
297//! use apache_avro::Writer;
298//! use apache_avro::Codec;
299//! #
300//! # let raw_schema = r#"
301//! # {
302//! # "type": "record",
303//! # "name": "test",
304//! # "fields": [
305//! # {"name": "a", "type": "long", "default": 42},
306//! # {"name": "b", "type": "string"}
307//! # ]
308//! # }
309//! # "#;
310//! # let schema = Schema::parse_str(raw_schema).unwrap();
311//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
312//! ```
313//!
314//! # Reading data
315//!
316//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to
317//! read them. The library will do it automatically for us, as it already does for the compression
318//! codec:
319//!
320//! ```
321//! use apache_avro::Reader;
322//! # use apache_avro::Schema;
323//! # use apache_avro::types::Record;
324//! # use apache_avro::Writer;
325//! #
326//! # let raw_schema = r#"
327//! # {
328//! # "type": "record",
329//! # "name": "test",
330//! # "fields": [
331//! # {"name": "a", "type": "long", "default": 42},
332//! # {"name": "b", "type": "string"}
333//! # ]
334//! # }
335//! # "#;
336//! # let schema = Schema::parse_str(raw_schema).unwrap();
337//! # let mut writer = Writer::new(&schema, Vec::new());
338//! # let mut record = Record::new(writer.schema()).unwrap();
339//! # record.put("a", 27i64);
340//! # record.put("b", "foo");
341//! # writer.append(record).unwrap();
342//! # let input = writer.into_inner().unwrap();
343//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
344//! let reader = Reader::new(&input[..]).unwrap();
345//! ```
346//!
347//! In case, instead, we want to specify a different (but compatible) reader schema from the schema
348//! the data has been written with, we can just do as the following:
349//! ```
350//! use apache_avro::Schema;
351//! use apache_avro::Reader;
352//! # use apache_avro::types::Record;
353//! # use apache_avro::Writer;
354//! #
355//! # let writer_raw_schema = r#"
356//! # {
357//! # "type": "record",
358//! # "name": "test",
359//! # "fields": [
360//! # {"name": "a", "type": "long", "default": 42},
361//! # {"name": "b", "type": "string"}
362//! # ]
363//! # }
364//! # "#;
365//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
366//! # let mut writer = Writer::new(&writer_schema, Vec::new());
367//! # let mut record = Record::new(writer.schema()).unwrap();
368//! # record.put("a", 27i64);
369//! # record.put("b", "foo");
370//! # writer.append(record).unwrap();
371//! # let input = writer.into_inner().unwrap();
372//!
373//! let reader_raw_schema = r#"
374//! {
375//! "type": "record",
376//! "name": "test",
377//! "fields": [
378//! {"name": "a", "type": "long", "default": 42},
379//! {"name": "b", "type": "string"},
380//! {"name": "c", "type": "long", "default": 43}
381//! ]
382//! }
383//! "#;
384//!
385//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
386//!
387//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
388//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
389//! ```
390//!
391//! The library will also automatically perform schema resolution while reading the data.
392//!
393//! For more information about schema compatibility and resolution, please refer to the
394//! [Avro Specification](https://avro.apache.org/docs/current/spec.html#schemas).
395//!
396//! As usual, there are two ways to handle Avro data in Rust, as you can see below.
397//!
398//! **NOTE:** The library also provides a low-level interface for decoding a single datum in Avro
399//! bytecode without markers and header (for advanced use), but we highly recommend the `Reader`
400//! interface to leverage all Avro features. Please read the API reference in case you are
401//! interested.
402//!
403//!
404//! ## The avro way
405//!
406//! We can just read directly instances of `Value` out of the `Reader` iterator:
407//!
408//! ```
409//! # use apache_avro::Schema;
410//! # use apache_avro::types::Record;
411//! # use apache_avro::Writer;
412//! use apache_avro::Reader;
413//! #
414//! # let raw_schema = r#"
415//! # {
416//! # "type": "record",
417//! # "name": "test",
418//! # "fields": [
419//! # {"name": "a", "type": "long", "default": 42},
420//! # {"name": "b", "type": "string"}
421//! # ]
422//! # }
423//! # "#;
424//! # let schema = Schema::parse_str(raw_schema).unwrap();
425//! # let schema = Schema::parse_str(raw_schema).unwrap();
426//! # let mut writer = Writer::new(&schema, Vec::new());
427//! # let mut record = Record::new(writer.schema()).unwrap();
428//! # record.put("a", 27i64);
429//! # record.put("b", "foo");
430//! # writer.append(record).unwrap();
431//! # let input = writer.into_inner().unwrap();
432//! let reader = Reader::new(&input[..]).unwrap();
433//!
434//! // value is a Result of an Avro Value in case the read operation fails
435//! for value in reader {
436//! println!("{:?}", value.unwrap());
437//! }
438//!
439//! ```
440//!
441//! ## The serde way
442//!
443//! Alternatively, we can use a Rust type implementing `Deserialize` and representing our schema to
444//! read the data into:
445//!
446//! ```
447//! # use apache_avro::Schema;
448//! # use apache_avro::Writer;
449//! # use serde::{Deserialize, Serialize};
450//! use apache_avro::Reader;
451//! use apache_avro::from_value;
452//!
453//! # #[derive(Serialize)]
454//! #[derive(Debug, Deserialize)]
455//! struct Test {
456//! a: i64,
457//! b: String,
458//! }
459//!
460//! # let raw_schema = r#"
461//! # {
462//! # "type": "record",
463//! # "name": "test",
464//! # "fields": [
465//! # {"name": "a", "type": "long", "default": 42},
466//! # {"name": "b", "type": "string"}
467//! # ]
468//! # }
469//! # "#;
470//! # let schema = Schema::parse_str(raw_schema).unwrap();
471//! # let mut writer = Writer::new(&schema, Vec::new());
472//! # let test = Test {
473//! # a: 27,
474//! # b: "foo".to_owned(),
475//! # };
476//! # writer.append_ser(test).unwrap();
477//! # let input = writer.into_inner().unwrap();
478//! let reader = Reader::new(&input[..]).unwrap();
479//!
480//! // value is a Result in case the read operation fails
481//! for value in reader {
482//! println!("{:?}", from_value::<Test>(&value.unwrap()));
483//! }
484//! ```
485//!
486//! # Putting everything together
487//!
488//! The following is an example of how to combine everything showed so far and it is meant to be a
489//! quick reference of the library interface:
490//!
491//! ```
492//! use apache_avro::{Codec, Reader, Schema, Writer, from_value, types::Record, Error};
493//! use serde::{Deserialize, Serialize};
494//!
495//! #[derive(Debug, Deserialize, Serialize)]
496//! struct Test {
497//! a: i64,
498//! b: String,
499//! }
500//!
501//! fn main() -> Result<(), Error> {
502//! let raw_schema = r#"
503//! {
504//! "type": "record",
505//! "name": "test",
506//! "fields": [
507//! {"name": "a", "type": "long", "default": 42},
508//! {"name": "b", "type": "string"}
509//! ]
510//! }
511//! "#;
512//!
513//! let schema = Schema::parse_str(raw_schema)?;
514//!
515//! println!("{:?}", schema);
516//!
517//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
518//!
519//! let mut record = Record::new(writer.schema()).unwrap();
520//! record.put("a", 27i64);
521//! record.put("b", "foo");
522//!
523//! writer.append(record)?;
524//!
525//! let test = Test {
526//! a: 27,
527//! b: "foo".to_owned(),
528//! };
529//!
530//! writer.append_ser(test)?;
531//!
532//! let input = writer.into_inner()?;
533//! let reader = Reader::with_schema(&schema, &input[..])?;
534//!
535//! for record in reader {
536//! println!("{:?}", from_value::<Test>(&record?));
537//! }
538//! Ok(())
539//! }
540//! ```
541//!
542//! `apache-avro` also supports the logical types listed in the [Avro specification](https://avro.apache.org/docs/current/spec.html#Logical+Types):
543//!
544//! 1. `Decimal` using the [`num_bigint`](https://docs.rs/num-bigint/latest/num_bigint) crate
545//! 1. UUID using the [`uuid`](https://docs.rs/uuid/latest/uuid) crate
546//! 1. Date, Time (milli) as `i32` and Time (micro) as `i64`
547//! 1. Timestamp (milli and micro) as `i64`
548//! 1. Local timestamp (milli and micro) as `i64`
549//! 1. Duration as a custom type with `months`, `days` and `millis` accessor methods each of which returns an `i32`
550//!
551//! Note that the on-disk representation is identical to the underlying primitive/complex type.
552//!
553//! ### Read and write logical types
554//!
555//! ```rust
556//! use apache_avro::{
557//! types::Record, types::Value, Codec, Days, Decimal, Duration, Millis, Months, Reader, Schema,
558//! Writer, Error,
559//! };
560//! use num_bigint::ToBigInt;
561//!
562//! fn main() -> Result<(), Error> {
563//! let raw_schema = r#"
564//! {
565//! "type": "record",
566//! "name": "test",
567//! "fields": [
568//! {
569//! "name": "decimal_fixed",
570//! "type": {
571//! "type": "fixed",
572//! "size": 2,
573//! "name": "decimal"
574//! },
575//! "logicalType": "decimal",
576//! "precision": 4,
577//! "scale": 2
578//! },
579//! {
580//! "name": "decimal_var",
581//! "type": "bytes",
582//! "logicalType": "decimal",
583//! "precision": 10,
584//! "scale": 3
585//! },
586//! {
587//! "name": "uuid",
588//! "type": "string",
589//! "logicalType": "uuid"
590//! },
591//! {
592//! "name": "date",
593//! "type": "int",
594//! "logicalType": "date"
595//! },
596//! {
597//! "name": "time_millis",
598//! "type": "int",
599//! "logicalType": "time-millis"
600//! },
601//! {
602//! "name": "time_micros",
603//! "type": "long",
604//! "logicalType": "time-micros"
605//! },
606//! {
607//! "name": "timestamp_millis",
608//! "type": "long",
609//! "logicalType": "timestamp-millis"
610//! },
611//! {
612//! "name": "timestamp_micros",
613//! "type": "long",
614//! "logicalType": "timestamp-micros"
615//! },
616//! {
617//! "name": "local_timestamp_millis",
618//! "type": "long",
619//! "logicalType": "local-timestamp-millis"
620//! },
621//! {
622//! "name": "local_timestamp_micros",
623//! "type": "long",
624//! "logicalType": "local-timestamp-micros"
625//! },
626//! {
627//! "name": "duration",
628//! "type": {
629//! "type": "fixed",
630//! "size": 12,
631//! "name": "duration"
632//! },
633//! "logicalType": "duration"
634//! }
635//! ]
636//! }
637//! "#;
638//!
639//! let schema = Schema::parse_str(raw_schema)?;
640//!
641//! println!("{:?}", schema);
642//!
643//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
644//!
645//! let mut record = Record::new(writer.schema()).unwrap();
646//! record.put("decimal_fixed", Decimal::from(9936.to_bigint().unwrap().to_signed_bytes_be()));
647//! record.put("decimal_var", Decimal::from((-32442.to_bigint().unwrap()).to_signed_bytes_be()));
648//! record.put("uuid", uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap());
649//! record.put("date", Value::Date(1));
650//! record.put("time_millis", Value::TimeMillis(2));
651//! record.put("time_micros", Value::TimeMicros(3));
652//! record.put("timestamp_millis", Value::TimestampMillis(4));
653//! record.put("timestamp_micros", Value::TimestampMicros(5));
654//! record.put("timestamp_nanos", Value::TimestampNanos(6));
655//! record.put("local_timestamp_millis", Value::LocalTimestampMillis(4));
656//! record.put("local_timestamp_micros", Value::LocalTimestampMicros(5));
657//! record.put("local_timestamp_nanos", Value::LocalTimestampMicros(6));
658//! record.put("duration", Duration::new(Months::new(6), Days::new(7), Millis::new(8)));
659//!
660//! writer.append(record)?;
661//!
662//! let input = writer.into_inner()?;
663//! let reader = Reader::with_schema(&schema, &input[..])?;
664//!
665//! for record in reader {
666//! println!("{:?}", record?);
667//! }
668//! Ok(())
669//! }
670//! ```
671//!
672//! ## Calculate Avro schema fingerprint
673//!
674//! This library supports calculating the following fingerprints:
675//!
676//! - SHA-256
677//! - MD5
678//! - Rabin
679//!
680//! An example of fingerprinting for the supported fingerprints:
681//!
682//! ```rust
683//! use apache_avro::rabin::Rabin;
684//! use apache_avro::{Schema, Error};
685//! use md5::Md5;
686//! use sha2::Sha256;
687//!
688//! fn main() -> Result<(), Error> {
689//! let raw_schema = r#"
690//! {
691//! "type": "record",
692//! "name": "test",
693//! "fields": [
694//! {"name": "a", "type": "long", "default": 42},
695//! {"name": "b", "type": "string"}
696//! ]
697//! }
698//! "#;
699//! let schema = Schema::parse_str(raw_schema)?;
700//! println!("{}", schema.fingerprint::<Sha256>());
701//! println!("{}", schema.fingerprint::<Md5>());
702//! println!("{}", schema.fingerprint::<Rabin>());
703//! Ok(())
704//! }
705//! ```
706//!
707//! ## Ill-formed data
708//!
709//! In order to ease decoding, the Binary Encoding specification of Avro data
710//! requires some fields to have their length encoded alongside the data.
711//!
712//! If encoded data passed to a `Reader` has been ill-formed, it can happen that
713//! the bytes meant to contain the length of data are bogus and could result
714//! in extravagant memory allocation.
715//!
716//! To shield users from ill-formed data, `apache-avro` sets a limit (default: 512MB)
717//! to any allocation it will perform when decoding data.
718//!
719//! If you expect some of your data fields to be larger than this limit, be sure
720//! to make use of the `max_allocation_bytes` function before reading **any** data
721//! (we leverage Rust's [`std::sync::Once`](https://doc.rust-lang.org/std/sync/struct.Once.html)
722//! mechanism to initialize this value, if
723//! any call to decode is made before a call to `max_allocation_bytes`, the limit
724//! will be 512MB throughout the lifetime of the program).
725//!
726//!
727//! ```rust
728//! use apache_avro::max_allocation_bytes;
729//!
730//! max_allocation_bytes(2 * 1024 * 1024 * 1024); // 2GB
731//!
732//! // ... happily decode large data
733//!
734//! ```
735//!
736//! ## Check schemas compatibility
737//!
738//! This library supports checking for schemas compatibility.
739//!
740//! Examples of checking for compatibility:
741//!
742//! 1. Compatible schemas
743//!
744//! Explanation: an int array schema can be read by a long array schema- an int
745//! (32bit signed integer) fits into a long (64bit signed integer)
746//!
747//! ```rust
748//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
749//!
750//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
751//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
752//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_ok());
753//! ```
754//!
755//! 2. Incompatible schemas (a long array schema cannot be read by an int array schema)
756//!
757//! Explanation: a long array schema cannot be read by an int array schema- a
758//! long (64bit signed integer) does not fit into an int (32bit signed integer)
759//!
760//! ```rust
761//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
762//!
763//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
764//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
765//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err());
766//! ```
767//! ## Custom names validators
768//!
769//! By default the library follows the rules by the
770//! [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names)!
771//!
772//! Some of the other Apache Avro language SDKs are not that strict and allow more
773//! characters in names. For interoperability with those SDKs, the library provides
774//! a way to customize the names validation.
775//!
776//! ```rust
777//! use apache_avro::AvroResult;
778//! use apache_avro::schema::Namespace;
779//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator};
780//!
781//! struct MyCustomValidator;
782//!
783//! impl SchemaNameValidator for MyCustomValidator {
784//! fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> {
785//! todo!()
786//! }
787//! }
788//!
789//! // don't parse any schema before registering the custom validator(s) !
790//!
791//! set_schema_name_validator(Box::new(MyCustomValidator));
792//!
793//! // ... use the library
794//! ```
795//!
796//! Similar logic could be applied to the schema namespace, enum symbols and field names validation.
797//!
798//! **Note**: the library allows to set a validator only once per the application lifetime!
799//! If the application parses schemas before setting a validator, the default validator will be
800//! registered and used!
801//!
802//! ## Custom schema equality comparators
803//!
804//! The library provides two implementations of schema equality comparators:
805//! 1. `SpecificationEq` - a comparator that serializes the schemas to their
806//! canonical forms (i.e. JSON) and compares them as strings. It is the only implementation
807//! until apache_avro 0.16.0.
808//! See the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas)
809//! for more information!
810//! 2. `StructFieldEq` - a comparator that compares the schemas structurally.
811//! It is faster than the `SpecificationEq` because it returns `false` as soon as a difference
812//! is found and is recommended for use!
813//! It is the default comparator since apache_avro 0.17.0.
814//!
815//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the
816//! `set_schemata_equality_comparator` function:
817//!
818//! ```rust
819//! use apache_avro::{AvroResult, Schema};
820//! use apache_avro::schema::Namespace;
821//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator};
822//!
823//! #[derive(Debug)]
824//! struct MyCustomSchemataEq;
825//!
826//! impl SchemataEq for MyCustomSchemataEq {
827//! fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool {
828//! todo!()
829//! }
830//! }
831//!
832//! // don't parse any schema before registering the custom comparator !
833//!
834//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq));
835//!
836//! // ... use the library
837//! ```
838//! **Note**: the library allows to set a comparator only once per the application lifetime!
839//! If the application parses schemas before setting a comparator, the default comparator will be
840//! registered and used!
841//!
842
843mod bigdecimal;
844mod bytes;
845mod codec;
846mod de;
847mod decimal;
848mod decode;
849mod duration;
850mod encode;
851mod error;
852mod reader;
853mod ser;
854mod util;
855mod writer;
856
857pub mod rabin;
858pub mod schema;
859pub mod schema_compatibility;
860pub mod schema_equality;
861pub mod types;
862pub mod validator;
863
864pub use crate::{
865 bigdecimal::BigDecimal,
866 bytes::{
867 serde_avro_bytes, serde_avro_bytes_opt, serde_avro_fixed, serde_avro_fixed_opt,
868 serde_avro_slice, serde_avro_slice_opt,
869 },
870};
871pub use codec::Codec;
872pub use de::from_value;
873pub use decimal::Decimal;
874pub use duration::{Days, Duration, Millis, Months};
875pub use error::Error;
876pub use reader::{
877 from_avro_datum, from_avro_datum_schemata, read_marker, GenericSingleObjectReader, Reader,
878 SpecificSingleObjectReader,
879};
880pub use schema::{AvroSchema, Schema};
881pub use ser::to_value;
882pub use util::{max_allocation_bytes, set_serde_human_readable};
883pub use uuid::Uuid;
884pub use writer::{
885 to_avro_datum, to_avro_datum_schemata, GenericSingleObjectWriter, SpecificSingleObjectWriter,
886 Writer,
887};
888
889#[cfg(feature = "derive")]
890pub use apache_avro_derive::*;
891
892#[macro_use]
893extern crate log;
894
895/// A convenience type alias for `Result`s with `Error`s.
896pub type AvroResult<T> = Result<T, Error>;
897
898#[cfg(test)]
899mod tests {
900 use crate::{
901 from_avro_datum,
902 types::{Record, Value},
903 Codec, Reader, Schema, Writer,
904 };
905 use pretty_assertions::assert_eq;
906
907 //TODO: move where it fits better
908 #[test]
909 fn test_enum_default() {
910 let writer_raw_schema = r#"
911 {
912 "type": "record",
913 "name": "test",
914 "fields": [
915 {"name": "a", "type": "long", "default": 42},
916 {"name": "b", "type": "string"}
917 ]
918 }
919 "#;
920 let reader_raw_schema = r#"
921 {
922 "type": "record",
923 "name": "test",
924 "fields": [
925 {"name": "a", "type": "long", "default": 42},
926 {"name": "b", "type": "string"},
927 {
928 "name": "c",
929 "type": {
930 "type": "enum",
931 "name": "suit",
932 "symbols": ["diamonds", "spades", "clubs", "hearts"]
933 },
934 "default": "spades"
935 }
936 ]
937 }
938 "#;
939 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
940 let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
941 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
942 let mut record = Record::new(writer.schema()).unwrap();
943 record.put("a", 27i64);
944 record.put("b", "foo");
945 writer.append(record).unwrap();
946 let input = writer.into_inner().unwrap();
947 let mut reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
948 assert_eq!(
949 reader.next().unwrap().unwrap(),
950 Value::Record(vec![
951 ("a".to_string(), Value::Long(27)),
952 ("b".to_string(), Value::String("foo".to_string())),
953 ("c".to_string(), Value::Enum(1, "spades".to_string())),
954 ])
955 );
956 assert!(reader.next().is_none());
957 }
958
959 //TODO: move where it fits better
960 #[test]
961 fn test_enum_string_value() {
962 let raw_schema = r#"
963 {
964 "type": "record",
965 "name": "test",
966 "fields": [
967 {"name": "a", "type": "long", "default": 42},
968 {"name": "b", "type": "string"},
969 {
970 "name": "c",
971 "type": {
972 "type": "enum",
973 "name": "suit",
974 "symbols": ["diamonds", "spades", "clubs", "hearts"]
975 },
976 "default": "spades"
977 }
978 ]
979 }
980 "#;
981 let schema = Schema::parse_str(raw_schema).unwrap();
982 let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Null);
983 let mut record = Record::new(writer.schema()).unwrap();
984 record.put("a", 27i64);
985 record.put("b", "foo");
986 record.put("c", "clubs");
987 writer.append(record).unwrap();
988 let input = writer.into_inner().unwrap();
989 let mut reader = Reader::with_schema(&schema, &input[..]).unwrap();
990 assert_eq!(
991 reader.next().unwrap().unwrap(),
992 Value::Record(vec![
993 ("a".to_string(), Value::Long(27)),
994 ("b".to_string(), Value::String("foo".to_string())),
995 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
996 ])
997 );
998 assert!(reader.next().is_none());
999 }
1000
1001 //TODO: move where it fits better
1002 #[test]
1003 fn test_enum_no_reader_schema() {
1004 let writer_raw_schema = r#"
1005 {
1006 "type": "record",
1007 "name": "test",
1008 "fields": [
1009 {"name": "a", "type": "long", "default": 42},
1010 {"name": "b", "type": "string"},
1011 {
1012 "name": "c",
1013 "type": {
1014 "type": "enum",
1015 "name": "suit",
1016 "symbols": ["diamonds", "spades", "clubs", "hearts"]
1017 },
1018 "default": "spades"
1019 }
1020 ]
1021 }
1022 "#;
1023 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
1024 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
1025 let mut record = Record::new(writer.schema()).unwrap();
1026 record.put("a", 27i64);
1027 record.put("b", "foo");
1028 record.put("c", "clubs");
1029 writer.append(record).unwrap();
1030 let input = writer.into_inner().unwrap();
1031 let mut reader = Reader::new(&input[..]).unwrap();
1032 assert_eq!(
1033 reader.next().unwrap().unwrap(),
1034 Value::Record(vec![
1035 ("a".to_string(), Value::Long(27)),
1036 ("b".to_string(), Value::String("foo".to_string())),
1037 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1038 ])
1039 );
1040 }
1041
1042 #[test]
1043 fn test_illformed_length() {
1044 let raw_schema = r#"
1045 {
1046 "type": "record",
1047 "name": "test",
1048 "fields": [
1049 {"name": "a", "type": "long", "default": 42},
1050 {"name": "b", "type": "string"}
1051 ]
1052 }
1053 "#;
1054
1055 let schema = Schema::parse_str(raw_schema).unwrap();
1056
1057 // Would allocated 18446744073709551605 bytes
1058 let illformed: &[u8] = &[0x3e, 0x15, 0xff, 0x1f, 0x15, 0xff];
1059
1060 let value = from_avro_datum(&schema, &mut &*illformed, None);
1061 assert!(value.is_err());
1062 }
1063}