apache_avro/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! A library for working with [Apache Avro](https://avro.apache.org/) in Rust.
19//!
20//! Please check our [documentation](https://docs.rs/apache-avro) for examples, tutorials and API reference.
21//!
22//! **[Apache Avro](https://avro.apache.org/)** is a data serialization system which provides rich
23//! data structures and a compact, fast, binary data format.
24//!
25//! All data in Avro is schematized, as in the following example:
26//!
27//! ```json
28//! {
29//!     "type": "record",
30//!     "name": "test",
31//!     "fields": [
32//!         {"name": "a", "type": "long", "default": 42},
33//!         {"name": "b", "type": "string"}
34//!     ]
35//! }
36//! ```
37//!
38//! There are basically two ways of handling Avro data in Rust:
39//!
40//! * **as Avro-specialized data types** based on an Avro schema;
41//! * **as generic Rust serde-compatible types** implementing/deriving `Serialize` and
42//! `Deserialize`;
43//!
44//! **apache-avro** provides a way to read and write both these data representations easily and
45//! efficiently.
46//!
47//! # Installing the library
48//!
49//!
50//! Add to your `Cargo.toml`:
51//!
52//! ```toml
53//! [dependencies]
54//! apache-avro = "x.y"
55//! ```
56//!
57//! Or in case you want to leverage the **Snappy** codec:
58//!
59//! ```toml
60//! [dependencies.apache-avro]
61//! version = "x.y"
62//! features = ["snappy"]
63//! ```
64//!
65//! Or in case you want to leverage the **Zstandard** codec:
66//!
67//! ```toml
68//! [dependencies.apache-avro]
69//! version = "x.y"
70//! features = ["zstandard"]
71//! ```
72//!
73//! Or in case you want to leverage the **Bzip2** codec:
74//!
75//! ```toml
76//! [dependencies.apache-avro]
77//! version = "x.y"
78//! features = ["bzip"]
79//! ```
80//!
81//! Or in case you want to leverage the **Xz** codec:
82//!
83//! ```toml
84//! [dependencies.apache-avro]
85//! version = "x.y"
86//! features = ["xz"]
87//! ```
88//!
89//! # Upgrading to a newer minor version
90//!
91//! The library is still in beta, so there might be backward-incompatible changes between minor
92//! versions. If you have troubles upgrading, check the [version upgrade guide](https://github.com/apache/avro/blob/main/lang/rust/migration_guide.md).
93//!
94//! # Defining a schema
95//!
96//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and
97//! **can** be used while reading and they carry the information regarding the type of data we are
98//! handling. Avro schemas are used for both schema validation and resolution of Avro data.
99//!
100//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string:
101//!
102//! ```
103//! use apache_avro::Schema;
104//!
105//! let raw_schema = r#"
106//!     {
107//!         "type": "record",
108//!         "name": "test",
109//!         "fields": [
110//!             {"name": "a", "type": "long", "default": 42},
111//!             {"name": "b", "type": "string"}
112//!         ]
113//!     }
114//! "#;
115//!
116//! // if the schema is not valid, this function will return an error
117//! let schema = Schema::parse_str(raw_schema).unwrap();
118//!
119//! // schemas can be printed for debugging
120//! println!("{:?}", schema);
121//! ```
122//!
123//! Additionally, a list of of definitions (which may depend on each other) can be given and all of
124//! them will be parsed into the corresponding schemas.
125//!
126//! ```
127//! use apache_avro::Schema;
128//!
129//! let raw_schema_1 = r#"{
130//!         "name": "A",
131//!         "type": "record",
132//!         "fields": [
133//!             {"name": "field_one", "type": "float"}
134//!         ]
135//!     }"#;
136//!
137//! // This definition depends on the definition of A above
138//! let raw_schema_2 = r#"{
139//!         "name": "B",
140//!         "type": "record",
141//!         "fields": [
142//!             {"name": "field_one", "type": "A"}
143//!         ]
144//!     }"#;
145//!
146//! // if the schemas are not valid, this function will return an error
147//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap();
148//!
149//! // schemas can be printed for debugging
150//! println!("{:?}", schemas);
151//! ```
152//! *N.B.* It is important to note that the composition of schema definitions requires schemas with names.
153//! For this reason, only schemas of type Record, Enum, and Fixed should be input into this function.
154//!
155//! The library provides also a programmatic interface to define schemas without encoding them in
156//! JSON (for advanced use), but we highly recommend the JSON interface. Please read the API
157//! reference in case you are interested.
158//!
159//! For more information about schemas and what kind of information you can encapsulate in them,
160//! please refer to the appropriate section of the
161//! [Avro Specification](https://avro.apache.org/docs/current/spec.html#schemas).
162//!
163//! # Writing data
164//!
165//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against
166//! the provided schema in the process. As mentioned before, there are two ways of handling Avro
167//! data in Rust.
168//!
169//! **NOTE:** The library also provides a low-level interface for encoding a single datum in Avro
170//! bytecode without generating markers and headers (for advanced use), but we highly recommend the
171//! `Writer` interface to be totally Avro-compatible. Please read the API reference in case you are
172//! interested.
173//!
174//! ## The avro way
175//!
176//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the
177//! associated type provided by the library to specify the data we want to serialize:
178//!
179//! ```
180//! # use apache_avro::Schema;
181//! use apache_avro::types::Record;
182//! use apache_avro::Writer;
183//! #
184//! # let raw_schema = r#"
185//! #     {
186//! #         "type": "record",
187//! #         "name": "test",
188//! #         "fields": [
189//! #             {"name": "a", "type": "long", "default": 42},
190//! #             {"name": "b", "type": "string"}
191//! #         ]
192//! #     }
193//! # "#;
194//! # let schema = Schema::parse_str(raw_schema).unwrap();
195//! // a writer needs a schema and something to write to
196//! let mut writer = Writer::new(&schema, Vec::new());
197//!
198//! // the Record type models our Record schema
199//! let mut record = Record::new(writer.schema()).unwrap();
200//! record.put("a", 27i64);
201//! record.put("b", "foo");
202//!
203//! // schema validation happens here
204//! writer.append(record).unwrap();
205//!
206//! // this is how to get back the resulting avro bytecode
207//! // this performs a flush operation to make sure data has been written, so it can fail
208//! // you can also call `writer.flush()` yourself without consuming the writer
209//! let encoded = writer.into_inner().unwrap();
210//! ```
211//!
212//! The vast majority of the times, schemas tend to define a record as a top-level container
213//! encapsulating all the values to convert as fields and providing documentation for them, but in
214//! case we want to directly define an Avro value, the library offers that capability via the
215//! `Value` interface.
216//!
217//! ```
218//! use apache_avro::types::Value;
219//!
220//! let mut value = Value::String("foo".to_string());
221//! ```
222//!
223//! ## The serde way
224//!
225//! Given that the schema we defined above is an Avro *Record*, we can directly use a Rust struct
226//! deriving `Serialize` to model our data:
227//!
228//! ```
229//! # use apache_avro::Schema;
230//! # use serde::Serialize;
231//! use apache_avro::Writer;
232//!
233//! #[derive(Debug, Serialize)]
234//! struct Test {
235//!     a: i64,
236//!     b: String,
237//! }
238//!
239//! # let raw_schema = r#"
240//! #     {
241//! #         "type": "record",
242//! #         "name": "test",
243//! #         "fields": [
244//! #             {"name": "a", "type": "long", "default": 42},
245//! #             {"name": "b", "type": "string"}
246//! #         ]
247//! #     }
248//! # "#;
249//! # let schema = Schema::parse_str(raw_schema).unwrap();
250//! // a writer needs a schema and something to write to
251//! let mut writer = Writer::new(&schema, Vec::new());
252//!
253//! // the structure models our Record schema
254//! let test = Test {
255//!     a: 27,
256//!     b: "foo".to_owned(),
257//! };
258//!
259//! // schema validation happens here
260//! writer.append_ser(test).unwrap();
261//!
262//! // this is how to get back the resulting avro bytecode
263//! // this performs a flush operation to make sure data is written, so it can fail
264//! // you can also call `writer.flush()` yourself without consuming the writer
265//! let encoded = writer.into_inner();
266//! ```
267//!
268//! The vast majority of the times, schemas tend to define a record as a top-level container
269//! encapsulating all the values to convert as fields and providing documentation for them, but in
270//! case we want to directly define an Avro value, any type implementing `Serialize` should work.
271//!
272//! ```
273//! let mut value = "foo".to_string();
274//! ```
275//!
276//! ## Using codecs to compress data
277//!
278//! Avro supports three different compression codecs when encoding data:
279//!
280//! * **Null**: leaves data uncompressed;
281//! * **Deflate**: writes the data block using the deflate algorithm as specified in RFC 1951, and
282//! typically implemented using the zlib library. Note that this format (unlike the "zlib format" in
283//! RFC 1950) does not have a checksum.
284//! * **Snappy**: uses Google's [Snappy](http://google.github.io/snappy/) compression library. Each
285//! compressed block is followed by the 4-byte, big-endianCRC32 checksum of the uncompressed data in
286//! the block. You must enable the `snappy` feature to use this codec.
287//! * **Zstandard**: uses Facebook's [Zstandard](https://facebook.github.io/zstd/) compression library.
288//! You must enable the `zstandard` feature to use this codec.
289//! * **Bzip2**: uses [BZip2](https://sourceware.org/bzip2/) compression library.
290//! You must enable the `bzip` feature to use this codec.
291//! * **Xz**: uses [xz2](https://github.com/alexcrichton/xz2-rs) compression library.
292//!   You must enable the `xz` feature to use this codec.
293//!
294//! To specify a codec to use to compress data, just specify it while creating a `Writer`:
295//! ```
296//! # use apache_avro::Schema;
297//! use apache_avro::Writer;
298//! use apache_avro::Codec;
299//! #
300//! # let raw_schema = r#"
301//! #     {
302//! #         "type": "record",
303//! #         "name": "test",
304//! #         "fields": [
305//! #             {"name": "a", "type": "long", "default": 42},
306//! #             {"name": "b", "type": "string"}
307//! #         ]
308//! #     }
309//! # "#;
310//! # let schema = Schema::parse_str(raw_schema).unwrap();
311//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
312//! ```
313//!
314//! # Reading data
315//!
316//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to
317//! read them. The library will do it automatically for us, as it already does for the compression
318//! codec:
319//!
320//! ```
321//! use apache_avro::Reader;
322//! # use apache_avro::Schema;
323//! # use apache_avro::types::Record;
324//! # use apache_avro::Writer;
325//! #
326//! # let raw_schema = r#"
327//! #     {
328//! #         "type": "record",
329//! #         "name": "test",
330//! #         "fields": [
331//! #             {"name": "a", "type": "long", "default": 42},
332//! #             {"name": "b", "type": "string"}
333//! #         ]
334//! #     }
335//! # "#;
336//! # let schema = Schema::parse_str(raw_schema).unwrap();
337//! # let mut writer = Writer::new(&schema, Vec::new());
338//! # let mut record = Record::new(writer.schema()).unwrap();
339//! # record.put("a", 27i64);
340//! # record.put("b", "foo");
341//! # writer.append(record).unwrap();
342//! # let input = writer.into_inner().unwrap();
343//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
344//! let reader = Reader::new(&input[..]).unwrap();
345//! ```
346//!
347//! In case, instead, we want to specify a different (but compatible) reader schema from the schema
348//! the data has been written with, we can just do as the following:
349//! ```
350//! use apache_avro::Schema;
351//! use apache_avro::Reader;
352//! # use apache_avro::types::Record;
353//! # use apache_avro::Writer;
354//! #
355//! # let writer_raw_schema = r#"
356//! #     {
357//! #         "type": "record",
358//! #         "name": "test",
359//! #         "fields": [
360//! #             {"name": "a", "type": "long", "default": 42},
361//! #             {"name": "b", "type": "string"}
362//! #         ]
363//! #     }
364//! # "#;
365//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
366//! # let mut writer = Writer::new(&writer_schema, Vec::new());
367//! # let mut record = Record::new(writer.schema()).unwrap();
368//! # record.put("a", 27i64);
369//! # record.put("b", "foo");
370//! # writer.append(record).unwrap();
371//! # let input = writer.into_inner().unwrap();
372//!
373//! let reader_raw_schema = r#"
374//!     {
375//!         "type": "record",
376//!         "name": "test",
377//!         "fields": [
378//!             {"name": "a", "type": "long", "default": 42},
379//!             {"name": "b", "type": "string"},
380//!             {"name": "c", "type": "long", "default": 43}
381//!         ]
382//!     }
383//! "#;
384//!
385//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
386//!
387//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
388//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
389//! ```
390//!
391//! The library will also automatically perform schema resolution while reading the data.
392//!
393//! For more information about schema compatibility and resolution, please refer to the
394//! [Avro Specification](https://avro.apache.org/docs/current/spec.html#schemas).
395//!
396//! As usual, there are two ways to handle Avro data in Rust, as you can see below.
397//!
398//! **NOTE:** The library also provides a low-level interface for decoding a single datum in Avro
399//! bytecode without markers and header (for advanced use), but we highly recommend the `Reader`
400//! interface to leverage all Avro features. Please read the API reference in case you are
401//! interested.
402//!
403//!
404//! ## The avro way
405//!
406//! We can just read directly instances of `Value` out of the `Reader` iterator:
407//!
408//! ```
409//! # use apache_avro::Schema;
410//! # use apache_avro::types::Record;
411//! # use apache_avro::Writer;
412//! use apache_avro::Reader;
413//! #
414//! # let raw_schema = r#"
415//! #     {
416//! #         "type": "record",
417//! #         "name": "test",
418//! #         "fields": [
419//! #             {"name": "a", "type": "long", "default": 42},
420//! #             {"name": "b", "type": "string"}
421//! #         ]
422//! #     }
423//! # "#;
424//! # let schema = Schema::parse_str(raw_schema).unwrap();
425//! # let schema = Schema::parse_str(raw_schema).unwrap();
426//! # let mut writer = Writer::new(&schema, Vec::new());
427//! # let mut record = Record::new(writer.schema()).unwrap();
428//! # record.put("a", 27i64);
429//! # record.put("b", "foo");
430//! # writer.append(record).unwrap();
431//! # let input = writer.into_inner().unwrap();
432//! let reader = Reader::new(&input[..]).unwrap();
433//!
434//! // value is a Result  of an Avro Value in case the read operation fails
435//! for value in reader {
436//!     println!("{:?}", value.unwrap());
437//! }
438//!
439//! ```
440//!
441//! ## The serde way
442//!
443//! Alternatively, we can use a Rust type implementing `Deserialize` and representing our schema to
444//! read the data into:
445//!
446//! ```
447//! # use apache_avro::Schema;
448//! # use apache_avro::Writer;
449//! # use serde::{Deserialize, Serialize};
450//! use apache_avro::Reader;
451//! use apache_avro::from_value;
452//!
453//! # #[derive(Serialize)]
454//! #[derive(Debug, Deserialize)]
455//! struct Test {
456//!     a: i64,
457//!     b: String,
458//! }
459//!
460//! # let raw_schema = r#"
461//! #     {
462//! #         "type": "record",
463//! #         "name": "test",
464//! #         "fields": [
465//! #             {"name": "a", "type": "long", "default": 42},
466//! #             {"name": "b", "type": "string"}
467//! #         ]
468//! #     }
469//! # "#;
470//! # let schema = Schema::parse_str(raw_schema).unwrap();
471//! # let mut writer = Writer::new(&schema, Vec::new());
472//! # let test = Test {
473//! #     a: 27,
474//! #     b: "foo".to_owned(),
475//! # };
476//! # writer.append_ser(test).unwrap();
477//! # let input = writer.into_inner().unwrap();
478//! let reader = Reader::new(&input[..]).unwrap();
479//!
480//! // value is a Result in case the read operation fails
481//! for value in reader {
482//!     println!("{:?}", from_value::<Test>(&value.unwrap()));
483//! }
484//! ```
485//!
486//! # Putting everything together
487//!
488//! The following is an example of how to combine everything showed so far and it is meant to be a
489//! quick reference of the library interface:
490//!
491//! ```
492//! use apache_avro::{Codec, Reader, Schema, Writer, from_value, types::Record, Error};
493//! use serde::{Deserialize, Serialize};
494//!
495//! #[derive(Debug, Deserialize, Serialize)]
496//! struct Test {
497//!     a: i64,
498//!     b: String,
499//! }
500//!
501//! fn main() -> Result<(), Error> {
502//!     let raw_schema = r#"
503//!         {
504//!             "type": "record",
505//!             "name": "test",
506//!             "fields": [
507//!                 {"name": "a", "type": "long", "default": 42},
508//!                 {"name": "b", "type": "string"}
509//!             ]
510//!         }
511//!     "#;
512//!
513//!     let schema = Schema::parse_str(raw_schema)?;
514//!
515//!     println!("{:?}", schema);
516//!
517//!     let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
518//!
519//!     let mut record = Record::new(writer.schema()).unwrap();
520//!     record.put("a", 27i64);
521//!     record.put("b", "foo");
522//!
523//!     writer.append(record)?;
524//!
525//!     let test = Test {
526//!         a: 27,
527//!         b: "foo".to_owned(),
528//!     };
529//!
530//!     writer.append_ser(test)?;
531//!
532//!     let input = writer.into_inner()?;
533//!     let reader = Reader::with_schema(&schema, &input[..])?;
534//!
535//!     for record in reader {
536//!         println!("{:?}", from_value::<Test>(&record?));
537//!     }
538//!     Ok(())
539//! }
540//! ```
541//!
542//! `apache-avro` also supports the logical types listed in the [Avro specification](https://avro.apache.org/docs/current/spec.html#Logical+Types):
543//!
544//! 1. `Decimal` using the [`num_bigint`](https://docs.rs/num-bigint/latest/num_bigint) crate
545//! 1. UUID using the [`uuid`](https://docs.rs/uuid/latest/uuid) crate
546//! 1. Date, Time (milli) as `i32` and Time (micro) as `i64`
547//! 1. Timestamp (milli and micro) as `i64`
548//! 1. Local timestamp (milli and micro) as `i64`
549//! 1. Duration as a custom type with `months`, `days` and `millis` accessor methods each of which returns an `i32`
550//!
551//! Note that the on-disk representation is identical to the underlying primitive/complex type.
552//!
553//! ### Read and write logical types
554//!
555//! ```rust
556//! use apache_avro::{
557//!     types::Record, types::Value, Codec, Days, Decimal, Duration, Millis, Months, Reader, Schema,
558//!     Writer, Error,
559//! };
560//! use num_bigint::ToBigInt;
561//!
562//! fn main() -> Result<(), Error> {
563//!     let raw_schema = r#"
564//!     {
565//!       "type": "record",
566//!       "name": "test",
567//!       "fields": [
568//!         {
569//!           "name": "decimal_fixed",
570//!           "type": {
571//!             "type": "fixed",
572//!             "size": 2,
573//!             "name": "decimal"
574//!           },
575//!           "logicalType": "decimal",
576//!           "precision": 4,
577//!           "scale": 2
578//!         },
579//!         {
580//!           "name": "decimal_var",
581//!           "type": "bytes",
582//!           "logicalType": "decimal",
583//!           "precision": 10,
584//!           "scale": 3
585//!         },
586//!         {
587//!           "name": "uuid",
588//!           "type": "string",
589//!           "logicalType": "uuid"
590//!         },
591//!         {
592//!           "name": "date",
593//!           "type": "int",
594//!           "logicalType": "date"
595//!         },
596//!         {
597//!           "name": "time_millis",
598//!           "type": "int",
599//!           "logicalType": "time-millis"
600//!         },
601//!         {
602//!           "name": "time_micros",
603//!           "type": "long",
604//!           "logicalType": "time-micros"
605//!         },
606//!         {
607//!           "name": "timestamp_millis",
608//!           "type": "long",
609//!           "logicalType": "timestamp-millis"
610//!         },
611//!         {
612//!           "name": "timestamp_micros",
613//!           "type": "long",
614//!           "logicalType": "timestamp-micros"
615//!         },
616//!         {
617//!           "name": "local_timestamp_millis",
618//!           "type": "long",
619//!           "logicalType": "local-timestamp-millis"
620//!         },
621//!         {
622//!           "name": "local_timestamp_micros",
623//!           "type": "long",
624//!           "logicalType": "local-timestamp-micros"
625//!         },
626//!         {
627//!           "name": "duration",
628//!           "type": {
629//!             "type": "fixed",
630//!             "size": 12,
631//!             "name": "duration"
632//!           },
633//!           "logicalType": "duration"
634//!         }
635//!       ]
636//!     }
637//!     "#;
638//!
639//!     let schema = Schema::parse_str(raw_schema)?;
640//!
641//!     println!("{:?}", schema);
642//!
643//!     let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate);
644//!
645//!     let mut record = Record::new(writer.schema()).unwrap();
646//!     record.put("decimal_fixed", Decimal::from(9936.to_bigint().unwrap().to_signed_bytes_be()));
647//!     record.put("decimal_var", Decimal::from((-32442.to_bigint().unwrap()).to_signed_bytes_be()));
648//!     record.put("uuid", uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap());
649//!     record.put("date", Value::Date(1));
650//!     record.put("time_millis", Value::TimeMillis(2));
651//!     record.put("time_micros", Value::TimeMicros(3));
652//!     record.put("timestamp_millis", Value::TimestampMillis(4));
653//!     record.put("timestamp_micros", Value::TimestampMicros(5));
654//!     record.put("timestamp_nanos", Value::TimestampNanos(6));
655//!     record.put("local_timestamp_millis", Value::LocalTimestampMillis(4));
656//!     record.put("local_timestamp_micros", Value::LocalTimestampMicros(5));
657//!     record.put("local_timestamp_nanos", Value::LocalTimestampMicros(6));
658//!     record.put("duration", Duration::new(Months::new(6), Days::new(7), Millis::new(8)));
659//!
660//!     writer.append(record)?;
661//!
662//!     let input = writer.into_inner()?;
663//!     let reader = Reader::with_schema(&schema, &input[..])?;
664//!
665//!     for record in reader {
666//!         println!("{:?}", record?);
667//!     }
668//!     Ok(())
669//! }
670//! ```
671//!
672//! ## Calculate Avro schema fingerprint
673//!
674//! This library supports calculating the following fingerprints:
675//!
676//!  - SHA-256
677//!  - MD5
678//!  - Rabin
679//!
680//! An example of fingerprinting for the supported fingerprints:
681//!
682//! ```rust
683//! use apache_avro::rabin::Rabin;
684//! use apache_avro::{Schema, Error};
685//! use md5::Md5;
686//! use sha2::Sha256;
687//!
688//! fn main() -> Result<(), Error> {
689//!     let raw_schema = r#"
690//!         {
691//!             "type": "record",
692//!             "name": "test",
693//!             "fields": [
694//!                 {"name": "a", "type": "long", "default": 42},
695//!                 {"name": "b", "type": "string"}
696//!             ]
697//!         }
698//!     "#;
699//!     let schema = Schema::parse_str(raw_schema)?;
700//!     println!("{}", schema.fingerprint::<Sha256>());
701//!     println!("{}", schema.fingerprint::<Md5>());
702//!     println!("{}", schema.fingerprint::<Rabin>());
703//!     Ok(())
704//! }
705//! ```
706//!
707//! ## Ill-formed data
708//!
709//! In order to ease decoding, the Binary Encoding specification of Avro data
710//! requires some fields to have their length encoded alongside the data.
711//!
712//! If encoded data passed to a `Reader` has been ill-formed, it can happen that
713//! the bytes meant to contain the length of data are bogus and could result
714//! in extravagant memory allocation.
715//!
716//! To shield users from ill-formed data, `apache-avro` sets a limit (default: 512MB)
717//! to any allocation it will perform when decoding data.
718//!
719//! If you expect some of your data fields to be larger than this limit, be sure
720//! to make use of the `max_allocation_bytes` function before reading **any** data
721//! (we leverage Rust's [`std::sync::Once`](https://doc.rust-lang.org/std/sync/struct.Once.html)
722//! mechanism to initialize this value, if
723//! any call to decode is made before a call to `max_allocation_bytes`, the limit
724//! will be 512MB throughout the lifetime of the program).
725//!
726//!
727//! ```rust
728//! use apache_avro::max_allocation_bytes;
729//!
730//! max_allocation_bytes(2 * 1024 * 1024 * 1024);  // 2GB
731//!
732//! // ... happily decode large data
733//!
734//! ```
735//!
736//! ## Check schemas compatibility
737//!
738//! This library supports checking for schemas compatibility.
739//!
740//! Examples of checking for compatibility:
741//!
742//! 1. Compatible schemas
743//!
744//! Explanation: an int array schema can be read by a long array schema- an int
745//! (32bit signed integer) fits into a long (64bit signed integer)
746//!
747//! ```rust
748//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
749//!
750//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
751//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
752//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_ok());
753//! ```
754//!
755//! 2. Incompatible schemas (a long array schema cannot be read by an int array schema)
756//!
757//! Explanation: a long array schema cannot be read by an int array schema- a
758//! long (64bit signed integer) does not fit into an int (32bit signed integer)
759//!
760//! ```rust
761//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
762//!
763//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
764//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
765//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err());
766//! ```
767//! ## Custom names validators
768//!
769//! By default the library follows the rules by the
770//! [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names)!
771//!
772//! Some of the other Apache Avro language SDKs are not that strict and allow more
773//! characters in names. For interoperability with those SDKs, the library provides
774//! a way to customize the names validation.
775//!
776//! ```rust
777//! use apache_avro::AvroResult;
778//! use apache_avro::schema::Namespace;
779//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator};
780//!
781//! struct MyCustomValidator;
782//!
783//! impl SchemaNameValidator for MyCustomValidator {
784//!     fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> {
785//!         todo!()
786//!     }
787//! }
788//!
789//! // don't parse any schema before registering the custom validator(s) !
790//!
791//! set_schema_name_validator(Box::new(MyCustomValidator));
792//!
793//! // ... use the library
794//! ```
795//!
796//! Similar logic could be applied to the schema namespace, enum symbols and field names validation.
797//!
798//! **Note**: the library allows to set a validator only once per the application lifetime!
799//! If the application parses schemas before setting a validator, the default validator will be
800//! registered and used!
801//!
802//! ## Custom schema equality comparators
803//!
804//! The library provides two implementations of schema equality comparators:
805//! 1. `SpecificationEq` - a comparator that serializes the schemas to their
806//! canonical forms (i.e. JSON) and compares them as strings. It is the only implementation
807//! until apache_avro 0.16.0.
808//! See the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas)
809//! for more information!
810//! 2. `StructFieldEq` - a comparator that compares the schemas structurally.
811//! It is faster than the `SpecificationEq` because it returns `false` as soon as a difference
812//! is found and is recommended for use!
813//! It is the default comparator since apache_avro 0.17.0.
814//!
815//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the
816//! `set_schemata_equality_comparator` function:
817//!
818//! ```rust
819//! use apache_avro::{AvroResult, Schema};
820//! use apache_avro::schema::Namespace;
821//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator};
822//!
823//! #[derive(Debug)]
824//! struct MyCustomSchemataEq;
825//!
826//! impl SchemataEq for MyCustomSchemataEq {
827//!     fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool {
828//!         todo!()
829//!     }
830//! }
831//!
832//! // don't parse any schema before registering the custom comparator !
833//!
834//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq));
835//!
836//! // ... use the library
837//! ```
838//! **Note**: the library allows to set a comparator only once per the application lifetime!
839//! If the application parses schemas before setting a comparator, the default comparator will be
840//! registered and used!
841//!
842
843mod bigdecimal;
844mod bytes;
845mod codec;
846mod de;
847mod decimal;
848mod decode;
849mod duration;
850mod encode;
851mod error;
852mod reader;
853mod ser;
854mod util;
855mod writer;
856
857pub mod rabin;
858pub mod schema;
859pub mod schema_compatibility;
860pub mod schema_equality;
861pub mod types;
862pub mod validator;
863
864pub use crate::{
865    bigdecimal::BigDecimal,
866    bytes::{
867        serde_avro_bytes, serde_avro_bytes_opt, serde_avro_fixed, serde_avro_fixed_opt,
868        serde_avro_slice, serde_avro_slice_opt,
869    },
870};
871pub use codec::Codec;
872pub use de::from_value;
873pub use decimal::Decimal;
874pub use duration::{Days, Duration, Millis, Months};
875pub use error::Error;
876pub use reader::{
877    from_avro_datum, from_avro_datum_schemata, read_marker, GenericSingleObjectReader, Reader,
878    SpecificSingleObjectReader,
879};
880pub use schema::{AvroSchema, Schema};
881pub use ser::to_value;
882pub use util::{max_allocation_bytes, set_serde_human_readable};
883pub use uuid::Uuid;
884pub use writer::{
885    to_avro_datum, to_avro_datum_schemata, GenericSingleObjectWriter, SpecificSingleObjectWriter,
886    Writer,
887};
888
889#[cfg(feature = "derive")]
890pub use apache_avro_derive::*;
891
892#[macro_use]
893extern crate log;
894
895/// A convenience type alias for `Result`s with `Error`s.
896pub type AvroResult<T> = Result<T, Error>;
897
898#[cfg(test)]
899mod tests {
900    use crate::{
901        from_avro_datum,
902        types::{Record, Value},
903        Codec, Reader, Schema, Writer,
904    };
905    use pretty_assertions::assert_eq;
906
907    //TODO: move where it fits better
908    #[test]
909    fn test_enum_default() {
910        let writer_raw_schema = r#"
911            {
912                "type": "record",
913                "name": "test",
914                "fields": [
915                    {"name": "a", "type": "long", "default": 42},
916                    {"name": "b", "type": "string"}
917                ]
918            }
919        "#;
920        let reader_raw_schema = r#"
921            {
922                "type": "record",
923                "name": "test",
924                "fields": [
925                    {"name": "a", "type": "long", "default": 42},
926                    {"name": "b", "type": "string"},
927                    {
928                        "name": "c",
929                        "type": {
930                            "type": "enum",
931                            "name": "suit",
932                            "symbols": ["diamonds", "spades", "clubs", "hearts"]
933                        },
934                        "default": "spades"
935                    }
936                ]
937            }
938        "#;
939        let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
940        let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
941        let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
942        let mut record = Record::new(writer.schema()).unwrap();
943        record.put("a", 27i64);
944        record.put("b", "foo");
945        writer.append(record).unwrap();
946        let input = writer.into_inner().unwrap();
947        let mut reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
948        assert_eq!(
949            reader.next().unwrap().unwrap(),
950            Value::Record(vec![
951                ("a".to_string(), Value::Long(27)),
952                ("b".to_string(), Value::String("foo".to_string())),
953                ("c".to_string(), Value::Enum(1, "spades".to_string())),
954            ])
955        );
956        assert!(reader.next().is_none());
957    }
958
959    //TODO: move where it fits better
960    #[test]
961    fn test_enum_string_value() {
962        let raw_schema = r#"
963            {
964                "type": "record",
965                "name": "test",
966                "fields": [
967                    {"name": "a", "type": "long", "default": 42},
968                    {"name": "b", "type": "string"},
969                    {
970                        "name": "c",
971                        "type": {
972                            "type": "enum",
973                            "name": "suit",
974                            "symbols": ["diamonds", "spades", "clubs", "hearts"]
975                        },
976                        "default": "spades"
977                    }
978                ]
979            }
980        "#;
981        let schema = Schema::parse_str(raw_schema).unwrap();
982        let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Null);
983        let mut record = Record::new(writer.schema()).unwrap();
984        record.put("a", 27i64);
985        record.put("b", "foo");
986        record.put("c", "clubs");
987        writer.append(record).unwrap();
988        let input = writer.into_inner().unwrap();
989        let mut reader = Reader::with_schema(&schema, &input[..]).unwrap();
990        assert_eq!(
991            reader.next().unwrap().unwrap(),
992            Value::Record(vec![
993                ("a".to_string(), Value::Long(27)),
994                ("b".to_string(), Value::String("foo".to_string())),
995                ("c".to_string(), Value::Enum(2, "clubs".to_string())),
996            ])
997        );
998        assert!(reader.next().is_none());
999    }
1000
1001    //TODO: move where it fits better
1002    #[test]
1003    fn test_enum_no_reader_schema() {
1004        let writer_raw_schema = r#"
1005            {
1006                "type": "record",
1007                "name": "test",
1008                "fields": [
1009                    {"name": "a", "type": "long", "default": 42},
1010                    {"name": "b", "type": "string"},
1011                    {
1012                        "name": "c",
1013                        "type": {
1014                            "type": "enum",
1015                            "name": "suit",
1016                            "symbols": ["diamonds", "spades", "clubs", "hearts"]
1017                        },
1018                        "default": "spades"
1019                    }
1020                ]
1021            }
1022        "#;
1023        let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
1024        let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
1025        let mut record = Record::new(writer.schema()).unwrap();
1026        record.put("a", 27i64);
1027        record.put("b", "foo");
1028        record.put("c", "clubs");
1029        writer.append(record).unwrap();
1030        let input = writer.into_inner().unwrap();
1031        let mut reader = Reader::new(&input[..]).unwrap();
1032        assert_eq!(
1033            reader.next().unwrap().unwrap(),
1034            Value::Record(vec![
1035                ("a".to_string(), Value::Long(27)),
1036                ("b".to_string(), Value::String("foo".to_string())),
1037                ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1038            ])
1039        );
1040    }
1041
1042    #[test]
1043    fn test_illformed_length() {
1044        let raw_schema = r#"
1045            {
1046                "type": "record",
1047                "name": "test",
1048                "fields": [
1049                    {"name": "a", "type": "long", "default": 42},
1050                    {"name": "b", "type": "string"}
1051                ]
1052            }
1053        "#;
1054
1055        let schema = Schema::parse_str(raw_schema).unwrap();
1056
1057        // Would allocated 18446744073709551605 bytes
1058        let illformed: &[u8] = &[0x3e, 0x15, 0xff, 0x1f, 0x15, 0xff];
1059
1060        let value = from_avro_datum(&schema, &mut &*illformed, None);
1061        assert!(value.is_err());
1062    }
1063}
apache_avro/lib.rs

apache_avro/
lib.rs