arroy/lib.rs
1//! Arroy ([Approximate Rearest Reighbors][1] Oh Yeah) is a Rust library with the interface of the [Annoy Python library][2]
2//! to search for vectors in space that are close to a given query vector. It is based on LMDB, a memory-mapped key-value store,
3//! so many processes may share the same data and atomically modify the vectors.
4//!
5//! [1]: https://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor
6//! [2]: https://github.com/spotify/annoy/#full-python-api
7//!
8//! # Examples
9//!
10//! Open an LMDB database, store some vectors in it and query the top 20 nearest items from the first vector. This is the most
11//! trivial way to use arroy and it's fairly easy. Just do not forget to [`ArroyBuilder::build`] and [`heed::RwTxn::commit`]
12//! when you are done inserting your items.
13//!
14//! ```
15//! use std::num::NonZeroUsize;
16//!
17//! use arroy::distances::Euclidean;
18//! use arroy::{Database as ArroyDatabase, Writer, Reader};
19//! use rand::rngs::StdRng;
20//! use rand::{Rng, SeedableRng};
21//!
22//! /// That's the 200MiB size limit we allow LMDB to grow.
23//! const TWENTY_HUNDRED_MIB: usize = 2 * 1024 * 1024 * 1024;
24//!
25//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
26//! let dir = tempfile::tempdir()?;
27//! let env = unsafe { heed::EnvOpenOptions::new().map_size(TWENTY_HUNDRED_MIB).open(dir.path()) }?;
28//!
29//! // we will open the default LMDB unnamed database
30//! let mut wtxn = env.write_txn()?;
31//! let db: ArroyDatabase<Euclidean> = env.create_database(&mut wtxn, None)?;
32//!
33//! // Now we can give it to our arroy writer
34//! let index = 0;
35//! let dimensions = 5;
36//! let writer = Writer::<Euclidean>::new(db, index, dimensions);
37//!
38//! // let's write some vectors
39//! writer.add_item(&mut wtxn, 0, &[0.8, 0.49, 0.27, 0.76, 0.94])?;
40//! writer.add_item(&mut wtxn, 1, &[0.66, 0.86, 0.42, 0.4, 0.31])?;
41//! writer.add_item(&mut wtxn, 2, &[0.5, 0.95, 0.7, 0.51, 0.03])?;
42//! writer.add_item(&mut wtxn, 100, &[0.52, 0.33, 0.65, 0.23, 0.44])?;
43//! writer.add_item(&mut wtxn, 1000, &[0.18, 0.43, 0.48, 0.81, 0.29])?;
44//!
45//! // You can specify the number of trees to use or specify None.
46//! let mut rng = StdRng::seed_from_u64(42);
47//! writer.builder(&mut rng).build(&mut wtxn)?;
48//!
49//! // By committing, other readers can query the database in parallel.
50//! wtxn.commit()?;
51//!
52//! let mut rtxn = env.read_txn()?;
53//! let reader = Reader::<Euclidean>::open(&rtxn, index, db)?;
54//! let n_results = 20;
55//!
56//! let mut query = reader.nns(n_results);
57//!
58//! // You can increase the quality of the results by forcing arroy to search into more nodes.
59//! // This multiplier is arbitrary but basically the higher, the better the results, the slower the query.
60//! let is_precise = true;
61//! if is_precise {
62//! query.search_k(NonZeroUsize::new(n_results * reader.n_trees() * 15).unwrap());
63//! }
64//!
65//! // Similar searching can be achieved by requesting the nearest neighbors of a given item.
66//! let item_id = 0;
67//! let arroy_results = query.by_item(&rtxn, item_id)?.unwrap();
68//! # Ok(()) }
69//! ```
70
71#![warn(missing_docs)]
72#![doc(
73 html_favicon_url = "https://raw.githubusercontent.com/meilisearch/arroy/main/assets/arroy-electric-clusters.ico?raw=true"
74)]
75#![doc(
76 html_logo_url = "https://raw.githubusercontent.com/meilisearch/arroy/main/assets/arroy-electric-clusters-logo.png?raw=true"
77)]
78
79mod distance;
80mod error;
81mod item_iter;
82mod key;
83mod metadata;
84mod node;
85mod node_id;
86mod parallel;
87mod reader;
88mod roaring;
89mod spaces;
90mod stats;
91pub mod upgrade;
92mod version;
93mod writer;
94
95#[cfg(test)]
96mod tests;
97mod unaligned_vector;
98
99pub use distance::Distance;
100pub use error::Error;
101
102use key::{Key, Prefix, PrefixCodec};
103use metadata::{Metadata, MetadataCodec};
104use node::{Node, NodeCodec};
105use node_id::{NodeId, NodeMode};
106pub use reader::{QueryBuilder, Reader};
107pub use stats::{Stats, TreeStats};
108pub use writer::{ArroyBuilder, MainStep, SubStep, Writer, WriterProgress};
109
110/// The set of types used by the [`Distance`] trait.
111pub mod internals {
112 use rand::Rng;
113
114 pub use crate::distance::{
115 NodeHeaderBinaryQuantizedCosine, NodeHeaderBinaryQuantizedEuclidean,
116 NodeHeaderBinaryQuantizedManhattan, NodeHeaderCosine, NodeHeaderDotProduct,
117 NodeHeaderEuclidean, NodeHeaderManhattan,
118 };
119 pub use crate::key::KeyCodec;
120 pub use crate::node::{Leaf, NodeCodec};
121 pub use crate::unaligned_vector::{SizeMismatch, UnalignedVector, UnalignedVectorCodec};
122
123 /// A type that is used to decide on
124 /// which side of a plane we move an item.
125 #[derive(Debug, Copy, Clone)]
126 pub enum Side {
127 /// The left side.
128 Left,
129 /// The right side.
130 Right,
131 }
132
133 impl Side {
134 pub(crate) fn random<R: Rng>(rng: &mut R) -> Side {
135 if rng.gen() {
136 Side::Left
137 } else {
138 Side::Right
139 }
140 }
141 }
142}
143
144/// The set of distances implementing the [`Distance`] and supported by arroy.
145pub mod distances {
146 pub use crate::distance::{
147 BinaryQuantizedCosine, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, Cosine,
148 DotProduct, Euclidean, Manhattan,
149 };
150}
151
152/// A custom Result type that is returning an arroy error by default.
153pub type Result<T, E = Error> = std::result::Result<T, E>;
154
155/// The database required by arroy for reading or writing operations.
156pub type Database<D> = heed::Database<internals::KeyCodec, NodeCodec<D>>;
157
158/// An identifier for the items stored in the database.
159pub type ItemId = u32;