rust_bert/pipelines/sentence_embeddings/mod.rs
1//! # Sentence Embeddings pipeline
2//!
3//! Compute sentence/text embeddings that can be compared (e.g. with
4//! cosine-similarity) to find sentences with a similar meaning. This can be useful for
5//! semantic textual similar, semantic search, or paraphrase mining.
6//!
7//! The implementation is based on [Sentence-Transformers][sbert] and pretrained models
8//! available on [Hugging Face Hub][sbert-hub] can be used. It's however necessary to
9//! convert them using the script `utils/convert_model.py` beforehand, see
10//! `tests/sentence_embeddings.rs` for such examples.
11//!
12//! [sbert]: https://sbert.net/
13//! [sbert-hub]: https://huggingface.co/sentence-transformers/
14//!
15//! Basic usage is as follows:
16//!
17//! ```no_run
18//! use rust_bert::pipelines::sentence_embeddings::SentenceEmbeddingsBuilder;
19//!
20//! # fn main() -> anyhow::Result<()> {
21//! let model = SentenceEmbeddingsBuilder::local("local/path/to/distiluse-base-multilingual-cased")
22//! .with_device(tch::Device::cuda_if_available())
23//! .create_model()?;
24//!
25//! let sentences = ["This is an example sentence", "Each sentence is converted"];
26//! let embeddings = model.encode(&sentences)?;
27//! # Ok(())
28//! # }
29//! ```
30
31pub mod builder;
32mod config;
33pub mod layers;
34mod pipeline;
35mod resources;
36
37pub use builder::SentenceEmbeddingsBuilder;
38pub use config::{
39 SentenceEmbeddingsConfig, SentenceEmbeddingsModuleConfig, SentenceEmbeddingsModuleType,
40 SentenceEmbeddingsModulesConfig, SentenceEmbeddingsSentenceBertConfig,
41 SentenceEmbeddingsTokenizerConfig,
42};
43pub use pipeline::{
44 SentenceEmbeddingsModel, SentenceEmbeddingsModelOutput, SentenceEmbeddingsOption,
45 SentenceEmbeddingsTokenizerOutput,
46};
47
48pub use resources::{
49 SentenceEmbeddingsConfigResources, SentenceEmbeddingsDenseConfigResources,
50 SentenceEmbeddingsDenseResources, SentenceEmbeddingsModelType,
51 SentenceEmbeddingsModulesConfigResources, SentenceEmbeddingsPoolingConfigResources,
52 SentenceEmbeddingsTokenizerConfigResources,
53};
54
55/// Length = sequence length
56pub type Attention = Vec<f32>;
57/// Length = sequence length
58pub type AttentionHead = Vec<Attention>;
59/// Length = number of heads per attention layer
60pub type AttentionLayer = Vec<AttentionHead>;
61/// Length = number of attention layers
62pub type AttentionOutput = Vec<AttentionLayer>;
63
64pub type Embedding = Vec<f32>;