1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// tokenizers.rs
//
// Copyright (c) 2023-2024 Junpei Kawamoto
//
// This software is released under the MIT License.
//
// http://opensource.org/licenses/mit-license.php
//! This module provides tokenizers.
//!
//! Currently, this module implements four tokenizers:
//! * [`auto`] module provides a tokenizer that automatically determines the appropriate tokenizer.
//! * [`hf`] module provides the tokenizer provided by the Hugging Face's
//! [`tokenizers` crate](https://docs.rs/tokenizers/),
//! * [`sentencepiece`] module provides a tokenizer based on
//! [Sentencepiece crate](https://docs.rs/sentencepiece/).
//! * [`bpe`] module provides a tokenizer based on the Byte Pair Encoding (BPE) model,
//!
//! ## Examples:
//! Here is an example of using [`auto::Tokenizer`] to build a Translator and translate a string:
//!
//! ```no_run
//! # use anyhow::Result;
//! #
//! use ct2rs::{Config, Translator};
//!
//! # fn main() -> Result<()> {
//! // Translator::new creates a translator instance with auto::Tokenizer.
//! let t = Translator::new("/path/to/model", &Config::default())?;
//! let res = t.translate_batch(
//! &vec!["Hallo World!"],
//! &Default::default(),
//! None,
//! )?;
//! for r in res {
//! println!("{:?}", r);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! The following example translates English to German and Japanese using the tokenizer provided by
//! the Hugging Face's [`tokenizers` crate](https://docs.rs/tokenizers/).
//! ```no_run
//! # use anyhow::Result;
//!
//! use ct2rs::{Config, TranslationOptions, Translator};
//! use ct2rs::tokenizers::hf::Tokenizer;
//!
//! # fn main() -> Result<()> {
//! let path = "/path/to/model";
//! let t = Translator::with_tokenizer(&path, Tokenizer::new(&path)?, &Config::default())?;
//! let res = t.translate_batch_with_target_prefix(
//! &vec![
//! "Hello world!",
//! "This library provides Rust bindings for CTranslate2.",
//! ],
//! &vec![vec!["deu_Latn"], vec!["jpn_Jpan"]],
//! &TranslationOptions {
//! return_scores: true,
//! ..Default::default()
//! },
//! None
//! )?;
//! for r in res {
//! println!("{}, (score: {:?})", r.0, r.1);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! The following example generates text using the tokenizer provided by
//! [Sentencepiece crate](https://docs.rs/sentencepiece/).
//! ```no_run
//! # use anyhow::Result;
//! use ct2rs::{Config, Device, Generator, GenerationOptions};
//! use ct2rs::tokenizers::sentencepiece::Tokenizer;
//!
//! # fn main() -> Result<()> {
//! let path = "/path/to/model";
//! let g = Generator::with_tokenizer(&path, Tokenizer::new(&path)?, &Config::default())?;
//! let res = g.generate_batch(
//! &vec!["prompt"],
//! &GenerationOptions::default(),
//! None,
//! )?;
//! for r in res {
//! println!("{:?}", r.0);
//! }
//! # Ok(())
//! # }
//! ```