pub struct Detector {
pub word_lang_prob_map: HashMap<String, Vec<f64>>,
pub langlist: Vec<String>,
pub seed: Option<u64>,
pub text: String,
pub langprob: Option<Vec<f64>>,
pub alpha: f64,
pub n_trial: usize,
pub max_text_length: usize,
pub prior_map: Option<Vec<f64>>,
pub verbose: bool,
}Expand description
Core language detection engine.
The Detector performs the actual language identification using n-gram analysis and Bayesian probability estimation. It uses an iterative expectation-maximization algorithm to determine the most likely language for a given text.
§Algorithm Overview
- Extract n-grams (1-3 characters) from the input text
- Look up probabilities for each n-gram across all languages
- Use iterative EM algorithm to estimate language probabilities
- Return the language with highest probability
§Examples
use langdetect_rs::detector_factory::DetectorFactory;
let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");
let language = detector.detect().unwrap();Fields§
§word_lang_prob_map: HashMap<String, Vec<f64>>Word-to-language probability mapping.
langlist: Vec<String>List of language identifiers.
seed: Option<u64>Optional seed for reproducible randomization.
text: StringAccumulated text for analysis.
langprob: Option<Vec<f64>>Current language probability estimates.
alpha: f64Alpha smoothing parameter for probability estimation.
n_trial: usizeNumber of trials for the EM algorithm.
max_text_length: usizeMaximum text length to process.
prior_map: Option<Vec<f64>>Prior probabilities for languages (optional).
verbose: boolWhether to enable verbose logging.
Implementations§
Source§impl Detector
impl Detector
Sourcepub const ALPHA_DEFAULT: f64 = 0.5f64
pub const ALPHA_DEFAULT: f64 = 0.5f64
Default alpha smoothing parameter.
Sourcepub const ALPHA_WIDTH: f64 = 0.050000000000000003f64
pub const ALPHA_WIDTH: f64 = 0.050000000000000003f64
Width of alpha variation during randomization.
Sourcepub const ITERATION_LIMIT: usize = 1_000usize
pub const ITERATION_LIMIT: usize = 1_000usize
Maximum iterations for the EM algorithm.
Sourcepub const PROB_THRESHOLD: f64 = 0.10000000000000001f64
pub const PROB_THRESHOLD: f64 = 0.10000000000000001f64
Minimum probability threshold for reporting languages.
Sourcepub const CONV_THRESHOLD: f64 = 0.99999000000000004f64
pub const CONV_THRESHOLD: f64 = 0.99999000000000004f64
Convergence threshold for the EM algorithm.
Sourcepub const UNKNOWN_LANG: &'static str = "unknown"
pub const UNKNOWN_LANG: &'static str = "unknown"
Language identifier for unknown/undetected languages.
Sourcepub fn new(
word_lang_prob_map: HashMap<String, Vec<f64>>,
langlist: Vec<String>,
seed: Option<u64>,
) -> Self
pub fn new( word_lang_prob_map: HashMap<String, Vec<f64>>, langlist: Vec<String>, seed: Option<u64>, ) -> Self
Creates a new Detector with the given language profiles.
§Arguments
word_lang_prob_map- Pre-computed word-to-language probability mapping.langlist- List of language identifiers.seed- Optional seed for reproducible randomization.
Sourcepub fn append(&mut self, text: &str)
pub fn append(&mut self, text: &str)
Appends text to the detector for analysis.
The text is preprocessed to remove URLs, emails, and normalize whitespace. Vietnamese text is also normalized for better detection.
§Arguments
text- The text to append for language detection.
§Examples
use langdetect_rs::detector_factory::DetectorFactory;
let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");Examples found in repository?
3fn main() {
4 let factory = DetectorFactory::default().build();
5
6 // let mut detector = factory.create(None);
7 match factory.detect("War doesn't show who's right, just who's left.", None) {
8 Ok(lang) => println!("Detected language: {}", lang),
9 Err(e) => println!("Detection error: {:?}", e),
10 }
11
12 // let mut detector = factory.create(None);
13 match factory.detect("Ein, zwei, drei, vier", None) {
14 Ok(lang) => println!("Detected language: {}", lang),
15 Err(e) => println!("Detection error: {:?}", e),
16 }
17
18 match factory.get_probabilities("Otec matka syn.", None) {
19 Ok(probs) => println!("Language probabilities: {:?}", probs),
20 Err(e) => println!("Detection error: {:?}", e),
21 }
22
23 // For reproducibility use a fixed seed within explicitly defined detector
24 let mut detector = factory.create(None);
25 detector.seed = Some(42);
26 detector.append("Otec matka syn.");
27 match detector.get_probabilities() {
28 Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
29 Err(e) => println!("Detection error: {:?}", e),
30 }
31
32 // Or you can set the seed for the factory itself and it will be inherited by detectors
33 let factory_with_seed = DetectorFactory::default()
34 .with_seed(Some(43))
35 .build();
36 match factory_with_seed.get_probabilities("Otec matka syn.", None) {
37 Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
38 Err(e) => println!("Detection error: {:?}", e),
39 }
40}Sourcepub fn detect(&mut self) -> Result<String, DetectorError>
pub fn detect(&mut self) -> Result<String, DetectorError>
Performs language detection on the accumulated text.
§Returns
The detected language code, or “unknown” if detection fails.
§Errors
Returns DetectorError::NoFeatures if no detectable n-grams are found.
§Examples
use langdetect_rs::detector_factory::DetectorFactory;
let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Bonjour le monde!");
let language = detector.detect().unwrap();
assert_eq!(language, "fr");Sourcepub fn get_probabilities(&mut self) -> Result<Vec<Language>, DetectorError>
pub fn get_probabilities(&mut self) -> Result<Vec<Language>, DetectorError>
Gets detailed language probabilities for the accumulated text.
Returns all languages with probability above the threshold, sorted by probability descending.
§Returns
A vector of Language structs with language codes and probabilities.
§Errors
Returns DetectorError::NoFeatures if no detectable n-grams are found.
§Examples
use langdetect_rs::detector_factory::DetectorFactory;
let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");
let probabilities = detector.get_probabilities().unwrap();
for lang in probabilities {
println!("{}: {:.3}", lang.lang.unwrap_or_default(), lang.prob);
}Examples found in repository?
3fn main() {
4 let factory = DetectorFactory::default().build();
5
6 // let mut detector = factory.create(None);
7 match factory.detect("War doesn't show who's right, just who's left.", None) {
8 Ok(lang) => println!("Detected language: {}", lang),
9 Err(e) => println!("Detection error: {:?}", e),
10 }
11
12 // let mut detector = factory.create(None);
13 match factory.detect("Ein, zwei, drei, vier", None) {
14 Ok(lang) => println!("Detected language: {}", lang),
15 Err(e) => println!("Detection error: {:?}", e),
16 }
17
18 match factory.get_probabilities("Otec matka syn.", None) {
19 Ok(probs) => println!("Language probabilities: {:?}", probs),
20 Err(e) => println!("Detection error: {:?}", e),
21 }
22
23 // For reproducibility use a fixed seed within explicitly defined detector
24 let mut detector = factory.create(None);
25 detector.seed = Some(42);
26 detector.append("Otec matka syn.");
27 match detector.get_probabilities() {
28 Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
29 Err(e) => println!("Detection error: {:?}", e),
30 }
31
32 // Or you can set the seed for the factory itself and it will be inherited by detectors
33 let factory_with_seed = DetectorFactory::default()
34 .with_seed(Some(43))
35 .build();
36 match factory_with_seed.get_probabilities("Otec matka syn.", None) {
37 Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
38 Err(e) => println!("Detection error: {:?}", e),
39 }
40}