Detector

Struct Detector 

Source
pub struct Detector {
    pub word_lang_prob_map: HashMap<String, Vec<f64>>,
    pub langlist: Vec<String>,
    pub seed: Option<u64>,
    pub text: String,
    pub langprob: Option<Vec<f64>>,
    pub alpha: f64,
    pub n_trial: usize,
    pub max_text_length: usize,
    pub prior_map: Option<Vec<f64>>,
    pub verbose: bool,
}
Expand description

Core language detection engine.

The Detector performs the actual language identification using n-gram analysis and Bayesian probability estimation. It uses an iterative expectation-maximization algorithm to determine the most likely language for a given text.

§Algorithm Overview

  1. Extract n-grams (1-3 characters) from the input text
  2. Look up probabilities for each n-gram across all languages
  3. Use iterative EM algorithm to estimate language probabilities
  4. Return the language with highest probability

§Examples

use langdetect_rs::detector_factory::DetectorFactory;

let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");
let language = detector.detect().unwrap();

Fields§

§word_lang_prob_map: HashMap<String, Vec<f64>>

Word-to-language probability mapping.

§langlist: Vec<String>

List of language identifiers.

§seed: Option<u64>

Optional seed for reproducible randomization.

§text: String

Accumulated text for analysis.

§langprob: Option<Vec<f64>>

Current language probability estimates.

§alpha: f64

Alpha smoothing parameter for probability estimation.

§n_trial: usize

Number of trials for the EM algorithm.

§max_text_length: usize

Maximum text length to process.

§prior_map: Option<Vec<f64>>

Prior probabilities for languages (optional).

§verbose: bool

Whether to enable verbose logging.

Implementations§

Source§

impl Detector

Source

pub const ALPHA_DEFAULT: f64 = 0.5f64

Default alpha smoothing parameter.

Source

pub const ALPHA_WIDTH: f64 = 0.050000000000000003f64

Width of alpha variation during randomization.

Source

pub const ITERATION_LIMIT: usize = 1_000usize

Maximum iterations for the EM algorithm.

Source

pub const PROB_THRESHOLD: f64 = 0.10000000000000001f64

Minimum probability threshold for reporting languages.

Source

pub const CONV_THRESHOLD: f64 = 0.99999000000000004f64

Convergence threshold for the EM algorithm.

Source

pub const BASE_FREQ: f64 = 1.0E+4f64

Base frequency for probability calculations.

Source

pub const UNKNOWN_LANG: &'static str = "unknown"

Language identifier for unknown/undetected languages.

Source

pub fn new( word_lang_prob_map: HashMap<String, Vec<f64>>, langlist: Vec<String>, seed: Option<u64>, ) -> Self

Creates a new Detector with the given language profiles.

§Arguments
  • word_lang_prob_map - Pre-computed word-to-language probability mapping.
  • langlist - List of language identifiers.
  • seed - Optional seed for reproducible randomization.
Source

pub fn append(&mut self, text: &str)

Appends text to the detector for analysis.

The text is preprocessed to remove URLs, emails, and normalize whitespace. Vietnamese text is also normalized for better detection.

§Arguments
  • text - The text to append for language detection.
§Examples
use langdetect_rs::detector_factory::DetectorFactory;

let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");
Examples found in repository?
examples/simple/main.rs (line 26)
3fn main() {
4    let factory = DetectorFactory::default().build();
5
6    // let mut detector = factory.create(None);
7    match factory.detect("War doesn't show who's right, just who's left.", None) {
8        Ok(lang) => println!("Detected language: {}", lang),
9        Err(e) => println!("Detection error: {:?}", e),
10    }
11
12    // let mut detector = factory.create(None);
13    match factory.detect("Ein, zwei, drei, vier", None) {
14        Ok(lang) => println!("Detected language: {}", lang),
15        Err(e) => println!("Detection error: {:?}", e),
16    }
17
18    match factory.get_probabilities("Otec matka syn.", None) {
19        Ok(probs) => println!("Language probabilities: {:?}", probs),
20        Err(e) => println!("Detection error: {:?}", e),
21    }
22
23    // For reproducibility use a fixed seed within explicitly defined detector
24    let mut detector = factory.create(None);
25    detector.seed = Some(42);
26    detector.append("Otec matka syn.");
27    match detector.get_probabilities() {
28        Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
29        Err(e) => println!("Detection error: {:?}", e),
30    }
31
32    // Or you can set the seed for the factory itself and it will be inherited by detectors
33    let factory_with_seed = DetectorFactory::default()
34        .with_seed(Some(43))
35        .build();
36    match factory_with_seed.get_probabilities("Otec matka syn.", None) {
37        Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
38        Err(e) => println!("Detection error: {:?}", e),
39    }
40}
Source

pub fn detect(&mut self) -> Result<String, DetectorError>

Performs language detection on the accumulated text.

§Returns

The detected language code, or “unknown” if detection fails.

§Errors

Returns DetectorError::NoFeatures if no detectable n-grams are found.

§Examples
use langdetect_rs::detector_factory::DetectorFactory;

let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Bonjour le monde!");
let language = detector.detect().unwrap();
assert_eq!(language, "fr");
Source

pub fn get_probabilities(&mut self) -> Result<Vec<Language>, DetectorError>

Gets detailed language probabilities for the accumulated text.

Returns all languages with probability above the threshold, sorted by probability descending.

§Returns

A vector of Language structs with language codes and probabilities.

§Errors

Returns DetectorError::NoFeatures if no detectable n-grams are found.

§Examples
use langdetect_rs::detector_factory::DetectorFactory;

let factory = DetectorFactory::default().build();
let mut detector = factory.create(None);
detector.append("Hello world!");
let probabilities = detector.get_probabilities().unwrap();
for lang in probabilities {
    println!("{}: {:.3}", lang.lang.unwrap_or_default(), lang.prob);
}
Examples found in repository?
examples/simple/main.rs (line 27)
3fn main() {
4    let factory = DetectorFactory::default().build();
5
6    // let mut detector = factory.create(None);
7    match factory.detect("War doesn't show who's right, just who's left.", None) {
8        Ok(lang) => println!("Detected language: {}", lang),
9        Err(e) => println!("Detection error: {:?}", e),
10    }
11
12    // let mut detector = factory.create(None);
13    match factory.detect("Ein, zwei, drei, vier", None) {
14        Ok(lang) => println!("Detected language: {}", lang),
15        Err(e) => println!("Detection error: {:?}", e),
16    }
17
18    match factory.get_probabilities("Otec matka syn.", None) {
19        Ok(probs) => println!("Language probabilities: {:?}", probs),
20        Err(e) => println!("Detection error: {:?}", e),
21    }
22
23    // For reproducibility use a fixed seed within explicitly defined detector
24    let mut detector = factory.create(None);
25    detector.seed = Some(42);
26    detector.append("Otec matka syn.");
27    match detector.get_probabilities() {
28        Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
29        Err(e) => println!("Detection error: {:?}", e),
30    }
31
32    // Or you can set the seed for the factory itself and it will be inherited by detectors
33    let factory_with_seed = DetectorFactory::default()
34        .with_seed(Some(43))
35        .build();
36    match factory_with_seed.get_probabilities("Otec matka syn.", None) {
37        Ok(probs) => println!("Language probabilities with seed: {:?}", probs),
38        Err(e) => println!("Detection error: {:?}", e),
39    }
40}

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V