langdetect_rs/
detector_factory.rs

1use std::fs;
2use std::path::Path;
3use serde_json;
4use std::collections::HashMap;
5use crate::utils::lang_profile::LangProfile;
6use crate::detector::{Detector, DetectorError};
7use crate::language::Language;
8use crate::utils::lang_profile::LangProfileJson;
9
10/// Errors that can occur when working with DetectorFactory.
11#[derive(Debug, Clone)]
12pub enum DetectorFactoryError {
13    /// Attempted to add a language profile that already exists.
14    DuplicatedLanguage(String),
15    /// At least 2 languages are required for detection.
16    NotEnoughProfiles,
17}
18
19impl std::fmt::Display for DetectorFactoryError {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            DetectorFactoryError::DuplicatedLanguage(lang) => {
23                write!(f, "Duplicated language profile: {}", lang)
24            }
25            DetectorFactoryError::NotEnoughProfiles => {
26                write!(f, "Two languages at least are required")
27            }
28        }
29    }
30}
31
32/// Factory for creating language detectors with pre-loaded language profiles.
33///
34/// The DetectorFactory manages a collection of language profiles and provides
35/// methods to create Detector instances for language identification.
36///
37/// # Examples
38///
39/// ```rust
40/// use langdetect_rs::detector_factory::DetectorFactory;
41///
42/// // Create factory with built-in profiles
43/// let factory = DetectorFactory::default().build();
44///
45/// // Create a detector
46/// let detector = factory.create(None);
47/// ```
48#[derive(Clone)]
49pub struct DetectorFactory {
50    /// Word-to-language probability mapping for all loaded languages.
51    pub word_lang_prob_map: HashMap<String, Vec<f64>>,
52    /// List of language identifiers in the same order as probability vectors.
53    pub langlist: Vec<String>,
54    /// Optional seed for reproducible randomization.
55    pub seed: Option<u64>,
56}
57
58impl DetectorFactory {
59    /// Creates a new DetectorFactory builder.
60    ///
61    /// Use the builder pattern to configure the factory before calling `build()`.
62    ///
63    /// # Examples
64    ///
65    /// ```rust
66    /// use langdetect_rs::detector_factory::DetectorFactory;
67    ///
68    /// let factory = DetectorFactory::new()
69    ///     .with_seed(Some(42))
70    ///     .build();
71    /// ```
72    pub fn new() -> DetectorFactoryBuilder {
73        DetectorFactoryBuilder {
74            factory: DetectorFactory {
75                word_lang_prob_map: HashMap::new(),
76                langlist: Vec::new(),
77                seed: None,
78            },
79        }
80    }
81
82    /// Creates a DetectorFactoryBuilder with all built-in language profiles loaded.
83    ///
84    /// This method loads the 55 built-in language profiles from the crate's
85    /// profiles directory and returns a builder that can be further re-configured.
86    /// The profiles are cached for performance.
87    ///
88    /// # Example
89    ///
90    /// ```rust
91    /// use langdetect_rs::detector_factory::DetectorFactory;
92    ///
93    /// let factory = DetectorFactory::default()
94    ///     .with_seed(Some(42))
95    ///     .build();
96    /// ```
97    pub fn default() -> DetectorFactoryBuilder {
98        use std::sync::Mutex;
99        use lazy_static::lazy_static;
100        lazy_static! {
101            static ref DEFAULT_FACTORY: Mutex<Option<DetectorFactory>> = Mutex::new(None);
102        }
103        {
104            let factory_guard = DEFAULT_FACTORY.lock().unwrap();
105            if let Some(factory) = &*factory_guard {
106                return DetectorFactoryBuilder { factory: factory.clone() };
107            }
108        }
109        let mut factory = DetectorFactory::new().build();
110        // Try to load profiles from crate-level "profiles" folder
111        let crate_profiles = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("profiles");
112
113        let _ = factory.load_profile(&crate_profiles);
114        // Cache the factory for future use
115        let mut factory_guard = DEFAULT_FACTORY.lock().unwrap();
116        *factory_guard = Some(factory.clone());
117        DetectorFactoryBuilder { factory }
118    }
119
120    /// Returns the path to the default language profiles directory.
121    ///
122    /// This method provides the path to the built-in language profile files that ship
123    /// with the crate. End-users can use this path to load default profiles when
124    /// extending or customizing the factory.
125    ///
126    /// Note: This path is only accessible when the crate is used as a source dependency
127    /// or when running from the crate's directory. When used as a published dependency,
128    /// the profiles may not be available as filesystem files.
129    ///
130    /// # Returns
131    /// A PathBuf pointing to the default profiles directory.
132    ///
133    /// # Example
134    ///
135    /// ```rust
136    /// use langdetect_rs::detector_factory::DetectorFactory;
137    /// use langdetect_rs::utils::lang_profile::{LangProfileJson, LangProfile};
138    ///
139    /// // Get path to default profiles
140    /// let profiles_path = DetectorFactory::get_default_profiles_path();
141    /// println!("Default profiles are located at: {:?}", profiles_path);
142    ///
143    /// // Load a specific profile
144    /// let en_profile = LangProfileJson::new_from_file(profiles_path.join("en")).unwrap();
145    /// let profile = LangProfile::from_json(en_profile).unwrap();
146    ///
147    /// // Add to custom factory
148    /// let mut factory = DetectorFactory::new().build();
149    /// factory.add_profile(profile, 0, 1).unwrap();
150    /// ```
151    pub fn get_default_profiles_path() -> std::path::PathBuf {
152        std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("profiles")
153    }
154
155    /// Clears all loaded language profiles and mappings.
156    pub fn clear(&mut self) {
157        self.langlist.clear();
158        self.word_lang_prob_map.clear();
159    }
160
161    /// Sets the randomization seed for reproducible results.
162    ///
163    /// # Arguments
164    /// * `seed` - The seed value to use for randomization.
165    pub fn set_seed(&mut self, seed: u64) {
166        self.seed = Some(seed);
167    }
168
169    /// Returns a list of all loaded language identifiers.
170    ///
171    /// # Returns
172    /// A vector of language codes (ISO 639-1) in the order they were loaded.
173    pub fn get_lang_list(&self) -> Vec<String> {
174        self.langlist.clone()
175    }
176
177    /// Creates a new Detector instance with the current profiles.
178    ///
179    /// # Arguments
180    /// * `alpha` - Optional alpha smoothing parameter (default: 0.5).
181    ///
182    /// # Returns
183    /// A configured Detector ready for language detection.
184    pub fn create(&self, alpha: Option<f64>) -> Detector {
185        let mut detector = Detector::new(
186            self.word_lang_prob_map.clone(),
187            self.langlist.clone(),
188            self.seed,
189        );
190        if let Some(a) = alpha {
191            detector.alpha = a;
192        }
193        detector
194    }
195
196    /// Overrides an existing language profile at the specified index.
197    ///
198    /// This is an internal method used during profile loading.
199    ///
200    /// # Arguments
201    /// * `profile` - The language profile to add.
202    /// * `index` - The index in the language list.
203    /// * `langsize` - Total number of languages.
204    pub fn override_profile(&mut self, profile: LangProfile, index: usize, langsize: usize) -> Result<(), DetectorFactoryError> {
205        let lang = profile.name.clone().unwrap();
206        self.langlist.push(lang.clone());
207        for (word, &count) in profile.freq.iter() {
208            if !self.word_lang_prob_map.contains_key(word) {
209                self.word_lang_prob_map.insert(word.clone(), vec![0.0; langsize]);
210            }
211            let length = word.chars().count();
212            if length >= 1 && length <= 3 {
213                let prob = count as f64 / profile.n_words[length - 1] as f64;
214                if let Some(vec) = self.word_lang_prob_map.get_mut(word) {
215                    vec[index] = prob;
216                }
217            }
218        }
219        Ok(())
220    }
221
222    /// Adds a new language profile to the factory.
223    ///
224    /// # Arguments
225    /// * `profile` - The language profile to add.
226    /// * `index` - The index position for this language.
227    /// * `langsize` - Total number of languages in the profile set.
228    ///
229    /// # Errors
230    /// Returns `DetectorFactoryError::DuplicatedLanguage` if the language already exists.
231    pub fn add_profile(&mut self, profile: LangProfile, index: usize, langsize: usize) -> Result<(), DetectorFactoryError> {
232        let lang = profile.name.clone().unwrap();
233        if self.langlist.contains(&lang) {
234            return Err(DetectorFactoryError::DuplicatedLanguage(lang));
235        }
236        self.override_profile(profile, index, langsize)
237    }
238
239    /// Removes a language profile from the factory.
240    ///
241    /// # Arguments
242    /// * `lang` - The language code to remove.
243    ///
244    /// # Errors
245    /// Returns `DetectorFactoryError::DuplicatedLanguage` if the language doesn't exist.
246    pub fn delete_profile(&mut self, lang: &str) -> Result<(), DetectorFactoryError> {
247        let pos = self.langlist.iter().position(|l| l == lang);
248        if let Some(index) = pos {
249            self.langlist.remove(index);
250            // Remove the language's probabilities from word_lang_prob_map
251            for vec in self.word_lang_prob_map.values_mut() {
252                if vec.len() > index {
253                    vec.remove(index);
254                }
255            }
256            Ok(())
257        } else {
258            Err(DetectorFactoryError::DuplicatedLanguage(lang.to_string()))
259        }
260    }
261
262    /// Loads language profiles from JSON strings.
263    ///
264    /// # Arguments
265    /// * `json_profiles` - Array of JSON strings representing language profiles.
266    ///
267    /// # Errors
268    /// Returns `DetectorFactoryError::NotEnoughProfiles` if fewer than 2 profiles provided.
269    pub fn load_json_profile(&mut self, json_profiles: &[&str]) -> Result<(), DetectorFactoryError> {
270        let langsize = json_profiles.len();
271        if langsize < 2 {
272            return Err(DetectorFactoryError::NotEnoughProfiles);
273        }
274        let mut index = 0;
275        for json_profile in json_profiles {
276            let json_data: LangProfileJson = serde_json::from_str(json_profile)
277                .map_err(|_| DetectorFactoryError::NotEnoughProfiles)?;
278            let profile = LangProfile {
279                name: Some(json_data.name),
280                freq: json_data.freq,
281                n_words: {
282                    let mut arr = [0; 3];
283                    for (i, v) in json_data.n_words.iter().enumerate().take(3) {
284                        arr[i] = *v;
285                    }
286                    arr
287                },
288            };
289            self.add_profile(profile, index, langsize)?;
290            index += 1;
291        }
292        Ok(())
293    }
294
295    /// Shortcut method to detect language from text in one call.
296    ///
297    /// # Arguments
298    /// * `text` - The text to analyze.
299    /// * `alpha` - Optional alpha smoothing parameter.
300    ///
301    /// # Returns
302    /// The detected language code or an error.
303    ///
304    /// # Example
305    ///
306    /// ```rust
307    /// use langdetect_rs::detector_factory::DetectorFactory;
308    ///
309    /// let factory = DetectorFactory::default().build();
310    /// let result = factory.detect("Hello world!", None);
311    /// ```
312    pub fn detect(&self, text: &str, alpha: Option<f64>) -> Result<String, DetectorError> {
313        let mut detector = self.create(alpha);
314        detector.append(text);
315        detector.detect()
316    }
317
318    /// Shortcut method to get language probabilities from text in one call.
319    ///
320    /// # Arguments
321    /// * `text` - The text to analyze.
322    /// * `alpha` - Optional alpha smoothing parameter.
323    ///
324    /// # Returns
325    /// A vector of languages with their probabilities, sorted by probability descending.
326    ///
327    /// # Example
328    ///
329    /// ```rust
330    /// use langdetect_rs::detector_factory::DetectorFactory;
331    ///
332    /// let factory = DetectorFactory::default().build();
333    /// let result = factory.get_probabilities("Hello world!", None);
334    /// ```
335    pub fn get_probabilities(&self, text: &str, alpha: Option<f64>) -> Result<Vec<Language>, DetectorError> {
336        let mut detector = self.create(alpha);
337        detector.append(text);
338        detector.get_probabilities()
339    }
340
341    /// Loads all language profiles from a directory of JSON files.
342    ///
343    /// # Arguments
344    /// * `profile_directory` - Path to directory containing JSON profile files.
345    ///
346    /// # Returns
347    /// Ok(()) on success, or an error string on failure.
348    ///
349    /// # Example
350    ///
351    /// ```rust
352    /// use langdetect_rs::detector_factory::DetectorFactory;
353    ///
354    /// let mut factory = DetectorFactory::new().build();
355    /// factory.load_profile("profiles/").unwrap();
356    /// ```
357    pub fn load_profile<P: AsRef<Path>>(&mut self, profile_directory: P) -> Result<(), String> {
358        let dir = profile_directory.as_ref();
359        let entries = fs::read_dir(dir).map_err(|e| format!("Failed to read profile directory: {}", e))?;
360        let mut json_profiles = Vec::new();
361        for entry in entries {
362            let entry = entry.map_err(|e| format!("Failed to read entry: {}", e))?;
363            let path = entry.path();
364            if path.is_file() {
365                let content = fs::read_to_string(&path)
366                    .map_err(|e| format!("Failed to read file {:?}: {}", path, e))?;
367                json_profiles.push(content);
368            }
369        }
370        let json_refs: Vec<&str> = json_profiles.iter().map(|s| s.as_str()).collect();
371        self.load_json_profile(&json_refs)
372            .map_err(|e| format!("Failed to parse JSON profiles: {:?}", e))?;
373        Ok(())
374    }
375}
376
377/// Builder for `DetectorFactory` with fluent setters.
378///
379/// Provides a convenient way to configure a DetectorFactory before building it.
380///
381/// # Examples
382///
383/// ```rust
384/// use langdetect_rs::detector_factory::DetectorFactory;
385/// use std::collections::HashMap;
386///
387/// let factory = DetectorFactory::new()
388///     .with_langlist(vec!["en".to_string(), "fr".to_string()])
389///     .with_seed(Some(42))
390///     .build();
391/// ```
392pub struct DetectorFactoryBuilder {
393    factory: DetectorFactory,
394}
395
396impl DetectorFactoryBuilder {
397    /// Set the word language probability map.
398    ///
399    /// # Arguments
400    /// * `word_lang_prob_map` - A HashMap of word to language probabilities.
401    ///
402    /// # Example
403    /// ```
404    /// use std::collections::HashMap;
405    /// use langdetect_rs::detector_factory::DetectorFactory;
406    /// let mut word_lang_prob_map = HashMap::new();
407    /// word_lang_prob_map.insert("hello".to_string(), vec![0.5, 0.3]);
408    /// let builder = DetectorFactory::new().with_word_lang_prob_map(word_lang_prob_map);
409    /// ```
410    pub fn with_word_lang_prob_map(mut self, word_lang_prob_map: HashMap<String, Vec<f64>>) -> Self {
411        self.factory.word_lang_prob_map = word_lang_prob_map;
412        self
413    }
414
415    /// Set the language list.
416    ///
417    /// # Arguments
418    /// * `langlist` - A vector of language names.
419    ///
420    /// # Example
421    /// ```
422    /// use langdetect_rs::detector_factory::DetectorFactory;
423    /// let builder = DetectorFactory::new().with_langlist(vec!["en".to_string(), "fr".to_string()]);
424    /// ```
425    pub fn with_langlist(mut self, langlist: Vec<String>) -> Self {
426        self.factory.langlist = langlist;
427        self
428    }
429
430    /// Set the seed for randomization.
431    ///
432    /// # Arguments
433    /// * `seed` - An optional u64 seed value.
434    ///
435    /// # Example
436    /// ```
437    /// use langdetect_rs::detector_factory::DetectorFactory;
438    /// let builder = DetectorFactory::new().with_seed(Some(42));
439    /// ```
440    pub fn with_seed(mut self, seed: Option<u64>) -> Self {
441        self.factory.seed = seed;
442        self
443    }
444
445    /// Builds the final `DetectorFactory` object with the configured properties.
446    ///
447    /// # Returns
448    /// The fully constructed `DetectorFactory` object.
449    ///
450    /// # Example
451    /// ```
452    /// use langdetect_rs::detector_factory::DetectorFactory;
453    /// let factory = DetectorFactory::new().with_seed(Some(123)).build();
454    /// ```
455    pub fn build(self) -> DetectorFactory {
456        self.factory
457    }
458}