langdetect_rs/detector_factory.rs
1use std::fs;
2use std::path::Path;
3use serde_json;
4use std::collections::HashMap;
5use crate::utils::lang_profile::LangProfile;
6use crate::detector::{Detector, DetectorError};
7use crate::language::Language;
8use crate::utils::lang_profile::LangProfileJson;
9
10/// Errors that can occur when working with DetectorFactory.
11#[derive(Debug, Clone)]
12pub enum DetectorFactoryError {
13 /// Attempted to add a language profile that already exists.
14 DuplicatedLanguage(String),
15 /// At least 2 languages are required for detection.
16 NotEnoughProfiles,
17}
18
19impl std::fmt::Display for DetectorFactoryError {
20 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21 match self {
22 DetectorFactoryError::DuplicatedLanguage(lang) => {
23 write!(f, "Duplicated language profile: {}", lang)
24 }
25 DetectorFactoryError::NotEnoughProfiles => {
26 write!(f, "Two languages at least are required")
27 }
28 }
29 }
30}
31
32/// Factory for creating language detectors with pre-loaded language profiles.
33///
34/// The DetectorFactory manages a collection of language profiles and provides
35/// methods to create Detector instances for language identification.
36///
37/// # Examples
38///
39/// ```rust
40/// use langdetect_rs::detector_factory::DetectorFactory;
41///
42/// // Create factory with built-in profiles
43/// let factory = DetectorFactory::default().build();
44///
45/// // Create a detector
46/// let detector = factory.create(None);
47/// ```
48#[derive(Clone)]
49pub struct DetectorFactory {
50 /// Word-to-language probability mapping for all loaded languages.
51 pub word_lang_prob_map: HashMap<String, Vec<f64>>,
52 /// List of language identifiers in the same order as probability vectors.
53 pub langlist: Vec<String>,
54 /// Optional seed for reproducible randomization.
55 pub seed: Option<u64>,
56}
57
58impl DetectorFactory {
59 /// Creates a new DetectorFactory builder.
60 ///
61 /// Use the builder pattern to configure the factory before calling `build()`.
62 ///
63 /// # Examples
64 ///
65 /// ```rust
66 /// use langdetect_rs::detector_factory::DetectorFactory;
67 ///
68 /// let factory = DetectorFactory::new()
69 /// .with_seed(Some(42))
70 /// .build();
71 /// ```
72 pub fn new() -> DetectorFactoryBuilder {
73 DetectorFactoryBuilder {
74 factory: DetectorFactory {
75 word_lang_prob_map: HashMap::new(),
76 langlist: Vec::new(),
77 seed: None,
78 },
79 }
80 }
81
82 /// Creates a DetectorFactoryBuilder with all built-in language profiles loaded.
83 ///
84 /// This method loads the 55 built-in language profiles from the crate's
85 /// profiles directory and returns a builder that can be further re-configured.
86 /// The profiles are cached for performance.
87 ///
88 /// # Example
89 ///
90 /// ```rust
91 /// use langdetect_rs::detector_factory::DetectorFactory;
92 ///
93 /// let factory = DetectorFactory::default()
94 /// .with_seed(Some(42))
95 /// .build();
96 /// ```
97 pub fn default() -> DetectorFactoryBuilder {
98 use std::sync::Mutex;
99 use lazy_static::lazy_static;
100 lazy_static! {
101 static ref DEFAULT_FACTORY: Mutex<Option<DetectorFactory>> = Mutex::new(None);
102 }
103 {
104 let factory_guard = DEFAULT_FACTORY.lock().unwrap();
105 if let Some(factory) = &*factory_guard {
106 return DetectorFactoryBuilder { factory: factory.clone() };
107 }
108 }
109 let mut factory = DetectorFactory::new().build();
110 // Try to load profiles from crate-level "profiles" folder
111 let crate_profiles = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("profiles");
112
113 let _ = factory.load_profile(&crate_profiles);
114 // Cache the factory for future use
115 let mut factory_guard = DEFAULT_FACTORY.lock().unwrap();
116 *factory_guard = Some(factory.clone());
117 DetectorFactoryBuilder { factory }
118 }
119
120 /// Returns the path to the default language profiles directory.
121 ///
122 /// This method provides the path to the built-in language profile files that ship
123 /// with the crate. End-users can use this path to load default profiles when
124 /// extending or customizing the factory.
125 ///
126 /// Note: This path is only accessible when the crate is used as a source dependency
127 /// or when running from the crate's directory. When used as a published dependency,
128 /// the profiles may not be available as filesystem files.
129 ///
130 /// # Returns
131 /// A PathBuf pointing to the default profiles directory.
132 ///
133 /// # Example
134 ///
135 /// ```rust
136 /// use langdetect_rs::detector_factory::DetectorFactory;
137 /// use langdetect_rs::utils::lang_profile::{LangProfileJson, LangProfile};
138 ///
139 /// // Get path to default profiles
140 /// let profiles_path = DetectorFactory::get_default_profiles_path();
141 /// println!("Default profiles are located at: {:?}", profiles_path);
142 ///
143 /// // Load a specific profile
144 /// let en_profile = LangProfileJson::new_from_file(profiles_path.join("en")).unwrap();
145 /// let profile = LangProfile::from_json(en_profile).unwrap();
146 ///
147 /// // Add to custom factory
148 /// let mut factory = DetectorFactory::new().build();
149 /// factory.add_profile(profile, 0, 1).unwrap();
150 /// ```
151 pub fn get_default_profiles_path() -> std::path::PathBuf {
152 std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("profiles")
153 }
154
155 /// Clears all loaded language profiles and mappings.
156 pub fn clear(&mut self) {
157 self.langlist.clear();
158 self.word_lang_prob_map.clear();
159 }
160
161 /// Sets the randomization seed for reproducible results.
162 ///
163 /// # Arguments
164 /// * `seed` - The seed value to use for randomization.
165 pub fn set_seed(&mut self, seed: u64) {
166 self.seed = Some(seed);
167 }
168
169 /// Returns a list of all loaded language identifiers.
170 ///
171 /// # Returns
172 /// A vector of language codes (ISO 639-1) in the order they were loaded.
173 pub fn get_lang_list(&self) -> Vec<String> {
174 self.langlist.clone()
175 }
176
177 /// Creates a new Detector instance with the current profiles.
178 ///
179 /// # Arguments
180 /// * `alpha` - Optional alpha smoothing parameter (default: 0.5).
181 ///
182 /// # Returns
183 /// A configured Detector ready for language detection.
184 pub fn create(&self, alpha: Option<f64>) -> Detector {
185 let mut detector = Detector::new(
186 self.word_lang_prob_map.clone(),
187 self.langlist.clone(),
188 self.seed,
189 );
190 if let Some(a) = alpha {
191 detector.alpha = a;
192 }
193 detector
194 }
195
196 /// Overrides an existing language profile at the specified index.
197 ///
198 /// This is an internal method used during profile loading.
199 ///
200 /// # Arguments
201 /// * `profile` - The language profile to add.
202 /// * `index` - The index in the language list.
203 /// * `langsize` - Total number of languages.
204 pub fn override_profile(&mut self, profile: LangProfile, index: usize, langsize: usize) -> Result<(), DetectorFactoryError> {
205 let lang = profile.name.clone().unwrap();
206 self.langlist.push(lang.clone());
207 for (word, &count) in profile.freq.iter() {
208 if !self.word_lang_prob_map.contains_key(word) {
209 self.word_lang_prob_map.insert(word.clone(), vec![0.0; langsize]);
210 }
211 let length = word.chars().count();
212 if length >= 1 && length <= 3 {
213 let prob = count as f64 / profile.n_words[length - 1] as f64;
214 if let Some(vec) = self.word_lang_prob_map.get_mut(word) {
215 vec[index] = prob;
216 }
217 }
218 }
219 Ok(())
220 }
221
222 /// Adds a new language profile to the factory.
223 ///
224 /// # Arguments
225 /// * `profile` - The language profile to add.
226 /// * `index` - The index position for this language.
227 /// * `langsize` - Total number of languages in the profile set.
228 ///
229 /// # Errors
230 /// Returns `DetectorFactoryError::DuplicatedLanguage` if the language already exists.
231 pub fn add_profile(&mut self, profile: LangProfile, index: usize, langsize: usize) -> Result<(), DetectorFactoryError> {
232 let lang = profile.name.clone().unwrap();
233 if self.langlist.contains(&lang) {
234 return Err(DetectorFactoryError::DuplicatedLanguage(lang));
235 }
236 self.override_profile(profile, index, langsize)
237 }
238
239 /// Removes a language profile from the factory.
240 ///
241 /// # Arguments
242 /// * `lang` - The language code to remove.
243 ///
244 /// # Errors
245 /// Returns `DetectorFactoryError::DuplicatedLanguage` if the language doesn't exist.
246 pub fn delete_profile(&mut self, lang: &str) -> Result<(), DetectorFactoryError> {
247 let pos = self.langlist.iter().position(|l| l == lang);
248 if let Some(index) = pos {
249 self.langlist.remove(index);
250 // Remove the language's probabilities from word_lang_prob_map
251 for vec in self.word_lang_prob_map.values_mut() {
252 if vec.len() > index {
253 vec.remove(index);
254 }
255 }
256 Ok(())
257 } else {
258 Err(DetectorFactoryError::DuplicatedLanguage(lang.to_string()))
259 }
260 }
261
262 /// Loads language profiles from JSON strings.
263 ///
264 /// # Arguments
265 /// * `json_profiles` - Array of JSON strings representing language profiles.
266 ///
267 /// # Errors
268 /// Returns `DetectorFactoryError::NotEnoughProfiles` if fewer than 2 profiles provided.
269 pub fn load_json_profile(&mut self, json_profiles: &[&str]) -> Result<(), DetectorFactoryError> {
270 let langsize = json_profiles.len();
271 if langsize < 2 {
272 return Err(DetectorFactoryError::NotEnoughProfiles);
273 }
274 let mut index = 0;
275 for json_profile in json_profiles {
276 let json_data: LangProfileJson = serde_json::from_str(json_profile)
277 .map_err(|_| DetectorFactoryError::NotEnoughProfiles)?;
278 let profile = LangProfile {
279 name: Some(json_data.name),
280 freq: json_data.freq,
281 n_words: {
282 let mut arr = [0; 3];
283 for (i, v) in json_data.n_words.iter().enumerate().take(3) {
284 arr[i] = *v;
285 }
286 arr
287 },
288 };
289 self.add_profile(profile, index, langsize)?;
290 index += 1;
291 }
292 Ok(())
293 }
294
295 /// Shortcut method to detect language from text in one call.
296 ///
297 /// # Arguments
298 /// * `text` - The text to analyze.
299 /// * `alpha` - Optional alpha smoothing parameter.
300 ///
301 /// # Returns
302 /// The detected language code or an error.
303 ///
304 /// # Example
305 ///
306 /// ```rust
307 /// use langdetect_rs::detector_factory::DetectorFactory;
308 ///
309 /// let factory = DetectorFactory::default().build();
310 /// let result = factory.detect("Hello world!", None);
311 /// ```
312 pub fn detect(&self, text: &str, alpha: Option<f64>) -> Result<String, DetectorError> {
313 let mut detector = self.create(alpha);
314 detector.append(text);
315 detector.detect()
316 }
317
318 /// Shortcut method to get language probabilities from text in one call.
319 ///
320 /// # Arguments
321 /// * `text` - The text to analyze.
322 /// * `alpha` - Optional alpha smoothing parameter.
323 ///
324 /// # Returns
325 /// A vector of languages with their probabilities, sorted by probability descending.
326 ///
327 /// # Example
328 ///
329 /// ```rust
330 /// use langdetect_rs::detector_factory::DetectorFactory;
331 ///
332 /// let factory = DetectorFactory::default().build();
333 /// let result = factory.get_probabilities("Hello world!", None);
334 /// ```
335 pub fn get_probabilities(&self, text: &str, alpha: Option<f64>) -> Result<Vec<Language>, DetectorError> {
336 let mut detector = self.create(alpha);
337 detector.append(text);
338 detector.get_probabilities()
339 }
340
341 /// Loads all language profiles from a directory of JSON files.
342 ///
343 /// # Arguments
344 /// * `profile_directory` - Path to directory containing JSON profile files.
345 ///
346 /// # Returns
347 /// Ok(()) on success, or an error string on failure.
348 ///
349 /// # Example
350 ///
351 /// ```rust
352 /// use langdetect_rs::detector_factory::DetectorFactory;
353 ///
354 /// let mut factory = DetectorFactory::new().build();
355 /// factory.load_profile("profiles/").unwrap();
356 /// ```
357 pub fn load_profile<P: AsRef<Path>>(&mut self, profile_directory: P) -> Result<(), String> {
358 let dir = profile_directory.as_ref();
359 let entries = fs::read_dir(dir).map_err(|e| format!("Failed to read profile directory: {}", e))?;
360 let mut json_profiles = Vec::new();
361 for entry in entries {
362 let entry = entry.map_err(|e| format!("Failed to read entry: {}", e))?;
363 let path = entry.path();
364 if path.is_file() {
365 let content = fs::read_to_string(&path)
366 .map_err(|e| format!("Failed to read file {:?}: {}", path, e))?;
367 json_profiles.push(content);
368 }
369 }
370 let json_refs: Vec<&str> = json_profiles.iter().map(|s| s.as_str()).collect();
371 self.load_json_profile(&json_refs)
372 .map_err(|e| format!("Failed to parse JSON profiles: {:?}", e))?;
373 Ok(())
374 }
375}
376
377/// Builder for `DetectorFactory` with fluent setters.
378///
379/// Provides a convenient way to configure a DetectorFactory before building it.
380///
381/// # Examples
382///
383/// ```rust
384/// use langdetect_rs::detector_factory::DetectorFactory;
385/// use std::collections::HashMap;
386///
387/// let factory = DetectorFactory::new()
388/// .with_langlist(vec!["en".to_string(), "fr".to_string()])
389/// .with_seed(Some(42))
390/// .build();
391/// ```
392pub struct DetectorFactoryBuilder {
393 factory: DetectorFactory,
394}
395
396impl DetectorFactoryBuilder {
397 /// Set the word language probability map.
398 ///
399 /// # Arguments
400 /// * `word_lang_prob_map` - A HashMap of word to language probabilities.
401 ///
402 /// # Example
403 /// ```
404 /// use std::collections::HashMap;
405 /// use langdetect_rs::detector_factory::DetectorFactory;
406 /// let mut word_lang_prob_map = HashMap::new();
407 /// word_lang_prob_map.insert("hello".to_string(), vec![0.5, 0.3]);
408 /// let builder = DetectorFactory::new().with_word_lang_prob_map(word_lang_prob_map);
409 /// ```
410 pub fn with_word_lang_prob_map(mut self, word_lang_prob_map: HashMap<String, Vec<f64>>) -> Self {
411 self.factory.word_lang_prob_map = word_lang_prob_map;
412 self
413 }
414
415 /// Set the language list.
416 ///
417 /// # Arguments
418 /// * `langlist` - A vector of language names.
419 ///
420 /// # Example
421 /// ```
422 /// use langdetect_rs::detector_factory::DetectorFactory;
423 /// let builder = DetectorFactory::new().with_langlist(vec!["en".to_string(), "fr".to_string()]);
424 /// ```
425 pub fn with_langlist(mut self, langlist: Vec<String>) -> Self {
426 self.factory.langlist = langlist;
427 self
428 }
429
430 /// Set the seed for randomization.
431 ///
432 /// # Arguments
433 /// * `seed` - An optional u64 seed value.
434 ///
435 /// # Example
436 /// ```
437 /// use langdetect_rs::detector_factory::DetectorFactory;
438 /// let builder = DetectorFactory::new().with_seed(Some(42));
439 /// ```
440 pub fn with_seed(mut self, seed: Option<u64>) -> Self {
441 self.factory.seed = seed;
442 self
443 }
444
445 /// Builds the final `DetectorFactory` object with the configured properties.
446 ///
447 /// # Returns
448 /// The fully constructed `DetectorFactory` object.
449 ///
450 /// # Example
451 /// ```
452 /// use langdetect_rs::detector_factory::DetectorFactory;
453 /// let factory = DetectorFactory::new().with_seed(Some(123)).build();
454 /// ```
455 pub fn build(self) -> DetectorFactory {
456 self.factory
457 }
458}