Skip to main content

jpx_core/extensions/
language.rs

1//! Language detection functions.
2
3use std::collections::HashSet;
4
5use serde_json::{Number, Value};
6
7use crate::functions::{Function, number_value};
8use crate::interpreter::SearchResult;
9use crate::registry::register_if_enabled;
10use crate::{Context, Runtime, arg, defn};
11
12// =============================================================================
13// detect_language(text) -> string (full name like "English")
14// =============================================================================
15
16defn!(DetectLanguageFn, vec![arg!(string)], None);
17
18impl Function for DetectLanguageFn {
19    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
20        self.signature.validate(args, ctx)?;
21        let text = args[0].as_str().unwrap();
22
23        match whatlang::detect(text) {
24            Some(info) => {
25                let name = info.lang().to_string();
26                Ok(Value::String(name))
27            }
28            None => Ok(Value::Null),
29        }
30    }
31}
32
33// =============================================================================
34// detect_language_iso(text) -> string (ISO 639-3 code like "eng")
35// =============================================================================
36
37defn!(DetectLanguageIsoFn, vec![arg!(string)], None);
38
39impl Function for DetectLanguageIsoFn {
40    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
41        self.signature.validate(args, ctx)?;
42        let text = args[0].as_str().unwrap();
43
44        match whatlang::detect(text) {
45            Some(info) => {
46                let code = info.lang().code();
47                Ok(Value::String(code.to_string()))
48            }
49            None => Ok(Value::Null),
50        }
51    }
52}
53
54// =============================================================================
55// detect_script(text) -> string (script name like "Latin")
56// =============================================================================
57
58defn!(DetectScriptFn, vec![arg!(string)], None);
59
60impl Function for DetectScriptFn {
61    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
62        self.signature.validate(args, ctx)?;
63        let text = args[0].as_str().unwrap();
64
65        match whatlang::detect(text) {
66            Some(info) => {
67                let script = format!("{:?}", info.script());
68                Ok(Value::String(script))
69            }
70            None => Ok(Value::Null),
71        }
72    }
73}
74
75// =============================================================================
76// detect_language_confidence(text) -> number (0.0-1.0)
77// =============================================================================
78
79defn!(DetectLanguageConfidenceFn, vec![arg!(string)], None);
80
81impl Function for DetectLanguageConfidenceFn {
82    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
83        self.signature.validate(args, ctx)?;
84        let text = args[0].as_str().unwrap();
85
86        match whatlang::detect(text) {
87            Some(info) => Ok(number_value(info.confidence())),
88            None => Ok(Value::Null),
89        }
90    }
91}
92
93// =============================================================================
94// detect_language_info(text) -> object with full detection info
95// =============================================================================
96
97defn!(DetectLanguageInfoFn, vec![arg!(string)], None);
98
99impl Function for DetectLanguageInfoFn {
100    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
101        self.signature.validate(args, ctx)?;
102        let text = args[0].as_str().unwrap();
103
104        match whatlang::detect(text) {
105            Some(info) => {
106                let mut result = serde_json::Map::new();
107
108                result.insert(
109                    "language".to_string(),
110                    Value::String(info.lang().to_string()),
111                );
112                result.insert(
113                    "code".to_string(),
114                    Value::String(info.lang().code().to_string()),
115                );
116                result.insert(
117                    "script".to_string(),
118                    Value::String(format!("{:?}", info.script())),
119                );
120                result.insert(
121                    "confidence".to_string(),
122                    Number::from_f64(info.confidence()).map_or(Value::Null, Value::Number),
123                );
124                result.insert("reliable".to_string(), Value::Bool(info.is_reliable()));
125
126                Ok(Value::Object(result))
127            }
128            None => Ok(Value::Null),
129        }
130    }
131}
132
133/// Register language detection functions filtered by the enabled set.
134pub fn register_filtered(runtime: &mut Runtime, enabled: &HashSet<&str>) {
135    register_if_enabled(
136        runtime,
137        "detect_language",
138        enabled,
139        Box::new(DetectLanguageFn::new()),
140    );
141    register_if_enabled(
142        runtime,
143        "detect_language_iso",
144        enabled,
145        Box::new(DetectLanguageIsoFn::new()),
146    );
147    register_if_enabled(
148        runtime,
149        "detect_script",
150        enabled,
151        Box::new(DetectScriptFn::new()),
152    );
153    register_if_enabled(
154        runtime,
155        "detect_language_confidence",
156        enabled,
157        Box::new(DetectLanguageConfidenceFn::new()),
158    );
159    register_if_enabled(
160        runtime,
161        "detect_language_info",
162        enabled,
163        Box::new(DetectLanguageInfoFn::new()),
164    );
165}
166
167#[cfg(test)]
168mod tests {
169    use crate::Runtime;
170    use serde_json::json;
171
172    fn setup_runtime() -> Runtime {
173        Runtime::builder()
174            .with_standard()
175            .with_all_extensions()
176            .build()
177    }
178
179    #[test]
180    fn test_detect_language_english() {
181        let runtime = setup_runtime();
182        let expr = runtime
183            .compile("detect_language('This is a test of the language detection system.')")
184            .unwrap();
185        let result = expr.search(&json!(null)).unwrap();
186        assert_eq!(result.as_str().unwrap(), "English");
187    }
188
189    #[test]
190    fn test_detect_language_spanish() {
191        let runtime = setup_runtime();
192        let expr = runtime
193            .compile("detect_language('Esto es una prueba del sistema de deteccion de idiomas.')")
194            .unwrap();
195        let result = expr.search(&json!(null)).unwrap();
196        // whatlang returns native language names
197        assert_eq!(result.as_str().unwrap(), "Espa\u{f1}ol");
198    }
199
200    #[test]
201    fn test_detect_language_french() {
202        let runtime = setup_runtime();
203        let expr = runtime
204            .compile("detect_language('Ceci est un test du systeme de detection de langue.')")
205            .unwrap();
206        let result = expr.search(&json!(null)).unwrap();
207        // whatlang returns native language names
208        assert_eq!(result.as_str().unwrap(), "Fran\u{e7}ais");
209    }
210
211    #[test]
212    fn test_detect_language_german() {
213        let runtime = setup_runtime();
214        let expr = runtime
215            .compile("detect_language('Dies ist ein Test des Spracherkennungssystems.')")
216            .unwrap();
217        let result = expr.search(&json!(null)).unwrap();
218        // whatlang returns native language names
219        assert_eq!(result.as_str().unwrap(), "Deutsch");
220    }
221
222    #[test]
223    fn test_detect_language_iso_english() {
224        let runtime = setup_runtime();
225        let expr = runtime
226            .compile("detect_language_iso('This is English text.')")
227            .unwrap();
228        let result = expr.search(&json!(null)).unwrap();
229        assert_eq!(result.as_str().unwrap(), "eng");
230    }
231
232    #[test]
233    fn test_detect_language_iso_spanish() {
234        let runtime = setup_runtime();
235        let expr = runtime
236            .compile("detect_language_iso('Este es un texto en espanol.')")
237            .unwrap();
238        let result = expr.search(&json!(null)).unwrap();
239        assert_eq!(result.as_str().unwrap(), "spa");
240    }
241
242    #[test]
243    fn test_detect_script_latin() {
244        let runtime = setup_runtime();
245        let expr = runtime.compile("detect_script('Hello world')").unwrap();
246        let result = expr.search(&json!(null)).unwrap();
247        assert_eq!(result.as_str().unwrap(), "Latin");
248    }
249
250    #[test]
251    fn test_detect_script_cyrillic() {
252        let runtime = setup_runtime();
253        let expr = runtime
254            .compile(
255                "detect_script('\u{41f}\u{440}\u{438}\u{432}\u{435}\u{442} \u{43c}\u{438}\u{440}')",
256            )
257            .unwrap();
258        let result = expr.search(&json!(null)).unwrap();
259        assert_eq!(result.as_str().unwrap(), "Cyrillic");
260    }
261
262    #[test]
263    fn test_detect_script_arabic() {
264        let runtime = setup_runtime();
265        let expr = runtime
266            .compile("detect_script('\u{645}\u{631}\u{62d}\u{628}\u{627} \u{628}\u{627}\u{644}\u{639}\u{627}\u{644}\u{645}')")
267            .unwrap();
268        let result = expr.search(&json!(null)).unwrap();
269        assert_eq!(result.as_str().unwrap(), "Arabic");
270    }
271
272    #[test]
273    fn test_detect_language_confidence() {
274        let runtime = setup_runtime();
275        let expr = runtime
276            .compile("detect_language_confidence('This is definitely English text for testing.')")
277            .unwrap();
278        let result = expr.search(&json!(null)).unwrap();
279        let confidence = result.as_f64().unwrap();
280        assert!(confidence > 0.5);
281    }
282
283    #[test]
284    fn test_detect_language_info() {
285        let runtime = setup_runtime();
286        let expr = runtime
287            .compile("detect_language_info('This is a test.')")
288            .unwrap();
289        let result = expr.search(&json!(null)).unwrap();
290        let obj = result.as_object().unwrap();
291
292        assert!(obj.contains_key("language"));
293        assert!(obj.contains_key("code"));
294        assert!(obj.contains_key("script"));
295        assert!(obj.contains_key("confidence"));
296        assert!(obj.contains_key("reliable"));
297
298        assert_eq!(obj.get("language").unwrap().as_str().unwrap(), "English");
299        assert_eq!(obj.get("code").unwrap().as_str().unwrap(), "eng");
300        assert_eq!(obj.get("script").unwrap().as_str().unwrap(), "Latin");
301    }
302
303    #[test]
304    fn test_detect_language_empty_string() {
305        let runtime = setup_runtime();
306        let expr = runtime.compile("detect_language('')").unwrap();
307        let result = expr.search(&json!(null)).unwrap();
308        // Empty string may return null or a guess
309        // Just verify it doesn't crash
310        assert!(result.is_null() || result.as_str().is_some());
311    }
312
313    #[test]
314    fn test_detect_language_short_text() {
315        let runtime = setup_runtime();
316        let expr = runtime.compile("detect_language('Hi')").unwrap();
317        let result = expr.search(&json!(null)).unwrap();
318        // Short text may have low confidence but should still work
319        assert!(result.is_null() || result.as_str().is_some());
320    }
321}