Skip to main content

udpipe_rs/
lib.rs

1//! Rust bindings for `UDPipe` - Universal Dependencies Pipeline.
2//!
3//! `UDPipe` is a trainable pipeline for tokenization, tagging, lemmatization,
4//! and dependency parsing of CoNLL-U files.
5//!
6//! # Example
7//!
8//! ```no_run
9//! use udpipe_rs::Model;
10//!
11//! // Download a model by language (one-time setup)
12//! let model_path =
13//!     udpipe_rs::download_model("english-ewt", ".").expect("Failed to download model");
14//!
15//! // Load and use the model
16//! let model = Model::load(&model_path).expect("Failed to load model");
17//! let words = model.parse("Hello world!").expect("Failed to parse");
18//!
19//! for word in words {
20//!     println!("{}: {} ({})", word.form, word.upostag, word.deprel);
21//! }
22//! ```
23
24use std::ffi::{CStr, CString};
25use std::fs::File;
26use std::io::BufWriter;
27use std::path::Path;
28
29/// Base URL for the LINDAT/CLARIAH-CZ model repository (UD 2.5).
30const MODEL_BASE_URL: &str =
31    "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131";
32
33/// Error type for `UDPipe` operations.
34#[derive(Debug, Clone)]
35pub struct UdpipeError {
36    /// The error message.
37    pub message: String,
38}
39
40impl UdpipeError {
41    /// Create a new error with the given message.
42    pub fn new(message: impl Into<String>) -> Self {
43        Self {
44            message: message.into(),
45        }
46    }
47}
48
49impl std::fmt::Display for UdpipeError {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        write!(f, "UDPipe error: {}", self.message)
52    }
53}
54
55impl std::error::Error for UdpipeError {}
56
57impl From<std::io::Error> for UdpipeError {
58    fn from(err: std::io::Error) -> Self {
59        Self {
60            message: err.to_string(),
61        }
62    }
63}
64
65/// A parsed word from `UDPipe` with Universal Dependencies annotations.
66#[derive(Debug, Clone, PartialEq, Eq, Hash)]
67pub struct Word {
68    /// The surface form (actual text).
69    pub form: String,
70    /// The lemma (dictionary form).
71    pub lemma: String,
72    /// Universal POS tag (NOUN, VERB, ADJ, etc.).
73    pub upostag: String,
74    /// Language-specific POS tag.
75    pub xpostag: String,
76    /// Morphological features (e.g., "VerbForm=Inf|Mood=Imp").
77    pub feats: String,
78    /// Dependency relation to head (root, nsubj, obj, etc.).
79    pub deprel: String,
80    /// Miscellaneous annotations (e.g., "SpaceAfter=No").
81    pub misc: String,
82    /// 1-based index of this word within its sentence.
83    pub id: i32,
84    /// Index of the head word (0 = root).
85    pub head: i32,
86    /// 0-based index of the sentence this word belongs to.
87    pub sentence_id: i32,
88}
89
90impl Word {
91    /// Returns true if this word has a specific morphological feature.
92    ///
93    /// # Example
94    /// ```
95    /// # use udpipe_rs::Word;
96    /// # let word = Word {
97    /// #     form: "run".to_string(),
98    /// #     lemma: "run".to_string(),
99    /// #     upostag: "VERB".to_string(),
100    /// #     xpostag: String::new(),
101    /// #     feats: "Mood=Imp|VerbForm=Fin".to_string(),
102    /// #     deprel: "root".to_string(),
103    /// #     misc: String::new(),
104    /// #     id: 1,
105    /// #     head: 0,
106    /// #     sentence_id: 0,
107    /// # };
108    /// assert!(word.has_feature("Mood", "Imp"));
109    /// ```
110    #[must_use]
111    pub fn has_feature(&self, key: &str, value: &str) -> bool {
112        self.get_feature(key) == Some(value)
113    }
114
115    /// Returns the value of a morphological feature, if present.
116    ///
117    /// # Example
118    /// ```
119    /// # use udpipe_rs::Word;
120    /// # let word = Word {
121    /// #     form: "run".to_string(),
122    /// #     lemma: "run".to_string(),
123    /// #     upostag: "VERB".to_string(),
124    /// #     xpostag: String::new(),
125    /// #     feats: "Mood=Imp|VerbForm=Fin".to_string(),
126    /// #     deprel: "root".to_string(),
127    /// #     misc: String::new(),
128    /// #     id: 1,
129    /// #     head: 0,
130    /// #     sentence_id: 0,
131    /// # };
132    /// assert_eq!(word.get_feature("Mood"), Some("Imp"));
133    /// ```
134    #[must_use]
135    pub fn get_feature(&self, key: &str) -> Option<&str> {
136        self.feats
137            .split('|')
138            .find_map(|f| f.strip_prefix(key)?.strip_prefix('='))
139    }
140
141    /// Returns true if this word is a verb (VERB or AUX).
142    #[must_use]
143    pub fn is_verb(&self) -> bool {
144        self.upostag == "VERB" || self.upostag == "AUX"
145    }
146
147    /// Returns true if this word is a noun (NOUN or PROPN).
148    #[must_use]
149    pub fn is_noun(&self) -> bool {
150        self.upostag == "NOUN" || self.upostag == "PROPN"
151    }
152
153    /// Returns true if this word is an adjective (ADJ).
154    #[must_use]
155    pub fn is_adjective(&self) -> bool {
156        self.upostag == "ADJ"
157    }
158
159    /// Returns true if this word is punctuation (PUNCT).
160    #[must_use]
161    pub fn is_punct(&self) -> bool {
162        self.upostag == "PUNCT"
163    }
164
165    /// Returns true if this word is the root of its sentence.
166    #[must_use]
167    pub fn is_root(&self) -> bool {
168        self.deprel == "root"
169    }
170
171    /// Returns true if there's a space after this word.
172    ///
173    /// In CoNLL-U format, `SpaceAfter=No` is only present when there's no
174    /// space. This returns `true` (the default) when that annotation is
175    /// absent.
176    #[must_use]
177    pub fn has_space_after(&self) -> bool {
178        !self.misc.contains("SpaceAfter=No")
179    }
180}
181
182/// FFI declarations for the `UDPipe` C++ wrapper.
183mod ffi {
184    use std::os::raw::c_char;
185
186    /// Opaque handle to a loaded `UDPipe` model.
187    #[repr(C)]
188    pub struct UdpipeModel {
189        /// Zero-sized field to make the struct opaque.
190        _private: [u8; 0],
191    }
192
193    /// Opaque handle to a parse result.
194    #[repr(C)]
195    pub struct UdpipeParseResult {
196        /// Zero-sized field to make the struct opaque.
197        _private: [u8; 0],
198    }
199
200    /// A single word from a parse result.
201    #[repr(C)]
202    pub struct UdpipeWord {
203        /// The word form (surface text).
204        pub form: *const c_char,
205        /// The lemma.
206        pub lemma: *const c_char,
207        /// Universal POS tag.
208        pub upostag: *const c_char,
209        /// Language-specific POS tag.
210        pub xpostag: *const c_char,
211        /// Morphological features.
212        pub feats: *const c_char,
213        /// Dependency relation.
214        pub deprel: *const c_char,
215        /// Miscellaneous annotations.
216        pub misc: *const c_char,
217        /// Word ID (1-indexed within sentence).
218        pub id: i32,
219        /// Head word ID (0 for root).
220        pub head: i32,
221        /// Sentence ID (0-indexed).
222        pub sentence_id: i32,
223    }
224
225    unsafe extern "C" {
226        /// Load a model from a file path.
227        pub fn udpipe_model_load(model_path: *const c_char) -> *mut UdpipeModel;
228        /// Load a model from memory.
229        pub fn udpipe_model_load_from_memory(data: *const u8, len: usize) -> *mut UdpipeModel;
230        /// Free a loaded model.
231        pub fn udpipe_model_free(model: *mut UdpipeModel);
232        /// Parse text and return a result handle.
233        pub fn udpipe_parse(model: *mut UdpipeModel, text: *const c_char)
234        -> *mut UdpipeParseResult;
235        /// Free a parse result.
236        pub fn udpipe_result_free(result: *mut UdpipeParseResult);
237        /// Get the last error message.
238        pub fn udpipe_get_error() -> *const c_char;
239        /// Get the word count in a parse result.
240        pub fn udpipe_result_word_count(result: *mut UdpipeParseResult) -> i32;
241        /// Get a word by index from a parse result.
242        pub fn udpipe_result_get_word(result: *mut UdpipeParseResult, index: i32) -> UdpipeWord;
243    }
244}
245
246/// Get the last error from the FFI layer.
247fn get_ffi_error() -> String {
248    // SAFETY: `udpipe_get_error` returns a pointer to a static thread-local buffer.
249    let err_ptr = unsafe { ffi::udpipe_get_error() };
250    assert!(!err_ptr.is_null(), "UDPipe returned null error pointer");
251    // SAFETY: The pointer is valid and points to a null-terminated C string.
252    unsafe { CStr::from_ptr(err_ptr) }
253        .to_string_lossy()
254        .into_owned()
255}
256
257/// `UDPipe` model wrapper.
258///
259/// This is the main type for loading and using `UDPipe` models.
260/// Models can be loaded from files or from memory.
261///
262/// # Thread Safety
263///
264/// `Model` is [`Send`] but not [`Sync`]. You can transfer a model to another
265/// thread, but you cannot share references to it across threads. If you need
266/// concurrent access from multiple threads, wrap the model in
267/// `Arc<Mutex<Model>>`.
268pub struct Model {
269    /// Raw pointer to the underlying `UDPipe` model.
270    inner: *mut ffi::UdpipeModel,
271}
272
273impl std::fmt::Debug for Model {
274    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
275        f.debug_struct("Model")
276            .field("inner", &(!self.inner.is_null()))
277            .finish()
278    }
279}
280
281// SAFETY: Transferring ownership of a Model to another thread is safe.
282//
283// Verified by auditing vendor/udpipe/src:
284// - No `thread_local` storage in UDPipe, MorphoDiTa, or Parsito
285// - Model data is owned via unique_ptr (no shared ownership)
286// - Internal caches use atomic spin-locks (threadsafe_stack with atomic_flag)
287// - Global statics (ragel_map, lzma allocators) are read-only after init
288// - Our C++ wrapper uses thread_local only for error messages, which are
289//   captured immediately after each FFI call on the calling thread
290unsafe impl Send for Model {}
291
292// NOTE: Model is intentionally !Sync because the underlying UDPipe C++ library
293// is not thread-safe for concurrent access.
294//
295// Evidence from vendor/udpipe/src:
296// - tag() and parse() methods mutate internal workspace caches
297// - While caches use threadsafe_stack for pool management, concurrent parse
298//   operations on the same Model would race on workspace contents
299// - TSAN confirms data races in std::string operations during concurrent access
300//
301// Use Arc<Mutex<Model>> or create separate Model instances per thread.
302
303impl Model {
304    /// Load a model from a file path.
305    ///
306    /// # Errors
307    ///
308    /// Returns an error if the path contains a null byte or if the model cannot
309    /// be loaded.
310    ///
311    /// # Example
312    /// ```no_run
313    /// use udpipe_rs::Model;
314    /// let model = Model::load("english-ewt-ud-2.5-191206.udpipe").expect("Failed to load model");
315    /// ```
316    pub fn load(path: impl AsRef<Path>) -> Result<Self, UdpipeError> {
317        let path_str = path.as_ref().to_string_lossy();
318        let c_path = CString::new(path_str.as_bytes()).map_err(|_| UdpipeError {
319            message: "Invalid path (contains null byte)".to_owned(),
320        })?;
321
322        // SAFETY: `c_path` is a valid null-terminated C string.
323        let model = unsafe { ffi::udpipe_model_load(c_path.as_ptr()) };
324
325        if model.is_null() {
326            return Err(UdpipeError {
327                message: get_ffi_error(),
328            });
329        }
330
331        Ok(Self { inner: model })
332    }
333
334    /// Load a model from a byte slice.
335    ///
336    /// This is useful for loading models from network sources or embedded data.
337    ///
338    /// # Errors
339    ///
340    /// Returns an error if the data is empty or not a valid `UDPipe` model.
341    ///
342    /// # Example
343    /// ```no_run
344    /// use udpipe_rs::Model;
345    /// let model_data =
346    ///     std::fs::read("english-ewt-ud-2.5-191206.udpipe").expect("Failed to read model");
347    /// let model = Model::load_from_memory(&model_data).expect("Failed to load model");
348    /// ```
349    pub fn load_from_memory(data: &[u8]) -> Result<Self, UdpipeError> {
350        // SAFETY: `data` is a valid slice; pointer and length are derived from it.
351        let model = unsafe { ffi::udpipe_model_load_from_memory(data.as_ptr(), data.len()) };
352
353        if model.is_null() {
354            return Err(UdpipeError {
355                message: get_ffi_error(),
356            });
357        }
358
359        Ok(Self { inner: model })
360    }
361
362    /// Parse text and return all words with their UD annotations.
363    ///
364    /// The text is tokenized, tagged, lemmatized, and parsed for dependencies.
365    ///
366    /// # Errors
367    ///
368    /// Returns an error if the text contains a null byte or if parsing fails.
369    ///
370    /// # Example
371    /// ```no_run
372    /// use udpipe_rs::Model;
373    /// let model = Model::load("english-ewt-ud-2.5-191206.udpipe").expect("Failed to load");
374    /// let words = model
375    ///     .parse("The quick brown fox.")
376    ///     .expect("Failed to parse");
377    /// for word in words {
378    ///     println!("{} -> {} ({})", word.form, word.lemma, word.upostag);
379    /// }
380    /// ```
381    pub fn parse(&self, text: &str) -> Result<Vec<Word>, UdpipeError> {
382        let c_text = CString::new(text).map_err(|_| UdpipeError {
383            message: "Invalid text (contains null byte)".to_owned(),
384        })?;
385
386        // SAFETY: `self.inner` is valid and `c_text` is a valid null-terminated C
387        // string.
388        let result = unsafe { ffi::udpipe_parse(self.inner, c_text.as_ptr()) };
389        if result.is_null() {
390            return Err(UdpipeError {
391                message: get_ffi_error(),
392            });
393        }
394
395        // SAFETY: `result` is a valid parse result pointer.
396        let word_count = unsafe { ffi::udpipe_result_word_count(result) };
397        let capacity = usize::try_from(word_count).unwrap_or(0);
398        let mut words = Vec::with_capacity(capacity);
399
400        for i in 0..word_count {
401            // SAFETY: `result` is valid and `i` is within bounds.
402            let word = unsafe { ffi::udpipe_result_get_word(result, i) };
403            words.push(Word {
404                form: ptr_to_string(word.form),
405                lemma: ptr_to_string(word.lemma),
406                upostag: ptr_to_string(word.upostag),
407                xpostag: ptr_to_string(word.xpostag),
408                feats: ptr_to_string(word.feats),
409                deprel: ptr_to_string(word.deprel),
410                misc: ptr_to_string(word.misc),
411                id: word.id,
412                head: word.head,
413                sentence_id: word.sentence_id,
414            });
415        }
416
417        // SAFETY: `result` is a valid pointer that we own.
418        unsafe { ffi::udpipe_result_free(result) };
419
420        Ok(words)
421    }
422}
423
424/// Convert a C string pointer to an owned `String`.
425///
426/// # Safety
427/// The pointer must be valid and point to a null-terminated C string.
428fn ptr_to_string(ptr: *const std::os::raw::c_char) -> String {
429    // SAFETY: FFI guarantees the pointer is valid and null-terminated.
430    unsafe { CStr::from_ptr(ptr) }
431        .to_string_lossy()
432        .into_owned()
433}
434
435impl Drop for Model {
436    fn drop(&mut self) {
437        if !self.inner.is_null() {
438            // SAFETY: `self.inner` is valid and we have exclusive ownership.
439            unsafe { ffi::udpipe_model_free(self.inner) };
440        }
441    }
442}
443
444/// Available pre-trained models from Universal Dependencies 2.5.
445///
446/// These models are hosted at the [LINDAT/CLARIAH-CZ repository](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131).
447/// Use [`download_model`] to fetch them.
448pub const AVAILABLE_MODELS: &[&str] = &[
449    "afrikaans-afribooms",
450    "ancient_greek-perseus",
451    "ancient_greek-proiel",
452    "arabic-padt",
453    "armenian-armtdp",
454    "basque-bdt",
455    "belarusian-hse",
456    "bulgarian-btb",
457    "buryat-bdt",
458    "catalan-ancora",
459    "chinese-gsd",
460    "chinese-gsdsimp",
461    "classical_chinese-kyoto",
462    "coptic-scriptorium",
463    "croatian-set",
464    "czech-cac",
465    "czech-cltt",
466    "czech-fictree",
467    "czech-pdt",
468    "danish-ddt",
469    "dutch-alpino",
470    "dutch-lassysmall",
471    "english-ewt",
472    "english-gum",
473    "english-lines",
474    "english-partut",
475    "estonian-edt",
476    "estonian-ewt",
477    "finnish-ftb",
478    "finnish-tdt",
479    "french-gsd",
480    "french-partut",
481    "french-sequoia",
482    "french-spoken",
483    "galician-ctg",
484    "galician-treegal",
485    "german-gsd",
486    "german-hdt",
487    "gothic-proiel",
488    "greek-gdt",
489    "hebrew-htb",
490    "hindi-hdtb",
491    "hungarian-szeged",
492    "indonesian-gsd",
493    "irish-idt",
494    "italian-isdt",
495    "italian-partut",
496    "italian-postwita",
497    "italian-twittiro",
498    "italian-vit",
499    "japanese-gsd",
500    "kazakh-ktb",
501    "korean-gsd",
502    "korean-kaist",
503    "kurmanji-mg",
504    "latin-ittb",
505    "latin-perseus",
506    "latin-proiel",
507    "latvian-lvtb",
508    "lithuanian-alksnis",
509    "lithuanian-hse",
510    "maltese-mudt",
511    "marathi-ufal",
512    "north_sami-giella",
513    "norwegian-bokmaal",
514    "norwegian-nynorsk",
515    "norwegian-nynorsklia",
516    "old_church_slavonic-proiel",
517    "old_french-srcmf",
518    "old_russian-torot",
519    "persian-seraji",
520    "polish-lfg",
521    "polish-pdb",
522    "polish-sz",
523    "portuguese-bosque",
524    "portuguese-br",
525    "portuguese-gsd",
526    "romanian-nonstandard",
527    "romanian-rrt",
528    "russian-gsd",
529    "russian-syntagrus",
530    "russian-taiga",
531    "sanskrit-ufal",
532    "scottish_gaelic-arcosg",
533    "serbian-set",
534    "slovak-snk",
535    "slovenian-ssj",
536    "slovenian-sst",
537    "spanish-ancora",
538    "spanish-gsd",
539    "swedish-lines",
540    "swedish-talbanken",
541    "tamil-ttb",
542    "telugu-mtg",
543    "turkish-imst",
544    "ukrainian-iu",
545    "upper_sorbian-ufal",
546    "urdu-udtb",
547    "uyghur-udt",
548    "vietnamese-vtb",
549    "wolof-wtb",
550];
551
552/// Download a pre-trained model by language identifier.
553///
554/// Downloads a model from the [LINDAT/CLARIAH-CZ repository](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131)
555/// to the specified destination directory. Returns the path to the downloaded
556/// model file.
557///
558/// # Arguments
559///
560/// * `language` - Language identifier (e.g., "english-ewt", "dutch-alpino",
561///   "german-gsd"). See [`AVAILABLE_MODELS`] for the full list.
562/// * `dest_dir` - Directory where the model will be saved.
563///
564/// # Errors
565///
566/// Returns an error if the language is not in [`AVAILABLE_MODELS`] or if the
567/// download fails.
568///
569/// # Example
570///
571/// ```no_run
572/// use udpipe_rs::{Model, download_model};
573///
574/// // Download English model to current directory
575/// let model_path = download_model("english-ewt", ".").expect("Failed to download");
576/// println!("Model saved to: {}", model_path);
577///
578/// // Load and use
579/// let model = Model::load(&model_path).expect("Failed to load");
580/// ```
581pub fn download_model(language: &str, dest_dir: impl AsRef<Path>) -> Result<String, UdpipeError> {
582    let dest_dir = dest_dir.as_ref();
583
584    // Validate the language
585    if !AVAILABLE_MODELS.contains(&language) {
586        return Err(UdpipeError {
587            message: format!(
588                "Unknown language '{}'. Use one of: {}",
589                language,
590                AVAILABLE_MODELS[..5].join(", ") + ", ..."
591            ),
592        });
593    }
594
595    // Construct filename and URL
596    let filename = model_filename(language);
597    let dest_path = dest_dir.join(&filename);
598    let url = format!("{MODEL_BASE_URL}/{filename}");
599
600    // Download using the generic download function
601    download_model_from_url(&url, &dest_path)?;
602
603    Ok(dest_path.to_string_lossy().into_owned())
604}
605
606/// Download a model from a custom URL to a local file path.
607///
608/// Use this if you need to download models from a different source or version.
609/// For standard models, prefer [`download_model`].
610///
611/// # Errors
612///
613/// Returns an error if the download fails, the response is empty, or the file
614/// cannot be written.
615///
616/// # Example
617///
618/// ```no_run
619/// use udpipe_rs::download_model_from_url;
620///
621/// download_model_from_url(
622///     "https://example.com/custom-model.udpipe",
623///     "custom-model.udpipe",
624/// )
625/// .expect("Failed to download");
626/// ```
627pub fn download_model_from_url(url: &str, path: impl AsRef<Path>) -> Result<(), UdpipeError> {
628    let path = path.as_ref();
629
630    // Download using ureq
631    let response = ureq::get(url).call().map_err(|e| UdpipeError {
632        message: format!("Failed to download: {e}"),
633    })?;
634
635    // Stream response directly to file
636    let file = File::create(path)?;
637    let mut writer = BufWriter::new(file);
638    let bytes_written = std::io::copy(&mut response.into_body().into_reader(), &mut writer)?;
639
640    if bytes_written == 0 {
641        return Err(UdpipeError {
642            message: "Downloaded file is empty".to_owned(),
643        });
644    }
645
646    Ok(())
647}
648
649/// Returns the expected filename for a given language model.
650///
651/// # Example
652///
653/// ```
654/// assert_eq!(
655///     udpipe_rs::model_filename("english-ewt"),
656///     "english-ewt-ud-2.5-191206.udpipe"
657/// );
658/// ```
659#[must_use]
660pub fn model_filename(language: &str) -> String {
661    format!("{language}-ud-2.5-191206.udpipe")
662}
663
664#[cfg(test)]
665mod tests {
666    use super::*;
667
668    fn make_word(feats: &str) -> Word {
669        Word {
670            form: "test".to_owned(),
671            lemma: "test".to_owned(),
672            upostag: "NOUN".to_owned(),
673            xpostag: String::new(),
674            feats: feats.to_owned(),
675            deprel: "root".to_owned(),
676            misc: String::new(),
677            id: 1,
678            head: 0,
679            sentence_id: 0,
680        }
681    }
682
683    #[test]
684    fn test_word_has_feature() {
685        let word = make_word("Mood=Imp|VerbForm=Fin");
686
687        assert!(word.has_feature("Mood", "Imp"));
688        assert!(word.has_feature("VerbForm", "Fin"));
689        assert!(!word.has_feature("Mood", "Ind"));
690        assert!(!word.has_feature("Tense", "Past"));
691    }
692
693    #[test]
694    fn test_word_has_feature_empty() {
695        let word = make_word("");
696        assert!(!word.has_feature("Mood", "Imp"));
697    }
698
699    #[test]
700    fn test_word_has_feature_single() {
701        let word = make_word("Mood=Imp");
702        assert!(word.has_feature("Mood", "Imp"));
703        assert!(!word.has_feature("VerbForm", "Fin"));
704    }
705
706    #[test]
707    fn test_word_get_feature() {
708        let word = make_word("Tense=Pres|VerbForm=Part");
709
710        assert_eq!(word.get_feature("Tense"), Some("Pres"));
711        assert_eq!(word.get_feature("VerbForm"), Some("Part"));
712        assert_eq!(word.get_feature("Mood"), None);
713    }
714
715    #[test]
716    fn test_word_get_feature_empty() {
717        let word = make_word("");
718        assert_eq!(word.get_feature("Mood"), None);
719    }
720
721    #[test]
722    fn test_word_get_feature_single() {
723        let word = make_word("Mood=Imp");
724        assert_eq!(word.get_feature("Mood"), Some("Imp"));
725        assert_eq!(word.get_feature("VerbForm"), None);
726    }
727
728    #[test]
729    fn test_word_is_verb() {
730        let mut word = make_word("");
731        word.upostag = "VERB".to_owned();
732        assert!(word.is_verb());
733
734        word.upostag = "AUX".to_owned();
735        assert!(word.is_verb());
736
737        word.upostag = "NOUN".to_owned();
738        assert!(!word.is_verb());
739    }
740
741    #[test]
742    fn test_word_is_noun() {
743        let mut word = make_word("");
744        word.upostag = "NOUN".to_owned();
745        assert!(word.is_noun());
746
747        word.upostag = "PROPN".to_owned();
748        assert!(word.is_noun());
749
750        word.upostag = "VERB".to_owned();
751        assert!(!word.is_noun());
752    }
753
754    #[test]
755    fn test_word_is_root() {
756        let mut word = make_word("");
757        word.deprel = "root".to_owned();
758        assert!(word.is_root());
759
760        word.deprel = "nsubj".to_owned();
761        assert!(!word.is_root());
762    }
763
764    #[test]
765    fn test_word_is_adjective() {
766        let mut word = make_word("");
767        word.upostag = "ADJ".to_owned();
768        assert!(word.is_adjective());
769
770        word.upostag = "NOUN".to_owned();
771        assert!(!word.is_adjective());
772    }
773
774    #[test]
775    fn test_word_is_punct() {
776        let mut word = make_word("");
777        word.upostag = "PUNCT".to_owned();
778        assert!(word.is_punct());
779
780        word.upostag = "NOUN".to_owned();
781        assert!(!word.is_punct());
782    }
783
784    #[test]
785    fn test_word_hash() {
786        use std::collections::HashSet;
787
788        let word1 = make_word("Mood=Imp");
789        let word2 = make_word("Mood=Imp");
790        let mut set = HashSet::new();
791        set.insert(word1);
792        assert!(set.contains(&word2));
793    }
794
795    #[test]
796    fn test_model_filename() {
797        assert_eq!(
798            model_filename("english-ewt"),
799            "english-ewt-ud-2.5-191206.udpipe"
800        );
801        assert_eq!(
802            model_filename("dutch-alpino"),
803            "dutch-alpino-ud-2.5-191206.udpipe"
804        );
805    }
806
807    #[test]
808    fn test_available_models_contains_common_languages() {
809        assert!(AVAILABLE_MODELS.contains(&"english-ewt"));
810        assert!(AVAILABLE_MODELS.contains(&"german-gsd"));
811        assert!(AVAILABLE_MODELS.contains(&"french-gsd"));
812        assert!(AVAILABLE_MODELS.contains(&"spanish-ancora"));
813    }
814
815    #[test]
816    fn test_available_models_sorted() {
817        // Verify the list is sorted for binary search if needed later
818        let mut sorted = AVAILABLE_MODELS.to_vec();
819        sorted.sort_unstable();
820        assert_eq!(AVAILABLE_MODELS, sorted.as_slice());
821    }
822
823    #[test]
824    fn test_download_model_invalid_language() {
825        let result = download_model("invalid-language-xyz", ".");
826        assert!(result.is_err());
827        let err = result.unwrap_err();
828        assert!(err.message.contains("Unknown language"));
829    }
830
831    #[test]
832    fn test_udpipe_error_display() {
833        let err = UdpipeError::new("test error");
834        assert_eq!(format!("{err}"), "UDPipe error: test error");
835    }
836
837    #[test]
838    fn test_udpipe_error_from_io() {
839        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "file not found");
840        let err: UdpipeError = io_err.into();
841        assert!(err.message.contains("not found"));
842    }
843
844    #[test]
845    fn test_has_space_after() {
846        let mut word = make_word("");
847        word.misc = String::new();
848        assert!(word.has_space_after()); // default: has space
849
850        word.misc = "SpaceAfter=No".to_owned();
851        assert!(!word.has_space_after());
852
853        word.misc = "SpaceAfter=No|Other=Value".to_owned();
854        assert!(!word.has_space_after());
855    }
856
857    #[test]
858    fn test_model_load_nonexistent_file() {
859        let result = Model::load("/nonexistent/path/to/model.udpipe");
860        assert!(result.is_err());
861    }
862
863    #[test]
864    fn test_model_load_path_with_null_byte() {
865        let result = Model::load("path\0with\0nulls.udpipe");
866        let err = result.expect_err("expected error");
867        assert!(err.message.contains("null byte"));
868    }
869
870    #[test]
871    fn test_model_load_from_memory_empty() {
872        let result = Model::load_from_memory(&[]);
873        assert!(result.is_err());
874    }
875
876    #[test]
877    fn test_model_load_from_memory_invalid() {
878        let garbage = b"this is not a valid udpipe model";
879        let result = Model::load_from_memory(garbage);
880        assert!(result.is_err());
881    }
882
883    #[test]
884    fn test_parse_with_null_model() {
885        // Create a Model with a null inner pointer to test the error path
886        let model = Model {
887            inner: std::ptr::null_mut(),
888        };
889        let result = model.parse("test");
890        let err = result.unwrap_err();
891        assert!(err.message.contains("Invalid arguments"));
892    }
893
894    #[test]
895    fn test_model_debug() {
896        let model = Model {
897            inner: std::ptr::null_mut(),
898        };
899        let debug_str = format!("{model:?}");
900        assert!(debug_str.contains("Model"));
901        assert!(debug_str.contains("inner"));
902    }
903
904    #[test]
905    fn test_download_model_from_url_invalid_url() {
906        let temp_dir = tempfile::tempdir().unwrap();
907        let path = temp_dir.path().join("model.udpipe");
908        let result = download_model_from_url("http://invalid.invalid/no-such-model", &path);
909        assert!(result.is_err());
910        let err = result.unwrap_err();
911        assert!(err.message.contains("Failed to download"));
912    }
913
914    #[test]
915    fn test_download_model_from_url_nonexistent_dir() {
916        let temp_dir = tempfile::tempdir().unwrap();
917        let path = temp_dir.path().join("nonexistent/model.udpipe");
918        // Use a dummy URL - we should fail when writing, not on network
919        let url = "http://localhost:1/model.udpipe";
920
921        let result = download_model_from_url(url, &path);
922        // Will fail on network error first since dir doesn't exist check happens at
923        // write time
924        assert!(result.is_err());
925    }
926
927    #[test]
928    fn test_download_model_from_url_empty_response() {
929        let temp_dir = tempfile::tempdir().unwrap();
930        let path = temp_dir.path().join("model.udpipe");
931
932        let mut server = mockito::Server::new();
933        let mock = server
934            .mock("GET", "/empty-model.udpipe")
935            .with_status(200)
936            .with_body("")
937            .create();
938        let full_url = format!("{}/empty-model.udpipe", server.url());
939
940        let result = download_model_from_url(&full_url, &path);
941        mock.assert();
942        drop(server);
943
944        assert!(result.is_err());
945        let err = result.unwrap_err();
946        assert!(err.message.contains("empty"));
947    }
948
949    #[test]
950    fn test_ffi_null_result_word_count() {
951        // SAFETY: Testing that null pointer returns 0 (defensive C++ code)
952        let count = unsafe { ffi::udpipe_result_word_count(std::ptr::null_mut()) };
953        assert_eq!(count, 0);
954    }
955
956    #[test]
957    fn test_ffi_null_result_get_word() {
958        // SAFETY: Testing that null pointer returns zeroed word (defensive C++ code)
959        let word = unsafe { ffi::udpipe_result_get_word(std::ptr::null_mut(), 0) };
960        assert!(word.form.is_null());
961        assert!(word.lemma.is_null());
962        assert!(word.upostag.is_null());
963    }
964
965    #[test]
966    fn test_ffi_invalid_index() {
967        // SAFETY: Testing bounds checking with negative index
968        let word = unsafe { ffi::udpipe_result_get_word(std::ptr::null_mut(), -1) };
969        assert!(word.form.is_null());
970    }
971}