rust_macios/natural_language/
nl_tokenizer.rs

1use block::{ConcreteBlock, IntoConcreteBlock};
2use objc::{msg_send, sel, sel_impl};
3
4use crate::{
5    foundation::{NSArray, NSRange, NSString, UInt},
6    object,
7    objective_c_runtime::{
8        macros::interface_impl,
9        traits::{FromId, PNSObject},
10        INSValue,
11    },
12};
13
14use super::{NLLanguage, NLTokenUnit};
15
16/// Hints about the contents of the string for the tokenizer.
17#[derive(Debug, PartialEq, Eq)]
18#[repr(u64)]
19pub enum NLTokenizerAttributes {
20    /// Doesn't contain any special attributes.
21    None = 0,
22    /// The string contains numbers.
23    Numeric = 1 << 0,
24    /// The string contains symbols.
25    Symbolic = 1 << 1,
26    /// The string contains emoji.
27    Emoji = 1 << 2,
28}
29
30object! {
31    /// A tokenizer that segments natural language text into semantic units.
32    unsafe pub struct NLTokenizer;
33}
34
35#[interface_impl(NSObject)]
36impl NLTokenizer {
37    /* Creating a Tokenizer
38     */
39
40    /// Creates a tokenizer with the specified unit.
41    #[method]
42    pub fn init_with_unit(&mut self, unit: NLTokenUnit) -> Self
43    where
44        Self: Sized + FromId,
45    {
46        unsafe { Self::from_id(msg_send![self.m_self(), initWithUnit: unit]) }
47    }
48
49    /* Configuring a Tokenizer
50     */
51
52    /// The text to be tokenized.
53    #[property]
54    pub fn string(&self) -> NSString {
55        unsafe { NSString::from_id(msg_send![self.m_self(), string]) }
56    }
57
58    /// Sets the text to be tokenized.
59    #[property]
60    pub fn set_string(&mut self, string: NSString) {
61        unsafe { msg_send![self.m_self(), setString: string] }
62    }
63
64    /// Sets the language of the text to be tokenized.
65    #[method]
66    pub fn set_language(&mut self, language: NLLanguage) {
67        unsafe { msg_send![self.m_self(), setLanguage: language] }
68    }
69
70    /// The linguistic unit that this tokenizer uses.
71    #[property]
72    pub fn unit(&self) -> NLTokenUnit {
73        unsafe { msg_send![self.m_self(), unit] }
74    }
75
76    /* Enumerating the Tokens
77     */
78
79    /// Enumerates over a given range of the string and calls the specified block for each token.
80    #[method]
81    pub fn enumerate_tokens_in_range_using_block<F>(&self, range: NSRange, block: F)
82    where
83        F: IntoConcreteBlock<(NSRange, NLTokenizerAttributes, *mut bool), Ret = ()> + 'static,
84    {
85        let block = ConcreteBlock::new(block);
86        let block = block.copy();
87        unsafe {
88            msg_send![
89                self.m_self(),
90                enumerateTokensInRange: range
91                usingBlock: block
92            ]
93        }
94    }
95
96    /// Tokenizes the string within the provided range.
97    #[method]
98    pub fn tokens_for_range<T>(&self, range: NSRange) -> NSArray<T>
99    where
100        T: INSValue,
101    {
102        unsafe { NSArray::from_id(msg_send![self.m_self(), tokensForRange: range]) }
103    }
104
105    /// Finds the range of the token at the given index.
106    #[method]
107    pub fn token_range_at_index(&self, character_index: UInt) -> NSRange {
108        unsafe { msg_send![self.m_self(), tokenRangeAtIndex: character_index] }
109    }
110
111    /// Finds the entire range of all tokens contained completely or partially within the specified range.
112    #[method]
113    pub fn token_range_for_range(&self, range: NSRange) -> NSRange {
114        unsafe { msg_send![self.m_self(), tokenRangeForRange: range] }
115    }
116}
117
118impl Default for NLTokenizer {
119    fn default() -> Self {
120        Self::m_new()
121    }
122}
123
124#[cfg(test)]
125mod tests {
126
127    use crate::natural_language::English;
128
129    use super::*;
130
131    #[test]
132    fn test_init() {
133        let mut tokenizer = NLTokenizer::default();
134        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
135        assert_ne!(tokenizer.unit(), super::NLTokenUnit::Sentence);
136        assert_eq!(tokenizer.unit(), super::NLTokenUnit::Word);
137    }
138
139    #[test]
140    fn test_string() {
141        let mut tokenizer = NLTokenizer::default();
142        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
143        tokenizer.set_string("Hello, world!".into());
144        assert_ne!(tokenizer.string(), "Goodbye, world!");
145        assert_eq!(tokenizer.string(), "Hello, world!");
146    }
147
148    #[test]
149    fn test_set_string() {
150        let mut tokenizer = NLTokenizer::default();
151        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
152        tokenizer.set_string("Hello, world!".into());
153        assert_ne!(tokenizer.string(), "Goodbye, world!");
154        assert_eq!(tokenizer.string(), "Hello, world!");
155    }
156
157    #[test]
158    fn test_set_language() {
159        let mut tokenizer = NLTokenizer::default();
160        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
161        tokenizer.set_language(unsafe { English.clone() });
162    }
163
164    #[test]
165    fn test_unit() {
166        let mut tokenizer = NLTokenizer::default();
167        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
168        assert_ne!(tokenizer.unit(), NLTokenUnit::Sentence);
169        assert_eq!(tokenizer.unit(), NLTokenUnit::Word);
170    }
171
172    #[test]
173    fn test_token_range_at_index() {
174        let mut tokenizer = NLTokenizer::default();
175        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
176        tokenizer.set_string("Hello, world!".into());
177        assert_eq!(tokenizer.token_range_at_index(0), (0..5).into());
178    }
179
180    #[test]
181    fn test_token_range_for_range() {
182        let mut tokenizer = NLTokenizer::default();
183        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
184        tokenizer.set_string("Hello, world!".into());
185        assert_eq!(
186            tokenizer.token_range_for_range((0..5).into()),
187            (0..5).into()
188        );
189    }
190
191    #[test]
192    fn test_enumerate_tokens_in_range() {
193        let mut tokenizer = NLTokenizer::default();
194        tokenizer = tokenizer.init_with_unit(NLTokenUnit::Word);
195        let text = "Hello";
196        tokenizer.set_string(text.into());
197        tokenizer.enumerate_tokens_in_range_using_block((0..text.len()).into(), |_, attr, _| {
198            assert_eq!(attr, NLTokenizerAttributes::None);
199        });
200    }
201}