udgraph/
token.rs

1//! Tokens in the dependency graph.
2
3use std::collections::BTreeMap;
4use std::iter::FromIterator;
5use std::mem;
6use std::ops::{Deref, DerefMut};
7
8use crate::graph::{Iter, IterMut, Node, Sentence};
9
10/// A builder for [`Token`]s.
11///
12/// The `Token` type stores a CoNLL-U token. However, since this format
13/// permits a large number of fields, construction of a token can get
14/// tedious. This builder provides a fluent interface for creating [`Token`]
15/// instances.
16pub struct TokenBuilder {
17    token: Token,
18}
19
20impl TokenBuilder {
21    /// Create a [`TokenBuilder`] with all non-form fields set to absent.
22    pub fn new(form: impl Into<String>) -> TokenBuilder {
23        TokenBuilder {
24            token: Token::new(form),
25        }
26    }
27
28    /// Set the word form or punctuation symbol.
29    pub fn form(mut self, form: impl Into<String>) -> TokenBuilder {
30        self.token.set_form(form);
31        self
32    }
33
34    /// Set the lemma or stem of the word form.
35    pub fn lemma(mut self, lemma: impl Into<String>) -> TokenBuilder {
36        self.token.set_lemma(Some(lemma));
37        self
38    }
39
40    /// Set the universal part-of-speech tag.
41    pub fn upos(mut self, upos: impl Into<String>) -> TokenBuilder {
42        self.token.set_upos(Some(upos));
43        self
44    }
45
46    /// Set the language-specific part-of-speech tag.
47    pub fn xpos(mut self, xpos: impl Into<String>) -> TokenBuilder {
48        self.token.set_xpos(Some(xpos));
49        self
50    }
51
52    /// Set the syntactic and/or morphological features of the token.
53    pub fn features(mut self, features: Features) -> TokenBuilder {
54        self.token.set_features(features);
55        self
56    }
57
58    /// Set UD enhanced dependencies.
59    ///
60    /// **Warning:** this method will be removed once proper support for enhanced
61    /// dependencies is added.
62    pub fn deps(mut self, deps: impl Into<String>) -> TokenBuilder {
63        self.token.set_deps(Some(deps.into()));
64        self
65    }
66
67    /// Set miscellaneous token features.
68    pub fn misc(mut self, misc: Misc) -> TokenBuilder {
69        self.token.set_misc(misc);
70        self
71    }
72}
73
74impl From<Token> for TokenBuilder {
75    fn from(token: Token) -> Self {
76        TokenBuilder { token }
77    }
78}
79
80impl From<TokenBuilder> for Token {
81    fn from(builder: TokenBuilder) -> Self {
82        builder.token
83    }
84}
85
86/// Token
87///
88/// The `Token` type stores a CoNLL-U token. Various information can be
89/// associated with a token, such as its lemma, (universal) part-of-speech,
90/// morphological features, and other (miscellaneous) features.
91///
92/// `Token`s are typically stored as vertices in a
93/// [`DepGraph`](crate::graph::DepGraph), so that their dependency heads
94/// and dependents can be queried.
95#[derive(Clone, Debug, Eq, PartialEq)]
96pub struct Token {
97    form: String,
98    lemma: Option<String>,
99    upos: Option<String>,
100    xpos: Option<String>,
101    features: Features,
102    misc: Misc,
103
104    // Currently not exposed, but stored to preserve existing
105    // field on read -> write round trips.
106    deps: Option<String>,
107}
108
109impl Token {
110    /// Create a new token where all the non-form fields are absent.
111    pub fn new(form: impl Into<String>) -> Token {
112        Token {
113            form: form.into(),
114            lemma: None,
115            upos: None,
116            xpos: None,
117            features: Features::new(),
118            misc: Misc::new(),
119            deps: None,
120        }
121    }
122
123    /// Get the word form or punctuation symbol.
124    pub fn form(&self) -> &str {
125        self.form.as_ref()
126    }
127
128    /// Get the lemma or stem of the word form.
129    pub fn lemma(&self) -> Option<&str> {
130        self.lemma.as_ref().map(String::as_ref)
131    }
132
133    /// Get the universal part-of-speech tag.
134    pub fn upos(&self) -> Option<&str> {
135        self.upos.as_ref().map(String::as_ref)
136    }
137
138    /// Get the language-specific part-of-speech tag.
139    pub fn xpos(&self) -> Option<&str> {
140        self.xpos.as_ref().map(String::as_ref)
141    }
142
143    /// Get the syntactic and/or morphological features of the token.
144    pub fn features(&self) -> &Features {
145        &self.features
146    }
147
148    /// Get the syntactic and/or morphological features of the token.
149    ///
150    /// Returns a mutable reference, so that the features can be updated.
151    pub fn features_mut(&mut self) -> &mut Features {
152        &mut self.features
153    }
154
155    /// Get enhanced dependencies.
156    ///
157    /// **Warning:** this method will be removed once proper support for enhanced
158    /// dependencies is added.
159    pub fn deps(&self) -> Option<&str> {
160        self.deps.as_deref()
161    }
162
163    /// Get miscellaneous token features.
164    pub fn misc(&self) -> &Misc {
165        &self.misc
166    }
167
168    /// Get miscellaneous token features.
169    ///
170    /// Returns a mutable reference, so that the information can be updated.
171    pub fn misc_mut(&mut self) -> &mut Misc {
172        &mut self.misc
173    }
174
175    /// Set the word form or punctuation symbol.
176    ///
177    /// Returns the form that is replaced.
178    pub fn set_form(&mut self, form: impl Into<String>) -> String {
179        mem::replace(&mut self.form, form.into())
180    }
181
182    /// Set the lemma or stem of the word form.
183    ///
184    /// Returns the lemma that is replaced.
185    pub fn set_lemma<S>(&mut self, lemma: Option<S>) -> Option<String>
186    where
187        S: Into<String>,
188    {
189        mem::replace(&mut self.lemma, lemma.map(Into::into))
190    }
191
192    /// Set the universal part-of-speech tag.
193    ///
194    /// Returns the universal part-of-speech tag that is replaced.
195    pub fn set_upos<S>(&mut self, upos: Option<S>) -> Option<String>
196    where
197        S: Into<String>,
198    {
199        mem::replace(&mut self.upos, upos.map(Into::into))
200    }
201
202    /// Set the language-specific part-of-speech tag.
203    ///
204    /// Returns the language-specific part-of-speech tag that is replaced.
205    pub fn set_xpos<S>(&mut self, xpos: Option<S>) -> Option<String>
206    where
207        S: Into<String>,
208    {
209        mem::replace(&mut self.xpos, xpos.map(Into::into))
210    }
211
212    /// Set the syntactic and/or morphological features of the token.
213    ///
214    /// Returns the features that are replaced.
215    pub fn set_features(&mut self, features: Features) -> Features {
216        mem::replace(&mut self.features, features)
217    }
218
219    /// Set UD enhanced dependencies.
220    ///
221    /// **Warning:** this method will be removed once proper support for enhanced
222    /// dependencies is added.
223    pub fn set_deps(&mut self, deps: Option<impl Into<String>>) -> Option<String> {
224        mem::replace(&mut self.deps, deps.map(Into::into))
225    }
226
227    /// Set miscellaneous token features.
228    ///
229    /// Returns the features that are replaced.
230    pub fn set_misc(&mut self, misc: Misc) -> Misc {
231        mem::replace(&mut self.misc, misc)
232    }
233}
234
235/// Token features.
236///
237/// In the CoNLL-U specification, these are morphological features of the
238/// token. Typically, the features are a list or a key-value mapping.
239#[derive(Clone, Debug, Eq, PartialEq)]
240pub struct Features {
241    inner: BTreeMap<String, String>,
242}
243
244impl Features {
245    /// Construct an empty set of features.
246    pub fn new() -> Self {
247        Features {
248            inner: BTreeMap::new(),
249        }
250    }
251
252    /// Unwrap the contained feature map.
253    pub fn into_inner(self) -> BTreeMap<String, String> {
254        self.inner
255    }
256}
257
258impl Default for Features {
259    fn default() -> Self {
260        Features::new()
261    }
262}
263
264impl Deref for Features {
265    type Target = BTreeMap<String, String>;
266
267    fn deref(&self) -> &Self::Target {
268        &self.inner
269    }
270}
271
272impl DerefMut for Features {
273    fn deref_mut(&mut self) -> &mut Self::Target {
274        &mut self.inner
275    }
276}
277
278impl From<BTreeMap<String, String>> for Features {
279    fn from(feature_map: BTreeMap<String, String>) -> Self {
280        Features { inner: feature_map }
281    }
282}
283
284impl<S, T> FromIterator<(S, T)> for Features
285where
286    S: Into<String>,
287    T: Into<String>,
288{
289    fn from_iter<I>(iter: I) -> Self
290    where
291        I: IntoIterator<Item = (S, T)>,
292    {
293        let features = iter
294            .into_iter()
295            .map(|(k, v)| (k.into(), v.into()))
296            .collect();
297
298        Features { inner: features }
299    }
300}
301
302/// Miscellaneous features.
303#[derive(Clone, Debug, Eq, PartialEq)]
304pub struct Misc {
305    inner: BTreeMap<String, Option<String>>,
306}
307
308impl Misc {
309    /// Construct an empty set of features.
310    pub fn new() -> Self {
311        Misc {
312            inner: BTreeMap::new(),
313        }
314    }
315
316    /// Unwrap the contained feature map.
317    pub fn into_inner(self) -> BTreeMap<String, Option<String>> {
318        self.inner
319    }
320}
321
322impl Default for Misc {
323    fn default() -> Self {
324        Misc::new()
325    }
326}
327
328impl Deref for Misc {
329    type Target = BTreeMap<String, Option<String>>;
330
331    fn deref(&self) -> &Self::Target {
332        &self.inner
333    }
334}
335
336impl DerefMut for Misc {
337    fn deref_mut(&mut self) -> &mut Self::Target {
338        &mut self.inner
339    }
340}
341
342impl From<BTreeMap<String, Option<String>>> for Misc {
343    fn from(misc_map: BTreeMap<String, Option<String>>) -> Self {
344        Misc { inner: misc_map }
345    }
346}
347
348impl<S, T> FromIterator<(S, Option<T>)> for Misc
349where
350    S: Into<String>,
351    T: Into<String>,
352{
353    fn from_iter<I>(iter: I) -> Self
354    where
355        I: IntoIterator<Item = (S, Option<T>)>,
356    {
357        let misc = iter
358            .into_iter()
359            .map(|(k, v)| (k.into(), v.map(Into::into)))
360            .collect();
361
362        Misc { inner: misc }
363    }
364}
365
366/// Get tokens of a sentence.
367pub trait Tokens {
368    /// Get an iterator over the tokens in a sentence.
369    fn tokens(&self) -> TokenIter;
370
371    /// Get the tokens in a sentence mutably.
372    fn tokens_mut(&mut self) -> TokenIterMut;
373}
374
375impl Tokens for Sentence {
376    fn tokens(&self) -> TokenIter {
377        TokenIter { inner: self.iter() }
378    }
379
380    fn tokens_mut(&mut self) -> TokenIterMut {
381        TokenIterMut {
382            inner: self.iter_mut(),
383        }
384    }
385}
386
387/// Token iterator.
388pub struct TokenIter<'a> {
389    inner: Iter<'a>,
390}
391
392impl<'a> Iterator for TokenIter<'a> {
393    type Item = &'a Token;
394
395    fn next(&mut self) -> Option<Self::Item> {
396        for node in self.inner.by_ref() {
397            if let Node::Token(token) = node {
398                return Some(token);
399            }
400        }
401
402        None
403    }
404}
405
406/// Mutable token iterator.
407pub struct TokenIterMut<'a> {
408    inner: IterMut<'a>,
409}
410
411impl<'a> Iterator for TokenIterMut<'a> {
412    type Item = &'a mut Token;
413
414    fn next(&mut self) -> Option<Self::Item> {
415        for node in self.inner.by_ref() {
416            if let Node::Token(token) = node {
417                return Some(token);
418            }
419        }
420
421        None
422    }
423}
424
425#[cfg(test)]
426mod tests {
427    use std::iter::FromIterator;
428
429    use maplit::btreemap;
430
431    use super::{Features, Tokens};
432    use crate::tests::TEST_SENTENCES;
433
434    #[test]
435    fn features_from_iter() {
436        let feature_map = btreemap! {
437            "feature2".to_string() => "y".to_string(),
438            "feature1".to_string() => "x".to_string(),
439        };
440
441        assert_eq!(feature_map, *Features::from_iter(feature_map.clone()));
442    }
443
444    #[test]
445    fn tokens() {
446        let mut iter = TEST_SENTENCES[0].tokens();
447        assert_eq!(iter.next(), TEST_SENTENCES[0][1].token());
448        assert_eq!(iter.next(), TEST_SENTENCES[0][2].token());
449        assert_eq!(iter.next(), None);
450    }
451
452    #[test]
453    fn tokens_mut() {
454        let mut sentence = TEST_SENTENCES[0].clone();
455
456        {
457            let mut iter = sentence.tokens_mut();
458            let token = iter.next().unwrap();
459            assert_eq!(&*token, TEST_SENTENCES[0][1].token().unwrap());
460            token.set_upos(Some("mutable"));
461            assert_eq!(iter.next().map(|t| &*t), TEST_SENTENCES[0][2].token());
462            assert_eq!(iter.next(), None);
463        }
464
465        assert_eq!(sentence[1].token().unwrap().upos(), Some("mutable"));
466    }
467}