rs_conllu/
token.rs

1//! The basic token element, its building blocks and builder.
2
3use std::{collections::HashMap, fmt::Display};
4
5use crate::UPOS;
6
7#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10/// The id of a [`Token`].
11#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
12#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
13pub enum TokenID {
14    /// The standard, single index.
15    Single(usize),
16    /// A range of tokens that form an ID. Denoted by a hyphen
17    /// in CoNLL-U format (e.g. 1-3).
18    Range(usize, usize),
19    /// To represent ellipses, ConLL-U allows to create sub-indices of the preceding
20    /// regular node (or 0 if it is a the beginning of a sentence). They are separated
21    /// by a decimal point and represent an "empty" node.
22    Empty(usize, usize),
23}
24
25impl Display for TokenID {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match self {
28            TokenID::Single(id) => write!(f, "{id}"),
29            TokenID::Range(start, end) => write!(f, "{start}-{end}"),
30            TokenID::Empty(parent, id) => write!(f, "{parent}.{id}"),
31        }
32    }
33}
34
35type Features = HashMap<String, String>;
36
37/// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format.
38/// The ConLL-U specification uses the terms _word_, _node_ and _multi-word token_ while this crate
39/// decided to use the general notion of _Token_ to subsume all of the above.
40///
41/// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification.
42/// The only mandatory fields are [id](Token::id) and [form](Token::form). The remaining ones are optional (absence denoted
43/// by an underscore in the text format) and represented as [Option] types.
44///
45/// A [TokenBuilder] type is available for more convenient creation of [Token] structs,
46/// which can be instantiated via the [builder](Token::builder) method.
47///
48#[derive(Debug, Clone, PartialEq, Eq)]
49#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
50pub struct Token {
51    /// The id of the token within the sentence.
52    pub id: TokenID,
53    /// The surface form of the token as it appears in the sentence.
54    pub form: String,
55    /// The lemma or lexical form of the token.
56    pub lemma: Option<String>,
57    /// The universal POS tag of the token.
58    pub upos: Option<UPOS>,
59    /// Language-specific POS tag for the token.
60    pub xpos: Option<String>,
61    /// Morphological features of the token as key-value pairs.
62    pub features: Option<Features>,
63    /// The head of the current token.
64    pub head: Option<TokenID>,
65    /// The dependency relation fo the token.
66    pub deprel: Option<String>,
67    /// Enhanced dependency graph information.
68    pub deps: Option<Vec<Dep>>,
69    /// Other types of annotation.
70    pub misc: Option<String>,
71}
72
73impl Token {
74    /// Return a new [TokenBuilder].
75    pub fn builder(id: TokenID, form: String) -> TokenBuilder {
76        TokenBuilder::new(id, form)
77    }
78}
79
80/// A builder for Tokens to allow for more convenient manual creation if necessary.
81///
82/// ```rust
83/// use rs_conllu::{Token, TokenID};
84///
85/// // Get a new builder from Token
86/// let token = Token::builder(TokenID::Single(1), "Hello".to_string())
87///     .lemma("Hello".to_string())
88///     .build();
89///
90/// ```
91pub struct TokenBuilder {
92    id: TokenID,
93    form: String,
94    lemma: Option<String>,
95    upos: Option<UPOS>,
96    xpos: Option<String>,
97    features: Option<Features>,
98    head: Option<TokenID>,
99    deprel: Option<String>,
100    deps: Option<Vec<Dep>>,
101    misc: Option<String>,
102}
103
104impl TokenBuilder {
105    /// Constructor for [TokenBuilder]. Both `id` and `form` are mandatory
106    /// fields and thus required when instantiating.
107    pub fn new(id: TokenID, form: String) -> TokenBuilder {
108        TokenBuilder {
109            id,
110            form,
111            lemma: None,
112            upos: None,
113            xpos: None,
114            features: None,
115            head: None,
116            deprel: None,
117            deps: None,
118            misc: None,
119        }
120    }
121
122    /// Set the lemma field.
123    pub fn lemma(mut self, lemma: String) -> TokenBuilder {
124        self.lemma = Some(lemma);
125        self
126    }
127
128    /// Set the universal POS tag field.
129    pub fn upos(mut self, upos: UPOS) -> TokenBuilder {
130        self.upos = Some(upos);
131        self
132    }
133
134    /// Set the xpos field.
135    pub fn xpos(mut self, xpos: String) -> TokenBuilder {
136        self.xpos = Some(xpos);
137        self
138    }
139
140    /// Set the features field.
141    pub fn features(mut self, features: Features) -> TokenBuilder {
142        self.features = Some(features);
143        self
144    }
145
146    /// Set the head field.
147    pub fn head(mut self, head: TokenID) -> TokenBuilder {
148        self.head = Some(head);
149        self
150    }
151
152    /// Set the deprel field.
153    pub fn deprel(mut self, deprel: String) -> TokenBuilder {
154        self.deprel = Some(deprel);
155        self
156    }
157
158    /// Set the deps field.
159    pub fn deps(mut self, dep: Vec<Dep>) -> TokenBuilder {
160        self.deps = Some(dep);
161        self
162    }
163
164    /// Set the misc field.
165    pub fn misc(mut self, misc: String) -> TokenBuilder {
166        self.misc = Some(misc);
167        self
168    }
169
170    /// Build the token.
171    pub fn build(self) -> Token {
172        Token {
173            id: self.id,
174            form: self.form,
175            lemma: self.lemma,
176            upos: self.upos,
177            xpos: self.xpos,
178            features: self.features,
179            head: self.head,
180            deprel: self.deprel,
181            deps: self.deps,
182            misc: self.misc,
183        }
184    }
185}
186
187/// A head-relation pair, used in the `deps` field of [Token]
188#[derive(Debug, Clone, PartialEq, Eq)]
189#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
190pub struct Dep {
191    /// The head of the relation.
192    pub head: TokenID,
193    /// The type of the relation.
194    pub rel: String,
195}