rs_conllu/token.rs
1//! The basic token element, its building blocks and builder.
2
3use std::{collections::HashMap, fmt::Display};
4
5use crate::UPOS;
6
7#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10/// The id of a [`Token`].
11#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
12#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
13pub enum TokenID {
14 /// The standard, single index.
15 Single(usize),
16 /// A range of tokens that form an ID. Denoted by a hyphen
17 /// in CoNLL-U format (e.g. 1-3).
18 Range(usize, usize),
19 /// To represent ellipses, ConLL-U allows to create sub-indices of the preceding
20 /// regular node (or 0 if it is a the beginning of a sentence). They are separated
21 /// by a decimal point and represent an "empty" node.
22 Empty(usize, usize),
23}
24
25impl Display for TokenID {
26 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27 match self {
28 TokenID::Single(id) => write!(f, "{id}"),
29 TokenID::Range(start, end) => write!(f, "{start}-{end}"),
30 TokenID::Empty(parent, id) => write!(f, "{parent}.{id}"),
31 }
32 }
33}
34
35type Features = HashMap<String, String>;
36
37/// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format.
38/// The ConLL-U specification uses the terms _word_, _node_ and _multi-word token_ while this crate
39/// decided to use the general notion of _Token_ to subsume all of the above.
40///
41/// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification.
42/// The only mandatory fields are [id](Token::id) and [form](Token::form). The remaining ones are optional (absence denoted
43/// by an underscore in the text format) and represented as [Option] types.
44///
45/// A [TokenBuilder] type is available for more convenient creation of [Token] structs,
46/// which can be instantiated via the [builder](Token::builder) method.
47///
48#[derive(Debug, Clone, PartialEq, Eq)]
49#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
50pub struct Token {
51 /// The id of the token within the sentence.
52 pub id: TokenID,
53 /// The surface form of the token as it appears in the sentence.
54 pub form: String,
55 /// The lemma or lexical form of the token.
56 pub lemma: Option<String>,
57 /// The universal POS tag of the token.
58 pub upos: Option<UPOS>,
59 /// Language-specific POS tag for the token.
60 pub xpos: Option<String>,
61 /// Morphological features of the token as key-value pairs.
62 pub features: Option<Features>,
63 /// The head of the current token.
64 pub head: Option<TokenID>,
65 /// The dependency relation fo the token.
66 pub deprel: Option<String>,
67 /// Enhanced dependency graph information.
68 pub deps: Option<Vec<Dep>>,
69 /// Other types of annotation.
70 pub misc: Option<String>,
71}
72
73impl Token {
74 /// Return a new [TokenBuilder].
75 pub fn builder(id: TokenID, form: String) -> TokenBuilder {
76 TokenBuilder::new(id, form)
77 }
78}
79
80/// A builder for Tokens to allow for more convenient manual creation if necessary.
81///
82/// ```rust
83/// use rs_conllu::{Token, TokenID};
84///
85/// // Get a new builder from Token
86/// let token = Token::builder(TokenID::Single(1), "Hello".to_string())
87/// .lemma("Hello".to_string())
88/// .build();
89///
90/// ```
91pub struct TokenBuilder {
92 id: TokenID,
93 form: String,
94 lemma: Option<String>,
95 upos: Option<UPOS>,
96 xpos: Option<String>,
97 features: Option<Features>,
98 head: Option<TokenID>,
99 deprel: Option<String>,
100 deps: Option<Vec<Dep>>,
101 misc: Option<String>,
102}
103
104impl TokenBuilder {
105 /// Constructor for [TokenBuilder]. Both `id` and `form` are mandatory
106 /// fields and thus required when instantiating.
107 pub fn new(id: TokenID, form: String) -> TokenBuilder {
108 TokenBuilder {
109 id,
110 form,
111 lemma: None,
112 upos: None,
113 xpos: None,
114 features: None,
115 head: None,
116 deprel: None,
117 deps: None,
118 misc: None,
119 }
120 }
121
122 /// Set the lemma field.
123 pub fn lemma(mut self, lemma: String) -> TokenBuilder {
124 self.lemma = Some(lemma);
125 self
126 }
127
128 /// Set the universal POS tag field.
129 pub fn upos(mut self, upos: UPOS) -> TokenBuilder {
130 self.upos = Some(upos);
131 self
132 }
133
134 /// Set the xpos field.
135 pub fn xpos(mut self, xpos: String) -> TokenBuilder {
136 self.xpos = Some(xpos);
137 self
138 }
139
140 /// Set the features field.
141 pub fn features(mut self, features: Features) -> TokenBuilder {
142 self.features = Some(features);
143 self
144 }
145
146 /// Set the head field.
147 pub fn head(mut self, head: TokenID) -> TokenBuilder {
148 self.head = Some(head);
149 self
150 }
151
152 /// Set the deprel field.
153 pub fn deprel(mut self, deprel: String) -> TokenBuilder {
154 self.deprel = Some(deprel);
155 self
156 }
157
158 /// Set the deps field.
159 pub fn deps(mut self, dep: Vec<Dep>) -> TokenBuilder {
160 self.deps = Some(dep);
161 self
162 }
163
164 /// Set the misc field.
165 pub fn misc(mut self, misc: String) -> TokenBuilder {
166 self.misc = Some(misc);
167 self
168 }
169
170 /// Build the token.
171 pub fn build(self) -> Token {
172 Token {
173 id: self.id,
174 form: self.form,
175 lemma: self.lemma,
176 upos: self.upos,
177 xpos: self.xpos,
178 features: self.features,
179 head: self.head,
180 deprel: self.deprel,
181 deps: self.deps,
182 misc: self.misc,
183 }
184 }
185}
186
187/// A head-relation pair, used in the `deps` field of [Token]
188#[derive(Debug, Clone, PartialEq, Eq)]
189#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
190pub struct Dep {
191 /// The head of the relation.
192 pub head: TokenID,
193 /// The type of the relation.
194 pub rel: String,
195}