rs_conllu/lib.rs
1#![warn(missing_docs)]
2//! A library for parsing the CoNNL-U format.
3//!
4//! ## Basic Usage
5//!
6//! ```
7//! use rs_conllu::parse_file;
8//! use std::fs::File;
9//!
10//! # use std::error::Error;
11//! # fn main() -> Result<(), Box<dyn Error>> {
12//! let file = File::open("tests/example.conllu")?;
13//!
14//! let parsed = parse_file(file)?;
15//!
16//! // parse_file returns a `ParsedDoc`, which allows iteration
17//! // over the contained sentences.
18//! for sentence in parsed {
19//! // We can also iterate over the tokens in the sentence.
20//! for token in sentence {
21//! // Process token, e.g. access individual fields.
22//! println!("{}", token.form)
23//! }
24//! }
25//! # Ok(())
26//! # }
27//!
28//! ```
29//! ## Modifying
30//!
31//! If manipulation is necessary, sentences can be iterated
32//! mutably. The example below shows how we can change the
33//! `form` and `lemma` of a particular token.
34//!
35//!
36//! ```
37//! use rs_conllu::{parse_file, Sentence, TokenID};
38//! use std::fs::File;
39//!
40//! # use std::error::Error;
41//! # fn main() -> Result<(), Box<dyn Error>> {
42//! let file = File::open("tests/example.conllu")?;
43//!
44//! let mut parsed = parse_file(file)?;
45//!
46//! if let Some(s) = parsed.iter_mut().nth(0) {
47//! if let Some(token) = s.get_token_mut(TokenID::Single(6)) {
48//! token.form = "crabs".to_string();
49//! token.lemma = Some("crab".to_string());
50//! }
51//! }
52//!
53//! # Ok(())
54//! # }
55//! ```
56
57#![allow(clippy::tabs_in_doc_comments)]
58
59use std::{error::Error, fmt, str::FromStr};
60
61pub mod parsers;
62pub mod sentence;
63pub mod token;
64
65pub use sentence::Sentence;
66pub use token::{Dep, Token, TokenID};
67
68pub use parsers::{parse_file, parse_sentence, parse_token};
69
70#[cfg(feature = "serde")]
71pub use serde::{Deserialize, Serialize};
72
73/// Error used when a Universal POS tag could not be parsed.
74#[derive(Debug, PartialEq, Eq)]
75pub struct ParseUposError;
76
77impl fmt::Display for ParseUposError {
78 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79 write!(f, "Error while parsing UPOS.")
80 }
81}
82
83impl Error for ParseUposError {}
84
85/// The set of Universal POS tags according
86/// to [UD version 2](https://universaldependencies.org/u/pos/index.html).
87#[derive(Debug, Clone, Copy, PartialEq, Eq, derive_more::Display)]
88#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
89pub enum UPOS {
90 /// adjective
91 ADJ,
92 /// adposition
93 ADP,
94 /// adverb
95 ADV,
96 /// auxiliary
97 AUX,
98 /// coordinating conjunction
99 CCONJ,
100 /// determiner
101 DET,
102 /// interjection
103 INTJ,
104 /// noun
105 NOUN,
106 /// numeral
107 NUM,
108 /// particle
109 PART,
110 /// pronoun
111 PRON,
112 /// proper noun
113 PROPN,
114 /// punctuation
115 PUNCT,
116 /// subordinating conjunction
117 SCONJ,
118 /// symbol
119 SYM,
120 /// verb
121 VERB,
122 /// other
123 X,
124}
125
126impl FromStr for UPOS {
127 type Err = ParseUposError;
128
129 fn from_str(value: &str) -> Result<Self, Self::Err> {
130 use UPOS::*;
131 match value {
132 "ADJ" => Ok(ADJ),
133 "ADP" => Ok(ADP),
134 "ADV" => Ok(ADV),
135 "AUX" => Ok(AUX),
136 "CCONJ" => Ok(CCONJ),
137 "DET" => Ok(DET),
138 "INTJ" => Ok(INTJ),
139 "NOUN" => Ok(NOUN),
140 "NUM" => Ok(NUM),
141 "PART" => Ok(PART),
142 "PRON" => Ok(PRON),
143 "PROPN" => Ok(PROPN),
144 "PUNCT" => Ok(PUNCT),
145 "SCONJ" => Ok(SCONJ),
146 "SYM" => Ok(SYM),
147 "VERB" => Ok(VERB),
148 "X" => Ok(X),
149 _ => Err(ParseUposError),
150 }
151 }
152}