rs_conllu/
lib.rs

1#![warn(missing_docs)]
2//! A library for parsing the CoNNL-U format.
3//!
4//! ## Basic Usage
5//!
6//! ```
7//! use rs_conllu::parse_file;
8//! use std::fs::File;
9//!
10//! # use std::error::Error;
11//! # fn main() -> Result<(), Box<dyn Error>> {
12//! let file = File::open("tests/example.conllu")?;
13//!
14//! let parsed = parse_file(file)?;
15//!
16//! // parse_file returns a `ParsedDoc`, which allows iteration
17//! // over the contained sentences.
18//! for sentence in parsed {
19//!     // We can also iterate over the tokens in the sentence.
20//!     for token in sentence {
21//!         // Process token, e.g. access individual fields.
22//!         println!("{}", token.form)
23//!     }
24//! }
25//! # Ok(())
26//! # }
27//!
28//! ```
29//! ## Modifying
30//!
31//! If manipulation is necessary, sentences can be iterated
32//! mutably. The example below shows how we can change the
33//! `form` and `lemma` of a particular token.
34//!
35//!
36//! ```
37//! use rs_conllu::{parse_file, Sentence, TokenID};
38//! use std::fs::File;
39//!
40//! # use std::error::Error;
41//! # fn main() -> Result<(), Box<dyn Error>> {
42//! let file = File::open("tests/example.conllu")?;
43//!
44//! let mut parsed = parse_file(file)?;
45//!
46//! if let Some(s) = parsed.iter_mut().nth(0) {
47//!     if let Some(token) = s.get_token_mut(TokenID::Single(6)) {
48//!         token.form = "crabs".to_string();
49//!         token.lemma = Some("crab".to_string());
50//!     }
51//! }
52//!
53//! # Ok(())
54//! # }
55//! ```
56
57#![allow(clippy::tabs_in_doc_comments)]
58
59use std::{error::Error, fmt, str::FromStr};
60
61pub mod parsers;
62pub mod sentence;
63pub mod token;
64
65pub use sentence::Sentence;
66pub use token::{Dep, Token, TokenID};
67
68pub use parsers::{parse_file, parse_sentence, parse_token};
69
70#[cfg(feature = "serde")]
71pub use serde::{Deserialize, Serialize};
72
73/// Error used when a Universal POS tag could not be parsed.
74#[derive(Debug, PartialEq, Eq)]
75pub struct ParseUposError;
76
77impl fmt::Display for ParseUposError {
78    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79        write!(f, "Error while parsing UPOS.")
80    }
81}
82
83impl Error for ParseUposError {}
84
85/// The set of Universal POS tags according
86/// to [UD version 2](https://universaldependencies.org/u/pos/index.html).
87#[derive(Debug, Clone, Copy, PartialEq, Eq, derive_more::Display)]
88#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
89pub enum UPOS {
90    /// adjective
91    ADJ,
92    /// adposition
93    ADP,
94    /// adverb
95    ADV,
96    /// auxiliary
97    AUX,
98    /// coordinating conjunction
99    CCONJ,
100    /// determiner
101    DET,
102    /// interjection
103    INTJ,
104    /// noun
105    NOUN,
106    /// numeral
107    NUM,
108    /// particle
109    PART,
110    /// pronoun
111    PRON,
112    /// proper noun
113    PROPN,
114    /// punctuation
115    PUNCT,
116    /// subordinating conjunction
117    SCONJ,
118    /// symbol
119    SYM,
120    /// verb
121    VERB,
122    /// other
123    X,
124}
125
126impl FromStr for UPOS {
127    type Err = ParseUposError;
128
129    fn from_str(value: &str) -> Result<Self, Self::Err> {
130        use UPOS::*;
131        match value {
132            "ADJ" => Ok(ADJ),
133            "ADP" => Ok(ADP),
134            "ADV" => Ok(ADV),
135            "AUX" => Ok(AUX),
136            "CCONJ" => Ok(CCONJ),
137            "DET" => Ok(DET),
138            "INTJ" => Ok(INTJ),
139            "NOUN" => Ok(NOUN),
140            "NUM" => Ok(NUM),
141            "PART" => Ok(PART),
142            "PRON" => Ok(PRON),
143            "PROPN" => Ok(PROPN),
144            "PUNCT" => Ok(PUNCT),
145            "SCONJ" => Ok(SCONJ),
146            "SYM" => Ok(SYM),
147            "VERB" => Ok(VERB),
148            "X" => Ok(X),
149            _ => Err(ParseUposError),
150        }
151    }
152}