jsonnlp/
lib.rs

1#![crate_name = "jsonnlp"]
2
3
4//! This is an implementation of [JSON-NLP](https://github.com/SemiringInc/JSON-NLP) in Rust.
5//! [JSON-NLP](https://github.com/SemiringInc/JSON-NLP) provides the data structures for
6//! detailed Natural Language Processing (NLP) annotations of speech and text.
7//! 
8//! (C) 2021 by [Semiring Inc.](https://semiring.com/), [Damir Cavar](http://damir.cavar.me/) <damir@semiring.com>
9//! 
10//! Version 0.0.4
11//! 
12//! See for more details:
13//! 
14//! - [GitHub repo](https://github.com/SemiringInc/RustJSONNLP)
15//! 
16
17
18//use serde_json::json;
19use serde_json;
20use serde;
21use serde::{Serialize, Deserialize};
22use std::error::Error;
23use std::fs::File;
24use std::io::BufReader;
25use std::path::Path;
26
27/// contains the metadata for the JSON-NLP and individual documents.
28/// The metadata is using Dublin Core (DC) terms.
29#[derive(Serialize, Deserialize)]
30pub struct Meta {
31    #[serde(rename = "DC.conformsTo")]
32    #[serde(skip_serializing_if = "String::is_empty")]
33	conforms_to: String,
34    #[serde(rename = "DC.author")]
35    #[serde(skip_serializing_if = "String::is_empty")]
36	author:      String,
37    #[serde(rename = "DC.created")]
38    #[serde(skip_serializing_if = "String::is_empty")]
39	created:     String,
40    #[serde(rename = "DC.date")]
41    #[serde(skip_serializing_if = "String::is_empty")]
42	date:        String,
43    #[serde(rename = "DC.source")]
44    #[serde(skip_serializing_if = "String::is_empty")]
45	source:      String,
46    #[serde(rename = "DC.language")]
47    #[serde(skip_serializing_if = "String::is_empty")]
48	language:    String,
49    #[serde(rename = "DC.creator")]
50    #[serde(skip_serializing_if = "String::is_empty")]
51	creator:     String,
52    #[serde(rename = "DC.publisher")]
53    #[serde(skip_serializing_if = "String::is_empty")]
54	publisher:   String,
55    #[serde(rename = "DC.title")]
56    #[serde(skip_serializing_if = "String::is_empty")]
57	title:       String,
58    #[serde(rename = "DC.description")]
59    #[serde(skip_serializing_if = "String::is_empty")]
60	description: String,
61    #[serde(rename = "DC.identifier")]
62    #[serde(skip_serializing_if = "String::is_empty")]
63	identifier:  String,
64}
65
66///  contains different morpho-syntactic, semantic, or orthographic token features.
67#[derive(Serialize, Deserialize)]
68pub struct TokenFeatures {
69	overt:          bool,
70	stop:           bool,
71	alpha:          bool,
72	number:         u8,
73    #[serde(skip_serializing_if = "String::is_empty")]
74	gender:         String,
75	person:         u8,
76    #[serde(skip_serializing_if = "String::is_empty")]
77	tense:          String,
78	perfect:        bool,
79	continuous:     bool,
80	progressive:    bool,
81    #[serde(skip_serializing_if = "String::is_empty")]
82	case:           String,
83	human:          bool,
84	animate:        bool,
85	negated:        bool,
86	countable:      bool,
87	factive:        bool,
88	counterfactive: bool,
89	irregular:      bool,
90    #[serde(rename = "phrasalVerb")]
91	phrasalverb:    bool,
92    #[serde(skip_serializing_if = "String::is_empty")]
93	mood:           String,
94	foreign:        bool,
95    #[serde(rename = "spaceAfter")]
96	spaceafter:     bool,
97}
98
99/// contains the token information.
100#[derive(Serialize, Deserialize)]
101pub struct Token {
102	id:                u64,
103	sentence_id:       u64,
104	text:              String,
105	lemma:             String,
106    #[serde(skip_serializing_if = "String::is_empty")]
107	xpos:              String,
108	xpos_prob:         f64,
109    #[serde(skip_serializing_if = "String::is_empty")]
110	upos:              String,
111	upos_prob:         f64,
112    #[serde(skip_serializing_if = "String::is_empty")]
113	entity_iob:        String,
114    #[serde(rename = "characterOffsetBegin")]
115	char_offset_begin: u64,
116    #[serde(rename = "characterOffsetEnd")]
117	char_offset_end:   u64,
118    #[serde(skip_serializing_if = "String::is_empty")]
119    #[serde(rename = "propID")]
120	prop_id:           String,
121    #[serde(rename = "propIDProbability")]
122	prop_id_prob:      f64,
123    #[serde(rename = "frameID")]
124	frame_id:          u64,
125    #[serde(rename = "frameIDProb")]
126	frame_id_prob:     f64,
127    #[serde(rename = "wordNetID")]
128	wordnet_id:        u64,
129    #[serde(rename = "wordNetIDProb")]
130	wordnet_id_prob:   f64,
131    #[serde(rename = "verbNetID")]
132	verbnet_id:        u64,
133    #[serde(rename = "verbNetIDProb")]
134	verbnet_id_prob:   f64,
135    #[serde(skip_serializing_if = "String::is_empty")]
136	lang:              String,
137	features:          TokenFeatures,
138    #[serde(skip_serializing_if = "String::is_empty")]
139	shape:             String,
140    #[serde(skip_serializing_if = "String::is_empty")]
141	entity:            String,
142}
143
144/// contains sentence information.
145#[derive(Serialize, Deserialize)]
146pub struct Sentence {
147	id:             u64,
148    #[serde(rename = "tokenFrom")]
149	token_from:     u64,
150    #[serde(rename = "tokenTo")]
151	token_to:       u64,
152	tokens:         Vec<u64>,
153	clauses:        Vec<u64>,
154    #[serde(rename = "type")]
155    #[serde(skip_serializing_if = "String::is_empty")]
156	stype:          String,
157    #[serde(skip_serializing_if = "String::is_empty")]
158	sentiment:      String,
159    #[serde(rename = "sentimentProb")]
160	sentiment_prob: f64,
161}
162
163/// contains clause information, assuming that sentences contain one or more clauses.
164#[derive(Serialize, Deserialize)]
165pub struct Clause {
166	id:             u64,
167    #[serde(rename = "sentenceId")]
168	sentence_id:    u64,
169    #[serde(rename = "tokenFrom")]
170	token_from:     u64,
171    #[serde(rename = "tokenTo")]
172	token_to:       u64,
173	tokens:         Vec<u64>,
174	main:           bool,
175	gov:            u64,
176	head:           u64,
177	neg:            bool,
178    #[serde(skip_serializing_if = "String::is_empty")]
179	tense:          String,
180    #[serde(skip_serializing_if = "String::is_empty")]
181	mood:           String,
182	perfect:        bool,
183	continuous:     bool,
184    #[serde(skip_serializing_if = "String::is_empty")]
185	aspect:         String,
186    #[serde(skip_serializing_if = "String::is_empty")]
187	voice:          String,
188    #[serde(skip_serializing_if = "String::is_empty")]
189	sentiment:      String,
190    #[serde(rename = "sentimentProb")]
191	sentiment_prob: f64,
192}
193
194/// contains dependency information as part of dependency trees.
195/// A dependency is a tuple that contains a governor token ID, a dependent token ID, and a dependency label.
196/// In addition, each dependency can provide probability information about the confidence or another likelihood property.
197#[derive(Serialize, Deserialize)]
198pub struct Dependency {
199	lab:  String,
200	gov:  u64,
201	dep:  u64,
202	prob: f64,
203}
204
205/// This struct contains information about a dependency tree.
206/// A dependency tree is a set of dependency triples.
207/// In addition a tree provides the possibility to encode a probability score for the dependency tree.
208#[derive(Serialize, Deserialize)]
209pub struct DependencyTree {
210    #[serde(rename = "sentenceId")]
211	sentence_id:  u64,
212    #[serde(skip_serializing_if = "String::is_empty")]
213	style:        String,
214	dependencies: Vec<Dependency>,
215	prob:         f64,
216}
217
218/// This struct contains information about a representative phrase or token for coreference.
219#[derive(Serialize, Deserialize)]
220pub struct CoreferenceRepresentantive {
221	tokens: Vec<u64>,
222	head:   u64,
223}
224
225/// This struct contains information about a referent or anaphoric expression that refers to some referent.
226#[derive(Serialize, Deserialize)]
227pub struct CoreferenceReferents {
228	tokens: Vec<u64>,
229	head:   u64,
230	prob:   f64,
231}
232
233/// This struct contains information about a coreference relation between one referent and a list of refering expressions.
234#[derive(Serialize, Deserialize)]
235pub struct Coreference {
236	id:             u64,
237	representative: CoreferenceRepresentantive,
238	referents:      Vec<CoreferenceReferents>,
239}
240
241/// This struct contains information about scope relations between tokens or phrases in a sentence.
242#[derive(Serialize, Deserialize)]
243pub struct Scope {
244	id:        u64,
245	gov:       Vec<u64>,
246	dep:       Vec<u64>,
247	terminals: Vec<u64>,
248}
249
250/// This struct contains information about the constituent parse tree for a sentence.
251#[derive(Serialize, Deserialize)]
252pub struct ConstituentParse {
253    #[serde(rename = "sentenceId")]
254	sentence_id:        u64,
255    #[serde(rename = "type")]
256    #[serde(skip_serializing_if = "String::is_empty")]
257	ctype:              String,
258    #[serde(rename = "labeledBracketing")]
259    #[serde(skip_serializing_if = "String::is_empty")]
260	labeled_bracketing: String,
261	prob:               f64,
262	scopes:             Vec<Scope>,
263}
264
265/// This struct provides information about expressions or chunks in the text.
266#[derive(Serialize, Deserialize)]
267pub struct Expression {
268	id:         u64,
269    #[serde(rename = "type")]
270    #[serde(skip_serializing_if = "String::is_empty")]
271	etype:      String,
272	head:       u64,
273    #[serde(skip_serializing_if = "String::is_empty")]
274	dependency: String,
275    #[serde(rename = "tokenFrom")]
276	token_from: u64,
277    #[serde(rename = "tokenTo")]
278	token_to:   u64,
279	tokens:     Vec<u64>,
280	prob:       f64,
281}
282
283/// This struct contains information about paragraph properties in the text.
284#[derive(Serialize, Deserialize)]
285pub struct Paragraph {
286	id:         u64,
287    #[serde(rename = "tokenFrom")]
288	token_from: u64,
289    #[serde(rename = "tokenTo")]
290	token_to:   u64,
291	tokens:     Vec<u64>,
292	sentences:  Vec<u64>,
293}
294
295/// This struct encodes generic attribute value tuples for Attribute Value Matrix (AVM) based encoding of properties.
296#[derive(Serialize, Deserialize)]
297pub struct Attribute {
298	lab: String,
299	val: String,
300}
301
302/// This struct encodes entity properties.
303#[derive(Serialize, Deserialize)]
304pub struct Entity {
305	id:             u64,
306    #[serde(skip_serializing_if = "String::is_empty")]
307	label:          String,
308    #[serde(rename = "type")]
309    #[serde(skip_serializing_if = "String::is_empty")]
310	etype:          String,
311    #[serde(skip_serializing_if = "String::is_empty")]
312	url:            String,
313	head:           u64,
314    #[serde(rename = "tokenFrom")]
315	token_from:      u64,
316    #[serde(rename = "tokenTo")]
317	token_to:        u64,
318	tokens:         Vec<u64>,
319    #[serde(rename = "tripleID")]
320	triple_id:       u64,
321    #[serde(skip_serializing_if = "String::is_empty")]
322	sentiment:      String,
323    #[serde(rename = "sentimentProb")]
324	sentiment_prob: f64,
325	count:          u64,
326	attributes:     Vec<Attribute>,
327}
328
329/// This struct encodes relations and properties in a graph for entity, cocept, or knowledge graphs.
330#[derive(Serialize, Deserialize)]
331pub struct Relation {
332	id:             u64,
333    #[serde(skip_serializing_if = "String::is_empty")]
334	label:          String,
335    #[serde(rename = "type")]
336    #[serde(skip_serializing_if = "String::is_empty")]
337	rtype:          String,
338    #[serde(skip_serializing_if = "String::is_empty")]
339	url:            String,
340	head:           u64,
341    #[serde(rename = "tokenFrom")]
342	token_from:      u64,
343    #[serde(rename = "tokenTo")]
344	token_to:        u64,
345	tokens:         Vec<u64>,
346    #[serde(skip_serializing_if = "String::is_empty")]
347	sentiment:      String,
348    #[serde(rename = "sentimentProb")]
349	sentiment_prob: f64,
350	count:          u64,
351	attributes:     Vec<Attribute>,
352}
353
354/// This struct encodes triples for RDF, JSON-LD, or general Knowledge Graph encoding.
355#[derive(Serialize, Deserialize)]
356pub struct Triple {
357	id:           u64,
358    #[serde(rename = "fromEntity")]
359	from_entity:  u64,
360    #[serde(rename = "toEntity")]
361	to_entity:    u64,
362	rel:          u64,
363    #[serde(rename = "clauseID")]
364	clause_id:    Vec<u64>,
365    #[serde(rename = "sentenceID")]
366	sentence_id:  Vec<u64>,
367	directional:  bool,
368    #[serde(rename = "eventID")]
369	event_id:     u64,
370    #[serde(rename = "tempSeq")]
371	temp_seq:     u64,
372	prob:         f64,
373	syntactic:    bool,
374	implied:      bool,
375	presupposed:  bool,
376	count:        u64,
377}
378
379/// This struct contains all the information for one particular document.
380#[derive(Serialize, Deserialize)]
381pub struct Document {
382	meta:             Meta,
383	id:               u64,
384    #[serde(rename = "tokenList")]
385	token_list:       Vec<Token>,
386	clauses:          Vec<Clause>,
387	sentences:        Vec<Sentence>,
388	paragraphs:       Vec<Paragraph>,
389    #[serde(rename = "dependencyTrees")]
390	dependency_trees: Vec<DependencyTree>,
391	coreferences:     Vec<Coreference>,
392	constituents:     Vec<ConstituentParse>,
393	expressions:      Vec<Expression>,
394	entities:         Vec<Entity>,
395	relations:        Vec<Relation>,
396	triples:          Vec<Triple>,
397}
398
399/// This struct contains general elements of a JSON-NLP document.
400#[derive(Serialize, Deserialize)]
401pub struct JSONNLP {
402	meta: Meta,
403	docs: Vec<Document>,
404}
405
406/// This function converts a string containing JSON-NLP, returning a JSONNLP struct.
407pub fn from_string(json: &str) -> Result<JSONNLP, Box<dyn Error>> {
408	let r = serde_json::from_str::<JSONNLP>(json).unwrap();
409	Ok(r)
410}
411
412/// This function reads a JSON-NLP document from a file and returns a JSONNLP struct.
413pub fn from_file<P: AsRef<Path>>(path: P) -> Result<JSONNLP, Box<dyn Error>> {
414    let file = File::open(path)?;
415    let reader = BufReader::new(file);
416    let u = serde_json::from_reader(reader)?;
417	Ok(u)
418}
419
420/// This function returns a string representation of a JSONNLP struct/object.
421pub fn get_json(j: &JSONNLP) -> Result<String, Box<dyn Error>> {
422	let r = serde_json::to_string(j).unwrap();
423	Ok(r)
424}