1#[macro_use]
2extern crate pest;
3use pest::prelude::*;
4
5extern crate glob;
6use glob::glob;
7
8extern crate simple_error;
9use simple_error::SimpleError;
10
11use std::fs::File;
12use std::io::prelude::*;
13
14#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum PTBTree {
17 InnerNode {
18 label: String,
19 children: Vec<PTBTree>
20 },
21 TerminalNode {
22 label: String
23 }
24}
25
26impl PTBTree {
27 fn render_simple_brackets(&self) -> String {
29 match *self {
30 PTBTree::InnerNode { ref label, ref children } => {
31 format!("({} {})", label, children.iter().map(|c| c.render_simple_brackets()).collect::<Vec<_>>().join(" "))
32 }
33 PTBTree::TerminalNode { ref label } => {
34 label.to_string()
35 }
36 }
37 }
38
39 pub fn render(&self) -> String {
47 format!("({})", self.render_simple_brackets())
48 }
49
50 pub fn front(&self) -> String {
58 match *self {
59 PTBTree::TerminalNode { ref label } => label.clone(),
60 PTBTree::InnerNode { ref children, .. } => {
61 children.iter().map(|c| c.front()).collect::<Vec<String>>().join(" ")
62 }
63 }
64 }
65
66 pub fn pos_front(&self) -> String {
74 match *self {
75 PTBTree::TerminalNode { .. } => panic!("Encountered unexpected terminal while getting the POS front!"),
76 PTBTree::InnerNode { ref children, ref label } => {
77 if children.len() == 1 {
79 if let PTBTree::TerminalNode { .. } = children[0] {
80 return label.clone()
81 }
82 }
83
84 children.iter().map(|c| c.pos_front()).collect::<Vec<String>>().join(" ")
85 }
86 }
87 }
88
89 pub fn front_length(&self) -> usize {
91 match *self {
92 PTBTree::TerminalNode { .. } => 1,
93 PTBTree::InnerNode { ref children, .. } => {
94 children.iter().map(|c| c.front_length()).sum()
95 }
96 }
97 }
98
99 pub fn strip_all_predicate_annotations(&mut self) {
103 match *self {
104 PTBTree::InnerNode { ref mut label, ref mut children } => {
105 fn strip_one_label_suffix(s: &mut String) -> bool {
107 let functional_suffixes = ["-HLN", "-LST", "-TTL", "-CLF", "-NOM", "-ADV", "-LGS", "-PRD", "-SBJ", "-TPC", "-CLR", "-VOC", "-DIR", "-LOC", "-MNR", "-PRP", "-TMP"];
109 if let Some((i, _)) = s.char_indices().rev().nth(3) {
110 if functional_suffixes.contains(&&s[i..]) {
111 s.truncate(i);
112 return true
113 }
114 }
115
116 let mut gotdigit = false;
118 let mut truncation_index = None;
119 for (i, c) in s.char_indices().rev() {
120 if c.is_digit(10) {
121 gotdigit = true
122 } else if (c == '-' || c == '=') && gotdigit {
123 truncation_index = Some(i);
124 break
125 } else {
126 return false
127 }
128 }
129 match truncation_index {
130 None => false,
131 Some(i) => {s.truncate(i); true}
132 }
133 }
134 while strip_one_label_suffix(label) {}
135
136 let mut keeps_children = false;
138 let mut remove_me_indices: Vec<usize> = Vec::new();
139 for (i, c) in children.iter_mut().enumerate() {
140 c.strip_all_predicate_annotations();
141 let childlabel = match *c {
142 PTBTree::InnerNode { ref label, .. } | PTBTree::TerminalNode { ref label } => label
143 };
144 if !childlabel.is_empty() {
145 keeps_children = true
146 } else {
147 remove_me_indices.push(i)
148 }
149 }
150
151
152 if !keeps_children || label == "-NONE-" {
154 *label = "".to_string();
155 }
156 else {
158 for &i in remove_me_indices.iter().rev() {
159 children.remove(i);
160 }
161 }
162 }
163 PTBTree::TerminalNode { ref mut label } => {
164 let other_leafs = ["*", "*?*", "*NOT*", "*U*"];
166 if other_leafs.contains(&&label[..]) {
167 *label = "".to_string();
168 return
169 }
170
171 for pref in ["*-", "*T*-", "*ICH*-", "*PPA*-", "*RNR*-", "*EXP*-"].into_iter() {
173 if label.starts_with(pref) {
174 let mut gotdigit = false;
175 for c in label[pref.len()..].chars() {
176 if c.is_digit(10) {
177 gotdigit = true
178 } else {
179 break
180 }
181 }
182 if gotdigit {
183 *label = "".to_string();
184 return
185 }
186 }
187 }
188 }
189 }
190 }
191}
192
193impl std::fmt::Display for PTBTree {
194 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
196 write!(f, "{}", self.render())
197 }
198}
199
200mod myparser {
202 use super::PTBTree;
203 use pest::prelude::*;
204
205 impl_rdp! {
206 grammar! {
207 wholefile = { myws* ~ singletree+ }
208 singletree = { ["("] ~ myws* ~ node ~ myws* ~ endmarker ~ myws* }
209 node = _{ headed | terminal }
210 headed = { ["("] ~ nt ~ (!closing ~ myws+ ~ node)+ ~ myws* ~ endmarker }
211 closing = _{ myws* ~ endmarker }
212 endmarker = { [")"] }
213 terminal = { basechar+ }
214 nt = { basechar+ }
215 basechar = _{ !([" "] | ["("] | [")"]) ~ any }
216 myws = _{ [" "] | ["\n"] | ["\r"] }
217 }
218
219 process! {
220 get_all_trees(&self) -> Vec<PTBTree> {
221 (_: wholefile, mut ts: _gatherfile()) => {
222 ts.reverse();
223 ts
224 }
225 }
226 _gatherfile(&self) -> Vec<PTBTree> {
227 (_: singletree, t: _consume_until_endmarker(), mut trees: _gatherfile()) => {
228 assert!(t.len() == 1);
229 trees.push(t[0].clone());
230 trees
231 },
232 () => { Vec::new() },
233 }
234 _consume_until_endmarker(&self) -> Vec<PTBTree> {
235 (_: headed, &head: nt, mut innernodes: _consume_until_endmarker(), mut follow: _consume_until_endmarker()) => {
236 innernodes.reverse();
237 let newnode = PTBTree::InnerNode{ label: head.to_string(), children: innernodes };
238 follow.push(newnode);
239 follow
240 },
241 (&val: terminal, mut follow: _consume_until_endmarker()) => {
242 let newnode = PTBTree::TerminalNode{ label: val.to_string() };
243 follow.push(newnode);
244 follow
245 },
246 (_: endmarker) => {
247 Vec::new()
248 }
249 }
250 }
251 }
252}
253
254pub fn parse_ptbtree(s: &str) -> Option<PTBTree> {
264 let parsed = match parse_ptbtrees(s) {
265 None => return None,
266 Some(p) => p
267 };
268 if parsed.len() != 1 {
269 None
270 } else {
271 Some(parsed[0].clone())
272 }
273}
274
275pub fn parse_ptbtrees(s: &str) -> Option<Vec<PTBTree>> {
285 let mut parser = myparser::Rdp::new(StringInput::new(s));
286
287 if !parser.wholefile() || !parser.end() {
288 None
289 } else {
290 Some(parser.get_all_trees())
291 }
292}
293
294pub fn parse_ptb_file(f: &str) -> Option<Vec<PTBTree>> {
298 let mut contents = String::new();
299 match File::open(f) {
300 Err(_) => None,
301 Ok(mut fh) => match fh.read_to_string(&mut contents) {
302 Err(_) => None,
303 Ok(_) => parse_ptbtrees(&contents)
304 }
305 }
306}
307
308pub fn parse_spmrl_ptb_file(f: &str, singlebracketed: bool, remove_after_dash: bool) -> Result<Vec<PTBTree>, Box<std::error::Error>> {
315 fn remove_morphtags(t: PTBTree, remove_after_dash: bool) -> PTBTree {
316 match t {
317 PTBTree::InnerNode{label,children} => {
318 let newlabel = label.split("##").next().unwrap().to_string();
319 let newlabel = if remove_after_dash {newlabel.split("-").next().unwrap().to_string()} else {newlabel};
320 let newlabel = newlabel.replace("_", "---");
321 let newchildren = children.into_iter().map(|c| remove_morphtags(c, remove_after_dash)).collect::<Vec<_>>();
322 PTBTree::InnerNode { label: newlabel, children: newchildren }
323 }
324 n@PTBTree::TerminalNode{..} => n
325 }
326 }
327
328 let mut contents = String::new();
329 match File::open(f)?.read_to_string(&mut contents) {
330 Err(e) => Err(Box::new(e)),
331 Ok(_) => {
332 let lines = contents
333 .split('\n')
334 .filter(|s| *s != "")
335 .collect::<Vec<&str>>();
336 let parses: Vec<Option<PTBTree>> = if singlebracketed {
337 lines.into_iter().map(|s| parse_ptbtree(&format!("({})", s))).collect()
338 } else {
339 lines.into_iter().map(parse_ptbtree).collect()
340 };
341 if parses.iter().any(|o| o.is_none()) {
342 Err(Box::new(SimpleError::new("Some parses were none!")))
343 } else {
344 Ok(parses
345 .into_iter()
346 .map(|s| remove_morphtags(s.unwrap(), remove_after_dash))
347 .collect::<Vec<PTBTree>>())
348 }
349 }
350 }
351}
352
353pub fn parse_ptb_sample_dir(mergeddir: &str) -> Vec<PTBTree> {
359 let mut result = Vec::new();
360 for num in 1..200 {
361 let filename = mergeddir.to_string() + &format!("wsj_{:04}.mrg", num);
362 result.extend(parse_ptb_file(&filename).unwrap())
363 }
364 result
365}
366
367pub fn parse_ptb_sections(mergeddir: &str, sections: Vec<usize>) -> Vec<PTBTree> {
373 let mut result = Vec::new();
374 for section in sections {
375 for entry in glob(&format!("{}/{:02}/*.mrg", mergeddir, section)).unwrap() {
377 if let Ok(path) = entry {
378 result.extend(parse_ptb_file(path.to_str().unwrap()).unwrap())
379 }
380 }
381 }
382 result
383}
384
385pub fn parse_ptb_dir(mergeddir: &str) -> Vec<PTBTree> {
391 let mut result = Vec::new();
392 for entry in glob(&format!("{}/**/*.mrg", mergeddir)).unwrap() {
393 if let Ok(path) = entry {
394 result.extend(parse_ptb_file(path.to_str().unwrap()).unwrap())
395 }
396 }
397 result
398}
399
400#[cfg(test)]
401mod tests {
402 use super::*;
403 use std::collections::HashSet;
404
405 fn sample_tree() -> PTBTree {
406 PTBTree::InnerNode{ label: "ROOT".to_string(), children: vec![
407 PTBTree::InnerNode{ label: "A".to_string(), children: vec![
408 PTBTree::TerminalNode{ label: "2".to_string() }
409 ] },
410 PTBTree::InnerNode{ label: "X".to_string(), children: vec![
411 PTBTree::InnerNode{ label: "B".to_string(), children: vec![
412 PTBTree::TerminalNode{ label: "1".to_string() }
413 ] },
414 PTBTree::InnerNode{ label: "C".to_string(), children: vec![
415 PTBTree::TerminalNode{ label: "1".to_string() }
416 ] },
417 PTBTree::InnerNode{ label: "D".to_string(), children: vec![
418 PTBTree::TerminalNode{ label: "2".to_string() }
419 ] }
420 ] }
421 ] }
422 }
423
424 fn sample_trees(level: usize) -> Vec<PTBTree> {
425 let mut result;
426 if level == 0 {
427 result = Vec::new();
428 } else if level == 1 {
429 result = sample_trees(level - 1);
430 result.push(PTBTree::TerminalNode { label: "012".to_string() });
431 result.push(PTBTree::TerminalNode { label: "abc".to_string() });
432 result.push(PTBTree::TerminalNode { label: "-LRB-".to_string() });
433 result.push(PTBTree::TerminalNode { label: "_".to_string() });
434 result.push(PTBTree::TerminalNode { label: "-RRB-".to_string() });
435 } else {
436 result = sample_trees(level - 1);
437 result.push(PTBTree::InnerNode { label: "ABC".to_string(), children: sample_trees(level - 1) });
438 result.push(PTBTree::InnerNode { label: "B".to_string(), children: sample_trees(level - 1) });
439 result.push(PTBTree::InnerNode { label: "CBA".to_string(), children: sample_trees(level - 1) });
440 }
441 result
442 }
443
444 #[test]
445 fn front_of_ptb_tree() {
446 let s: String = sample_tree().front();
447 assert_eq!(s, "2 1 1 2")
448 }
449
450 #[test]
451 fn pos_front_of_ptb_tree() {
452 let s: String = sample_tree().pos_front();
453 assert_eq!(s, "A B C D")
454 }
455
456 #[test]
457 fn test_ptbtree_display() {
458 let s = "((ROOT (A 2) (X (B 1) (C 1) (D 2))))";
459 assert_eq!(format!("{}", sample_tree()), s)
460 }
461
462 #[test]
463 fn test_parser() {
464 let puretree = parse_ptbtree("((ROOT (A x) (C 1)))\n").unwrap();
465 assert_eq!(puretree, parse_ptbtree("((ROOT (A x) (C 1)))\n").unwrap());
466 assert_eq!(puretree, parse_ptbtree("((ROOT (A x) (C 1) ))\n").unwrap());
467 assert_eq!(puretree, parse_ptbtree("((ROOT (A x) \n (C \n 1) ))\n").unwrap());
468
469 for t in sample_trees(4) {
470 assert!(parse_ptbtree(&format!("{}\n", t)).unwrap() == t.clone());
471 }
472 }
473
474 #[test]
475 fn parse_actual_ptb_sample() {
477 let t1 = parse_ptb_sample_dir("/home/sjm/documents/Uni/FuzzySP/penn-treebank-sample/treebank/combined/");
478 let t2 = parse_ptb_dir("/home/sjm/documents/Uni/FuzzySP/penn-treebank-sample/treebank/combined/");
479
480 assert_eq!(3914, t1.len());
481 assert_eq!(3914, t2.len());
482 assert_eq!(t1, t2)
483 }
484
485 #[test]
486 #[ignore]
487 fn extract_trainset_yield() {
488 for (range, name) in vec![((2..22), "2-21"), ((22..23), "22")] {
489 let train_trees = parse_ptb_sections("/home/sjm/documents/Uni/FuzzySP/treebank-3_LDC99T42/treebank_3/parsed/mrg/wsj", range.collect());
490 let mut yields: Vec<String> = Vec::new();
491 let mut pos_yields: Vec<String> = Vec::new();
492
493 for mut t in train_trees {
494 t.strip_all_predicate_annotations();
495 yields.push(t.front() + "\n");
496 pos_yields.push(t.pos_front() + "\n")
497 }
498
499 let mut f = File::create(format!("/home/sjm/documents/Uni/FuzzySP/pcfg-experiments/pos-tagging/data/ptb.{}.lex", name)).expect("Unable to create file");
500 for y in &yields {
501 f.write_all(y.as_bytes()).expect("Unable to write data")
502 }
503 let mut f = File::create(format!("/home/sjm/documents/Uni/FuzzySP/pcfg-experiments/pos-tagging/data/ptb.{}.tag", name)).expect("Unable to create file");
504 for y in &pos_yields {
505 f.write_all(y.as_bytes()).expect("Unable to write data")
506 }
507 }
508 }
509
510 #[test]
511 fn readme_example() {
512 let s: &str = "((S (NNP John) (VP (VBD saw) (NNP Mary))))";
513 let t: PTBTree =
514 PTBTree::InnerNode{ label: "S".to_string(), children: vec![
515 PTBTree::InnerNode{ label: "NNP".to_string(), children: vec![
516 PTBTree::TerminalNode{ label: "John".to_string() }
517 ] },
518 PTBTree::InnerNode{ label: "VP".to_string(), children: vec![
519 PTBTree::InnerNode{ label: "VBD".to_string(), children: vec![
520 PTBTree::TerminalNode{ label: "saw".to_string() }
521 ] },
522 PTBTree::InnerNode{ label: "NNP".to_string(), children: vec![
523 PTBTree::TerminalNode{ label: "Mary".to_string() }
524 ] }
525 ] }
526 ] }
527 ;
528 assert_eq!(parse_ptbtree(s).unwrap(), t);
529 assert_eq!(t.render(), s);
530 assert_eq!(t.front(), "John saw Mary");
531
532 let s: &str = "((S (NNP John) (VP (VBD saw) (NNP Mary) )))";
533 let s_pred: &str = "((S (NNP-SBJ John) (VP (NP *T*-1) (VBD saw) (NNP Mary) (-NONE- nada))))";
534
535 let mut t = parse_ptbtree(s_pred).unwrap();
536 t.strip_all_predicate_annotations();
537 assert_eq!(t, parse_ptbtree(s).unwrap())
538 }
539
540 #[test]
541 fn test_predicate_annotations_removal() {
542 let mut full_t = parse_ptbtree("((S (NNP-NOM-SBJ (N John) (* K.) (-NONE- nay)) (VP-PRD (VBD-12-15 verbed) (*T*-42 something)) (nuh-uh *T*-42) ($ *) (. .) (! !)))").unwrap();
559 let stripped_t = parse_ptbtree("((S (NNP (N John) (* K.)) (VP (VBD verbed) (*T* something)) (. .) (! !)))").unwrap();
560
561 full_t.strip_all_predicate_annotations();
562
563 assert_eq!(full_t, stripped_t);
564 }
565
566 #[test]
567 #[ignore]
568 fn words_in_testset() {
569 let mut tokens = 0;
570 let mut types: HashSet<String> = HashSet::new();
571 let mut sentlengths: Vec<usize> = Vec::new();
572 for mut t in parse_ptb_sections("/home/sjm/documents/Uni/FuzzySP/treebank-3_LDC99T42/treebank_3/parsed/mrg/wsj", vec![22]) {
573 t.strip_all_predicate_annotations();
574 let sentlen = t.front_length();
575 tokens += sentlen;
576 sentlengths.push(sentlen);
577 for w in t.front().split(' ') {
578 types.insert(w.to_string());
579 }
580 }
581 println!("Tokens in test: {}, Types in test: {}", tokens, types.len());
582 println!("Sentence lengths:");
583 sentlengths.sort();
584 for v in sentlengths {
585 print!("{} ", v)
586 }
587 }
588
589 #[test]
590 fn parse_spmrl_ptb() {
591 let lang1 = "KOREAN";
592 let lang2 = "Korean";
593 let path = format!("/home/sjm/documents/Uni/FuzzySP/spmrl-2014/data/{}_SPMRL/gold/ptb/train5k/train5k.{}.gold.ptb", lang1, lang2);
594 match parse_spmrl_ptb_file(&path, true, false) {
595 Err(e) => println!("\nError!\n{}", e),
596 Ok(p) => println!("\nSuccess!\n{}", p[0])
597 }
598 }
599}