ptb_reader/
lib.rs

1#[macro_use]
2extern crate pest;
3use pest::prelude::*;
4
5extern crate glob;
6use glob::glob;
7
8extern crate simple_error;
9use simple_error::SimpleError;
10
11use std::fs::File;
12use std::io::prelude::*;
13
14/// Arbitrarily wide recursive trees of `String`.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum PTBTree {
17    InnerNode {
18        label: String,
19        children: Vec<PTBTree>
20    },
21    TerminalNode {
22        label: String
23    }
24}
25
26impl PTBTree {
27    /// Return bracketed (but not outside-bracketed!) PTB notation.
28    fn render_simple_brackets(&self) -> String {
29        match *self {
30            PTBTree::InnerNode { ref label, ref children } => {
31                format!("({} {})", label, children.iter().map(|c| c.render_simple_brackets()).collect::<Vec<_>>().join(" "))
32            }
33            PTBTree::TerminalNode { ref label } => {
34                label.to_string()
35            }
36        }
37    }
38    
39    /// Return bracketed (and outside-bracketed!) PTB notation:
40    ///
41    /// ```rust
42    /// use ptb_reader::PTBTree;
43    /// let tree = PTBTree::InnerNode{ label: "NT".to_string(), children: vec![PTBTree::TerminalNode{ label: "t".to_string() }] };
44    /// assert_eq!(tree.render(), "((NT t))")
45    /// ```
46    pub fn render(&self) -> String {
47        format!("({})", self.render_simple_brackets())
48    }
49    
50    /// Return `String` from joined terminals at the leaves (i.e, *front*, *yield*).
51    /// 
52    /// ```rust
53    /// use ptb_reader::PTBTree;
54    /// let tree = PTBTree::InnerNode{ label: "NT".to_string(), children: vec![PTBTree::TerminalNode{ label: "t".to_string() }] };
55    /// assert_eq!(tree.front(), "t")
56    /// ```
57    pub fn front(&self) -> String {
58        match *self {
59            PTBTree::TerminalNode { ref label } => label.clone(),
60            PTBTree::InnerNode { ref children, .. } => {
61                children.iter().map(|c| c.front()).collect::<Vec<String>>().join(" ")
62            }
63        }
64    }
65    
66    /// Return `String` from joined pre-terminals (i.e., POS tags).
67    /// 
68    /// ```rust
69    /// use ptb_reader::PTBTree;
70    /// let tree = PTBTree::InnerNode{ label: "NT".to_string(), children: vec![PTBTree::TerminalNode{ label: "t".to_string() }] };
71    /// assert_eq!(tree.pos_front(), "NT")
72    /// ```
73    pub fn pos_front(&self) -> String {
74        match *self {
75            PTBTree::TerminalNode { .. } => panic!("Encountered unexpected terminal while getting the POS front!"),
76            PTBTree::InnerNode { ref children, ref label } => {
77                //if let [PTBTree::TerminalNode { label } ] = children {
78                if children.len() == 1 {
79                    if let PTBTree::TerminalNode { .. } = children[0] {
80                        return label.clone()
81                    }
82                }
83                
84                children.iter().map(|c| c.pos_front()).collect::<Vec<String>>().join(" ")
85            }
86        }
87    }
88    
89    /// Return number of terminal words, i.e. length of front.
90    pub fn front_length(&self) -> usize {
91        match *self {
92            PTBTree::TerminalNode { .. } => 1,
93            PTBTree::InnerNode { ref children, .. } => {
94                children.iter().map(|c| c.front_length()).sum()
95            }
96        }
97    }
98    
99    /// Remove predicate-argument annotations and trace elements.
100    /// See <http://dx.doi.org/10.3115/1075812.1075835 The Penn Treebank: annotating predicate argument structure>.
101    /// Thanks to Toni Dietze for formalizing the necessary stripping :)
102    pub fn strip_all_predicate_annotations(&mut self) {
103        match *self {
104            PTBTree::InnerNode { ref mut label, ref mut children } => {
105                // Iterate suffix removal
106                fn strip_one_label_suffix(s: &mut String) -> bool {
107                    // Any of these in-/suffixes shall be removed from inner nodes (e.g., NNP-NOM-SBJ => NNP).
108                    let functional_suffixes = ["-HLN", "-LST", "-TTL", "-CLF", "-NOM", "-ADV", "-LGS", "-PRD", "-SBJ", "-TPC", "-CLR", "-VOC", "-DIR", "-LOC", "-MNR", "-PRP", "-TMP"];
109                    if let Some((i, _)) = s.char_indices().rev().nth(3) {
110                        if functional_suffixes.contains(&&s[i..]) {
111                            s.truncate(i);
112                            return true
113                        }
114                    }
115                    
116                    // Any dash-prefixed or equals-prefixed numbers shall be removed from inner nodes (e.g., NNP-1-12 => NNP).
117                    let mut gotdigit = false;
118                    let mut truncation_index = None;
119                    for (i, c) in s.char_indices().rev() {
120                        if c.is_digit(10) {
121                            gotdigit = true
122                        } else if (c == '-' || c == '=') && gotdigit {
123                            truncation_index = Some(i);
124                            break
125                        } else {
126                            return false
127                        }
128                    }
129                    match truncation_index {
130                        None => false,
131                        Some(i) => {s.truncate(i); true}
132                    }
133                }
134                while strip_one_label_suffix(label) {}
135                
136                // Recurse over children
137                let mut keeps_children = false;
138                let mut remove_me_indices: Vec<usize> = Vec::new();
139                for (i, c) in children.iter_mut().enumerate() {
140                    c.strip_all_predicate_annotations();
141                    let childlabel = match *c {
142                        PTBTree::InnerNode { ref label, .. } | PTBTree::TerminalNode { ref label } => label
143                    };
144                    if !childlabel.is_empty() {
145                        keeps_children = true
146                    } else {
147                        remove_me_indices.push(i)
148                    }
149                }
150                
151                
152                // Mark inner node itself for deletion
153                if !keeps_children || label == "-NONE-" {
154                    *label = "".to_string();
155                }
156                // Remove from back to front
157                else {
158                    for &i in remove_me_indices.iter().rev() {
159                        children.remove(i);
160                    }
161                }
162            }
163            PTBTree::TerminalNode { ref mut label } => {
164                // Any of these are invalid leaves.
165                let other_leafs = ["*", "*?*", "*NOT*", "*U*"];
166                if other_leafs.contains(&&label[..]) {
167                    *label = "".to_string();
168                    return
169                }
170                
171                // Any of these plus any digits are invalid leaves.
172                for pref in ["*-", "*T*-", "*ICH*-", "*PPA*-", "*RNR*-", "*EXP*-"].into_iter() {
173                    if label.starts_with(pref) {
174                        let mut gotdigit = false;
175                        for c in label[pref.len()..].chars() {
176                            if c.is_digit(10) {
177                                gotdigit = true
178                            } else {
179                                break
180                            }
181                        }
182                        if gotdigit {
183                            *label = "".to_string();
184                            return
185                        }
186                    }
187                }
188            }
189        }
190    }
191}
192
193impl std::fmt::Display for PTBTree {
194    /// Calls `self.render()`.
195    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
196        write!(f, "{}", self.render())
197    }
198}
199
200/// Internal PTB parser, using pest
201mod myparser {
202    use super::PTBTree;
203    use pest::prelude::*;
204    
205    impl_rdp! {
206        grammar! {
207            wholefile  =  { myws* ~ singletree+ }
208            singletree =  { ["("] ~ myws* ~ node ~ myws* ~ endmarker ~ myws* }
209            node       = _{ headed | terminal }
210            headed     =  { ["("] ~ nt ~ (!closing ~ myws+ ~ node)+ ~ myws* ~ endmarker }
211            closing    = _{ myws* ~ endmarker }
212            endmarker  =  { [")"] }
213            terminal   =  { basechar+ }
214            nt         =  { basechar+ }
215            basechar   = _{ !([" "] | ["("] | [")"]) ~ any }
216            myws       = _{ [" "] | ["\n"] | ["\r"] }
217        }
218        
219        process! {
220            get_all_trees(&self) -> Vec<PTBTree> {
221                (_: wholefile, mut ts: _gatherfile()) => {
222                    ts.reverse();
223                    ts
224                }
225            }
226            _gatherfile(&self) -> Vec<PTBTree> {
227                (_: singletree, t: _consume_until_endmarker(), mut trees: _gatherfile()) => {
228                    assert!(t.len() == 1);
229                    trees.push(t[0].clone());
230                    trees
231                },
232                () => { Vec::new() },
233            }
234            _consume_until_endmarker(&self) -> Vec<PTBTree> {
235                (_: headed, &head: nt, mut innernodes: _consume_until_endmarker(), mut follow: _consume_until_endmarker()) => {
236                    innernodes.reverse();
237                    let newnode = PTBTree::InnerNode{ label: head.to_string(), children: innernodes };
238                    follow.push(newnode);
239                    follow
240                },
241                (&val: terminal, mut follow: _consume_until_endmarker()) => {
242                    let newnode = PTBTree::TerminalNode{ label: val.to_string() };
243                    follow.push(newnode);
244                    follow
245                },
246                (_: endmarker) => {
247                    Vec::new()
248                }
249            }
250        }
251    }
252}
253
254/// Parse a single tree.
255///
256/// Wrapper around `parse_ptbtrees`, returns `None` if string contains not exactly one tree.
257/// 
258/// ```rust
259/// use ptb_reader::*;
260/// let tree = PTBTree::InnerNode{ label: "NT".to_string(), children: vec![PTBTree::TerminalNode{ label: "t".to_string() }] };
261/// assert_eq!(tree, parse_ptbtree("((NT t))").unwrap())
262/// ```
263pub fn parse_ptbtree(s: &str) -> Option<PTBTree> {
264    let parsed = match parse_ptbtrees(s) {
265        None => return None,
266        Some(p) => p
267    };
268    if parsed.len() != 1 {
269        None
270    } else {
271        Some(parsed[0].clone())
272    }
273}
274
275/// Parse a string of multiple trees.
276/// 
277/// Returns `None` if string contains anything unparseable.
278/// 
279/// ```rust
280/// use ptb_reader::*;
281/// let tree = PTBTree::InnerNode{ label: "NT".to_string(), children: vec![PTBTree::TerminalNode{ label: "t".to_string() }] };
282/// assert_eq!(vec![tree.clone(), tree], parse_ptbtrees("((NT t)) ((NT t))").unwrap())
283/// ```
284pub fn parse_ptbtrees(s: &str) -> Option<Vec<PTBTree>> {
285    let mut parser = myparser::Rdp::new(StringInput::new(s));
286    
287    if !parser.wholefile() || !parser.end() {
288        None
289    } else {
290        Some(parser.get_all_trees())
291    }
292}
293
294/// Parse a PTB file.
295/// 
296/// Wrapper for reading in a file and feeding it to `parse_ptbtrees`.
297pub fn parse_ptb_file(f: &str) -> Option<Vec<PTBTree>> {
298    let mut contents = String::new();
299    match File::open(f) {
300        Err(_) => None,
301        Ok(mut fh) => match fh.read_to_string(&mut contents) {
302            Err(_) => None,
303            Ok(_) => parse_ptbtrees(&contents)
304        }
305    }
306}
307
308/// Parse a 1-per-line PTB file (SPMRL file format) (single-bracketed possible),
309/// removing morph tags on non-terminals and replacing underscores with --- in NTs!
310/// If `remove_after_dash` is set to True, it will even remove everything after a dash
311/// (this is applied before the underscore-replacing, of course).
312///
313/// Just reads line per line, prepends '(' and appends ')' and calls `parse_ptbtrees`.
314pub fn parse_spmrl_ptb_file(f: &str, singlebracketed: bool, remove_after_dash: bool) -> Result<Vec<PTBTree>, Box<std::error::Error>> {
315    fn remove_morphtags(t: PTBTree, remove_after_dash: bool) -> PTBTree {
316        match t {
317            PTBTree::InnerNode{label,children} => {
318                let newlabel = label.split("##").next().unwrap().to_string();
319                let newlabel = if remove_after_dash {newlabel.split("-").next().unwrap().to_string()} else {newlabel};
320                let newlabel = newlabel.replace("_", "---");
321                let newchildren = children.into_iter().map(|c| remove_morphtags(c, remove_after_dash)).collect::<Vec<_>>();
322                PTBTree::InnerNode { label: newlabel, children: newchildren }
323            }
324            n@PTBTree::TerminalNode{..} => n
325        }
326    }
327    
328    let mut contents = String::new();
329    match File::open(f)?.read_to_string(&mut contents) {
330        Err(e) => Err(Box::new(e)),
331        Ok(_) => {
332            let lines = contents
333                .split('\n')
334                .filter(|s| *s != "")
335                .collect::<Vec<&str>>();
336            let parses: Vec<Option<PTBTree>> = if singlebracketed {
337                lines.into_iter().map(|s| parse_ptbtree(&format!("({})", s))).collect()
338            } else {
339                lines.into_iter().map(parse_ptbtree).collect()
340            };
341            if parses.iter().any(|o| o.is_none()) {
342                Err(Box::new(SimpleError::new("Some parses were none!")))
343            } else {
344                Ok(parses
345                    .into_iter()
346                    .map(|s| remove_morphtags(s.unwrap(), remove_after_dash))
347                    .collect::<Vec<PTBTree>>())
348            }
349        }
350    }
351}
352
353/// Parse the free PTB sample files (`wsj_0001.mrg` to `wsj_0199.mrg`).
354/// 
355/// Will `panic` if anything goes wrong.
356/// 
357/// Wrapper around `parse_ptb_file`.
358pub fn parse_ptb_sample_dir(mergeddir: &str) -> Vec<PTBTree> {
359    let mut result = Vec::new();
360    for num in 1..200 {
361        let filename = mergeddir.to_string() + &format!("wsj_{:04}.mrg", num);
362        result.extend(parse_ptb_file(&filename).unwrap())
363    }
364    result
365}
366
367/// Parse `{mergeddir}/{section}/*.mrg` files.
368/// 
369/// Will `panic` if anything goes wrong.
370/// 
371/// Wrapper around `parse_ptb_file`.
372pub fn parse_ptb_sections(mergeddir: &str, sections: Vec<usize>) -> Vec<PTBTree> {
373    let mut result = Vec::new();
374    for section in sections {
375        //println!("Reading section {}", section);
376        for entry in glob(&format!("{}/{:02}/*.mrg", mergeddir, section)).unwrap() {
377            if let Ok(path) = entry {
378                result.extend(parse_ptb_file(path.to_str().unwrap()).unwrap())
379            }
380        }
381    }
382    result
383}
384
385/// Parse any directory containing `*.mrg` files.
386/// 
387/// Will `panic` if anything goes wrong.
388/// 
389/// Wrapper around `parse_ptb_file`.
390pub fn parse_ptb_dir(mergeddir: &str) -> Vec<PTBTree> {
391    let mut result = Vec::new();
392    for entry in glob(&format!("{}/**/*.mrg", mergeddir)).unwrap() {
393        if let Ok(path) = entry {
394            result.extend(parse_ptb_file(path.to_str().unwrap()).unwrap())
395        }
396    }
397    result
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403    use std::collections::HashSet;
404    
405    fn sample_tree() -> PTBTree {
406        PTBTree::InnerNode{ label: "ROOT".to_string(), children: vec![
407            PTBTree::InnerNode{ label: "A".to_string(), children: vec![
408                PTBTree::TerminalNode{ label: "2".to_string() }
409            ] },
410            PTBTree::InnerNode{ label: "X".to_string(), children: vec![
411                PTBTree::InnerNode{ label: "B".to_string(), children: vec![
412                    PTBTree::TerminalNode{ label: "1".to_string() }
413                ] },
414                PTBTree::InnerNode{ label: "C".to_string(), children: vec![
415                    PTBTree::TerminalNode{ label: "1".to_string() }
416                ] },
417                PTBTree::InnerNode{ label: "D".to_string(), children: vec![
418                    PTBTree::TerminalNode{ label: "2".to_string() }
419                ] }
420            ] }
421        ] }
422    }
423    
424    fn sample_trees(level: usize) -> Vec<PTBTree> {
425        let mut result;
426        if level == 0 {
427            result = Vec::new();
428        } else if level == 1 {
429            result = sample_trees(level - 1);
430            result.push(PTBTree::TerminalNode { label: "012".to_string() });
431            result.push(PTBTree::TerminalNode { label: "abc".to_string() });
432            result.push(PTBTree::TerminalNode { label: "-LRB-".to_string() });
433            result.push(PTBTree::TerminalNode { label: "_".to_string() });
434            result.push(PTBTree::TerminalNode { label: "-RRB-".to_string() });
435        } else {
436            result = sample_trees(level - 1);
437            result.push(PTBTree::InnerNode { label: "ABC".to_string(), children: sample_trees(level - 1) });
438            result.push(PTBTree::InnerNode { label: "B".to_string(), children: sample_trees(level - 1) });
439            result.push(PTBTree::InnerNode { label: "CBA".to_string(), children: sample_trees(level - 1) });
440        }
441        result
442    }
443    
444    #[test]
445    fn front_of_ptb_tree() {
446        let s: String = sample_tree().front();
447        assert_eq!(s, "2 1 1 2")
448    }
449    
450    #[test]
451    fn pos_front_of_ptb_tree() {
452        let s: String = sample_tree().pos_front();
453        assert_eq!(s, "A B C D")
454    }
455    
456    #[test]
457    fn test_ptbtree_display() {
458        let s = "((ROOT (A 2) (X (B 1) (C 1) (D 2))))";
459        assert_eq!(format!("{}", sample_tree()), s)
460    }
461    
462    #[test]
463    fn test_parser() {
464        let puretree = parse_ptbtree("((ROOT (A x) (C 1)))\n").unwrap();
465        assert_eq!(puretree, parse_ptbtree("((ROOT (A   x)  (C  1)))\n").unwrap());
466        assert_eq!(puretree, parse_ptbtree("((ROOT (A    x)  (C  1) ))\n").unwrap());
467        assert_eq!(puretree, parse_ptbtree("((ROOT (A    x) \n (C \n 1) ))\n").unwrap());
468        
469        for t in sample_trees(4) {
470            assert!(parse_ptbtree(&format!("{}\n", t)).unwrap() == t.clone());
471        }
472    }
473    
474    #[test]
475    //#[ignore]
476    fn parse_actual_ptb_sample() {
477        let t1 = parse_ptb_sample_dir("/home/sjm/documents/Uni/FuzzySP/penn-treebank-sample/treebank/combined/");
478        let t2 = parse_ptb_dir("/home/sjm/documents/Uni/FuzzySP/penn-treebank-sample/treebank/combined/");
479        
480        assert_eq!(3914, t1.len());
481        assert_eq!(3914, t2.len());
482        assert_eq!(t1, t2)
483    }
484    
485    #[test]
486    #[ignore]
487    fn extract_trainset_yield() {
488        for (range, name) in vec![((2..22), "2-21"), ((22..23), "22")] {
489            let train_trees = parse_ptb_sections("/home/sjm/documents/Uni/FuzzySP/treebank-3_LDC99T42/treebank_3/parsed/mrg/wsj", range.collect());
490            let mut yields: Vec<String> = Vec::new();
491            let mut pos_yields: Vec<String> = Vec::new();
492            
493            for mut t in train_trees {
494                t.strip_all_predicate_annotations();
495                yields.push(t.front() + "\n");
496                pos_yields.push(t.pos_front() + "\n")
497            }
498            
499            let mut f = File::create(format!("/home/sjm/documents/Uni/FuzzySP/pcfg-experiments/pos-tagging/data/ptb.{}.lex", name)).expect("Unable to create file");
500            for y in &yields {
501                f.write_all(y.as_bytes()).expect("Unable to write data")
502            }
503            let mut f = File::create(format!("/home/sjm/documents/Uni/FuzzySP/pcfg-experiments/pos-tagging/data/ptb.{}.tag", name)).expect("Unable to create file");
504            for y in &pos_yields {
505                f.write_all(y.as_bytes()).expect("Unable to write data")
506            }
507        }
508    }
509    
510    #[test]
511    fn readme_example() {
512        let s: &str = "((S (NNP John) (VP (VBD saw) (NNP Mary))))";
513        let t: PTBTree = 
514            PTBTree::InnerNode{ label: "S".to_string(), children: vec![
515                PTBTree::InnerNode{ label: "NNP".to_string(), children: vec![
516                    PTBTree::TerminalNode{ label: "John".to_string() }
517                ] },
518                PTBTree::InnerNode{ label: "VP".to_string(), children: vec![
519                    PTBTree::InnerNode{ label: "VBD".to_string(), children: vec![
520                        PTBTree::TerminalNode{ label: "saw".to_string() }
521                    ] },
522                    PTBTree::InnerNode{ label: "NNP".to_string(), children: vec![
523                        PTBTree::TerminalNode{ label: "Mary".to_string() }
524                    ] }
525                ] }
526            ] }
527        ;
528        assert_eq!(parse_ptbtree(s).unwrap(), t);
529        assert_eq!(t.render(), s);
530        assert_eq!(t.front(), "John saw Mary");
531        
532        let s: &str      = "((S (NNP     John) (VP            (VBD saw) (NNP Mary)              )))";
533        let s_pred: &str = "((S (NNP-SBJ John) (VP (NP *T*-1) (VBD saw) (NNP Mary) (-NONE- nada))))";
534
535        let mut t = parse_ptbtree(s_pred).unwrap();
536        t.strip_all_predicate_annotations();
537        assert_eq!(t, parse_ptbtree(s).unwrap())
538    }
539    
540    #[test]
541    fn test_predicate_annotations_removal() {
542        // ((S
543        //         (NNP-NOM-SBJ
544        //             (N John)
545        //             (* K.)
546        //             (-NONE- nay)
547        //         )
548        //         (VP-PRD
549        //             (VBD-12-15 verbed)
550        //             (*T*-42 something)
551        //         )
552        //         (nuh-uh *T*-42)
553        //         ($ *)
554        //         (. .)
555        //         (! !)
556        // ))
557        
558        let mut full_t = parse_ptbtree("((S (NNP-NOM-SBJ (N John) (* K.) (-NONE- nay)) (VP-PRD (VBD-12-15 verbed) (*T*-42 something)) (nuh-uh *T*-42) ($ *) (. .) (! !)))").unwrap();
559        let stripped_t = parse_ptbtree("((S (NNP (N John) (* K.)) (VP (VBD verbed) (*T* something)) (. .) (! !)))").unwrap();
560        
561        full_t.strip_all_predicate_annotations();
562        
563        assert_eq!(full_t, stripped_t);
564    }
565    
566    #[test]
567    #[ignore]
568    fn words_in_testset() {
569        let mut tokens = 0;
570        let mut types: HashSet<String> = HashSet::new();
571        let mut sentlengths: Vec<usize> = Vec::new();
572        for mut t in parse_ptb_sections("/home/sjm/documents/Uni/FuzzySP/treebank-3_LDC99T42/treebank_3/parsed/mrg/wsj", vec![22]) {
573            t.strip_all_predicate_annotations();
574            let sentlen = t.front_length();
575            tokens += sentlen;
576            sentlengths.push(sentlen);
577            for w in t.front().split(' ') {
578                types.insert(w.to_string());
579            }
580        }
581        println!("Tokens in test: {}, Types in test: {}", tokens, types.len());
582        println!("Sentence lengths:");
583        sentlengths.sort();
584        for v in sentlengths {
585            print!("{} ", v)
586        }
587    }
588    
589    #[test]
590    fn parse_spmrl_ptb() {
591        let lang1 = "KOREAN";
592        let lang2 = "Korean";
593        let path = format!("/home/sjm/documents/Uni/FuzzySP/spmrl-2014/data/{}_SPMRL/gold/ptb/train5k/train5k.{}.gold.ptb", lang1, lang2);
594        match parse_spmrl_ptb_file(&path, true, false) {
595            Err(e) => println!("\nError!\n{}", e),
596            Ok(p) => println!("\nSuccess!\n{}", p[0])
597        }
598    }
599}