use std::io;
use failure::Error;
use crate::error::ReadError;
use crate::graph::{DepTriple, Sentence};
use crate::token::{Features, Token, EMPTY_TOKEN};
pub trait ReadSentence {
fn read_sentence(&mut self) -> Result<Option<Sentence>, Error>;
fn sentences(self) -> Sentences<Self>
where
Self: Sized,
{
Sentences { reader: self }
}
}
pub struct Reader<R> {
read: R,
}
impl<R: io::BufRead> Reader<R> {
pub fn new(read: R) -> Reader<R> {
Reader { read }
}
}
impl<R: io::BufRead> IntoIterator for Reader<R> {
type Item = Result<Sentence, Error>;
type IntoIter = Sentences<Reader<R>>;
fn into_iter(self) -> Self::IntoIter {
self.sentences()
}
}
impl<R: io::BufRead> ReadSentence for Reader<R> {
fn read_sentence(&mut self) -> Result<Option<Sentence>, Error> {
let mut line = String::new();
let mut sentence = Sentence::new();
let mut edges = Vec::new();
let mut proj_edges = Vec::new();
loop {
line.clear();
if self.read.read_line(&mut line)? == 0 {
if sentence.len() == 1 {
return Ok(None);
}
add_edges(&mut sentence, edges, proj_edges);
return Ok(Some(sentence));
}
if line.trim().is_empty() {
if sentence.len() == 1 {
continue;
}
add_edges(&mut sentence, edges, proj_edges);
return Ok(Some(sentence));
}
let mut iter = line.trim().split_terminator('\t');
parse_identifier_field(iter.next())?;
let mut token = Token::new(parse_form_field(iter.next())?);
token.set_lemma(parse_string_field(iter.next()));
token.set_cpos(parse_string_field(iter.next()));
token.set_pos(parse_string_field(iter.next()));
token.set_features(parse_string_field(iter.next()).map(|s| Features::from(s.as_str())));
if let Some(head) = parse_numeric_field(iter.next())? {
let head_rel = parse_string_field(iter.next());
edges.push(DepTriple::new(head, head_rel, sentence.len()));
}
if let Some(proj_head) = parse_numeric_field(iter.next())? {
let proj_head_rel = parse_string_field(iter.next());
proj_edges.push(DepTriple::new(proj_head, proj_head_rel, sentence.len()));
}
sentence.push(token);
}
}
}
fn add_edges(
sentence: &mut Sentence,
edges: Vec<DepTriple<String>>,
proj_edges: Vec<DepTriple<String>>,
) {
for edge in edges {
sentence.dep_graph_mut().add_deprel(edge);
}
for edge in proj_edges {
sentence.proj_dep_graph_mut().add_deprel(edge);
}
}
pub struct Sentences<R>
where
R: ReadSentence,
{
reader: R,
}
impl<R> Iterator for Sentences<R>
where
R: ReadSentence,
{
type Item = Result<Sentence, Error>;
fn next(&mut self) -> Option<Self::Item> {
match self.reader.read_sentence() {
Ok(None) => None,
Ok(Some(sent)) => Some(Ok(sent)),
Err(e) => Some(Err(e)),
}
}
}
fn parse_form_field(field: Option<&str>) -> Result<String, ReadError> {
field.map(str::to_owned).ok_or(ReadError::MissingFormField)
}
fn parse_string_field(field: Option<&str>) -> Option<String> {
field.and_then(|s| {
if s == EMPTY_TOKEN {
None
} else {
Some(s.to_string())
}
})
}
fn parse_identifier_field(field: Option<&str>) -> Result<Option<usize>, ReadError> {
match field {
None => Err(ReadError::ParseIdentifierField {
value: "A token identifier should be present".to_owned(),
}),
Some(s) => {
if s == EMPTY_TOKEN {
return Err(ReadError::ParseIdentifierField {
value: s.to_owned(),
});
}
Ok(Some(s.parse::<usize>().map_err(|_| {
ReadError::ParseIntField {
value: s.to_owned(),
}
})?))
}
}
}
fn parse_numeric_field(field: Option<&str>) -> Result<Option<usize>, ReadError> {
match field {
None => Ok(None),
Some(s) => {
if s == EMPTY_TOKEN {
Ok(None)
} else {
Ok(Some(s.parse::<usize>().map_err(|_| {
ReadError::ParseIntField {
value: s.to_owned(),
}
})?))
}
}
}
}
pub trait WriteSentence {
fn write_sentence(&mut self, sentence: &Sentence) -> Result<(), Error>;
}
pub struct Writer<W> {
write: W,
first: bool,
}
impl<W: io::Write> Writer<W> {
pub fn new(write: W) -> Writer<W> {
Writer { write, first: true }
}
pub fn get_ref(&self) -> &W {
&self.write
}
}
impl<W: io::Write> WriteSentence for Writer<W> {
fn write_sentence(&mut self, sentence: &Sentence) -> Result<(), Error> {
if self.first {
self.first = false;
write!(self.write, "{}", sentence)?
} else {
write!(self.write, "\n{}", sentence)?
}
Ok(())
}
}
pub struct PartitioningWriter<W>
where
W: WriteSentence,
{
writers: Vec<W>,
fold: usize,
}
impl<W> PartitioningWriter<W>
where
W: WriteSentence,
{
pub fn new(writers: Vec<W>) -> PartitioningWriter<W> {
PartitioningWriter { writers, fold: 0 }
}
}
impl<W> WriteSentence for PartitioningWriter<W>
where
W: WriteSentence,
{
fn write_sentence(&mut self, sentence: &Sentence) -> Result<(), Error> {
if self.fold == self.writers.len() {
self.fold = 0
}
self.writers[self.fold].write_sentence(sentence)?;
self.fold += 1;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::{BufRead, Cursor, Read};
use std::str;
use failure::Error;
use super::{ReadSentence, WriteSentence, Writer};
use crate::graph::Sentence;
use crate::tests::{read_sentences, TEST_SENTENCES};
static BASIC: &str = "testdata/basic.conll";
static DOUBLE_NEWLINE: &str = "testdata/double-newline.conll";
static EMPTY: &str = "testdata/empty.conll";
fn read_file(filename: &str) -> Result<String, Error> {
let mut f = File::open(filename)?;
let mut contents = String::new();
f.read_to_string(&mut contents)?;
Ok(contents)
}
fn string_reader(s: &str) -> Box<dyn BufRead> {
Box::new(Cursor::new(s.as_bytes().to_owned()))
}
fn test_parsing(correct: &[Sentence], fragment: &str) {
let sentences = read_sentences(fragment);
assert_eq!(correct.as_ref(), sentences.as_slice());
}
#[test]
fn reader() {
test_parsing(&*TEST_SENTENCES, BASIC);
}
#[test]
fn reader_robust() {
test_parsing(&*TEST_SENTENCES, DOUBLE_NEWLINE);
}
#[test]
fn reader_marked_empty() {
test_parsing(&*TEST_SENTENCES, EMPTY);
}
#[test]
#[should_panic(expected = "ParseIntField")]
fn reader_rejects_non_numeric_id() {
let mut reader = super::Reader::new(string_reader("test"));
reader.read_sentence().unwrap();
}
#[test]
#[should_panic(expected = "ParseIdentifierField")]
fn reader_rejects_underscore_id() {
let mut reader = super::Reader::new(string_reader("_"));
reader.read_sentence().unwrap();
}
#[test]
fn writer() {
let output = Vec::new();
let mut writer = Writer::new(Box::new(output));
for sentence in &*TEST_SENTENCES {
writer.write_sentence(&sentence).unwrap();
}
assert_eq!(
read_file(EMPTY).unwrap(),
str::from_utf8(writer.get_ref()).unwrap()
);
}
}