use std::io::{BufRead, Lines, Seek, SeekFrom};
use syntaxdot_tokenizers::{SentenceWithPieces, Tokenize};
use udgraph::graph::Sentence;
use udgraph::token::Token;
use crate::dataset::DataSet;
use crate::error::SyntaxDotError;
pub struct PlainTextDataSet<R>(R);
impl<R> PlainTextDataSet<R> {
pub fn new(read: R) -> Self {
Self(read)
}
}
impl<'a, R> DataSet<'a> for &'a mut PlainTextDataSet<R>
where
R: BufRead + Seek,
{
type Iter = PlainTextIter<'a, &'a mut R>;
fn sentences(self, tokenizer: &'a dyn Tokenize) -> Result<Self::Iter, SyntaxDotError> {
self.0.seek(SeekFrom::Start(0))?;
Ok(PlainTextIter {
lines: (&mut self.0).lines(),
tokenizer,
})
}
}
pub struct PlainTextIter<'a, R> {
lines: Lines<R>,
tokenizer: &'a dyn Tokenize,
}
impl<'a, R> Iterator for PlainTextIter<'a, R>
where
R: BufRead,
{
type Item = Result<SentenceWithPieces, SyntaxDotError>;
fn next(&mut self) -> Option<Self::Item> {
for line in &mut self.lines {
let line = match line {
Ok(line) => line,
Err(err) => return Some(Err(SyntaxDotError::IoError(err))),
};
let line_trimmed = line.trim();
if line_trimmed.is_empty() {
continue;
}
let sentence = line_trimmed
.split_terminator(' ')
.map(ToString::to_string)
.map(Token::new)
.collect::<Sentence>();
return Some(Ok(self.tokenizer.tokenize(sentence)));
}
None
}
}
#[cfg(test)]
mod tests {
use std::io::{BufReader, Cursor};
use crate::dataset::tests::{dataset_to_pieces, wordpiece_tokenizer, CORRECT_PIECE_IDS};
use crate::dataset::PlainTextDataSet;
const SENTENCES: &str = r#"
Dit is de eerste zin .
Dit de tweede zin .
En nu de laatste zin ."#;
#[test]
fn plain_text_dataset_works() {
let tokenizer = wordpiece_tokenizer();
let mut cursor = Cursor::new(SENTENCES);
let mut dataset = PlainTextDataSet::new(BufReader::new(&mut cursor));
let pieces = dataset_to_pieces(&mut dataset, &tokenizer).unwrap();
assert_eq!(pieces, *CORRECT_PIECE_IDS);
let more_pieces = dataset_to_pieces(&mut dataset, &tokenizer).unwrap();
assert_eq!(more_pieces, *CORRECT_PIECE_IDS);
}
}