use std::io::{BufRead, BufReader, Read};
use anyhow::Result;
pub struct Word {
surface: String,
feature: String,
}
impl Word {
pub fn new(surface: &str, feature: &str) -> Self {
Self {
surface: surface.to_string(),
feature: feature.to_string(),
}
}
pub fn surface(&self) -> &str {
&self.surface
}
pub fn feature(&self) -> &str {
&self.feature
}
}
pub struct Example {
pub(crate) sentence: String,
pub(crate) tokens: Vec<Word>,
}
impl Example {
pub fn new(tokens: Vec<Word>) -> Self {
let sentence = tokens.iter().map(|t| t.surface()).collect::<String>();
Self { sentence, tokens }
}
}
pub struct Corpus {
pub(crate) examples: Vec<Example>,
}
impl Default for Corpus {
fn default() -> Self {
Self::new()
}
}
impl Corpus {
pub fn new() -> Self {
Self {
examples: Vec::new(),
}
}
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
let reader = BufReader::new(reader);
let mut corpus = Self::new();
let mut current_tokens = Vec::new();
for line in reader.lines() {
let line = line?;
if line.trim().is_empty() || line == "EOS" {
if !current_tokens.is_empty() {
corpus.examples.push(Example::new(current_tokens));
current_tokens = Vec::new();
}
} else {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 {
let surface = parts[0];
let features = parts[1];
current_tokens.push(Word::new(surface, features));
}
}
}
if !current_tokens.is_empty() {
corpus.examples.push(Example::new(current_tokens));
}
Ok(corpus)
}
pub fn len(&self) -> usize {
self.examples.len()
}
pub fn is_empty(&self) -> bool {
self.examples.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_corpus_from_reader() {
let corpus_data = r#"外国 名詞,一般,*,*,*,*,外国,ガイコク,ガイコク
人 名詞,接尾,一般,*,*,*,人,ジン,ジン
EOS
これ 連体詞,*,*,*,*,*,これ,コレ,コレ
は 助詞,係助詞,*,*,*,*,は,ハ,ワ
EOS
"#;
let cursor = Cursor::new(corpus_data.as_bytes());
let corpus = Corpus::from_reader(cursor).unwrap();
assert_eq!(corpus.len(), 2);
}
}