#![deny(missing_docs)]
#![doc = include_str!("../README.md")]
#![doc = document_features::document_features!()]
#[doc = include_str!("../DATA.md")]
pub mod data {
#[cfg(feature = "include")]
pub static EXAMPLES: &str = include_str!(concat!(env!("OUT_DIR"), "/examples.utf"));
#[cfg(feature = "include_subset")]
pub static EXAMPLES_SUBSET: &str =
include_str!(concat!(env!("OUT_DIR"), "/examples_subset.utf"));
}
use std::{fmt::Debug, str};
use lazy_regex::regex;
use miette::{miette, IntoDiagnostic, Result};
#[derive(Debug)]
pub struct Corpus<'a> {
pub examples: Vec<Example<'a>>,
}
#[derive(Debug, PartialEq)]
pub struct Example<'a> {
pub ja: &'a str,
pub en: &'a str,
pub seq: &'a str,
pub words: Vec<Word<'a>>,
}
#[derive(Debug, PartialEq)]
pub struct Word<'a> {
pub dictionary: &'a str,
pub reading: Option<&'a str>,
pub sense: Option<u32>,
pub form: Option<&'a str>,
pub checked: bool,
}
impl<'a> Corpus<'a> {
#[cfg(feature = "include")]
pub fn examples() -> Corpus<'static> {
Corpus::parse(data::EXAMPLES).unwrap()
}
#[cfg(feature = "include_subset")]
pub fn examples_subset() -> Corpus<'static> {
Corpus::parse(data::EXAMPLES_SUBSET).unwrap()
}
pub fn parse(text: &'a str) -> Result<Corpus<'a>> {
let mut samples = vec![];
let mut lines = text.lines();
while let Some(next) = &lines.next() {
let next_b = lines.next().ok_or(miette!("no B line for A line"))?;
samples.push(Example::parse(next, next_b)?);
}
Ok(Corpus { examples: samples })
}
}
impl<'a> Example<'a> {
fn parse(line_a: &'a str, line_b: &'a str) -> Result<Self> {
let parts = line_b.split_whitespace().collect::<Vec<_>>();
if parts[0] != "B:" {
return Err(miette!("no 'B:' marker found: {}", parts[0]));
}
let re = regex!(r"^A: (?<en>.+)\t(?<ja>.+)#ID=(?<id>[0-9_]+)$");
let matches = re.captures(line_a).ok_or(miette!(
"line did not match expected format for 'A' line: {}",
line_a
))?;
Ok(Example {
ja: matches.name("en").unwrap().as_str(),
en: matches.name("ja").unwrap().as_str(),
seq: matches.name("id").unwrap().as_str(),
words: parts[1..]
.iter()
.map(|part| Word::parse(part))
.collect::<Result<Vec<_>>>()?,
})
}
}
impl<'a> Word<'a> {
fn parse(part: &'a str) -> Result<Word<'a>> {
let re = regex!(
r"(?x) # x = verbose mode
(?<kanji>[^\(\[\{\~]+)
(\((?<reading>[^\#].*?)\))?
(\[(?<sense>[^\#].*?)\])?
(\{(?<form>[^\#].*?)\})?
(?<checked>~)?"
);
let matches = re.captures(part).ok_or(miette!(
"text did not match expected format for word on 'B' line: {}",
part
))?;
Ok(Word {
dictionary: { matches.name("kanji").unwrap().as_str() },
reading: { matches.name("reading").map(|c| c.as_str()) },
sense: {
matches
.name("sense")
.map(|c| c.as_str().parse().into_diagnostic())
.transpose()?
},
form: { matches.name("form").map(|c| c.as_str()) },
checked: matches.name("checked").is_some(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
static LINE_A: &str = "A: 彼は忙しい生活の中で家族と会うことがない。\
\t\
He doesn't see his family in his busy life.#ID=303697_100000";
static LINE_B: &str = "B: \n\
彼(かれ)[01] \n\
は \n\
忙しい(いそがしい) \n\
生活~ \n\
の \n\
中(なか)~ \n\
で(#2028980) \n\
家族 \n\
と \n\
会う[01] \n\
事(こと){こと} \n\
が \n\
無い{ない}";
#[test]
fn parse_pair() -> Result<()> {
let sample = Example::parse(LINE_A, LINE_B)?;
assert_eq!(
sample.ja,
"彼は忙しい生活の中で家族と会うことがない。".to_owned()
);
assert_eq!(
sample.en,
"He doesn't see his family in his busy life.".to_owned()
);
assert_eq!(sample.seq, "303697_100000");
assert_eq!(
sample.words[0],
Word {
dictionary: "彼",
reading: Some("かれ"),
sense: Some(1),
form: None,
checked: false,
}
);
assert_eq!(
sample.words[1],
Word {
dictionary: "は",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[2],
Word {
dictionary: "忙しい",
reading: Some("いそがしい"),
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[3],
Word {
dictionary: "生活",
reading: None,
sense: None,
form: None,
checked: true,
}
);
assert_eq!(
sample.words[4],
Word {
dictionary: "の",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[5],
Word {
dictionary: "中",
reading: Some("なか"),
sense: None,
form: None,
checked: true,
}
);
assert_eq!(
sample.words[6],
Word {
dictionary: "で",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[7],
Word {
dictionary: "家族",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[8],
Word {
dictionary: "と",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[9],
Word {
dictionary: "会う",
reading: None,
sense: Some(1),
form: None,
checked: false,
}
);
assert_eq!(
sample.words[10],
Word {
dictionary: "事",
reading: Some("こと"),
sense: None,
form: Some("こと"),
checked: false,
}
);
assert_eq!(
sample.words[11],
Word {
dictionary: "が",
reading: None,
sense: None,
form: None,
checked: false,
}
);
assert_eq!(
sample.words[12],
Word {
dictionary: "無い",
reading: None,
sense: None,
form: Some("ない"),
checked: false,
}
);
Ok(())
}
#[test]
fn parse_subset() -> Result<()> {
let test_corpus_len = 25854;
let corpus = Corpus::parse(data::EXAMPLES_SUBSET)?;
assert_eq!(corpus.examples.len(), test_corpus_len);
let last_sample = &corpus.examples[test_corpus_len - 1];
assert_eq!(last_sample.ja, "彼は暴言罪で告発された。");
assert_eq!(
last_sample.en,
"He was charged with the crime of abusive language."
);
assert_eq!(last_sample.seq, "7015381_99995");
assert_eq!(
&last_sample.words[0],
&Word {
dictionary: "彼",
reading: Some("かれ"),
form: None,
sense: Some(1),
checked: false,
}
);
assert_eq!(
&last_sample.words[6],
&Word {
dictionary: "為れる",
reading: None,
form: Some("された"),
sense: None,
checked: false,
}
);
Ok(())
}
}