tanaka 0.1.0 - Docs.rs

#![deny(missing_docs)]
#![doc = include_str!("../README.md")]
//! ## Feature flags
#![doc = document_features::document_features!()]

#[doc = include_str!("../DATA.md")]
pub mod data {
    /// The latest UTF-8 encoded corpus, as of December 2023.
    ///
    /// Origially downloaded from [here][1].
    ///
    /// # Feature
    ///
    /// Requires the `include` [feature](crate#feature-flags).
    ///
    /// # License
    ///
    /// See [here](crate::data#license).
    ///
    ///
    /// [1]: ftp://ftp.edrdg.org/pub/Nihongo/examples.utf.gz
    #[cfg(feature = "include")]
    pub static EXAMPLES: &str = include_str!(concat!(env!("OUT_DIR"), "/examples.utf"));

    /// Like [EXAMPLES], but only the entries that contain at least
    /// one checked word.
    ///
    /// Origially downloaded from [here][1], and re-encoded into UTF-8
    /// with `iconv`.
    ///
    /// # Feature
    ///
    /// Requires the `include_subset` [feature](crate#feature-flags).
    ///
    /// # License
    ///
    /// See [here](crate::data#license).
    ///
    ///
    /// [1]: ftp://ftp.edrdg.org/pub/Nihongo/examples_s.gz
    #[cfg(feature = "include_subset")]
    pub static EXAMPLES_SUBSET: &str =
        include_str!(concat!(env!("OUT_DIR"), "/examples_subset.utf"));
}

use std::{fmt::Debug, str};

use lazy_regex::regex;
use miette::{miette, IntoDiagnostic, Result};

/// A parsed, in-memory corpus.
#[derive(Debug)]
pub struct Corpus<'a> {
    /// The examples in the corpus.
    pub examples: Vec<Example<'a>>,
}

/// An example sentence.
#[derive(Debug, PartialEq)]
pub struct Example<'a> {
    /// The Japanese sentence.
    pub ja: &'a str,

    /// The English translation.
    pub en: &'a str,

    /// A sequence number.
    ///
    /// Used to identify the pair uniquely across several projects
    /// using the file.
    pub seq: &'a str,

    /// The Japanese words found in the sentence.
    pub words: Vec<Word<'a>>,
}

/// Information about a Japanese word found in an [Example].
#[derive(Debug, PartialEq)]
pub struct Word<'a> {
    /// The dictionary form of the word.
    pub dictionary: &'a str,

    /// A reading in hiragana.
    ///
    /// This is to resolve cases where the word can be read different
    /// ways. WWWJDIC uses this to ensure that only the appropriate
    /// sentences are linked. The reading is in "round" parentheses.
    pub reading: Option<&'a str>,

    /// A sense number.
    ///
    /// This occurs when the word has multiple senses in the EDICT
    /// file, and indicates which sense applies in the
    /// sentence. WWWJDIC displays these numbers. The sense number is
    /// in "square" parentheses.
    pub sense: Option<u32>,

    /// The form in which the word appears in the sentence.
    ///
    /// This will differ from the indexing word if it has been
    /// inflected, for example. This field is in "curly" parentheses.
    pub form: Option<&'a str>,

    /// Indicates that the sentence pair is a good and checked example
    /// of the usage of the word.
    ///
    /// Words are marked to enable appropriate sentences to be
    /// selected by dictionary software. Typically only one instance
    /// per sense of a word will be marked. The WWWJDIC server
    /// displays these sentences below the display of the related
    /// dictionary entry.
    pub checked: bool,
}

impl<'a> Corpus<'a> {
    /// Load [the built-in corpus](data::EXAMPLES).
    ///
    /// # Feature
    ///
    /// Requires the `include` [feature](crate#feature-flags).
    #[cfg(feature = "include")]
    pub fn examples() -> Corpus<'static> {
        Corpus::parse(data::EXAMPLES).unwrap()
    }

    /// Load [the built-in subset corpus](data::EXAMPLES_SUBSET).
    ///
    /// # Feature
    ///
    /// Requires the `include_subset` [feature](crate#feature-flags).
    #[cfg(feature = "include_subset")]
    pub fn examples_subset() -> Corpus<'static> {
        Corpus::parse(data::EXAMPLES_SUBSET).unwrap()
    }

    /// Parse the text of a corpus.
    ///
    /// This must be in the format described in [data].
    pub fn parse(text: &'a str) -> Result<Corpus<'a>> {
        let mut samples = vec![];

        let mut lines = text.lines();

        while let Some(next) = &lines.next() {
            let next_b = lines.next().ok_or(miette!("no B line for A line"))?;

            samples.push(Example::parse(next, next_b)?);
        }

        Ok(Corpus { examples: samples })
    }
}

impl<'a> Example<'a> {
    fn parse(line_a: &'a str, line_b: &'a str) -> Result<Self> {
        let parts = line_b.split_whitespace().collect::<Vec<_>>();

        if parts[0] != "B:" {
            return Err(miette!("no 'B:' marker found: {}", parts[0]));
        }

        let re = regex!(r"^A: (?<en>.+)\t(?<ja>.+)#ID=(?<id>[0-9_]+)$");

        let matches = re.captures(line_a).ok_or(miette!(
            "line did not match expected format for 'A' line: {}",
            line_a
        ))?;

        // We can't use matches[] here, as that gives a reference
        // within the match object.
        Ok(Example {
            ja: matches.name("en").unwrap().as_str(),
            en: matches.name("ja").unwrap().as_str(),
            seq: matches.name("id").unwrap().as_str(),
            words: parts[1..]
                .iter()
                .map(|part| Word::parse(part))
                .collect::<Result<Vec<_>>>()?,
        })
    }
}

impl<'a> Word<'a> {
    fn parse(part: &'a str) -> Result<Word<'a>> {
        let re = regex!(
            r"(?x) # x = verbose mode
              (?<kanji>[^\(\[\{\~]+)
              (\((?<reading>[^\#].*?)\))?
              (\[(?<sense>[^\#].*?)\])?
              (\{(?<form>[^\#].*?)\})?
              (?<checked>~)?"
        );

        let matches = re.captures(part).ok_or(miette!(
            "text did not match expected format for word on 'B' line: {}",
            part
        ))?;

        // We can't use matches[] here, as that gives a reference
        // within the match object.
        Ok(Word {
            dictionary: { matches.name("kanji").unwrap().as_str() },
            reading: { matches.name("reading").map(|c| c.as_str()) },
            sense: {
                matches
                    .name("sense")
                    .map(|c| c.as_str().parse().into_diagnostic())
                    .transpose()?
            },
            form: { matches.name("form").map(|c| c.as_str()) },
            checked: matches.name("checked").is_some(),
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// From the first sample from the corpus.
    static LINE_A: &str = "A: 彼は忙しい生活の中で家族と会うことがない。\
                           \t\
                           He doesn't see his family in his busy life.#ID=303697_100000";

    /// From the first sample from the corpus, but with a tilde added
    /// to a couple of words for testing purposes.
    static LINE_B: &str = "B: \n\
                           彼(かれ)[01] \n\
                           は \n\
                           忙しい(いそがしい) \n\
                           生活~ \n\
                           の \n\
                           中(なか)~ \n\
                           で(#2028980) \n\
                           家族 \n\
                           と \n\
                           会う[01] \n\
                           事(こと){こと} \n\
                           が \n\
                           無い{ない}";

    /// Test parsing one sentence pair.
    #[test]
    fn parse_pair() -> Result<()> {
        let sample = Example::parse(LINE_A, LINE_B)?;
        assert_eq!(
            sample.ja,
            "彼は忙しい生活の中で家族と会うことがない。".to_owned()
        );
        assert_eq!(
            sample.en,
            "He doesn't see his family in his busy life.".to_owned()
        );
        assert_eq!(sample.seq, "303697_100000");

        assert_eq!(
            sample.words[0],
            Word {
                dictionary: "彼",
                reading: Some("かれ"),
                sense: Some(1),
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[1],
            Word {
                dictionary: "は",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[2],
            Word {
                dictionary: "忙しい",
                reading: Some("いそがしい"),
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[3],
            Word {
                dictionary: "生活",
                reading: None,
                sense: None,
                form: None,
                checked: true,
            }
        );
        assert_eq!(
            sample.words[4],
            Word {
                dictionary: "の",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[5],
            Word {
                dictionary: "中",
                reading: Some("なか"),
                sense: None,
                form: None,
                checked: true,
            }
        );
        assert_eq!(
            sample.words[6],
            Word {
                dictionary: "で",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[7],
            Word {
                dictionary: "家族",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[8],
            Word {
                dictionary: "と",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[9],
            Word {
                dictionary: "会う",
                reading: None,
                sense: Some(1),
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[10],
            Word {
                dictionary: "事",
                reading: Some("こと"),
                sense: None,
                form: Some("こと"),
                checked: false,
            }
        );
        assert_eq!(
            sample.words[11],
            Word {
                dictionary: "が",
                reading: None,
                sense: None,
                form: None,
                checked: false,
            }
        );
        assert_eq!(
            sample.words[12],
            Word {
                dictionary: "無い",
                reading: None,
                sense: None,
                form: Some("ない"),
                checked: false,
            }
        );

        Ok(())
    }

    /// Test parsing a subset of the corpus. We test loading the whole
    /// corpus in the integration tests.
    #[test]
    fn parse_subset() -> Result<()> {
        let test_corpus_len = 25854;

        let corpus = Corpus::parse(data::EXAMPLES_SUBSET)?;

        assert_eq!(corpus.examples.len(), test_corpus_len);

        // Check the last sample, to be sure we got to the end.
        let last_sample = &corpus.examples[test_corpus_len - 1];
        assert_eq!(last_sample.ja, "彼は暴言罪で告発された。");
        assert_eq!(
            last_sample.en,
            "He was charged with the crime of abusive language."
        );
        assert_eq!(last_sample.seq, "7015381_99995");

        // Check first and last word.
        assert_eq!(
            &last_sample.words[0],
            &Word {
                dictionary: "彼",
                reading: Some("かれ"),
                form: None,
                sense: Some(1),
                checked: false,
            }
        );
        assert_eq!(
            &last_sample.words[6],
            &Word {
                dictionary: "為れる",
                reading: None,
                form: Some("された"),
                sense: None,
                checked: false,
            }
        );

        Ok(())
    }
}