1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
//! Naming and identifying languages.  We use

use std::{collections::HashMap, fmt, result, str::from_utf8};

use anyhow::anyhow;
use lazy_static::lazy_static;
use log::debug;
use serde::{Serialize, Serializer};
use whatlang;

use crate::Result;

/// External CSV data from the LoC.
///
/// This is a CSV file which looks like:
///
/// ```csv
/// alpha3-b,alpha3-t,alpha2,English,French
/// aar,null,aa,Afar,afar
/// ```
static ISO_639_CODES: &str = include_str!("data/language-codes-full.csv");

/// Maps related to ISO 639 language codes.
struct LangMaps {
    canonical_codes: HashMap<String, String>,
    names: HashMap<String, String>,
}

/// Helper function called to build language maps.
fn iso_689_canonical_codes_and_names() -> LangMaps {
    let mut canonical_codes = HashMap::new();
    let mut names = HashMap::new();

    // Parse using `csv` crate.
    let mut rdr = csv::Reader::from_reader(ISO_639_CODES.as_bytes());
    let mut r = csv::StringRecord::new();
    while rdr.read_record(&mut r).expect("error reading embedded CSV") {
        let (a3b, a3t, a2, en, _fr) = (&r[0], &r[1], &r[2], &r[3], &r[4]);
        if a2 != "null" {
            if a3b != "null" {
                canonical_codes.insert(a3b.to_owned(), a2.to_owned());
            }
            if a3t != "null" {
                canonical_codes.insert(a3t.to_owned(), a2.to_owned());
            }
            names.insert(a2.to_owned(), en.to_owned());
        } else {
            if a3b != "null" {
                names.insert(a3b.to_owned(), en.to_owned());
            }
            if a3t != "null" {
                names.insert(a3t.to_owned(), en.to_owned());
            }
        }
    }
    LangMaps {
        canonical_codes,
        names,
    }
}

// Use the third-party `lazy_static!` macro to declare variables that will
// initialized the first time we use them.
lazy_static! {
    static ref LANG_MAPS: LangMaps = iso_689_canonical_codes_and_names();
}

/// A language identifier.
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct Lang {
    code: [u8; 3],
}

impl Lang {
    /// Specify a language using an ISO 639-1, -2/T or -2/B code.  We know
    /// that the same language is sometimes represented by more than one
    /// code, and we do our best to treat equivalent codes as the same
    /// language.
    ///
    /// ```
    /// use substudy::lang::Lang;
    /// assert_eq!(Lang::iso639("en").unwrap(), Lang::iso639("eng").unwrap());
    /// assert!(Lang::iso639("en").unwrap() != Lang::iso639("fr").unwrap());
    /// assert!(Lang::iso639("abcd").is_err());
    /// ```
    pub fn iso639(code: &str) -> Result<Lang> {
        let canon = LANG_MAPS
            .canonical_codes
            .get(code)
            .cloned()
            .unwrap_or_else(|| code.to_owned());
        let c = canon.as_bytes();
        match (canon.is_ascii(), c.len()) {
            (true, 2) => Ok(Lang {
                code: [c[0], c[1], b' '],
            }),
            (true, 3) => Ok(Lang {
                code: [c[0], c[1], c[2]],
            }),
            _ => Err(anyhow!("Unsupported language code: {}", code)),
        }
    }

    /// Get the normalized language code as a `&str`.  Prefers ISO 639-1
    /// codes when possible, and -2/T if that's the best it can do.
    ///
    /// ```
    /// use substudy::lang::Lang;
    /// assert_eq!("en", Lang::iso639("en").unwrap().as_str());
    /// assert_eq!("en", Lang::iso639("eng").unwrap().as_str());
    /// ```
    pub fn as_str(&self) -> &str {
        // We could actually use the unsafe from_utf8_unchecked here.
        if self.code[2] == b' ' {
            from_utf8(&self.code[..2]).unwrap()
        } else {
            from_utf8(&self.code).unwrap()
        }
    }

    /// Try to determine the language of `text`.  We return `None` unless
    /// we're pretty sure.
    ///
    /// ```
    /// use substudy::lang::Lang;
    /// let text = "Pour que le caractère d’un être humain dévoile des qualités";
    /// assert_eq!(Lang::for_text(text).unwrap(), Lang::iso639("fr").unwrap());
    /// ```
    pub fn for_text(text: &str) -> Option<Lang> {
        if let Some(info) = whatlang::detect(text) {
            debug!("detected language: {:?}", info);
            if info.is_reliable() {
                return Lang::iso639(info.lang().code()).ok();
            }
        }
        None
    }

    /// Names of the language (or related languages) in English. These
    /// may be separated by semi-colons.
    ///
    /// ```
    /// use substudy::lang::Lang;
    /// assert_eq!(
    ///     vec!["English".to_owned()],
    ///     Lang::iso639("en").unwrap().english_names().unwrap(),
    /// );
    /// ```
    pub fn english_names(&self) -> Result<Vec<&'static str>> {
        let name_str = LANG_MAPS
            .names
            .get(self.as_str())
            .map(|s| s.as_str())
            .ok_or_else(|| {
                anyhow!("No English name for language code: {:?}", self.as_str())
            })?;
        Ok(name_str.split("; ").collect())
    }
}

impl fmt::Debug for Lang {
    fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
        write!(f, "{}", self.as_str())
    }
}

impl fmt::Display for Lang {
    fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
        write!(f, "{}", self.as_str())
    }
}

impl Serialize for Lang {
    fn serialize<S>(&self, serializer: S) -> result::Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        self.as_str().serialize(serializer)
    }
}