substudy/
lang.rs

1//! Naming and identifying languages.  We use
2
3use std::{collections::HashMap, fmt, result, str::from_utf8};
4
5use anyhow::anyhow;
6use lazy_static::lazy_static;
7use log::debug;
8use serde::{Serialize, Serializer};
9use whatlang;
10
11use crate::Result;
12
13/// External CSV data from the LoC.
14///
15/// This is a CSV file which looks like:
16///
17/// ```csv
18/// alpha3-b,alpha3-t,alpha2,English,French
19/// aar,null,aa,Afar,afar
20/// ```
21static ISO_639_CODES: &str = include_str!("data/language-codes-full.csv");
22
23/// Maps related to ISO 639 language codes.
24struct LangMaps {
25    canonical_codes: HashMap<String, String>,
26    names: HashMap<String, String>,
27}
28
29/// Helper function called to build language maps.
30fn iso_689_canonical_codes_and_names() -> LangMaps {
31    let mut canonical_codes = HashMap::new();
32    let mut names = HashMap::new();
33
34    // Parse using `csv` crate.
35    let mut rdr = csv::Reader::from_reader(ISO_639_CODES.as_bytes());
36    let mut r = csv::StringRecord::new();
37    while rdr.read_record(&mut r).expect("error reading embedded CSV") {
38        let (a3b, a3t, a2, en, _fr) = (&r[0], &r[1], &r[2], &r[3], &r[4]);
39        if a2 != "null" {
40            if a3b != "null" {
41                canonical_codes.insert(a3b.to_owned(), a2.to_owned());
42            }
43            if a3t != "null" {
44                canonical_codes.insert(a3t.to_owned(), a2.to_owned());
45            }
46            names.insert(a2.to_owned(), en.to_owned());
47        } else {
48            if a3b != "null" {
49                names.insert(a3b.to_owned(), en.to_owned());
50            }
51            if a3t != "null" {
52                names.insert(a3t.to_owned(), en.to_owned());
53            }
54        }
55    }
56    LangMaps {
57        canonical_codes,
58        names,
59    }
60}
61
62// Use the third-party `lazy_static!` macro to declare variables that will
63// initialized the first time we use them.
64lazy_static! {
65    static ref LANG_MAPS: LangMaps = iso_689_canonical_codes_and_names();
66}
67
68/// A language identifier.
69#[derive(Clone, Copy, PartialEq, Eq)]
70pub struct Lang {
71    code: [u8; 3],
72}
73
74impl Lang {
75    /// Specify a language using an ISO 639-1, -2/T or -2/B code.  We know
76    /// that the same language is sometimes represented by more than one
77    /// code, and we do our best to treat equivalent codes as the same
78    /// language.
79    ///
80    /// ```
81    /// use substudy::lang::Lang;
82    /// assert_eq!(Lang::iso639("en").unwrap(), Lang::iso639("eng").unwrap());
83    /// assert!(Lang::iso639("en").unwrap() != Lang::iso639("fr").unwrap());
84    /// assert!(Lang::iso639("abcd").is_err());
85    /// ```
86    pub fn iso639(code: &str) -> Result<Lang> {
87        let canon = LANG_MAPS
88            .canonical_codes
89            .get(code)
90            .cloned()
91            .unwrap_or_else(|| code.to_owned());
92        let c = canon.as_bytes();
93        match (canon.is_ascii(), c.len()) {
94            (true, 2) => Ok(Lang {
95                code: [c[0], c[1], b' '],
96            }),
97            (true, 3) => Ok(Lang {
98                code: [c[0], c[1], c[2]],
99            }),
100            _ => Err(anyhow!("Unsupported language code: {}", code)),
101        }
102    }
103
104    /// Get the normalized language code as a `&str`.  Prefers ISO 639-1
105    /// codes when possible, and -2/T if that's the best it can do.
106    ///
107    /// ```
108    /// use substudy::lang::Lang;
109    /// assert_eq!("en", Lang::iso639("en").unwrap().as_str());
110    /// assert_eq!("en", Lang::iso639("eng").unwrap().as_str());
111    /// ```
112    pub fn as_str(&self) -> &str {
113        // We could actually use the unsafe from_utf8_unchecked here.
114        if self.code[2] == b' ' {
115            from_utf8(&self.code[..2]).unwrap()
116        } else {
117            from_utf8(&self.code).unwrap()
118        }
119    }
120
121    /// Try to determine the language of `text`.  We return `None` unless
122    /// we're pretty sure.
123    ///
124    /// ```
125    /// use substudy::lang::Lang;
126    /// let text = "Pour que le caractère d’un être humain dévoile des qualités";
127    /// assert_eq!(Lang::for_text(text).unwrap(), Lang::iso639("fr").unwrap());
128    /// ```
129    pub fn for_text(text: &str) -> Option<Lang> {
130        if let Some(info) = whatlang::detect(text) {
131            debug!("detected language: {:?}", info);
132            if info.is_reliable() {
133                return Lang::iso639(info.lang().code()).ok();
134            }
135        }
136        None
137    }
138
139    /// Names of the language (or related languages) in English. These
140    /// may be separated by semi-colons.
141    ///
142    /// ```
143    /// use substudy::lang::Lang;
144    /// assert_eq!(
145    ///     vec!["English".to_owned()],
146    ///     Lang::iso639("en").unwrap().english_names().unwrap(),
147    /// );
148    /// ```
149    pub fn english_names(&self) -> Result<Vec<&'static str>> {
150        let name_str = LANG_MAPS
151            .names
152            .get(self.as_str())
153            .map(|s| s.as_str())
154            .ok_or_else(|| {
155                anyhow!("No English name for language code: {:?}", self.as_str())
156            })?;
157        Ok(name_str.split("; ").collect())
158    }
159}
160
161impl fmt::Debug for Lang {
162    fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
163        write!(f, "{}", self.as_str())
164    }
165}
166
167impl fmt::Display for Lang {
168    fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
169        write!(f, "{}", self.as_str())
170    }
171}
172
173impl Serialize for Lang {
174    fn serialize<S>(&self, serializer: S) -> result::Result<S::Ok, S::Error>
175    where
176        S: Serializer,
177    {
178        self.as_str().serialize(serializer)
179    }
180}