pub mod numerals;
pub mod transliteration;
use std::borrow::Cow;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ScriptType {
Alphabet,
Abugida,
Abjad,
Syllabary,
Logographic,
Mixed,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum Direction {
LeftToRight,
RightToLeft,
TopToBottom,
Bidirectional,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ScriptStatus {
Living,
Limited,
Historical,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Script {
pub code: Cow<'static, str>,
pub name: Cow<'static, str>,
pub script_type: ScriptType,
pub direction: Direction,
pub status: ScriptStatus,
pub attestation: Option<Cow<'static, str>>,
pub unicode_ranges: Vec<(u32, u32)>,
pub languages: Vec<Cow<'static, str>>,
}
impl Script {
#[must_use]
pub fn contains_codepoint(&self, cp: u32) -> bool {
self.unicode_ranges
.iter()
.any(|&(lo, hi)| cp >= lo && cp <= hi)
}
}
#[must_use]
pub fn by_code(code: &str) -> Option<Script> {
tracing::trace!(code, "script lookup");
match code {
"Latn" => Some(latin()),
"Arab" => Some(arabic()),
"Deva" => Some(devanagari()),
"Hani" => Some(cjk()),
"Cyrl" => Some(cyrillic()),
"Hang" => Some(hangul()),
"Kana" => Some(kana()),
"Grek" => Some(greek()),
"Xsux" => Some(cuneiform()),
"Egyp" => Some(egyptian()),
_ => None,
}
}
#[must_use]
pub fn all_codes() -> &'static [&'static str] {
&[
"Latn", "Arab", "Deva", "Hani", "Cyrl", "Hang", "Kana", "Grek", "Xsux", "Egyp",
]
}
#[must_use]
pub fn latin() -> Script {
Script {
code: Cow::Borrowed("Latn"),
name: Cow::Borrowed("Latin"),
script_type: ScriptType::Alphabet,
direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x0041, 0x005A), (0x0061, 0x007A), (0x00C0, 0x00FF), (0x0100, 0x024F), ],
languages: vec![
Cow::Borrowed("en"),
Cow::Borrowed("fr"),
Cow::Borrowed("es"),
Cow::Borrowed("de"),
Cow::Borrowed("pt"),
Cow::Borrowed("it"),
],
}
}
#[must_use]
pub fn arabic() -> Script {
Script {
code: Cow::Borrowed("Arab"),
name: Cow::Borrowed("Arabic"),
script_type: ScriptType::Abjad,
direction: Direction::RightToLeft,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x0600, 0x06FF), (0x0750, 0x077F), (0xFB50, 0xFDFF), (0xFE70, 0xFEFF), ],
languages: vec![
Cow::Borrowed("ar"),
Cow::Borrowed("fa"),
Cow::Borrowed("ur"),
],
}
}
#[must_use]
pub fn devanagari() -> Script {
Script {
code: Cow::Borrowed("Deva"),
name: Cow::Borrowed("Devanagari"),
script_type: ScriptType::Abugida,
direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x0900, 0x097F), (0xA8E0, 0xA8FF), ],
languages: vec![
Cow::Borrowed("hi"),
Cow::Borrowed("sa"),
Cow::Borrowed("mr"),
Cow::Borrowed("ne"),
],
}
}
#[must_use]
pub fn cjk() -> Script {
Script {
code: Cow::Borrowed("Hani"),
name: Cow::Borrowed("CJK Unified Ideographs"),
script_type: ScriptType::Logographic,
direction: Direction::LeftToRight, status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x4E00, 0x9FFF), (0x3400, 0x4DBF), (0x20000, 0x2A6DF), ],
languages: vec![
Cow::Borrowed("zh"),
Cow::Borrowed("ja"),
Cow::Borrowed("ko"),
],
}
}
#[must_use]
pub fn cyrillic() -> Script {
Script {
code: Cow::Borrowed("Cyrl"),
name: Cow::Borrowed("Cyrillic"),
script_type: ScriptType::Alphabet,
direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x0400, 0x04FF), (0x0500, 0x052F), ],
languages: vec![
Cow::Borrowed("ru"),
Cow::Borrowed("uk"),
Cow::Borrowed("bg"),
Cow::Borrowed("sr"),
],
}
}
#[must_use]
pub fn hangul() -> Script {
Script {
code: Cow::Borrowed("Hang"),
name: Cow::Borrowed("Hangul"),
script_type: ScriptType::Alphabet, direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0xAC00, 0xD7AF), (0x1100, 0x11FF), (0x3130, 0x318F), ],
languages: vec![Cow::Borrowed("ko")],
}
}
#[must_use]
pub fn kana() -> Script {
Script {
code: Cow::Borrowed("Kana"),
name: Cow::Borrowed("Kana (Hiragana + Katakana)"),
script_type: ScriptType::Syllabary,
direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF), ],
languages: vec![Cow::Borrowed("ja")],
}
}
#[must_use]
pub fn greek() -> Script {
Script {
code: Cow::Borrowed("Grek"),
name: Cow::Borrowed("Greek"),
script_type: ScriptType::Alphabet,
direction: Direction::LeftToRight,
status: ScriptStatus::Living,
attestation: None,
unicode_ranges: vec![
(0x0370, 0x03FF), (0x1F00, 0x1FFF), ],
languages: vec![Cow::Borrowed("el")],
}
}
#[must_use]
pub fn cuneiform() -> Script {
Script {
code: Cow::Borrowed("Xsux"),
name: Cow::Borrowed("Cuneiform"),
script_type: ScriptType::Logographic,
direction: Direction::LeftToRight, status: ScriptStatus::Historical,
attestation: Some(Cow::Borrowed("3400 BCE – 75 CE")),
unicode_ranges: vec![
(0x12000, 0x1237F), (0x12400, 0x1247F), (0x12480, 0x1254F), ],
languages: vec![
Cow::Borrowed("sux"), Cow::Borrowed("akk"), ],
}
}
#[must_use]
pub fn egyptian() -> Script {
Script {
code: Cow::Borrowed("Egyp"),
name: Cow::Borrowed("Egyptian Hieroglyphs"),
script_type: ScriptType::Logographic,
direction: Direction::RightToLeft, status: ScriptStatus::Historical,
attestation: Some(Cow::Borrowed("3200 BCE – 400 CE")),
unicode_ranges: vec![
(0x13000, 0x1342F), (0x13430, 0x1345F), ],
languages: vec![Cow::Borrowed("egy")], }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_latin_script() {
let s = latin();
assert_eq!(s.code, "Latn");
assert_eq!(s.script_type, ScriptType::Alphabet);
assert_eq!(s.direction, Direction::LeftToRight);
assert!(s.contains_codepoint(0x0041)); assert!(!s.contains_codepoint(0x0600)); }
#[test]
fn test_arabic_script() {
let s = arabic();
assert_eq!(s.code, "Arab");
assert_eq!(s.script_type, ScriptType::Abjad);
assert_eq!(s.direction, Direction::RightToLeft);
assert!(s.contains_codepoint(0x0627)); }
#[test]
fn test_devanagari_script() {
let s = devanagari();
assert_eq!(s.code, "Deva");
assert_eq!(s.script_type, ScriptType::Abugida);
assert!(s.languages.iter().any(|l| l == "sa")); }
#[test]
fn test_cjk_script() {
let s = cjk();
assert_eq!(s.code, "Hani");
assert_eq!(s.script_type, ScriptType::Logographic);
assert!(s.contains_codepoint(0x4E00)); }
#[test]
fn test_cyrillic_script() {
let s = cyrillic();
assert_eq!(s.code, "Cyrl");
assert_eq!(s.script_type, ScriptType::Alphabet);
assert!(s.contains_codepoint(0x0410)); }
#[test]
fn test_hangul_script() {
let s = hangul();
assert_eq!(s.code, "Hang");
assert!(s.contains_codepoint(0xAC00)); }
#[test]
fn test_kana_script() {
let s = kana();
assert_eq!(s.code, "Kana");
assert_eq!(s.script_type, ScriptType::Syllabary);
assert!(s.contains_codepoint(0x3042)); assert!(s.contains_codepoint(0x30A2)); }
#[test]
fn test_greek_script() {
let s = greek();
assert_eq!(s.code, "Grek");
assert_eq!(s.script_type, ScriptType::Alphabet);
assert!(s.contains_codepoint(0x03B1)); assert!(s.contains_codepoint(0x03C0)); }
#[test]
fn test_by_code_lookup() {
assert!(by_code("Latn").is_some());
assert!(by_code("Grek").is_some());
assert!(by_code("XXXX").is_none());
}
#[test]
fn test_all_codes_match_by_code() {
for code in all_codes() {
assert!(by_code(code).is_some(), "by_code failed for {code}");
}
}
#[test]
fn test_contains_codepoint_boundary() {
let s = latin();
assert!(s.contains_codepoint(0x0041)); assert!(s.contains_codepoint(0x005A)); assert!(!s.contains_codepoint(0x0040)); assert!(!s.contains_codepoint(0x005B)); }
#[test]
fn test_cuneiform_script() {
let s = cuneiform();
assert_eq!(s.code, "Xsux");
assert_eq!(s.script_type, ScriptType::Logographic);
assert!(s.contains_codepoint(0x12000)); assert!(s.languages.iter().any(|l| l == "sux")); }
#[test]
fn test_egyptian_script() {
let s = egyptian();
assert_eq!(s.code, "Egyp");
assert_eq!(s.script_type, ScriptType::Logographic);
assert!(s.contains_codepoint(0x13000)); }
}