tran_term/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3#[cfg(feature = "yml")]
4pub mod yml;
5
6use std::{
7  borrow::Borrow,
8  collections::{HashMap, HashSet},
9};
10
11use daachorse::{CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder, MatchKind};
12use htmlize::{escape_attribute, unescape_attribute};
13use tag_replace::{TagReplace, word_push};
14use thiserror::Error;
15use unicode_categories::UnicodeCategories;
16use unicode_segmentation::UnicodeSegmentation;
17
18#[static_init::dynamic]
19pub static CODE: TagReplace = TagReplace::new("code", "v");
20
21#[derive(Error, Debug)]
22pub enum Error {
23  #[error("daachorse: {0}")]
24  Daachorse(daachorse::errors::DaachorseError),
25}
26
27fn capitalize_first_letter(s: impl AsRef<str>) -> String {
28  let mut chars = s.as_ref().chars();
29  match chars.next() {
30    Some(first_char) => first_char.to_uppercase().to_string() + chars.as_str(),
31    None => String::new(),
32  }
33}
34
35pub struct Term {
36  pub map: HashMap<String, String>,
37  pub ac: CharwiseDoubleArrayAhoCorasick<usize>,
38}
39
40impl Term {
41  pub fn load<'a, S1: AsRef<str> + 'a, S2: AsRef<str> + 'a, T: Borrow<(S1, S2)>>(
42    from_to_iter: impl IntoIterator<Item = T>,
43  ) -> Result<Self, Error> {
44    let mut map = HashMap::new();
45
46    let mut exist = HashSet::new();
47
48    for i in from_to_iter {
49      let (key, value) = i.borrow();
50      let lower_key = key.as_ref().to_lowercase();
51      if lower_key.is_empty() || exist.contains(&lower_key) {
52        continue;
53      }
54      exist.insert(lower_key.clone());
55
56      let value = value.as_ref().into();
57      map.insert(lower_key, value);
58    }
59
60    match CharwiseDoubleArrayAhoCorasickBuilder::new()
61      .match_kind(MatchKind::LeftmostLongest)
62      .build(map.keys())
63    {
64      Ok(ac) => Ok(Term { map, ac }),
65      Err(err) => Err(Error::Daachorse(err)),
66    }
67  }
68
69  pub fn restore(&self, txt: impl AsRef<str>) -> String {
70    let txt = txt.as_ref();
71    CODE.replace(txt, |li, _origin, val: &str| {
72      word_push(li, unescape_attribute(val));
73    })
74  }
75
76  pub fn replace(&mut self, txt: impl AsRef<str>) -> Option<String> {
77    let txt = txt.as_ref();
78    let txt_lower = txt.to_lowercase();
79    let mut li = vec![];
80    let mut pos = 0;
81    for mat in self.ac.leftmost_find_iter(&txt_lower) {
82      let start = mat.start();
83      let end = mat.end();
84      let matched = &txt_lower[start..end];
85
86      macro_rules! is_word {
87        ($prev:expr, $next:expr) => {
88          if let Some(last) = $prev.chars().last()
89            && let Some(first) = $next.chars().next()
90          {
91            let t = last.to_string() + &first.to_string();
92            if !t.contains('_') {
93              let mut t = t.split_word_bounds();
94              match (t.next(), t.next()) {
95                (Some(_), None) => continue,
96                _ => {}
97              };
98            }
99          }
100        };
101      }
102
103      if let Some(val) = self.map.get(matched) {
104        let mut val = val.to_owned();
105
106        // 如果的一个单词的内部, 不替换
107        is_word!(&txt[..start], matched);
108        is_word!(matched, &txt[end..]);
109
110        if start > pos {
111          li.push(txt[pos..start].to_owned());
112        }
113
114        let org = &txt[start..end];
115        if let Some(c) = org.chars().next()
116          && (
117            c.is_uppercase() || start == 0
118            // 行首大写
119          )
120        {
121          let pos = c.to_string().len();
122          if org.len() > 1 && org[pos..].chars().all(char::is_uppercase) {
123            val = val.to_uppercase();
124          } else {
125            val = capitalize_first_letter(val);
126          }
127        } else if let Some(last) = txt[..start].trim_end().chars().last()
128          && !"{}()_*[]~".contains(last)
129          && last.is_punctuation()
130        {
131          val = capitalize_first_letter(val)
132        }
133
134        let val = escape_attribute(val);
135        li.push(format!(r#"<code v="{val}">{matched}</code>"#));
136      }
137      pos = end;
138    }
139    if !li.is_empty() {
140      if pos < txt.len() {
141        li.push(txt[pos..].into());
142      }
143      let r = li.concat();
144      return Some(r);
145    }
146    None
147  }
148}
149
150impl PartialEq for Term {
151  fn eq(&self, other: &Self) -> bool {
152    self.map == other.map
153  }
154}