// autocorrect: false
use regex::Regex;
use std::collections::HashMap;
use super::CJK_RE;
#[derive(Clone)]
enum ReplaceMode {
Replace,
PrefixSpace,
SuffixSpace,
}
#[derive(Clone, PartialEq)]
enum CharType {
LeftQuote,
RightQuote,
Other,
}
#[derive(Clone)]
struct ReplaceRule {
to: &'static str,
mode: ReplaceMode,
char_type: CharType,
}
impl ReplaceRule {
fn new(to: &'static str) -> Self {
Self {
to,
mode: ReplaceMode::Replace,
char_type: CharType::Other,
}
}
fn with_suffix_space(&mut self) -> Self {
self.mode = ReplaceMode::SuffixSpace;
self.clone()
}
fn with_prefix_space(&mut self) -> Self {
self.mode = ReplaceMode::PrefixSpace;
self.clone()
}
fn left_quote(&mut self) -> Self {
self.char_type = CharType::LeftQuote;
self.clone()
}
fn right_quote(&mut self) -> Self {
self.char_type = CharType::RightQuote;
self.clone()
}
}
lazy_static! {
static ref CHAR_WIDTH_MAP: HashMap<&'static str, &'static str> = map!(
"a" => "a", "b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y", "z" => "z", "A" => "A", "B" => "B", "C" => "C", "D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I", "J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O", "P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U", "V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "1" => "1", "2" => "2", "3" => "3", "4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "0" => "0", " " => " ",
);
static ref HALF_TIME_RE: Regex = regexp!("{}", r"(\d)(:)(\d)");
// More than 2 words and leading with words
static ref ENGLISH_RE: Regex = regexp!("{}", r#"([\w]+[ ,.'?!&:]+[\w]+)"#);
static ref START_WITH_WORD_RE: Regex = regexp!("{}", r#"^\s*[\w]+"#);
static ref QUOTE_RE: Regex = regexp!("{}", r#"^\s*(["'`]).+(["'`])\s*$"#);
static ref WORD_RE: Regex = regexp!("{}", r#"[a-zA-Z]{2,}"#);
// %{xxx}, #{xxx}, i18n.t(
static ref CODE_STRING_RE: Regex = regexp!("{}", r#"([#%$]\{.+\})|([\w]+\.[\w]+\()"#);
static ref PUNCTUATION_MAP: HashMap<&'static str, ReplaceRule> = map!(
// The single (‘...’) and double (“...”) char is used in english typographic.
// Option + [ and Shift + Option + [ to get “”
// Option + ] and Shift + Option + ] to get ‘’
// https://en.wikipedia.org/wiki/Quotation_marks_in_English
"," => ReplaceRule::new(",").with_suffix_space(),
"、" => ReplaceRule::new(",").with_suffix_space(),
"。" => ReplaceRule::new(".").with_suffix_space(),
":" => ReplaceRule::new(":").with_suffix_space(),
";" => ReplaceRule::new(".").with_suffix_space(),
"!" => ReplaceRule::new("!").with_suffix_space(),
"?" => ReplaceRule::new("?").with_suffix_space(),
"(" => ReplaceRule::new("(").left_quote().with_prefix_space(),
"【" => ReplaceRule::new("[").left_quote().with_prefix_space(),
"「" => ReplaceRule::new("[").left_quote().with_prefix_space(),
"《" => ReplaceRule::new("“").left_quote().with_prefix_space(),
")" => ReplaceRule::new(")").right_quote().with_suffix_space(),
"】" => ReplaceRule::new("]").right_quote().with_suffix_space(),
"」" => ReplaceRule::new("]").right_quote().with_suffix_space(),
"》" => ReplaceRule::new("”").right_quote().with_suffix_space(),
);
}
trait CharMatching {
fn is_ascii_alphanumeric_punctuation(&self) -> bool;
fn is_alphanumeric_or_space(&self) -> bool;
}
impl CharMatching for char {
/// Match is a-z, A-Z, 0-9, all ASCII punctuations
fn is_ascii_alphanumeric_punctuation(&self) -> bool {
self.is_ascii_alphanumeric() || self.is_ascii_punctuation()
}
fn is_alphanumeric_or_space(&self) -> bool {
self.is_ascii_alphanumeric() || self.eq(&' ') || self.eq(&'\t')
}
}
pub fn format_punctuation(text: &str) -> String {
let mut out = String::from("");
// Get quote char in start and end or the text
let mut wrap_quote = ' ';
// Get first non space char as quote
for char in text.chars() {
if !char.is_whitespace() {
wrap_quote = char;
break;
}
}
for line in text.split_inclusive('\n') {
out.push_str(&format_line(line, wrap_quote));
}
out
}
pub fn format_word(text: &str) -> String {
let mut out = String::new();
for part in text.split("") {
if let Some(new_str) = CHAR_WIDTH_MAP.get(part) {
out.push_str(new_str);
continue;
}
out.push_str(part);
}
// Fix 12:00 -> 12:00
out = HALF_TIME_RE
.replace_all(&out, |cap: ®ex::Captures| cap[0].replace(':', ":"))
.to_string();
out
}
fn is_may_only_english(text: &str) -> bool {
if CJK_RE.is_match(text) {
return false;
}
// Characters which pass CHAR_WIDTH_MAP replacement
if ENGLISH_RE.is_match(text) && START_WITH_WORD_RE.is_match(text) {
// Maybe English, pass
return true;
}
// In quote and including words
if QUOTE_RE.is_match(text) && WORD_RE.is_match(text) {
// If there not english and space or there have complex punctuation, skip
// `${this.$t('hello')}:${items.join(',')}`, `%{foo},hello`
if CODE_STRING_RE.is_match(text) {
return false;
}
return true;
}
false
}
fn format_line(text: &str, wrap_quote: char) -> String {
if !is_may_only_english(text) {
return String::from(text);
}
let mut out = String::new();
let mut parts = text.split("").peekable();
while let Some(part) = parts.next() {
let next_part = parts.peek().unwrap_or(&"");
let last_part = out.chars().last().unwrap_or(' ');
// Remove duplicate space without CJK contents
// if part.ends_with(|s: char| s.is_whitespace())
// && !next_part.starts_with(|s: char| s.is_ascii_alphanumeric_punctuation())
// {
// part = "";
// }
// Fix punctuation without CJK contents
if let Some(rule) = PUNCTUATION_MAP.get(part) {
let to = escape_quote(wrap_quote, rule.to);
// Do not change left quote when is last char.
if rule.char_type == CharType::LeftQuote && next_part.is_empty() {
out.push_str(part);
continue;
}
match rule.mode {
ReplaceMode::SuffixSpace => {
out.push_str(&to);
if next_part.starts_with(|s: char| s.is_alphanumeric()) {
out.push(' ');
}
}
ReplaceMode::PrefixSpace => {
if last_part.is_alphanumeric() {
out.push(' ');
}
out.push_str(&to);
}
ReplaceMode::Replace => {
out.push_str(&to);
}
}
continue;
}
out.push_str(part);
}
out
}
fn escape_quote(wrap_quote: char, quote: &str) -> String {
if quote != "\"" && quote != "'" {
return String::from(quote);
}
let mut output = String::new();
if wrap_quote.to_string().as_str() == quote {
output.push('\\');
}
output.push_str(quote);
output
}
#[cfg(test)]
mod tests {
use super::*;
#[track_caller]
fn assert_cases(cases: HashMap<&str, &str>) {
for (source, exptected) in cases.into_iter() {
let actual = format_punctuation(source);
assert_eq!(exptected, actual);
}
}
#[test]
fn test_halfwidth_alphabetic_numbers() {
let source = "测试:abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
assert_eq!(
"测试:abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890",
format_word(source)
);
assert_eq!(
"他说:我们将在16:32分出发去CBD中心。",
format_word("他说:我们将在16:32分出发去CBD中心。")
);
// Fullwidth space
assert_eq!(
"ジョイフル-後場売り気配 200 店舗を閉鎖へ 7 月以降、不採算店中心に",
format_word("ジョイフル-後場売り気配 200 店舗を閉鎖へ 7 月以降、不採算店中心に")
);
}
#[test]
fn test_halfwidth_punctuation_ignores() {
let cases = map! [
"。" => "。",
"," => ",",
"SHA1。" => "SHA1。",
"a。" => "a。",
"foo-bar-dar。" => "foo-bar-dar。",
"hello)。" => "hello)。",
"说:你好 english。" => "说:你好 english。",
"‘腾讯’ - 发布 - ‘新版’本微信" => "‘腾讯’ - 发布 - ‘新版’本微信",
"${item.name}(ID ${item.id})" => "${item.name}(ID ${item.id})",
"{{ t('name') }}:{{ item.extraKeys.join(' | ') }}" => "{{ t('name') }}:{{ item.extraKeys.join(' | ') }}",
"The Exchange’s" => "The Exchange’s",
"It's revenue \"conditions\" among the suppliers’ “customers”" => "It's revenue \"conditions\" among the suppliers’ “customers”",
];
assert_cases(cases);
}
#[test]
fn test_halfwidth_punctuation() {
let cases = map! [
"hello。" => "hello。",
"hello 你好。" => "hello 你好。",
"中文1\nhello world。\n中文2" => "中文1\nhello world.\n中文2",
" \n Said:Come and,Join us! \n " => " \n Said: Come and, Join us! \n ",
"Said:Come and,Join us!" => "Said: Come and, Join us!",
"_(HTML5 Rocks)_" => "_(HTML5 Rocks)_",
" Start with space next word?Join us?" => " Start with space next word? Join us?",
", Not start with word will not change。" => ", Not start with word will not change。",
":“Not start with word will not change”" => ":“Not start with word will not change”",
"Come and, Join us!" => "Come and, Join us!",
"The microphone or camera is occupied,Please check and re-record the video。" => "The microphone or camera is occupied, Please check and re-record the video.",
"The “Convertible Amount” case。" => r#"The “Convertible Amount” case."#,
"The“Convertible Amount”case。" => r#"The“Convertible Amount”case."#,
"The(Convertible Amount)case!" => r#"The (Convertible Amount) case!"#,
"The【Convertible Amount】case?" => "The [Convertible Amount] case?",
"The「Convertible Amount」case:" => "The [Convertible Amount] case:",
"The《Convertible Amount》case," => r#"The “Convertible Amount” case,"#,
"Reason: CORS header ‘Origin’ cannot be added" => "Reason: CORS header ‘Origin’ cannot be added",
];
assert_cases(cases);
}
#[test]
fn test_ignore_left_quote_in_last() {
let cases = map! [
"Escher puzzle (" => "Escher puzzle (",
"Escher puzzle【" => "Escher puzzle【",
"Escher puzzle《" => "Escher puzzle《",
"Escher puzzle“" => "Escher puzzle“",
"Escher puzzle‘" => "Escher puzzle‘",
"Escher puzzle「" => "Escher puzzle「",
];
assert_cases(cases);
}
#[test]
fn test_halfwidth_punctuation_with_in_quote() {
let cases = map! [
r#"",""# => r#"",""#,
r#""。""# => r#""。""#,
r#""a。""# => r#""a。""#,
r#""Hi!""# => r#""Hi!""#,
r#""hello-world。""# => r#""hello-world.""#,
r#"'hello “world”。'"# => r#"'hello “world”.'"#,
r#""hello “world”。""# => r#""hello “world”.""#,
r#""hello ‘world’。""# => r#""hello ‘world’.""#,
r#"'hello ‘world’。'"# => r#"'hello ‘world’.'"#,
r#""Only the first time break。""# => r#""Only the first time break.""#,
r#"'Only the first time break?'"# => r#"'Only the first time break?'"#,
r#"`Only the first time break!`"# => r#"`Only the first time break!`"#,
r#"`${this.$t('hello')}:${items.join(',')}`"# => r#"`${this.$t('hello')}:${items.join(',')}`"#,
r#"`${t('hello')}:${user.name}`"# => r#"`${t('hello')}:${user.name}`"#,
r##""#{vars.join(",")}""## => r##""#{vars.join(",")}""##
];
assert_cases(cases);
}
}