use regex::Regex;
use std::borrow::Cow;
use crate::common::is_whitespace;
pub fn whitespaces<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
lazy_static! {
static ref REGEX: Regex = Regex::new(r"[ \x{202F}\x{2002}]{2,}?").unwrap();
}
let input = input.into();
let first = REGEX.find(&input)
.map(|mat| mat.start());
if let Some(first) = first {
let mut new_s = String::with_capacity(input.len());
new_s.push_str(&input[0..first]);
let mut previous_space = false;
for c in input[first..].chars() {
if is_whitespace(c) {
if previous_space {
} else {
new_s.push(c);
previous_space = true;
}
} else {
previous_space = false;
new_s.push(c);
}
}
Cow::Owned(new_s)
} else {
input
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
enum CharClass {
Whitespace = 0,
Punctuation,
Alphanumeric,
}
fn char_class(c: char) -> CharClass {
if c.is_alphanumeric() {
CharClass::Alphanumeric
} else if c.is_whitespace() {
CharClass::Whitespace
} else {
CharClass::Punctuation
}
}
pub fn ellipsis<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
lazy_static! {
static ref REGEX: Regex = Regex::new(r"\.\.\.|\. \. \. ").unwrap();
static ref UNICODE_ELLIPSIS: &'static [u8] = "…".as_bytes();
static ref NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
static ref FULL_NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
}
let input = input.into();
let first = REGEX.find(&input)
.map(|mat| mat.start());
if let Some(first) = first {
let mut output: Vec<u8> = Vec::with_capacity(input.len());
output.extend_from_slice(input[0..first].as_bytes());
let rest = input[first..].bytes().collect::<Vec<_>>();
let len = rest.len();
let mut i = 0;
while i < len {
if i + 3 <= len && &rest[i..(i + 3)] == &[b'.', b'.', b'.'] {
output.extend_from_slice(*UNICODE_ELLIPSIS);
i += 3;
} else if i + 6 <= len && &rest[i..(i + 6)] == &[b'.', b' ', b'.', b' ', b'.', b' '] {
if i + 6 == len || rest[i + 6] != b'.' {
output.extend_from_slice(*NB_ELLIPSIS);
} else {
output.extend_from_slice(*FULL_NB_ELLIPSIS);
}
i += 6;
} else {
output.push(rest[i]);
i += 1;
}
}
Cow::Owned(String::from_utf8(output).unwrap())
} else {
input
}
}
pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
lazy_static! {
static ref REGEX: Regex = Regex::new("[\"\']").unwrap();
}
let input = input.into();
let first = REGEX.find(&input)
.map(|mat| mat.start());
if let Some(mut first) = first {
let mut new_s = String::with_capacity(input.len());
if first > 0 {
first -= 1;
while !input.is_char_boundary(first) {
first -= 1;
}
}
new_s.push_str(&input[0..first]);
let mut chars = input[first..].chars().collect::<Vec<_>>();
let mut closing_quote = None;
let mut opened_doubles = 0;
for i in 0..chars.len() {
let c = chars[i];
let has_opened_quote = if let Some(n) = closing_quote {
i <= n
} else {
false
};
match c {
'"' => {
let prev = if i > 0 {
char_class(chars[i - 1])
} else {
CharClass::Whitespace
};
let next = if i < chars.len() - 1 {
char_class(chars[i + 1])
} else {
CharClass::Whitespace
};
if prev < next {
opened_doubles += 1;
new_s.push('“');
} else if opened_doubles > 0 {
opened_doubles -= 1;
new_s.push('”');
} else {
new_s.push('"');
}
}
'\'' => {
let prev = if i > 0 {
char_class(chars[i - 1])
} else {
CharClass::Whitespace
};
let next = if i < chars.len() - 1 {
char_class(chars[i + 1])
} else {
CharClass::Whitespace
};
let replacement = match (prev, next) {
(CharClass::Alphanumeric, CharClass::Alphanumeric)
=> '’',
(x, y) if x < y
=> {
let mut is_next_closing = false;
for j in (i + 1)..chars.len() {
if chars[j] == '\'' {
if chars[j-1].is_whitespace() {
continue;
} else if j >= chars.len() - 1
|| char_class(chars[j+1]) != CharClass::Alphanumeric {
is_next_closing = true;
closing_quote = Some(j);
chars[j] = '’';
break;
}
}
}
if is_next_closing && !has_opened_quote {
'‘'
} else {
'’'
}
}
(x, y) if x > y
=> {
'’'
},
_ => '\'',
};
new_s.push(replacement);
}
_ => new_s.push(c),
}
}
Cow::Owned(new_s)
} else {
input
}
}
pub fn dashes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
lazy_static! {
static ref REGEX: Regex = Regex::new(r"\x2D\x2D").unwrap();
static ref EN_SPACE: &'static [u8] = "–".as_bytes();
static ref EM_SPACE: &'static [u8] = "—".as_bytes();
}
let input = input.into();
let first = REGEX.find(&input)
.map(|mat| mat.start());
if let Some(first) = first {
let mut output: Vec<u8> = Vec::with_capacity(input.len());
output.extend_from_slice(input[0..first].as_bytes());
let rest = input[first..].bytes().collect::<Vec<_>>();
let len = rest.len();
let mut i = 0;
while i < len {
if i + 2 <= len && &rest[i..(i + 2)] == &[b'-', b'-'] {
if i + 2 < len && rest[i + 2] == b'-' {
output.extend_from_slice(*EM_SPACE);
i += 3;
} else {
output.extend_from_slice(*EN_SPACE);
i += 2;
}
} else {
output.push(rest[i]);
i += 1;
}
}
Cow::Owned(String::from_utf8(output).unwrap())
} else {
input
}
}
pub fn guillemets<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
lazy_static! {
static ref REGEX: Regex = Regex::new(r"<<|>>").unwrap();
static ref OPENING_GUILLEMET: &'static [u8] = "«".as_bytes();
static ref CLOSING_GUILLEMET: &'static [u8] = "»".as_bytes();
}
let input = input.into();
let first = REGEX.find(&input)
.map(|mat| mat.start());
if let Some(first) = first {
let mut output: Vec<u8> = Vec::with_capacity(input.len());
output.extend_from_slice(input[0..first].as_bytes());
let rest = input[first..].bytes().collect::<Vec<_>>();
let len = rest.len();
let mut i = 0;
while i < len {
if i + 2 <= len && &rest[i..(i + 2)] == &[b'<', b'<'] {
output.extend_from_slice(*OPENING_GUILLEMET);
i += 2;
} else if i+2 <= len && &rest[i..(i + 2)] == &[b'>', b'>'] {
output.extend_from_slice(*CLOSING_GUILLEMET);
i += 2;
} else {
output.push(rest[i]);
i += 1;
}
}
Cow::Owned(String::from_utf8(output).unwrap())
} else {
input
}
}
#[test]
fn whitespaces_1() {
let s = " Remove supplementary spaces but don't trim either ";
let res = whitespaces(s);
assert_eq!(&res, " Remove supplementary spaces but don't trim either ");
}
#[test]
fn quotes_1() {
let s = "Some string without ' typographic ' quotes";
let res = quotes(s);
assert_eq!(&res, s);
}
#[test]
fn quotes_2() {
let s = quotes("\"foo\"");
assert_eq!(&s, "“foo”");
let s = quotes("'foo'");
assert_eq!(&s, "‘foo’");
}
#[test]
fn quotes_3() {
let s = quotes("\'mam, how are you?");
assert_eq!(&s, "’mam, how are you?");
}
#[test]
fn quotes_4() {
let s = quotes("some char: 'c', '4', '&'");
assert_eq!(&s, "some char: ‘c’, ‘4’, ‘&’");
}
#[test]
fn quotes_5() {
let s = quotes("It's a good day to say 'hi'");
assert_eq!(&s, "It’s a good day to say ‘hi’");
}
#[test]
fn quotes_6() {
let s = quotes("The '60s were nice, weren't they?");
assert_eq!(&s, "The ’60s were nice, weren’t they?");
}
#[test]
fn quotes_7() {
let s = quotes("Plurals' possessive");
assert_eq!(&s, "Plurals’ possessive");
}
#[test]
fn quotes_8() {
let s = quotes("\"I like 'That '70s show'\", she said");
assert_eq!(&s, "“I like ‘That ’70s show’”, she said");
}
#[test]
fn quotes_9() {
let s = quotes("some char: '!', '?', ','");
assert_eq!(&s, "some char: ‘!’, ‘?’, ‘,’");
}
#[test]
fn quotes_10() {
let s = quotes("\"'Let's try \"nested\" quotes,' he said.\"");
assert_eq!(&s, "“‘Let’s try “nested” quotes,’ he said.”");
}
#[test]
fn quotes_11() {
let s = quotes("Enhanced \"quotes\"'s heuristics");
assert_eq!(&s, "Enhanced “quotes”’s heuristics");
}
#[test]
fn quotes_12() {
let s = quotes("A double quote--\"within\" dashes--would be nice.");
assert_eq!(&s, "A double quote--“within” dashes--would be nice.");
}
#[test]
fn quotes_13() {
let s = quotes("A double quote–\"within\" dashes–would be nice.");
assert_eq!(&s, "A double quote–“within” dashes–would be nice.");
}
#[test]
fn ellipsis_0() {
let s = ellipsis("Foo...");
assert_eq!(&s, "Foo…");
}
#[test]
fn ellipsis_1() {
let s = ellipsis("Foo... Bar");
assert_eq!(&s, "Foo… Bar");
}
#[test]
fn ellipsis_2() {
let s = ellipsis("foo....");
assert_eq!(&s, "foo….");
}
#[test]
fn ellipsis_3() {
let s = ellipsis("foo. . . ");
assert_eq!(&s, "foo. . . ");
}
#[test]
fn ellipsis_4() {
let s = ellipsis("foo. . . .");
assert_eq!(&s, "foo. . . .");
}
#[test]
fn ellipsis_5() {
let s = ellipsis("foo..");
assert_eq!(&s, "foo..");
}
#[test]
fn dashes_0() {
let s = dashes("foo - bar");
assert_eq!(&s, "foo - bar");
}
#[test]
fn dashes_1() {
let s = dashes("foo -- bar");
assert_eq!(&s, "foo – bar");
}
#[test]
fn dashes_2() {
let s = dashes("foo --- bar");
assert_eq!(&s, "foo — bar");
}
#[test]
fn dashes_3() {
let s = dashes("foo --- bar--");
assert_eq!(&s, "foo — bar–");
}
#[test]
fn guillemets_1() {
let s = guillemets("<< Foo >>");
assert_eq!(&s, "« Foo »");
}
#[test]
fn guillemets_2() {
let s = guillemets("<< Foo");
assert_eq!(&s, "« Foo");
}
#[test]
fn guillemets_3() {
let s = guillemets("Foo >>");
assert_eq!(&s, "Foo »");
}
#[test]
fn guillemets_4() {
let s = guillemets("<< Foo < Bar >>");
assert_eq!(&s, "« Foo < Bar »");
}