use std::collections::BTreeSet;
use std::ffi::OsStr;
use chrono::DateTime;
use chrono::Utc;
use deunicode::deunicode;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;
use super::result::Result;
use crate::render::engine::RenderEngine;
static RE_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r"#[a-zA-Z][^\s#]+\s?").unwrap());
static RE_BLOCKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n{3,}").unwrap());
#[must_use]
pub fn strip(string: &str, chars: &str) -> String {
let mut stripped = string.to_string();
stripped.retain(|char| !chars.contains(char));
stripped
}
#[must_use]
pub fn sanitize(string: &str) -> String {
let remove = &['\n', '\r', '\0'];
let replace = &['/', ':'];
let sanitized: String = string
.chars()
.filter(|c| !remove.contains(c))
.map(|c| if replace.contains(&c) { '_' } else { c })
.collect();
let sanitized = OsStr::new(&sanitized);
let sanitized = sanitized.to_string_lossy().to_string();
if sanitized != string {
log::warn!("the string '{}' contained invalid characters", string);
};
sanitized
}
#[must_use]
pub fn to_slug(string: &str, lowercase: bool) -> String {
let mut slug = String::with_capacity(string.len());
let mut prev_is_dash = true;
{
let mut push_char = |mut char: u8| match char {
b'a'..=b'z' | b'0'..=b'9' => {
prev_is_dash = false;
slug.push(char.into());
}
b'A'..=b'Z' => {
prev_is_dash = false;
char = if lowercase { char - b'A' + b'a' } else { char };
slug.push(char.into());
}
_ => {
if !prev_is_dash {
slug.push('-');
prev_is_dash = true;
}
}
};
for char in string.chars() {
if char.is_ascii() {
(push_char)(char as u8);
} else {
for &byte in deunicode::deunicode_char(char).unwrap_or("-").as_bytes() {
(push_char)(byte);
}
}
}
}
if slug.ends_with('-') {
slug.pop();
}
slug.shrink_to_fit();
slug
}
#[must_use]
pub fn to_slug_date(date: &DateTime<Utc>) -> String {
date.format(crate::defaults::DATE_FORMAT_SLUG).to_string()
}
pub fn render_and_sanitize<C>(template: &str, context: C) -> Result<String>
where
C: Serialize,
{
let string = RenderEngine::default().render_str(template, context)?;
Ok(sanitize(&string))
}
#[must_use]
pub fn build_filename_and_sanitize(file_stem: &str, extension: &str) -> String {
let filename = format!("{file_stem}.{extension}");
sanitize(&filename)
}
#[must_use]
pub fn normalize_whitespace(string: &str) -> String {
string
.lines()
.filter(|&s| !s.is_empty())
.map(str::trim)
.map(ToOwned::to_owned)
.collect::<Vec<_>>()
.join("\n\n")
}
#[must_use]
pub fn extract_tags(string: &str) -> BTreeSet<String> {
let mut tags = RE_TAG
.find_iter(string)
.map(|t| t.as_str())
.map(str::trim)
.map(ToOwned::to_owned)
.collect::<Vec<String>>();
tags.sort();
BTreeSet::from_iter(tags)
}
#[must_use]
pub fn remove_tags(string: &str) -> String {
RE_TAG.replace_all(string, "").trim().to_owned()
}
#[must_use]
pub fn convert_all_to_ascii(string: &str) -> String {
deunicode(string)
}
#[must_use]
pub fn convert_symbols_to_ascii(string: &str) -> String {
let mut string = string.to_owned();
for (from, to) in &*crate::defaults::UNICODE_TO_ASCII_SYMBOLS {
string = string.replace(*from, to);
}
string
}
#[must_use]
pub fn trim_blocks(string: &str) -> String {
let string = RE_BLOCKS.replace_all(string, "\n\n");
let mut string = string.trim_end().to_string();
string.push('\n');
string
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn strip() {
assert_eq!(
super::strip("Lorem ipsum. Aedipisicing culpa!?", " .!?"),
"LoremipsumAedipisicingculpa"
);
assert_eq!(
super::strip("Lorem ipsum.\n Aedipisicing culpa!?", " .!?\n"),
"LoremipsumAedipisicingculpa"
);
assert_eq!(
super::strip("--Lorem--ipsum. Aedipisicing -culpa-", " .-"),
"LoremipsumAedipisicingculpa"
);
assert_eq!(
super::strip("Lorem & Ipsúm. Ædipisicing culpa!?", " &.!?"),
"LoremIpsúmÆdipisicingculpa"
);
}
#[test]
fn slugify_original() {
assert_eq!(
super::to_slug("Lorem ipsum. Aedipisicing culpa!?", true),
"lorem-ipsum-aedipisicing-culpa"
);
assert_eq!(
super::to_slug("Lorem ipsum.\n Aedipisicing culpa!?", true),
"lorem-ipsum-aedipisicing-culpa"
);
assert_eq!(
super::to_slug("--Lorem--ipsum. Aedipisicing -culpa-", true),
"lorem-ipsum-aedipisicing-culpa"
);
assert_eq!(
super::to_slug("Lorem & Ipsúm. Ædipisicing culpa!?", true),
"lorem-ipsum-aedipisicing-culpa"
);
}
#[test]
fn slugify_with_lowercase() {
assert_eq!(
super::to_slug("Lorem ipsum. Aedipisicing culpa!?", false),
"Lorem-ipsum-Aedipisicing-culpa"
);
assert_eq!(
super::to_slug("Lorem ipsum.\n Aedipisicing culpa!?", false),
"Lorem-ipsum-Aedipisicing-culpa"
);
assert_eq!(
super::to_slug("--Lorem--ipsum. Aedipisicing -culpa-", false),
"Lorem-ipsum-Aedipisicing-culpa"
);
assert_eq!(
super::to_slug("Lorem & Ipsúm. Ædipisicing culpa!?", false),
"Lorem-Ipsum-AEdipisicing-culpa"
);
}
macro_rules! remove_and_extract_tags {
($($name:ident: ($input:tt, $tags_removed_expected:tt, $tags_expected:tt),)*) => {
$(
#[test]
fn $name() {
let tags_extracted = super::extract_tags($input);
let tags_expected: BTreeSet<String> = $tags_expected
.into_iter()
.map(|t: &str| t.to_string())
.collect();
let tags_removed = super::remove_tags($input);
assert_eq!(tags_extracted, tags_expected);
assert_eq!(tags_removed, $tags_removed_expected.to_string());
}
)*
}
}
remove_and_extract_tags! {
process_tags_00: (
"Lorem ipsum.",
"Lorem ipsum.",
[]
),
process_tags_01: (
"Lorem ipsum. #tag01 #tag02",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
process_tags_02: (
"Lorem ipsum. #tag01 #tag02 Adipisicing culpa.",
"Lorem ipsum. Adipisicing culpa.",
["#tag01", "#tag02"]
),
process_tags_03: (
"#tag01 #tag02 Lorem ipsum. Adipisicing culpa.",
"Lorem ipsum. Adipisicing culpa.",
["#tag01", "#tag02"]
),
process_tags_04: (
"Lorem ipsum. #tag01 #tag02 ",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
process_tags_05: (
"Lorem ipsum.#tag01#tag02",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
process_tags_06: (
"#tag01 #TAG01 #Tag01 #1 #999",
"#1 #999",
["#tag01", "#TAG01", "#Tag01"]
),
process_tags_07: (
"#tag01 #tag02",
"",
["#tag01", "#tag02"]
),
process_tags_08: (
"#tag01 #tag01 #tag01",
"",
["#tag01"]
),
process_tags_09: (
"###tag01##tag02",
"###",
["#tag01", "#tag02"]
),
}
}