lib/
strings.rs

1//! Defines functions for string creation/manipulation.
2
3use std::collections::BTreeSet;
4use std::ffi::OsStr;
5
6use chrono::DateTime;
7use chrono::Utc;
8use deunicode::deunicode;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::Serialize;
12
13use super::result::Result;
14use crate::render::engine::RenderEngine;
15
16/// Captures a `#tag`. Tags *must* start with a hash symbol `#` followed by a letter in `[a-zA-Z]`
17/// and then a series of any characters. A tag ends when a space or another `#` is encountered.
18static RE_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r"#[a-zA-Z][^\s#]+\s?").unwrap());
19
20/// Captures three or more consecutive linebreaks.
21static RE_BLOCKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n{3,}").unwrap());
22
23/// Strips a string of a set of characters.
24///
25/// # Arguments
26///
27/// * `string` - The input string.
28/// * `chars` - Characters to strip out.
29#[must_use]
30pub fn strip(string: &str, chars: &str) -> String {
31    let mut stripped = string.to_string();
32
33    stripped.retain(|char| !chars.contains(char));
34
35    stripped
36}
37
38/// Removes/replaces problematic characters from a string.
39///
40/// # Arguments
41///
42/// * `string` - The string to sanitize.
43#[must_use]
44pub fn sanitize(string: &str) -> String {
45    // These characters can potentially cause problems in filenames.
46    let remove = &['\n', '\r', '\0'];
47    let replace = &['/', ':'];
48
49    let sanitized: String = string
50        .chars()
51        .filter(|c| !remove.contains(c))
52        .map(|c| if replace.contains(&c) { '_' } else { c })
53        .collect();
54
55    let sanitized = OsStr::new(&sanitized);
56    let sanitized = sanitized.to_string_lossy().to_string();
57
58    if sanitized != string {
59        log::warn!("the string '{}' contained invalid characters", string);
60    };
61
62    sanitized
63}
64
65/// Slugifies a string.
66///
67/// Re-implementation of: <https://github.com/Stebalien/slug-rs/> but with an additional argument to
68/// toggle whether or not to drop the case of the slugified string.
69///
70/// # Arguments
71///
72/// * `string` - The input string.
73/// * `lowercase` - Toggle dropping the case of the string.
74#[must_use]
75pub fn to_slug(string: &str, lowercase: bool) -> String {
76    let mut slug = String::with_capacity(string.len());
77
78    // Start `true` to avoid any leading dashes.
79    let mut prev_is_dash = true;
80
81    {
82        let mut push_char = |mut char: u8| match char {
83            b'a'..=b'z' | b'0'..=b'9' => {
84                prev_is_dash = false;
85                slug.push(char.into());
86            }
87            b'A'..=b'Z' => {
88                prev_is_dash = false;
89
90                char = if lowercase { char - b'A' + b'a' } else { char };
91
92                slug.push(char.into());
93            }
94            _ => {
95                if !prev_is_dash {
96                    slug.push('-');
97                    prev_is_dash = true;
98                }
99            }
100        };
101
102        for char in string.chars() {
103            if char.is_ascii() {
104                (push_char)(char as u8);
105            } else {
106                for &byte in deunicode::deunicode_char(char).unwrap_or("-").as_bytes() {
107                    (push_char)(byte);
108                }
109            }
110        }
111    }
112
113    if slug.ends_with('-') {
114        slug.pop();
115    }
116
117    slug.shrink_to_fit();
118
119    slug
120}
121
122/// Slugifies a date.
123///
124/// # Arguments
125///
126/// * `date` - The date to slugify.
127#[must_use]
128pub fn to_slug_date(date: &DateTime<Utc>) -> String {
129    date.format(crate::defaults::DATE_FORMAT_SLUG).to_string()
130}
131
132/// Renders a one-off template string with a context and sanitizes the output string.
133///
134/// # Errors
135///
136/// Will return `Err` if the render engine encounters any errors.
137pub fn render_and_sanitize<C>(template: &str, context: C) -> Result<String>
138where
139    C: Serialize,
140{
141    let string = RenderEngine::default().render_str(template, context)?;
142
143    Ok(sanitize(&string))
144}
145
146/// Builds a filename from a file stem and extension and sanitizes the output string.
147///
148/// This is a helper method to replace `PathBuf::set_extension()` as some file stems might include
149/// a period `.`. If we used `PathBuf::set_extension()`, the text after the last period would be
150/// replaced with the extension.
151///
152/// # Arguments
153///
154/// * `file_stem` - The file stem.
155/// * `extension` - The file extension.
156#[must_use]
157pub fn build_filename_and_sanitize(file_stem: &str, extension: &str) -> String {
158    let filename = format!("{file_stem}.{extension}");
159
160    sanitize(&filename)
161}
162
163/// Trims whitespace and replaces all linebreaks with: `\n\n`.
164///
165/// # Arguments
166///
167/// * `string` - The string to normalize.
168#[must_use]
169pub fn normalize_whitespace(string: &str) -> String {
170    string
171        .lines()
172        .filter(|&s| !s.is_empty())
173        .map(str::trim)
174        .map(ToOwned::to_owned)
175        .collect::<Vec<_>>()
176        .join("\n\n")
177}
178
179/// Extracts all `#tags` from a string.
180///
181/// # Arguments
182///
183/// * `string` - The string to extract from.
184#[must_use]
185pub fn extract_tags(string: &str) -> BTreeSet<String> {
186    let mut tags = RE_TAG
187        .find_iter(string)
188        .map(|t| t.as_str())
189        .map(str::trim)
190        .map(ToOwned::to_owned)
191        .collect::<Vec<String>>();
192
193    tags.sort();
194
195    BTreeSet::from_iter(tags)
196}
197
198/// Removes all `#tags` from a string.
199///
200/// # Arguments
201///
202/// * `string` - The string to remove from.
203#[must_use]
204pub fn remove_tags(string: &str) -> String {
205    RE_TAG.replace_all(string, "").trim().to_owned()
206}
207
208/// Converts all Unicode characters to their ASCII equivalent.
209///
210/// # Arguments
211///
212/// * `string` - The string to convert.
213#[must_use]
214pub fn convert_all_to_ascii(string: &str) -> String {
215    deunicode(string)
216}
217
218/// Converts a subset of "smart" Unicode symbols to their ASCII equivalents.
219///
220/// See [`UNICODE_TO_ASCII_SYMBOLS`][symbols] for list of symbols and their ASCII equivalents.
221///
222/// # Arguments
223///
224/// * `string` - The string to convert.
225///
226/// [symbols]: crate::defaults::UNICODE_TO_ASCII_SYMBOLS
227#[must_use]
228pub fn convert_symbols_to_ascii(string: &str) -> String {
229    let mut string = string.to_owned();
230
231    for (from, to) in &*crate::defaults::UNICODE_TO_ASCII_SYMBOLS {
232        string = string.replace(*from, to);
233    }
234
235    string
236}
237
238/// Normalizes linebreaks by replacing three or more consecutive linebreaks with two consecutive
239/// linebreaks while leaving a single trailing linebreak.
240///
241/// NOTE: This is a temporary solution that naively mimicks what [`tera`][tera] would do if/when it
242/// adds [`trim_blocks`][github-tera]. It is by no means smart and will just normalize whitespace
243/// regardless of what the template requested.
244///
245/// # Arguments
246///
247/// * `string` - The string to normalize.
248///
249/// [github-tera]: https://github.com/Keats/tera/issues/637
250/// [tera]: https://docs.rs/tera/latest/tera/
251#[must_use]
252pub fn trim_blocks(string: &str) -> String {
253    let string = RE_BLOCKS.replace_all(string, "\n\n");
254    let mut string = string.trim_end().to_string();
255
256    string.push('\n');
257
258    string
259}
260
261// TODO: Add tests for other functions.
262#[cfg(test)]
263mod test {
264
265    use super::*;
266
267    #[test]
268    fn strip() {
269        assert_eq!(
270            super::strip("Lorem ipsum. Aedipisicing culpa!?", " .!?"),
271            "LoremipsumAedipisicingculpa"
272        );
273        assert_eq!(
274            super::strip("Lorem ipsum.\n   Aedipisicing culpa!?", " .!?\n"),
275            "LoremipsumAedipisicingculpa"
276        );
277        assert_eq!(
278            super::strip("--Lorem--ipsum. Aedipisicing   -culpa-", " .-"),
279            "LoremipsumAedipisicingculpa"
280        );
281        assert_eq!(
282            super::strip("Lorem & Ipsúm. Ædipisicing culpa!?", " &.!?"),
283            "LoremIpsúmÆdipisicingculpa"
284        );
285    }
286
287    #[test]
288    fn slugify_original() {
289        assert_eq!(
290            super::to_slug("Lorem ipsum. Aedipisicing culpa!?", true),
291            "lorem-ipsum-aedipisicing-culpa"
292        );
293        assert_eq!(
294            super::to_slug("Lorem ipsum.\n   Aedipisicing culpa!?", true),
295            "lorem-ipsum-aedipisicing-culpa"
296        );
297        assert_eq!(
298            super::to_slug("--Lorem--ipsum. Aedipisicing   -culpa-", true),
299            "lorem-ipsum-aedipisicing-culpa"
300        );
301        assert_eq!(
302            super::to_slug("Lorem & Ipsúm. Ædipisicing culpa!?", true),
303            "lorem-ipsum-aedipisicing-culpa"
304        );
305    }
306
307    #[test]
308    fn slugify_with_lowercase() {
309        assert_eq!(
310            super::to_slug("Lorem ipsum. Aedipisicing culpa!?", false),
311            "Lorem-ipsum-Aedipisicing-culpa"
312        );
313        assert_eq!(
314            super::to_slug("Lorem ipsum.\n   Aedipisicing culpa!?", false),
315            "Lorem-ipsum-Aedipisicing-culpa"
316        );
317        assert_eq!(
318            super::to_slug("--Lorem--ipsum. Aedipisicing   -culpa-", false),
319            "Lorem-ipsum-Aedipisicing-culpa"
320        );
321        assert_eq!(
322            super::to_slug("Lorem & Ipsúm. Ædipisicing culpa!?", false),
323            "Lorem-Ipsum-AEdipisicing-culpa"
324        );
325    }
326
327    // https://stackoverflow.com/a/34666891/16968574
328    macro_rules! remove_and_extract_tags {
329        ($($name:ident: ($input:tt, $tags_removed_expected:tt, $tags_expected:tt),)*) => {
330            $(
331                #[test]
332                fn $name() {
333                    let tags_extracted = super::extract_tags($input);
334                    let tags_expected: BTreeSet<String> = $tags_expected
335                        .into_iter()
336                        .map(|t: &str| t.to_string())
337                        .collect();
338
339                    let tags_removed = super::remove_tags($input);
340
341                    assert_eq!(tags_extracted, tags_expected);
342                    assert_eq!(tags_removed, $tags_removed_expected.to_string());
343                }
344            )*
345        }
346    }
347
348    // Tests that extracting and removing tags from a string produces the expected results. Only
349    // tags, e.g. contigious strings starting with a hashtag, should be extracted and removed
350    // from the original string.
351    //
352    // "Lorem ipsum. #tag",  // Input string
353    // "Lorem ipsum.",       // Expected: tags removed
354    // ["#tag"]              // Expected: tags extracted
355    remove_and_extract_tags! {
356        // Tests no tags in string.
357        process_tags_00: (
358            "Lorem ipsum.",
359            "Lorem ipsum.",
360            []
361        ),
362        // Tests tags at end of a string.
363        process_tags_01: (
364            "Lorem ipsum. #tag01 #tag02",
365            "Lorem ipsum.",
366            ["#tag01", "#tag02"]
367        ),
368        // Tests tags in the middle of a string.
369        process_tags_02: (
370            "Lorem ipsum. #tag01 #tag02 Adipisicing culpa.",
371            "Lorem ipsum. Adipisicing culpa.",
372            ["#tag01", "#tag02"]
373        ),
374        // Tests tags at beginning of a string.
375        process_tags_03: (
376            "#tag01 #tag02 Lorem ipsum. Adipisicing culpa.",
377            "Lorem ipsum. Adipisicing culpa.",
378            ["#tag01", "#tag02"]
379        ),
380        // Tests tags with extra whitespace.
381        process_tags_04: (
382            "Lorem ipsum.  #tag01  #tag02  ",
383            "Lorem ipsum.",
384            ["#tag01", "#tag02"]
385        ),
386        // Tests tags without spacing.
387        process_tags_05: (
388            "Lorem ipsum.#tag01#tag02",
389            "Lorem ipsum.",
390            ["#tag01", "#tag02"]
391         ),
392        // Tests that tags must start with letter.
393        process_tags_06: (
394            "#tag01 #TAG01 #Tag01 #1 #999",
395            "#1 #999",
396            ["#tag01", "#TAG01", "#Tag01"]
397        ),
398        // Tests that a string with only tags ends up empty.
399        process_tags_07: (
400            "#tag01 #tag02",
401            "",
402            ["#tag01", "#tag02"]
403        ),
404        // Tests that tags are deduped.
405        process_tags_08: (
406            "#tag01 #tag01 #tag01",
407            "",
408            ["#tag01"]
409        ),
410        // Tests that extra hashtags are ignored.
411        process_tags_09: (
412            "###tag01##tag02",
413            "###",
414            ["#tag01", "#tag02"]
415        ),
416    }
417}