deltachat_message_parser 0.14.1

email, link, hashtag, md and more - parsing for deltachat messages
Documentation
// thanks to https://medium.com/reactnative/emojis-in-javascript-f693d0eb79fb for figuring the main details and ranges.

use nom::{
    branch::alt,
    bytes::complete::tag,
    character::complete::{self, satisfy},
    combinator::{opt, recognize},
    multi::{many1, many_m_n},
    sequence::tuple,
    IResult,
};

fn variant_selector(c: char) -> bool {
    matches!(c, '\u{fe00}'..='\u{fe0f}')
}

fn zero_width_joiner(c: char) -> bool {
    c == '\u{200d}'
}

fn single_char_emoji_core(c: char) -> bool {
    matches!(c,
        // Dingbats
        | '\u{2700}'..='\u{27bf}'
        // miscSymbols
        | '\u{2600}'..='\u{26ff}'
        // cjkLettersAndMonths
        | '\u{3299}' | '\u{3297}'
        // cjkSymbolsAndPunctuation
        | '\u{303d}' | '\u{3030}'
        // enclosedAlphanumerics
        | '\u{24c2}'
        // generalPunctuation
        | '\u{203c}' | '\u{2049}'
        // geometricShapes
        | '\u{25aa}'..='\u{25ab}' | '\u{25b6}' | '\u{25c0}' | '\u{25fb}'..='\u{25fe}'
        // latin1Supplement
        | '\u{00a9}' | '\u{00ae}'
        // letterLikeSymbols
        | '\u{2122}' | '\u{2139}'
        // miscSymbolsAndArrows
        | '\u{2b05}' | '\u{2b06}' | '\u{2b07}' | '\u{2b1b}' | '\u{2b1c}' | '\u{2b50}' | '\u{2b55}'
        // miscTechnical
        | '\u{231a}' | '\u{231b}' | '\u{2328}' | '\u{23cf}' | '\u{23e9}'..='\u{23f3}' | '\u{23f8}'..='\u{23fa}'
        // supplementalArrows
        | '\u{2934}' | '\u{2935}'
        // arrows
        | '\u{2190}'..='\u{2199}'
        // Unicode Block β€œEnclosed Alphanumeric Supplement”
        | 'πŸ…°' | 'πŸ…±' | 'πŸ…Ύ'| 'πŸ…Ώ' | 'πŸ†Ž' | 'πŸ†‘'..='πŸ†š'
        // Unicode Block β€œEnclosed Ideographic Supplement” https://www.compart.com/en/unicode/block/U+1F200
        | '🈁' | '🈚'| '🈯' | '🈲'..='🈢' | '🈸'..='🈺' | 'πŸ‰' | 'πŸ‰‘'
        // Unicode Block β€œMiscellaneous Symbols and Pictographs” https://www.compart.com/en/unicode/block/U+1F300
        | 'πŸŒ€'..='🌑' | '🌀'..='πŸŽ“' | 'πŸŽ–'..='πŸŽ—'| 'πŸŽ™'..='πŸŽ›' | '🎞'..='🏰' | '🏳'..='🏡' | '🏷'..='πŸ“½' | 'πŸ“Ώ'..='πŸ”½'
        | 'πŸ•‰'..='πŸ•Ž' | 'πŸ•'..='πŸ•§'| 'πŸ•―' | 'πŸ•°' | 'πŸ•³'..='πŸ•Ί' | 'πŸ–‡' | 'πŸ–Š'..='πŸ–' | 'πŸ–' | 'πŸ–•' | 'πŸ––' | 'πŸ–€' | 'πŸ–₯'
        | 'πŸ–¨' | 'πŸ–±' | 'πŸ–²' | 'πŸ–Ό' | 'πŸ—‚'..='πŸ—„' | 'πŸ—‘'..='πŸ—“' | 'πŸ—œ' | 'πŸ—ž' | 'πŸ—‘' |'πŸ—£' | 'πŸ—¨' | 'πŸ—―' | 'πŸ—³' | 'πŸ—Ί'..='πŸ—Ώ'
        // Unicode Block β€œEmoticons” https://www.compart.com/en/unicode/block/U+1F600
        | 'πŸ˜€'..='πŸ™'
        // Unicode Block β€œTransport and Map Symbols” https://www.compart.com/en/unicode/block/U+1F680
        | 'πŸš€'..='πŸ›…' | 'πŸ›‹'..='πŸ›’' | 'πŸ›•'..='πŸ›₯' | 'πŸ›©' | 'πŸ›«'..='πŸ›°' | 'πŸ›³'..='πŸ›Ό'
        // Unicode Block β€œGeometric Shapes Extended” https://www.compart.com/en/unicode/block/U+1F780
        | '🟠'..='🟫'
        // Unicode Block β€œSupplemental Symbols and Pictographs” https://www.compart.com/en/unicode/block/U+1F900
        | '🀌'..='🀺' | '🀼'..='πŸ₯…' | 'πŸ₯‡'..='🧿'
        // Unicode Block β€œSymbols and Pictographs Extended-A” https://www.compart.com/en/unicode/block/U+1FA70
        | '🩰'..='🫸'
        // other
        | 'πŸ—' | '🟰'
    )
}

fn emoji_core(input: &str) -> IResult<&str, &str> {
    alt((
        // region flags
        recognize(tuple((
            complete::char('🏴'),
            many1(satisfy(|c| matches!(c, '\u{e0061}'..='\u{e007a}'))),
            complete::char('\u{e007f}'),
        ))),
        // Regional -> Flags
        recognize(tuple((
            satisfy(|c| matches!(c, 'πŸ‡¦'..='πŸ‡Ώ')),
            satisfy(|c| matches!(c, 'πŸ‡¦'..='πŸ‡Ώ')),
        ))),
        // standard emoji chars
        recognize(satisfy(single_char_emoji_core)),
        // keycap
        recognize(tuple((
            satisfy(|c| ('\u{0023}'..='\u{0039}').contains(&c)),
            opt(complete::char('\u{fe0f}')),
            complete::char('\u{20e3}'),
        ))),
        // mahjongTile
        tag("πŸ€„"),
        // playingCard
        tag("πŸƒ"),
        // other
        tag("πŸˆ‚οΈ"),
        tag("🈷️"),
        tag("↩️"),
        tag("β†ͺ️"),
    ))(input)
}

fn emoji_modifier(c: char) -> bool {
    matches!(c, '🏻' | '🏼' | '🏽' | '🏾' | '🏿')
}

const USIZE_MAX_COMPOSITE_LEN: usize = 10;

macro_rules! emoji_with_variant {
    () => {
        tuple((
            emoji_core,
            opt(satisfy(variant_selector)),
            opt(satisfy(emoji_modifier)),
        ))
    };
}

// nom parser that eats one emoji
pub fn emoji(input: &str) -> IResult<&str, &str> {
    recognize(tuple((
        emoji_with_variant!(),
        many_m_n(
            0,
            USIZE_MAX_COMPOSITE_LEN,
            tuple((satisfy(zero_width_joiner), emoji_with_variant!())),
        ),
    )))(input)
}

/// returns first emoji from text if text begins with an emoji
pub fn get_first_emoji(text: &str) -> Option<&str> {
    if let Ok((_, emoji)) = emoji(text) {
        Some(emoji)
    } else {
        None
    }
}

/// If string contains only emojis count the emojis otherwise retuns None
pub fn count_emojis_if_only_contains_emoji(input: &str) -> Option<u32> {
    let mut remainder = input;
    let mut count: u32 = 0;

    while let Ok((new_remainder, _)) = emoji(remainder) {
        remainder = new_remainder;
        count = count.saturating_add(1);
    }

    if !remainder.is_empty() {
        // string contains not only emojis
        return None;
    }

    if count == 0 {
        None
    } else {
        Some(count)
    }
}

#[cfg(test)]
mod emoji_test {
    mod emoji_char {
        use crate::parser::is_emoji::emoji;

        #[test]
        fn some_emojis() {
            assert!(emoji("πŸ”₯").is_ok());
        }

        #[test]
        fn not_emoji() {
            // println!("{:?}", emoji("A"));
            assert!(emoji("A").is_err());
        }

        #[test]
        fn keycap() {
            // keycap emojis
            assert!(emoji("#️⃣").is_ok());
        }

        #[test]
        fn flag() {
            // flag emojis
            assert!(emoji("πŸ‡¦πŸ‡¨").is_ok());
        }

        #[test]
        fn mahjong() {
            // mahjongTiles
            assert!(emoji("πŸ€„").is_ok());
        }

        #[test]
        fn playing_card() {
            // playingCard
            assert!(emoji("πŸƒ").is_ok());
        }

        #[test]
        fn supplemental_arrows() {
            // supplementalArrows
            assert!(emoji("‴").is_ok());
            assert!(emoji("‡").is_ok());
        }

        #[test]
        fn test_variant_emoji() {
            assert!(emoji("πŸ‹οΈβ€β™€οΈ").is_ok());
            assert!(emoji("🀹🏽").is_ok());
            assert!(emoji("πŸ›ŒπŸΏ").is_ok());
        }
        // composite with zero width joiner
        #[test]
        fn test_composite_emoji() {
            assert!(emoji("❀️‍πŸ”₯").is_ok());
            assert!(emoji("πŸ•β€πŸ¦Ί").is_ok());
            assert!(emoji("πŸ‘©β€πŸ‘©β€πŸ‘§").is_ok());
            assert!(emoji("πŸ§‘πŸΏβ€πŸ€β€πŸ§‘πŸΏ").is_ok());
            assert!(emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½").is_ok());
        }
    }

    mod exported_methods {
        use crate::parser::is_emoji::{count_emojis_if_only_contains_emoji, get_first_emoji};

        #[test]
        fn test_get_first_emoji() {
            assert_eq!(get_first_emoji("#️⃣ Hashtag"), Some("#️⃣"));
            assert_eq!(get_first_emoji("#️⃣Hashtag"), Some("#️⃣"));
            assert_eq!(get_first_emoji("#οΈβƒ£πŸƒHashtag"), Some("#️⃣"));
            assert_eq!(get_first_emoji("Hashtag #️⃣"), None);
            assert_eq!(get_first_emoji("'#️⃣"), None);
            assert_eq!(get_first_emoji("❀️‍πŸ”₯Hashtag"), Some("❀️‍πŸ”₯"));
            assert_eq!(get_first_emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½Hashtag"), Some("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½"));
            assert_eq!(get_first_emoji("πŸ‡ͺπŸ‡ΈπŸš§"), Some("πŸ‡ͺπŸ‡Έ"));
        }

        #[test]
        fn test_string_contains_only_emojis_and_count() {
            assert_eq!(count_emojis_if_only_contains_emoji("#️⃣"), Some(1));
            assert_eq!(
                count_emojis_if_only_contains_emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½Hashtag"),
                None
            );
            assert_eq!(count_emojis_if_only_contains_emoji("❀️‍πŸ”₯"), Some(1));
            assert_eq!(count_emojis_if_only_contains_emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½"), Some(1));
            assert_eq!(
                count_emojis_if_only_contains_emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½"),
                Some(2)
            );
            assert_eq!(
                count_emojis_if_only_contains_emoji("πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½β€οΈβ€πŸ”₯πŸ‘©πŸ½β€β€οΈβ€πŸ‘¨πŸ½"),
                Some(3)
            );
            // hair color
            assert_eq!(count_emojis_if_only_contains_emoji("πŸ‘¨β€πŸ¦°"), Some(1));
            assert_eq!(count_emojis_if_only_contains_emoji("πŸ‘¨β€πŸ¦³"), Some(1));
            assert_eq!(
                count_emojis_if_only_contains_emoji("πŸ‡ͺπŸ‡ΈπŸš§πŸš§πŸš§πŸš§πŸš§πŸš§πŸš§"),
                Some(8)
            );
        }
    }
}