use crate::parser::{Bound, Error, Parser, Result};
use log::debug;
#[derive(Debug, PartialEq, Eq)]
pub struct ByteSlice {
pos: u32,
len: u32,
}
impl ByteSlice {
pub fn new(pos: u32, len: u32) -> ByteSlice {
ByteSlice { pos, len }
}
#[inline(always)]
fn pos_usize(&self) -> usize {
self.pos as usize
}
#[inline(always)]
fn len_usize(&self) -> usize {
self.len as usize
}
pub fn bytes<'a>(&self, data: &'a [u8]) -> &'a [u8] {
&data[self.pos_usize()..self.pos_usize() + self.len_usize()]
}
pub fn str<'a>(&self, data: &'a [u8]) -> std::result::Result<&'a str, std::str::Utf8Error> {
std::str::from_utf8(self.bytes(data))
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum Mention {
Index(u16),
Bech32(ByteSlice),
}
#[derive(Debug, PartialEq, Eq)]
pub enum Shard {
Text(ByteSlice),
Mention(Mention),
Hashtag(ByteSlice),
Url(ByteSlice),
}
#[derive(Debug)]
pub struct Shards {
shards: Vec<Shard>,
}
impl Shards {
pub fn new() -> Shards {
Shards {
shards: Vec::with_capacity(32),
}
}
fn parse_indexed_mention(parser: &mut Parser) -> Result<u16> {
let start = parser.pos();
{
parser.parse_char('[')?;
let ind = parser.parse_digits()?;
parser.parse_char(']')?;
Ok(ind)
}
.map_err(|err| {
parser.set_pos(start);
err
})
}
fn parse_hashtag(parser: &mut Parser) -> Result<ByteSlice> {
let start = parser.pos();
match parser.parse_until(is_boundary_char) {
Ok(()) | Err(Error::OutOfBounds(Bound::End)) => {
let len = parser.pos() - start;
if len <= 0 {
return Err(Error::NotFound);
}
return Ok(ByteSlice::new(start as u32, len as u32));
}
Err(err) => Err(err.into()),
}
}
fn push_txt(&mut self, start: usize, upto: usize) {
let len = upto - start;
if len == 0 {
return;
}
let txt_slice = ByteSlice::new(start as u32, len as u32);
self.shards.push(Shard::Text(txt_slice));
}
pub fn parse(content: &str) -> Result<Shards> {
let mut parser = Parser::from_str(content);
let len = parser.len();
let mut shards = Shards::new();
let mut start = parser.pos();
while parser.pos() < len {
let before_parse = parser.pos();
let prev_boundary = is_left_boundary(&parser.peek_prev_byte());
let c1 = parser.data()[parser.pos()] as char;
parser.set_pos(parser.pos() + 1);
if c1 == '#' && prev_boundary {
if let Ok(ht) = Shards::parse_hashtag(&mut parser) {
shards.push_txt(start, before_parse);
start = parser.pos();
debug!("pushing hashtag {:?}", ht);
shards.shards.push(Shard::Hashtag(ht));
} else if let Ok(ind) = Shards::parse_indexed_mention(&mut parser) {
shards.push_txt(start, before_parse);
start = parser.pos();
debug!("pushing indexed mention {:?}", ind);
shards.shards.push(Shard::Mention(Mention::Index(ind)));
}
}
}
shards.push_txt(start, parser.pos());
Ok(shards)
}
}
fn is_left_boundary(r: &Result<u8>) -> bool {
match r {
Err(Error::OutOfBounds(_)) => true,
Err(_) => false,
Ok(c) => is_left_boundary_char(*c),
}
}
fn is_boundary_char(c: char) -> bool {
c.is_ascii_whitespace() || c.is_ascii_punctuation()
}
fn is_left_boundary_char(c: u8) -> bool {
is_boundary_char(c as char) || ((c & 0b10000000) == 0b10000000)
}
#[cfg(test)]
mod test {
use super::*;
use std::sync::Once;
static INIT: Once = Once::new();
fn is_boundary(r: &Result<char>) -> bool {
match r {
Err(Error::OutOfBounds(_)) => true,
Err(_) => false,
Ok(c) => is_boundary_char(*c),
}
}
fn setup() {
INIT.call_once(|| {
env_logger::init();
});
}
#[test]
fn test_is_boundary() {
setup();
let content = "a";
let parser = Parser::from_str(&content);
let res = parser.peek_prev_char();
assert_eq!(is_boundary(&res), true);
}
#[test]
fn test_parse_hashtag_basic() {
setup();
let content = "abc #😎";
debug!("hashtag_basic content '{}'", content);
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 2);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 4)));
}
#[test]
fn test_parse_hashtag_adjacent() {
setup();
let content = "aa#abc";
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 1);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 6)));
}
#[test]
fn test_parse_hashtag_start() {
setup();
let content = "#abc.";
debug!("test_parse_hashtag_start '{}'", content);
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 2);
assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
assert_eq!(bs[1], Shard::Text(ByteSlice::new(4, 1)));
}
#[test]
fn test_parse_hashtag_end() {
setup();
let content = "#abc";
debug!("test_parse_hashtag_end '{}'", content);
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 1);
assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
}
#[test]
fn test_parse_hashtag_punc_before() {
setup();
let content = ".#abc";
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 2);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 3)));
}
#[test]
fn test_indexed_mention() {
setup();
let content = "this is #[19] #[1 a mention";
debug!("test_indexed_mention '{}'", content);
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 3);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 8)));
assert_eq!(bs[1], Shard::Mention(Mention::Index(19)));
assert_eq!(bs[2], Shard::Text(ByteSlice::new(13, 14)));
}
#[test]
fn test_multiple_hashtags() {
setup();
let content = ".#alice.#bob";
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 4);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 5)));
assert_eq!(bs[2], Shard::Text(ByteSlice::new(7, 1)));
assert_eq!(bs[3], Shard::Hashtag(ByteSlice::new(9, 3)));
}
#[test]
fn test_multiple_adjacent_hashtags() {
setup();
let content = "#alice#bob";
debug!("test_multiple_adjacent_hashtags '{}'", content);
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 2);
assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 5)));
assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(7, 3)));
}
#[test]
fn test_parse_hashtag_emoji_before() {
setup();
let content = "😤#abc";
let shards = Shards::parse(content).unwrap();
let bs = shards.shards;
assert_eq!(bs.len(), 2);
assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 3)));
}
}