use twitter_text_config::Configuration;
use idna::uts46;
use entity::Entity;
use entity::Type;
use idna::uts46::Flags;
use unicode_normalization::UnicodeNormalization;
use TwitterTextParseResults;
use std::str::CharIndices;
use std::iter::Peekable;
use pest::Parser;
use twitter_text_parser::twitter_text::TwitterTextParser;
use twitter_text_parser::twitter_text::Rule;
use twitter_text_config::Range;
type RuleMatch = fn(Rule) -> bool;
type Pair<'a> = pest::iterators::Pair<'a, Rule>;
pub trait Extract<'a> {
type T;
type Mention;
fn get_extract_url_without_protocol(&self) -> bool;
fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool);
fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T;
fn create_result(&self, s: &'a str, entity_count:usize, pairs: &mut Vec<UnprocessedEntity<'a>>) -> Self::T;
fn extract_reply_username(&self, s: &'a str) -> Self::Mention;
fn mention_result(&self, s: &'a str, pairs: Option<Pair<'a>>) -> Self::Mention;
fn empty_result(&self) -> Self::T;
fn extract_impl(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
if s.is_empty() {
return self.empty_result();
}
match TwitterTextParser::parse(Rule::tweet, s) {
Ok(p) => {
let mut scanned = Vec::new();
let mut entity_count = 0;
p.flatten().for_each(|pair| {
let r = pair.as_rule();
if r == Rule::invalid_char || r == Rule::emoji {
scanned.insert(0, UnprocessedEntity::Pair(pair));
} else if r_match(r) {
if r == Rule::url || r == Rule::url_without_protocol {
let span = pair.as_span();
if validate_url(pair) {
entity_count += 1;
scanned.insert(0, UnprocessedEntity::UrlSpan(span));
}
} else {
entity_count += 1;
scanned.insert(0, UnprocessedEntity::Pair(pair));
}
}
});
self.create_result(s, entity_count, &mut scanned)
},
Err(_e) => {
self.empty_result()
}
}
}
fn extract_urls_with_indices(&self, s: &'a str) -> Self::T {
if self.get_extract_url_without_protocol() {
self.extract(s, |r| { r == Rule::url || r == Rule::url_without_protocol })
} else {
self.extract(s, |r| { r == Rule::url })
}
}
fn extract_hashtags(&self, s: &'a str) -> Self::T {
self.extract(s, |r| { r == Rule::hashtag })
}
fn extract_cashtags(&self, s: &'a str) -> Self::T {
self.extract(s, |r| { r == Rule::cashtag })
}
fn extract_mentioned_screennames(&self, s: &'a str) -> Self::T {
self.extract_mentioned_screennames_with_indices(s)
}
fn extract_mentioned_screennames_with_indices(&self, s: &'a str) -> Self::T {
self.extract(s, |r| { r == Rule::username })
}
fn extract_mentions_or_lists_with_indices(&self, s: &'a str) -> Self::T {
self.extract(s, |r| { r == Rule::username || r == Rule::list })
}
fn extract_reply_username_impl(&self, s: &'a str) -> Self::Mention {
match TwitterTextParser::parse(Rule::reply, s) {
Ok(pairs) => {
for pair in pairs.flatten() {
return self.mention_result(s, Some(pair));
}
return self.mention_result(s, None)
}
Err(_) => self.mention_result(s, None)
}
}
fn extract_entities_with_indices(&self, s: &'a str) -> Self::T {
self.extract(s, |r| {
r == Rule::url || r == Rule::hashtag || r == Rule::cashtag ||
r == Rule::list || r == Rule::username
})
}
fn extract_scan(&self, s: &'a str) -> Self::T {
self.extract(s, |_r| { false })
}
fn entity_from_pair(&self, ue: UnprocessedEntity<'a>, start: i32, end: i32) -> Option<Entity<'a>> {
match ue {
UnprocessedEntity::UrlSpan(url) => {
Some(Entity::new(Type::URL, url.as_str(), start, end))
},
UnprocessedEntity::Pair(pair) => {
let s = pair.as_str();
match pair.as_rule() {
Rule::hashtag => {
Some(Entity::new(Type::HASHTAG, &s[calculate_offset(s)..], start, end))
},
Rule::cashtag => {
Some(Entity::new(Type::CASHTAG, &s[calculate_offset(s)..], start, end))
},
Rule::username => {
Some(Entity::new(Type::MENTION, &s[calculate_offset(s)..], start, end))
},
Rule::list => {
let mut list_iter = pair.into_inner();
let listname = list_iter.find(|p| { p.as_rule() == Rule::listname });
let list_slug = list_iter.find(|p| { p.as_rule() == Rule::list_slug });
match (listname, list_slug) {
(Some(ln), Some(ls)) => {
let name = ln.as_str();
Some(Entity::new_list(Type::MENTION, &name[calculate_offset(name)..],
&ls.as_str(), start, end))
},
_ => {
None
}
}
}
_ => None
}
}
}
}
}
pub struct Extractor {
extract_url_without_protocol: bool,
}
impl Extractor {
pub fn new() -> Extractor {
Extractor {
extract_url_without_protocol: true,
}
}
pub fn extract_urls(&self, s: &str) -> Vec<String> {
self.extract_urls_with_indices(s).iter().map(|entity| {
String::from(entity.get_value())
}).collect()
}
fn scan(&self, iter: &mut Peekable<CharIndices>, limit: usize) -> i32 {
let mut offset = 0;
loop {
if let Some((peeked_pos, _c)) = iter.peek() {
if *peeked_pos >= limit {
break;
}
} else {
break;
}
if let Some((_, _)) = iter.next() {
offset += 1;
}
}
offset
}
}
impl<'a> Extract<'a> for Extractor {
type T = Vec<Entity<'a>>;
type Mention = Option<Entity<'a>>;
fn get_extract_url_without_protocol(&self) -> bool {
self.extract_url_without_protocol
}
fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
self.extract_url_without_protocol = extract_url_without_protocol;
}
fn extract(&self, s: &'a str, r_match: RuleMatch) -> Vec<Entity<'a>> {
self.extract_impl(s, r_match)
}
fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> Vec<Entity<'a>> {
let mut entities = Vec::with_capacity(count);
let mut iter = s.char_indices().peekable();
let mut start_index = 0;
while let Some(entity) = scanned.pop() {
start_index += self.scan(iter.by_ref(), entity.start());
let end_index = start_index + self.scan(iter.by_ref(), entity.end());
if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
entities.push(e);
}
start_index = end_index;
}
entities
}
fn extract_reply_username(&self, s: &'a str) -> Option<Entity<'a>> {
self.extract_reply_username_impl(s)
}
fn mention_result(&self, s: &'a str, entity: Option<Pair<'a>>) -> Option<Entity<'a>> {
match entity {
Some(e) => {
let mut v = Vec::new();
v.push(UnprocessedEntity::Pair(e));
self.create_result(s, 1, &mut v).pop()
},
None => None
}
}
fn empty_result(&self) -> Vec<Entity<'a>> {
Vec::new()
}
}
pub struct ValidatingExtractor<'a> {
extract_url_without_protocol: bool,
config: &'a Configuration,
ld: LengthData,
}
impl<'a> ValidatingExtractor<'a> {
pub fn new(configuration: &Configuration) -> ValidatingExtractor {
ValidatingExtractor {
extract_url_without_protocol: true,
config: configuration,
ld: LengthData::empty(),
}
}
pub fn prep_input(&mut self, s: &str) -> String {
let nfc: String = s.nfc().collect();
let (nfc_length, nfc_length_utf8) = calculate_length(nfc.as_str());
let (original_length, original_length_utf8) = calculate_length(s);
self.ld = LengthData {
normalized_length: nfc_length,
normalized_length_utf8: nfc_length_utf8,
original_length,
original_length_utf8,
};
nfc
}
pub fn new_with_nfc_input(configuration: &'a Configuration, s: &str) -> ValidatingExtractor<'a> {
let (original_length, original_length_utf8) = calculate_length(s);
let (length, length_utf8) = calculate_length(s);
ValidatingExtractor {
extract_url_without_protocol: true,
config: configuration,
ld: LengthData {
normalized_length: length,
normalized_length_utf8: length_utf8,
original_length: length,
original_length_utf8: length_utf8,
},
}
}
}
fn calculate_length(text: &str) -> (i32, i32) {
let mut length: i32 = 0;
let mut length_utf8: i32 = 0;
for c in text.chars() {
length += as_i32(c.len_utf16());
length_utf8 += 1;
}
(length, length_utf8)
}
impl<'a> Extract<'a> for ValidatingExtractor<'a> {
type T = ExtractResult<'a>;
type Mention = MentionResult<'a>;
fn get_extract_url_without_protocol(&self) -> bool {
self.extract_url_without_protocol
}
fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
self.extract_url_without_protocol = extract_url_without_protocol;
}
fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
self.extract_impl(s, r_match)
}
fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> ExtractResult<'a> {
let mut iter = s.char_indices().peekable();
let mut metrics = TextMetrics::new(self.config, self.ld.normalized_length);
let mut entities = Vec::with_capacity(count);
let mut start_index = 0;
while let Some(entity) = scanned.pop() {
start_index += metrics.scan(iter.by_ref(), entity.start(), TrackAction::Text);
let r = entity.as_rule();
if r == Rule::invalid_char {
metrics.is_valid = false;
} else if r == Rule::emoji && self.config.emoji_parsing_enabled {
metrics.weighted_count += self.config.default_weight;
start_index += metrics.scan(iter.by_ref(), entity.end(), TrackAction::Emoji);
} else {
let action = if r == Rule::url {
TrackAction::Url
} else {
TrackAction::Text
};
let end_index = start_index + metrics.scan(iter.by_ref(), entity.end(), action);
if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
entities.push(e);
}
start_index = end_index;
}
}
metrics.scan(iter.by_ref(), s.len(), TrackAction::Text);
let normalized_tweet_offset: i32 = self.ld.original_length - self.ld.normalized_length;
let scaled_weighted_length = metrics.weighted_count / self.config.scale;
let is_valid = metrics.is_valid && scaled_weighted_length <= self.config.max_weighted_tweet_length;
let permillage = scaled_weighted_length * 1000 / self.config.max_weighted_tweet_length;
let results = TwitterTextParseResults::new(
scaled_weighted_length,
permillage,
is_valid,
Range::new(0, metrics.offset + normalized_tweet_offset - 1),
Range::new(0, metrics.valid_offset + normalized_tweet_offset - 1),
);
ExtractResult::new(results, entities)
}
fn extract_reply_username(&self, s: &'a str) -> MentionResult<'a> {
self.extract_reply_username_impl(s)
}
fn mention_result(&self, s: &'a str, pair: Option<Pair<'a>>)
-> MentionResult<'a> {
MentionResult::new(TwitterTextParseResults::empty(), None)
}
fn empty_result(&self) -> ExtractResult<'a> {
ExtractResult::new(TwitterTextParseResults::empty(), Vec::new())
}
}
pub struct ExtractResult<'a> {
pub parse_results: TwitterTextParseResults,
pub entities: Vec<Entity<'a>>
}
impl<'a> ExtractResult<'a> {
pub fn new(results: TwitterTextParseResults, e: Vec<Entity<'a>>) -> ExtractResult<'a> {
ExtractResult {
parse_results: results,
entities: e,
}
}
}
pub struct MentionResult<'a> {
pub parse_results: TwitterTextParseResults,
pub mention: Option<Entity<'a>>
}
impl<'a> MentionResult<'a> {
pub fn new(results: TwitterTextParseResults, e: Option<Entity<'a>>) -> MentionResult<'a> {
MentionResult {
parse_results: results,
mention: e,
}
}
}
struct TextMetrics<'a> {
is_valid: bool,
weighted_count: i32,
offset: i32,
valid_offset: i32,
normalized_length: i32,
scaled_max_weighted_tweet_length: i32,
config: &'a Configuration,
}
impl<'a> TextMetrics<'a> {
fn new(config: &Configuration, normalized_length: i32) -> TextMetrics {
TextMetrics {
is_valid: true,
weighted_count: 0,
offset: 0,
valid_offset: 0,
normalized_length,
scaled_max_weighted_tweet_length: config.max_weighted_tweet_length * config.scale,
config
}
}
fn add_char(&mut self, c: char) {
let len_utf16 : i32 = as_i32(c.len_utf16());
self.add_offset(len_utf16);
}
fn add_offset(&mut self, offset: i32) {
self.offset += offset;
if self.is_valid && self.weighted_count <= self.scaled_max_weighted_tweet_length {
self.valid_offset += offset;
}
}
fn track_emoji(&mut self, c: char) {
self.add_char(c);
}
fn track_url(&mut self, count: i32) {
self.weighted_count += self.config.transformed_url_length * self.config.scale;
self.add_offset(count);
}
fn track_text(&mut self, c: char) {
if self.offset < self.normalized_length {
let code_point: i32 = c as i32;
let mut char_weight = self.config.default_weight;
for (_, range) in self.config.ranges.iter().enumerate() {
if range.contains(code_point) {
char_weight = range.weight;
break;
}
}
self.weighted_count += char_weight;
self.add_char(c);
}
}
fn scan(&mut self, iter: &mut Peekable<CharIndices>, limit: usize, action: TrackAction) -> i32 {
let mut offset = 0;
loop {
if let Some((peeked_pos, _c)) = iter.peek() {
if *peeked_pos >= limit {
break;
}
} else {
break;
}
if let Some((_pos, c)) = iter.next() {
offset += 1;
match action {
TrackAction::Text => self.track_text(c),
TrackAction::Emoji => self.track_emoji(c),
TrackAction::Url => {},
}
}
}
if let TrackAction::Url = action {
self.track_url(offset);
}
offset
}
}
enum TrackAction {
Text,
Emoji,
Url
}
pub enum UnprocessedEntity<'a> {
UrlSpan(pest::Span<'a>),
Pair(Pair<'a>)
}
impl<'a> UnprocessedEntity<'a> {
fn start(&self) -> usize {
match self {
UnprocessedEntity::UrlSpan(span) => span.start(),
UnprocessedEntity::Pair(pair) => pair.as_span().start(),
}
}
fn end(&self) -> usize {
match self {
UnprocessedEntity::UrlSpan(span) => span.end(),
UnprocessedEntity::Pair(pair) => pair.as_span().end(),
}
}
fn as_rule(&self) -> Rule {
match self {
UnprocessedEntity::UrlSpan(_span) => Rule::url,
UnprocessedEntity::Pair(pair) => pair.as_rule()
}
}
}
fn calculate_offset(s: &str) -> usize {
s.chars().next().unwrap_or(' ').len_utf8()
}
fn validate_url(p: Pair) -> bool {
let original = p.as_str();
match p.into_inner().find(|pair| {
let r = pair.as_rule();
r == Rule::host || r == Rule::tco_domain || r == Rule::uwp_domain
}) {
Some(pair) => valid_punycode(original, &pair),
_ => false
}
}
fn valid_punycode(original: &str, domain: &pest::iterators::Pair<Rule>) -> bool {
let source = domain.as_span().as_str();
let flags = Flags {
use_std3_ascii_rules: false,
transitional_processing: true,
verify_dns_length: true,
};
match uts46::to_ascii(&source, flags) {
Ok(s) => length_check(original, source, &s, domain.as_rule() != Rule::uwp_domain),
Err(_) => false
}
}
fn length_check(original: &str, original_domain: &str,
punycode_domain: &str, has_scheme: bool) -> bool {
let length = if has_scheme {
0
} else {
"https://".len()
};
(length + original.len() - original_domain.len() + punycode_domain.len()) < MAX_URL_LENGTH
}
pub const MAX_URL_LENGTH: usize = 4096;
fn as_i32(us: usize) -> i32 {
let u = if us > std::i32::MAX as usize {
None
} else {
Some(us as i32)
};
u.unwrap()
}
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
struct LengthData {
normalized_length: i32,
normalized_length_utf8: i32,
original_length: i32,
original_length_utf8: i32,
}
impl LengthData {
fn empty() -> LengthData {
LengthData {
normalized_length: 0,
normalized_length_utf8: 0,
original_length: 0,
original_length_utf8: 0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_empty_string_mentions() {
let extractor = Extractor::new();
let mentions = extractor.extract_mentioned_screennames("");
assert_eq!(0, mentions.len());
}
#[test]
fn test_extract_single_mention() {
let extractor = Extractor::new();
let mentions = extractor.extract_mentioned_screennames("@hi");
assert_eq!(1, mentions.len());
}
#[test]
fn test_extract_setting() {
let mut extractor = Extractor::new();
extractor.set_extract_url_without_protocol(false);
assert_eq!(false, extractor.get_extract_url_without_protocol());
extractor.set_extract_url_without_protocol(true);
assert_eq!(true, extractor.get_extract_url_without_protocol());
}
}