#[macro_use] extern crate lazy_static;
extern crate regex;
mod spam_phrases;
use regex::{Regex, Captures};
#[derive(Debug, PartialEq, Clone)]
pub enum Status {
Valid,
Moderate,
Spam,
}
#[derive(Debug, Clone)]
pub struct Comment {
pub author: Option<String>,
pub url: Option<String>,
pub body: String,
pub previously_accepted_for_email: Option<isize>,
pub previously_rejected_for_email: Option<isize>,
pub previous_comment_bodies: Option<Vec<String>>,
}
#[derive(Debug, Clone)]
pub struct Snooker {
pub score: isize,
pub status: Status,
pub comment: Comment,
}
lazy_static! {
static ref A_TAG_RE: Regex = Regex::new(r#"<a[^>]*href=["']((https?://)?([\da-zA-Z.-]+)\.([a-zA-Z]{2,10})[/]?([?]?[\S]*))["'][^>]*>"#).unwrap();
static ref URL_RE: Regex = Regex::new(r#"((https?://)?([\da-zA-Z.-]+)\.([a-zA-Z]{2,10})[/]?([?]?[\S]*))"#).unwrap();
static ref CONSONANTS_RE: Regex = Regex::new(r#"(?i)[b-z&&[^eiou]]{5,}"#).unwrap();
static ref HTML_TAGS_RE: Regex = Regex::new(r#"<[^>]*>"#).unwrap();
}
static SPAM_TLDS: [&str; 3] = ["de", "pl", "cn"];
static URL_SPAM_WORDS: [&str; 5] = [".html", ".info", "?", "&", "free"];
static BODY_SPAM_FIRST_WORDS: [&str; 4] = ["interesting", "sorry", "nice", "cool"];
#[doc(hidden)]
impl Snooker {
pub fn new(comment: Comment) -> Self {
let mut snooker = Snooker {
score: 0,
status: Status::Moderate,
comment: comment,
};
let link_count = snooker.check_body_links();
snooker.check_body_length(link_count);
snooker.check_body_for_spam_phrases();
snooker.check_body_first_word();
snooker.check_body_of_previous_for_matches();
snooker.check_url();
snooker.check_author_for_http();
snooker.count_emails_previous_statuses();
if snooker.score >= 1 {
snooker.status = Status::Valid;
} else if snooker.score == 0 {
snooker.status = Status::Moderate;
} else {
snooker.status = Status::Spam;
}
snooker
}
pub fn check_body_links(&mut self) -> i8 {
let mut link_count: i8 = 0;
let body_clone = self.comment.body.clone();
for c in A_TAG_RE.captures_iter(&body_clone) {
link_count += 1;
process_single_link(c, self);
}
if link_count < 2 {
self.score += 2;
} else {
self.score -= link_count as isize;
}
link_count
}
pub fn check_url(&mut self) {
let url_option = self.comment.clone().url;
if let Some(url) = url_option {
if let Some(c) = URL_RE.captures(&url) {
process_single_link(c, self);
};
};
}
pub fn check_body_length(&mut self, link_count: i8) {
let stripped = HTML_TAGS_RE.replace_all(&self.comment.body, "");
let trimmed_len = stripped.trim().len();
if trimmed_len > 20 && link_count == 0 {
self.score += 2;
} else if trimmed_len > 20 {
self.score += 1;
} else {
self.score -= 1;
}
}
pub fn check_body_for_spam_phrases(&mut self) {
let mut spam_phrase_count: i8 = 0;
for p in spam_phrases::SPAM_PHRASES.iter() {
if self.comment.body.to_lowercase().contains(p) {
spam_phrase_count += 1;
}
}
self.score -= spam_phrase_count as isize;
}
pub fn check_body_first_word(&mut self) {
let stripped = HTML_TAGS_RE.replace_all(&self.comment.body, "");
let first_word = stripped.split_whitespace().next().unwrap().to_lowercase();
for w in BODY_SPAM_FIRST_WORDS.iter() {
if first_word.contains(w) {
self.score -= 10;
}
}
}
pub fn check_body_of_previous_for_matches(&mut self) {
if let Some(ref previous_comments) = self.comment.previous_comment_bodies {
let lowercase_body = self.comment.body.trim().to_lowercase();
for pc in previous_comments {
let lowercase_pc = pc.trim().to_lowercase();
if lowercase_pc == lowercase_body {
self.score -= 1;
}
}
}
}
pub fn check_author_for_http(&mut self) {
if let Some(ref a) = self.comment.author {
if a.to_lowercase().contains("http://") || a.to_lowercase().contains("https://") {
self.score -= 2;
}
}
}
pub fn count_emails_previous_statuses(&mut self) {
if let Some(c) = self.comment.previously_accepted_for_email {
self.score += c;
}
if let Some(c) = self.comment.previously_rejected_for_email {
self.score -= c;
}
}
}
#[doc(hidden)]
pub fn count_consonant_collections(s: &str) -> u8 {
let mut count = 0;
for c in CONSONANTS_RE.captures_iter(s) {
if &c[0] != "http" && &c[0] != "https" {
count += 1;
}
}
count
}
#[doc(hidden)]
fn process_single_link(c: Captures, snooker: &mut Snooker) {
let tld = &c[4];
for spam_tld in SPAM_TLDS.iter() {
if &tld == spam_tld {
snooker.score -= 1 as isize;
break;
}
}
let url = &c[1];
for word in URL_SPAM_WORDS.iter() {
if url.to_lowercase().contains(word) {
snooker.score -= 1 as isize;
}
}
if url.len() > 30 {
snooker.score -= 1 as isize;
}
snooker.score -= count_consonant_collections(url) as isize;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn spam_1() {
let comment = Comment {
author: Some("https://elliotekj.com".to_string()),
url: None,
body: String::from("
<p>Cool, this <a href=\"https://elliotekj.com\">comment</a> has more <a\
href=\"https://elliotekj.de\">than</a> 20 characters in it but contains\
2 links.</p>
"),
previously_accepted_for_email: None,
previously_rejected_for_email: None,
previous_comment_bodies: None,
};
let snooker_result = Snooker::new(comment);
assert_eq!(snooker_result.score, -14);
assert_eq!(snooker_result.status, Status::Spam);
}
#[test]
fn spam_2() {
let previous_comment_bodies = vec![
String::from("
<p>Have you been turned down? Get our special promotion</p>
"),
String::from("
<p>Have you been turned down? Get our special promotion</p>
"),
];
let comment = Comment {
author: Some("Elliot Jackson".to_string()),
url: Some("http://someexample.com?getit=free".to_string()),
body: String::from("
<p>Have you been turned down? Get our special promotion</p>
"),
previously_accepted_for_email: None,
previously_rejected_for_email: None,
previous_comment_bodies: Some(previous_comment_bodies),
};
let snooker_result = Snooker::new(comment);
assert_eq!(snooker_result.score, -3);
assert_eq!(snooker_result.status, Status::Spam);
}
}