1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
//! This crate provides a pure-Rust implementation of Jonathan Snook's [spam detection
//! algorithm](https://snook.ca/archives/other/effective_blog_comment_spam_blocker) for blog
//! comments.
//!
//! As described in the afore-linked post, it works on a points system. Points are awarded and
//! deducted based on a variety of rules. If a comments final score is greater than or equal to
//! 1, the comment is considered valid. If the comments final score is 0 then it's considered
//! to be worth of moderating. If the comments final score is below 0 then it's considered to be
//! spam. Each comment starts with a score of 0.
//!
//! ## Installation
//!
//! If you're using Cargo, just add Snooker to your `Cargo.toml`:
//!
//! ```toml,no_run
//! [dependencies]
//! snooker = "0.1.0"
//! ```
//!
//! ## Example
//!
//! Snooker gives the example comment below a score of **-10** based off of the following patterns:
//!
//! - The `body` has less that 2 links in it: **+2 points**
//! - The `body` is more that 20 characters long but contains 1 link: **+1 point**
//! - The link in the `body` contains one keyword considered spammy ("free"): **-1 point**
//! - The `body` contains one phrase considered spammy ("limited time only"): **-1 point**
//! - The `body` starts with a word considered spammy when it's the first word of the comment
//! ("nice"): **-10 points**
//! - The `author` field doesn't contain `http://` or `https://`: **+0 points** (unchanged)
//! - The `url` field contains a keyword considered spammy ("free"): **-1 point**
//! - None of the URLs use a TLD considered spammy: **+0 points** (unchanged)
//! - None of the URLs are longer that 30 characters: **+0 points** (unchanged)
//! - No consonant groups were found: **+0 points** (unchanged)
//! - No data was provided about the comments previously submitted with this email address: **+0
//! points** (unchanged)
//!
//! ```rust
//! use snooker::{Comment, Snooker, Status};
//!
//! let comment = Comment {
//!     author: Some("Johnny B. Goode".to_string()),
//!     url: Some("http://my-free-ebook.com".to_string()),
//!     body: String::from("
//!         <p>Nice post! Check out our free (for a limited time only) eBook
//!         <a href=\"http://my-free-ebook.com\">here</a> that's totally relevant</p>
//!     "),
//!     previously_accepted_for_email: None,
//!     previously_rejected_for_email: None,
//!     previous_comment_bodies: None,
//! };
//!
//! let snooker_result = Snooker::new(comment);
//! assert_eq!(snooker_result.score, -10);
//! assert_eq!(snooker_result.status, Status::Spam);
//! ```

#[macro_use] extern crate lazy_static;
extern crate regex;

mod spam_phrases;

use regex::{Regex, Captures};

/// The status Snooker assigns to a comment.

#[derive(Debug, PartialEq, Clone)]
pub enum Status {
    Valid,
    Moderate,
    Spam,
}

/// Snooker's representation of a comment.
///
/// `body` is the only field that's required. It is highly recommended that you provide the
/// `author` and `url` fields too.
///
/// If you wan't to go the whole hog, you can provide data about the comments previously submitted
/// with this email address.

#[derive(Debug, Clone)]
pub struct Comment {
    /// The name the user provided when submitting the comment.
    pub author: Option<String>,

    /// The URL the user provided when submitting the comment.
    pub url: Option<String>,

    /// The body of the comment the user submitted.
    pub body: String,

    /// The number of comments from this email address that Snooker has previously deemed valid.
    /// Note: Snooker does not store any data about the comments it processes. If you want to use
    /// this feature, you'll need to keep your own database.
    pub previously_accepted_for_email: Option<isize>,

    /// The number of comments from this email address that Snooker has previously deemed spam.
    /// Note: Snooker does not store any data about the comments it processes. If you want to use
    /// this feature, you'll need to keep your own database.
    pub previously_rejected_for_email: Option<isize>,

    /// The bodies of the comments previously submitted with this email address. Note: Snooker does
    /// not store any data about the comments it processes. If you want to use this feature, you'll
    /// need to keep your own database.
    pub previous_comment_bodies: Option<Vec<String>>,
}

/// The struct returned by `Snooker::new` when it has finished processing a comment.

#[derive(Debug, Clone)]
pub struct Snooker {
    /// The final score the passed comment was given.
    pub score: isize,

    /// The status assigned to this comment based off of its `score`. If the score was greater than
    /// or equal to 1, the status is `Status::Valid`. If the score is 0, the status is
    /// `Status::Moderate`. If score is below 0, the status is `Status::Spam`.
    pub status: Status,

    /// The original comment struct passed to Snooker.
    pub comment: Comment,
}

lazy_static! {
    // Matches links, capturing the value in their `href`:
    static ref A_TAG_RE: Regex = Regex::new(r#"<a[^>]*href=["']((https?://)?([\da-zA-Z.-]+)\.([a-zA-Z]{2,10})[/]?([?]?[\S]*))["'][^>]*>"#).unwrap();
    static ref URL_RE: Regex = Regex::new(r#"((https?://)?([\da-zA-Z.-]+)\.([a-zA-Z]{2,10})[/]?([?]?[\S]*))"#).unwrap();

    // Matches 5 or more consonants in a row:
    static ref CONSONANTS_RE: Regex = Regex::new(r#"(?i)[b-z&&[^eiou]]{5,}"#).unwrap();

    // Matches all HTML tags:
    static ref HTML_TAGS_RE: Regex = Regex::new(r#"<[^>]*>"#).unwrap();
}

static SPAM_TLDS: [&str; 3] = ["de", "pl", "cn"];
static URL_SPAM_WORDS: [&str; 5] = [".html", ".info", "?", "&", "free"];
static BODY_SPAM_FIRST_WORDS: [&str; 4] = ["interesting", "sorry", "nice", "cool"];

#[doc(hidden)]
impl Snooker {
    pub fn new(comment: Comment) -> Self {
        let mut snooker = Snooker {
            score: 0,
            status: Status::Moderate,
            comment: comment,
        };

        let link_count = snooker.check_body_links();
        snooker.check_body_length(link_count);
        snooker.check_body_for_spam_phrases();
        snooker.check_body_first_word();
        snooker.check_body_of_previous_for_matches();
        snooker.check_url();
        snooker.check_author_for_http();
        snooker.count_emails_previous_statuses();

        if snooker.score >= 1 {
            snooker.status = Status::Valid;
        } else if snooker.score == 0 {
            snooker.status = Status::Moderate;
        } else {
            snooker.status = Status::Spam;
        }

        snooker
    }

    pub fn check_body_links(&mut self) -> i8 {
        let mut link_count: i8 = 0;
        let body_clone = self.comment.body.clone();

        for c in A_TAG_RE.captures_iter(&body_clone) {
            // Count the number of links
            link_count += 1;

            process_single_link(c, self);
        }

        if link_count < 2 {
            self.score += 2;
        } else {
            self.score -= link_count as isize;
        }

        link_count
    }

    pub fn check_url(&mut self) {
        let url_option = self.comment.clone().url;

        if let Some(url) = url_option {
            if let Some(c) = URL_RE.captures(&url) {
                process_single_link(c, self);
            };
        };
    }

    pub fn check_body_length(&mut self, link_count: i8) {
        let stripped = HTML_TAGS_RE.replace_all(&self.comment.body, "");
        let trimmed_len = stripped.trim().len();

        if trimmed_len > 20 && link_count == 0 {
            self.score += 2;
        } else if trimmed_len > 20 {
            self.score += 1;
        } else {
            self.score -= 1;
        }
    }

    pub fn check_body_for_spam_phrases(&mut self) {
        let mut spam_phrase_count: i8 = 0;

        for p in spam_phrases::SPAM_PHRASES.iter() {
            if self.comment.body.to_lowercase().contains(p) {
                spam_phrase_count += 1;
            }
        }

        self.score -= spam_phrase_count as isize;
    }

    pub fn check_body_first_word(&mut self) {
        let stripped = HTML_TAGS_RE.replace_all(&self.comment.body, "");
        let first_word = stripped.split_whitespace().next().unwrap().to_lowercase();

        for w in BODY_SPAM_FIRST_WORDS.iter() {
            if first_word.contains(w) {
                self.score -= 10;
            }
        }
    }

    pub fn check_body_of_previous_for_matches(&mut self) {
        if let Some(ref previous_comments) = self.comment.previous_comment_bodies {
            let lowercase_body = self.comment.body.trim().to_lowercase();

            for pc in previous_comments {
                let lowercase_pc = pc.trim().to_lowercase();

                if lowercase_pc == lowercase_body {
                    self.score -= 1;
                }
            }
        }
    }

    pub fn check_author_for_http(&mut self) {
        if let Some(ref a) = self.comment.author {
            if a.to_lowercase().contains("http://") || a.to_lowercase().contains("https://") {
                self.score -= 2;
            }
        }
    }

    pub fn count_emails_previous_statuses(&mut self) {
        if let Some(c) = self.comment.previously_accepted_for_email {
            self.score += c;
        }

        if let Some(c) = self.comment.previously_rejected_for_email {
            self.score -= c;
        }
    }
}

#[doc(hidden)]
pub fn count_consonant_collections(s: &str) -> u8 {
    let mut count = 0;

    for c in CONSONANTS_RE.captures_iter(s) {
        if &c[0] != "http" && &c[0] != "https" {
            count += 1;
        }
    }

    count
}

#[doc(hidden)]
fn process_single_link(c: Captures, snooker: &mut Snooker) {
    // Check for certain TLDs

    let tld = &c[4];

    for spam_tld in SPAM_TLDS.iter() {
        if &tld == spam_tld {
            snooker.score -= 1 as isize;

            break;
        }
    }

    // Check for certains words & characters

    let url = &c[1];

    for word in URL_SPAM_WORDS.iter() {
        if url.to_lowercase().contains(word) {
            snooker.score -= 1 as isize;
        }
    }

    // Check the length of the URL:
    if url.len() > 30 {
        snooker.score -= 1 as isize;
    }

    // Check for 5 consonants or more in a row:
    snooker.score -= count_consonant_collections(url) as isize;
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn spam_1() {
        // Author contains "https://" → -2
        // Body contains 2 links → -2
        // Body is over 20 chars with 2 links → +1
        // Body starts with "Cool" → -10
        // One of the body URLs has a spammy TLD → -1
        //
        // Expected: -14

        let comment = Comment {
            author: Some("https://elliotekj.com".to_string()),
            url: None,
            body: String::from("
                <p>Cool, this <a href=\"https://elliotekj.com\">comment</a> has more <a\
                href=\"https://elliotekj.de\">than</a> 20 characters in it but contains\
                2 links.</p>
            "),
            previously_accepted_for_email: None,
            previously_rejected_for_email: None,
            previous_comment_bodies: None,
        };

        let snooker_result = Snooker::new(comment);
        assert_eq!(snooker_result.score, -14);
        assert_eq!(snooker_result.status, Status::Spam);
    }

    #[test]
    fn spam_2() {
        // Body is over 20 chars and contains no links → +2
        // Body has less than 2 links → +2
        // Body contains 2 spam phrases → -2
        // URL has "free" and one param in it → -2
        // URL is over 30 characters → -1
        // 2 previous comments by this email address have the same body → -2
        //
        // Expected: -3

        let previous_comment_bodies = vec![
            String::from("
                <p>Have you been turned down? Get our special promotion</p>
            "),
            String::from("
                <p>Have you been turned down? Get our special promotion</p>
            "),
        ];

        let comment = Comment {
            author: Some("Elliot Jackson".to_string()),
            url: Some("http://someexample.com?getit=free".to_string()),
            body: String::from("
                <p>Have you been turned down? Get our special promotion</p>
            "),
            previously_accepted_for_email: None,
            previously_rejected_for_email: None,
            previous_comment_bodies: Some(previous_comment_bodies),
        };

        let snooker_result = Snooker::new(comment);
        assert_eq!(snooker_result.score, -3);
        assert_eq!(snooker_result.status, Status::Spam);
    }
}