#![doc = include_str!("../README.md")]
use aho_corasick::AhoCorasick;
use regex::Regex;
use thiserror::Error;
use url::Url;
mod constants {
pub const BEGIN_OF_SCHEME: u8 = 0x01;
pub const BEGIN_OF_HOST: u8 = 0x02;
pub const BEGIN_OF_PATH: u8 = 0x03;
pub const HOST_DELIMITER: u8 = b'.';
pub const PATH_DELIMITER: u8 = b'/';
}
#[derive(Debug, Error)]
pub enum BuildError {
#[error("syntax error at line {0}: {1}")]
Syntax(usize, SyntaxError),
#[error("error building Aho-Corasick: {0}")]
AhoCorasick(#[from] aho_corasick::BuildError),
}
#[derive(Debug, Error)]
pub enum SyntaxError {
#[error("invalid rule syntax")]
Rule,
#[error("error parsing regular expression: {0}")]
Regex(regex::Error),
#[error("error parsing URL: {0}")]
Url(url::ParseError),
}
#[derive(Debug)]
pub struct GfwList {
positive_ac: AhoCorasick,
negative_ac: AhoCorasick,
positive_rules: Vec<String>,
regex_patterns: Vec<(Regex, String)>,
}
fn append_host(acc: &mut Vec<u8>, host: &[u8]) {
if !host.starts_with(&[constants::HOST_DELIMITER]) {
acc.push(constants::HOST_DELIMITER);
}
acc.extend(host);
}
fn append_path(acc: &mut Vec<u8>, path: &[u8]) {
acc.extend(path);
if !path.ends_with(&[constants::PATH_DELIMITER]) {
acc.push(constants::PATH_DELIMITER);
}
}
fn append_host_path(acc: &mut Vec<u8>, input: &[u8]) {
let pos = input
.iter()
.position(|&b| b == constants::PATH_DELIMITER)
.unwrap_or(input.len());
append_host(acc, &input[..pos]);
acc.push(constants::BEGIN_OF_PATH);
append_path(acc, &input[pos..]);
}
fn append_url<const FULL_MODE: bool>(acc: &mut Vec<u8>, input: &str) -> Result<(), url::ParseError> {
let url = Url::parse(input)?;
let host_str = url.host_str().ok_or(url::ParseError::EmptyHost)?;
acc.push(constants::BEGIN_OF_SCHEME);
acc.extend(url.scheme().as_bytes());
acc.push(constants::BEGIN_OF_HOST);
append_host(acc, host_str.as_bytes());
let path = url.path();
if FULL_MODE || path != "/" || input.ends_with('/') {
acc.push(constants::BEGIN_OF_PATH);
append_path(acc, url.path().as_bytes());
}
Ok(())
}
impl GfwList {
pub fn from(input: &str) -> Result<Self, BuildError> {
let mut positive_rules: Vec<String> = vec![];
let mut negative_rules: Vec<String> = vec![];
let mut positive_patterns: Vec<Vec<u8>> = vec![];
let mut negative_patterns: Vec<Vec<u8>> = vec![];
let mut regex_patterns: Vec<(Regex, String)> = vec![];
for (line_index, mut line_str) in input.lines().enumerate() {
if line_str.is_empty() || line_str.starts_with('!') {
continue;
}
if line_str.starts_with('/') {
if line_str.len() == 1 || !line_str.ends_with('/') {
return Err(BuildError::Syntax(line_index, SyntaxError::Rule));
}
let regex = Regex::new(&line_str[1..line_str.len() - 1])
.map_err(|e| BuildError::Syntax(line_index, SyntaxError::Regex(e)))?;
regex_patterns.push((regex, line_str.to_string()));
continue;
}
let (patterns, rules) = if line_str.starts_with("@@") {
line_str = &line_str[2..];
(&mut negative_patterns, &mut negative_rules)
} else {
(&mut positive_patterns, &mut positive_rules)
};
let line = line_str.as_bytes();
let mut needle: Vec<u8> = vec![];
if line[0] == b'.' {
append_host_path(&mut needle, &line[1..]);
} else if line[0] != b'|' {
needle.push(constants::BEGIN_OF_HOST);
append_host_path(&mut needle, line);
} else if line.get(1) == Some(&b'|') {
append_host_path(&mut needle, &line[2..]);
} else {
append_url::<false>(&mut needle, &line_str[1..])
.map_err(|e| BuildError::Syntax(line_index, SyntaxError::Url(e)))?;
}
patterns.push(needle);
rules.push(line_str.to_string());
}
Ok(GfwList {
positive_ac: AhoCorasick::new(positive_patterns).map_err(BuildError::AhoCorasick)?,
negative_ac: AhoCorasick::new(negative_patterns).map_err(BuildError::AhoCorasick)?,
positive_rules,
regex_patterns,
})
}
pub fn test(&self, input: &str) -> Result<Option<&str>, url::ParseError> {
for (regex, rule) in &self.regex_patterns {
if regex.is_match(input) {
return Ok(Some(rule));
}
}
let mut haystack: Vec<u8> = vec![];
append_url::<true>(&mut haystack, input)?;
if self.negative_ac.find(&haystack).is_some() {
return Ok(None);
}
if let Some(match_) = self.positive_ac.find(&haystack) {
return Ok(Some(&self.positive_rules[match_.pattern().as_usize()]));
}
Ok(None)
}
pub fn len(&self) -> usize {
self.positive_ac.patterns_len() + self.negative_ac.patterns_len() + self.regex_patterns.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_1() {
let gfw = GfwList::from("|http://example.com").unwrap();
assert!(gfw.test("http://example.com").unwrap().is_some());
assert!(gfw.test("http://example.com/page").unwrap().is_some());
assert!(gfw.test("http://example.com.co").unwrap().is_some());
assert!(gfw.test("http://www.example.com").unwrap().is_none());
assert!(gfw.test("https://example.com/page").unwrap().is_none());
assert!(gfw.test("https://example.com").unwrap().is_none());
assert!(gfw.test("https://www.example.com").unwrap().is_none());
assert!(gfw.test("https://example.com.co").unwrap().is_none());
}
#[test]
fn test_2() {
let gfw = GfwList::from("||example.com").unwrap();
assert!(gfw.test("http://example.com").unwrap().is_some());
assert!(gfw.test("http://www.example.com").unwrap().is_some());
assert!(gfw.test("https://example.com").unwrap().is_some());
assert!(gfw.test("https://www.example.com").unwrap().is_some());
assert!(gfw.test("http://anotherexample.com").unwrap().is_none());
assert!(gfw.test("https://anotherexample.com").unwrap().is_none());
assert!(gfw.test("http://example.com.co").unwrap().is_none());
assert!(gfw.test("https://example.com.co").unwrap().is_none());
}
#[test]
fn test_3() {
let gfw = GfwList::from(".example.com\n@@|http://sub.example.com").unwrap();
assert!(gfw.test("http://example.com").unwrap().is_some());
assert!(gfw.test("http://www.sub.example.com").unwrap().is_some());
assert!(gfw.test("https://sub.example.com").unwrap().is_some());
assert!(gfw.test("https://www.example.com").unwrap().is_some());
assert!(gfw.test("http://sub.example.com").unwrap().is_none());
assert!(gfw.test("http://sub.example.com/page").unwrap().is_none());
}
}