use std::collections::HashSet;
use crate::{codec::name::Name, storage::blocklists::BlocklistFormat};
pub trait BlocklistParser {
fn parse(&self, text: &str) -> HashSet<Name>;
}
#[inline]
fn preprocess(line: &str) -> Option<&str> {
let line = if let Some(pos) = line.find('#') {
&line[..pos]
} else {
line
};
let trimmed = line.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.starts_with('!') {
return None;
}
Some(trimmed)
}
#[derive(Debug, Clone, Copy, Default)]
pub struct HostsParser;
impl BlocklistParser for HostsParser {
fn parse(&self, text: &str) -> HashSet<Name> {
let mut names = HashSet::new();
for line in text.lines() {
let Some(content) = preprocess(line) else {
continue;
};
let mut fields = content.split_ascii_whitespace();
fields.next();
for field in fields {
if let Ok(name) = field.parse::<Name>() {
names.insert(name);
}
}
}
names
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct DomainListParser;
impl BlocklistParser for DomainListParser {
fn parse(&self, text: &str) -> HashSet<Name> {
let mut names = HashSet::new();
for line in text.lines() {
let Some(content) = preprocess(line) else {
continue;
};
if content.contains(char::is_whitespace) {
continue;
}
if let Ok(name) = content.parse::<Name>() {
names.insert(name);
}
}
names
}
}
#[derive(Debug, Clone, Copy)]
pub enum Parser {
Hosts(HostsParser),
DomainList(DomainListParser),
}
impl From<BlocklistFormat> for Parser {
fn from(format: BlocklistFormat) -> Self {
match format {
BlocklistFormat::Hosts => Self::Hosts(HostsParser),
BlocklistFormat::DomainList => Self::DomainList(DomainListParser),
}
}
}
impl BlocklistParser for Parser {
fn parse(&self, text: &str) -> HashSet<Name> {
match self {
Self::Hosts(p) => p.parse(text),
Self::DomainList(p) => p.parse(text),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::blocklists::BlocklistFormat;
fn name(s: &str) -> Name {
s.parse().expect("valid name in test helper")
}
#[test]
fn hosts_basic_0000_entry() {
let set = HostsParser.parse("0.0.0.0 ads.example.com\n");
assert!(
set.contains(&name("ads.example.com")),
"expected ads.example.com. in set; got {set:?}"
);
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_basic_loopback_entry() {
let set = HostsParser.parse("127.0.0.1 tracker.example.org\n");
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_mixed_case_normalizes() {
let set = HostsParser.parse("0.0.0.0 ADS.Example.COM\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_multiple_hostnames_per_line() {
let set = HostsParser.parse("0.0.0.0 a.example.com b.example.com\n");
assert!(set.contains(&name("a.example.com")));
assert!(set.contains(&name("b.example.com")));
assert_eq!(set.len(), 2);
}
#[test]
fn hosts_hash_comment_line_ignored() {
let set = HostsParser.parse("# This is a comment\n0.0.0.0 ads.example.com\n");
assert_eq!(set.len(), 1);
assert!(set.contains(&name("ads.example.com")));
}
#[test]
fn hosts_exclamation_comment_line_ignored() {
let set = HostsParser.parse("! This is also a comment\n0.0.0.0 ads.example.com\n");
assert_eq!(set.len(), 1);
assert!(set.contains(&name("ads.example.com")));
}
#[test]
fn hosts_blank_line_ignored() {
let set = HostsParser.parse("\n\n0.0.0.0 ads.example.com\n\n");
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_whitespace_only_line_ignored() {
let set = HostsParser.parse(" \t \n0.0.0.0 ads.example.com\n");
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_inline_comment_stripped() {
let set = HostsParser.parse("0.0.0.0 ads.example.com # this is tracked\n");
assert!(
set.contains(&name("ads.example.com")),
"domain before inline comment must still be parsed"
);
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_empty_label_skipped() {
let set = HostsParser.parse("0.0.0.0 foo..bar\n0.0.0.0 valid.example.com\n");
assert!(
!set.contains(&name("foo.")),
"invalid domain must not appear"
);
assert!(set.contains(&name("valid.example.com")));
assert_eq!(set.len(), 1, "only the valid entry should be present");
}
#[test]
fn hosts_overlength_label_skipped() {
let long = "a".repeat(64);
let input = format!("0.0.0.0 {long}.example.com\n0.0.0.0 valid.example.com\n");
let set = HostsParser.parse(&input);
assert!(set.contains(&name("valid.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_ip_only_line_yields_no_entry() {
let set = HostsParser.parse("0.0.0.0\n");
assert!(set.is_empty(), "IP-only line must yield no entries");
}
#[test]
fn hosts_mixed_valid_invalid_fields() {
let set = HostsParser.parse("0.0.0.0 good.example.com bad..domain\n");
assert!(set.contains(&name("good.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_basic_entries() {
let text = "ads.example.com\ntracker.example.org\n";
let set = DomainListParser.parse(text);
assert!(set.contains(&name("ads.example.com")));
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 2);
}
#[test]
fn domain_list_mixed_case_normalizes() {
let set = DomainListParser.parse("Tracker.Example.Org\n");
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_hash_comment_ignored() {
let set = DomainListParser.parse("# comment\nads.example.com\n");
assert_eq!(set.len(), 1);
assert!(set.contains(&name("ads.example.com")));
}
#[test]
fn domain_list_exclamation_comment_ignored() {
let set = DomainListParser.parse("! comment\nads.example.com\n");
assert_eq!(set.len(), 1);
assert!(set.contains(&name("ads.example.com")));
}
#[test]
fn domain_list_blank_line_ignored() {
let set = DomainListParser.parse("\nads.example.com\n\n");
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_inline_comment_stripped() {
let set = DomainListParser.parse("ads.example.com # this domain is blocked\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_embedded_space_skipped() {
let set = DomainListParser.parse("foo bar\nads.example.com\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1, "line with spaces must be skipped");
}
#[test]
fn domain_list_empty_label_skipped() {
let set = DomainListParser.parse("foo..bar\nads.example.com\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_overlength_label_skipped() {
let long = "a".repeat(64);
let input = format!("{long}.example.com\nads.example.com\n");
let set = DomainListParser.parse(&input);
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_and_domain_list_equivalent_content_match() {
let hosts_text = "0.0.0.0 ads.example.com\n";
let domain_list_text = "ads.example.com\n";
let hosts_set = HostsParser.parse(hosts_text);
let dl_set = DomainListParser.parse(domain_list_text);
assert_eq!(
hosts_set, dl_set,
"equivalent content must produce identical sets"
);
}
#[test]
fn hosts_deduplicates_same_domain() {
let set = HostsParser.parse("0.0.0.0 ads.example.com\n0.0.0.0 ads.example.com\n");
assert_eq!(set.len(), 1);
}
#[test]
fn domain_list_deduplicates_case_insensitive() {
let set = DomainListParser.parse("ADS.Example.com\nads.example.com\n");
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_deduplicates_case_insensitive() {
let set = HostsParser.parse("0.0.0.0 ADS.EXAMPLE.COM\n0.0.0.0 ads.example.com\n");
assert_eq!(set.len(), 1);
}
#[test]
fn hosts_crlf_line_endings() {
let set = HostsParser.parse("0.0.0.0 ads.example.com\r\n0.0.0.0 tracker.example.org\r\n");
assert!(set.contains(&name("ads.example.com")));
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 2);
}
#[test]
fn domain_list_crlf_line_endings() {
let set = DomainListParser.parse("ads.example.com\r\ntracker.example.org\r\n");
assert!(set.contains(&name("ads.example.com")));
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 2);
}
#[test]
fn parser_dispatch_hosts_format() {
let parser = Parser::from(BlocklistFormat::Hosts);
let set = parser.parse("0.0.0.0 ads.example.com\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1);
}
#[test]
fn parser_dispatch_domain_list_format() {
let parser = Parser::from(BlocklistFormat::DomainList);
let set = parser.parse("ads.example.com\ntracker.example.org\n");
assert!(set.contains(&name("ads.example.com")));
assert!(set.contains(&name("tracker.example.org")));
assert_eq!(set.len(), 2);
}
#[test]
fn parser_dispatch_hosts_discards_ip_field() {
let parser = Parser::from(BlocklistFormat::Hosts);
let set = parser.parse("0.0.0.0 ads.example.com\n");
assert!(set.contains(&name("ads.example.com")));
assert_eq!(set.len(), 1, "the IP field must be discarded, not included");
}
#[test]
fn hosts_large_input_parses_correctly() {
let mut text = String::new();
for i in 0..1_000u32 {
text.push_str(&format!("0.0.0.0 host{i}.example.com\n"));
}
let set = HostsParser.parse(&text);
assert_eq!(set.len(), 1_000);
assert!(set.contains(&name("host0.example.com")));
assert!(set.contains(&name("host999.example.com")));
}
}