use hashbrown::HashSet;
use reqwest::blocking::Client;
use serde::Deserialize;
use std::env;
use std::fs;
use std::path::PathBuf;
#[derive(Debug, Deserialize)]
struct GithubContent {
name: String,
path: String,
#[serde(rename = "type")]
content_type: String,
}
static WHITE_LIST_AD_DOMAINS: phf::Set<&'static str> = phf::phf_set! {
"anydesk.com",
"firstaidbeauty.com",
"teads.com",
"appchair.com",
"ninjacat.io",
"oceango.net",
"center.io",
"bing.com",
"unity3d.com",
"adguard.com",
"bitdefender.com",
"blogspot.com",
"bytedance.com",
"comcast.net",
"duckdns.org",
"dyndns.org",
"fontawesome.com",
"grammarly.com",
"onenote.com",
"opendns.com",
"surfshark.com",
"teamviewer.com",
"tencent.com",
"tiktok.com",
"yandex.net",
"zoho.com",
"tiktokcdn-us.com",
"tiktokcdn.com",
"tiktokv.com",
"tiktokrow-cdn.com",
"tiktokv.us",
"wpengine.com"
};
fn main() -> std::io::Result<()> {
let client = Client::new();
let mut unique_entries = HashSet::new();
let mut unique_ads_entries = HashSet::new();
let mut unique_tracking_entries = HashSet::new();
let mut unique_gambling_entries = HashSet::new();
let base_url = "https://api.github.com/repos/ShadowWhisperer/BlockLists/contents/RAW";
let response = client
.get(base_url)
.header("User-Agent", ua_generator::ua::spoof_ua())
.send()
.expect("Failed to fetch directory listing");
let contents: Vec<GithubContent> = response.json().expect("Failed to parse JSON response");
let skip_list = vec![
"Cryptocurrency",
"Dating",
"Fonts",
"Microsoft",
"Marketing",
"Wild_Tracking",
];
for item in contents {
if skip_list.contains(&item.name.as_str()) {
continue;
}
if item.content_type == "file" {
let file_url = format!(
"https://raw.githubusercontent.com/ShadowWhisperer/BlockLists/master/{}",
item.path
);
let file_response = client
.get(&file_url)
.send()
.expect("Failed to fetch file content");
let file_content = file_response.text().expect("Failed to read file content");
if item.name == "Wild_Tracking" || item.name == "Tracking" {
for line in file_content.lines() {
if !line.is_empty() {
unique_tracking_entries.insert(line.to_string());
}
}
} else if item.name == "Wild_Ads" || item.name == "Ads" {
for line in file_content.lines() {
if !line.is_empty() {
unique_ads_entries.insert(line.to_string());
}
}
} else if item.name == "Gambling" {
for line in file_content.lines() {
if !line.is_empty() {
unique_gambling_entries.insert(line.to_string());
}
}
} else {
for line in file_content.lines() {
if !line.is_empty() {
unique_entries.insert(line.to_string());
}
}
}
}
}
let base_url = "https://api.github.com/repos/badmojr/1Hosts/contents/Lite/";
let response = client
.get(base_url)
.header("User-Agent", ua_generator::ua::spoof_ua())
.send()
.expect("Failed to fetch directory listing");
let contents: Vec<GithubContent> = response.json().expect("Failed to parse JSON response");
let skip_list = vec!["rpz", "domains.wildcards", "wildcards", "unbound.conf"];
for item in contents {
if skip_list.contains(&item.name.as_str()) {
continue;
}
if item.content_type == "file" {
let file_url = format!(
"https://raw.githubusercontent.com/badmojr/1Hosts/master/{}",
item.path
);
let file_response = client
.get(&file_url)
.send()
.expect("Failed to fetch file content");
let file_content = file_response.text().expect("Failed to read file content");
if item.name == "domains.txt" {
for line in file_content.lines().skip(15) {
if !line.is_empty() {
unique_tracking_entries.insert(line.to_string());
}
}
} else if item.name == "adblock.txt" {
for line in file_content.lines().skip(15) {
if !line.is_empty() {
let mut ad_url = line.replacen("||", "", 1);
ad_url.pop();
unique_ads_entries.insert(ad_url);
}
}
}
}
}
let additional_url =
"https://raw.githubusercontent.com/spider-rs/bad_websites/main/websites.txt";
let response = client
.get(additional_url)
.send()
.expect("Failed to fetch additional file");
let additional_content = response
.text()
.expect("Failed to read additional file content");
for line in additional_content.lines() {
let entry = line.trim_matches(|c| c == '"' || c == ',').to_owned();
if !entry.is_empty() {
unique_entries.insert(entry);
}
}
let mut set = phf_codegen::Set::new();
for entry in unique_entries {
if !WHITE_LIST_AD_DOMAINS.contains(&entry) {
set.entry(entry);
}
}
let mut ads_set = phf_codegen::Set::new();
for entry in unique_ads_entries {
ads_set.entry(entry);
}
let mut tracking_set = phf_codegen::Set::new();
for entry in unique_tracking_entries {
tracking_set.entry(entry);
}
let mut gambling_set = phf_codegen::Set::new();
for entry in unique_gambling_entries {
gambling_set.entry(entry);
}
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = PathBuf::from(out_dir).join("bad_websites.rs");
fs::write(
&dest_path,
format!(
"/// Bad websites that we should not connect to.\n\
static BAD_WEBSITES: phf::Set<&'static str> = {};\n
/// Ads websites that we should not connect to.\n\
static ADS_WEBSITES: phf::Set<&'static str> = {};\n
/// Tracking websites that we should not connect to.\n\
static TRACKING_WEBSITES: phf::Set<&'static str> = {};\n
/// Gambling websites that we should not connect to.\n\
static GAMBLING_WEBSITES: phf::Set<&'static str> = {};",
set.build(),
ads_set.build(),
tracking_set.build(),
gambling_set.build()
),
)?;
Ok(())
}