#[derive(Debug, Clone)]
pub struct CrawlerDefinition {
pub name: &'static str,
pub user_agent_pattern: &'static str,
pub reverse_dns_pattern: &'static str,
pub ip_ranges: Option<&'static [&'static str]>,
pub verification_required: bool,
}
pub static KNOWN_CRAWLERS: &[CrawlerDefinition] = &[
CrawlerDefinition {
name: "Googlebot",
user_agent_pattern: r"(?i)googlebot|google-inspectiontool|storebot-google",
reverse_dns_pattern: r"(?i)\.(googlebot|google)\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Bingbot",
user_agent_pattern: r"(?i)bingbot|msnbot|bingpreview",
reverse_dns_pattern: r"(?i)\.search\.msn\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Baiduspider",
user_agent_pattern: r"(?i)baiduspider",
reverse_dns_pattern: r"(?i)\.crawl\.baidu\.(com|jp)$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "YandexBot",
user_agent_pattern: r"(?i)yandexbot",
reverse_dns_pattern: r"(?i)\.yandex\.(com|ru|net)$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "DuckDuckBot",
user_agent_pattern: r"(?i)duckduckbot",
reverse_dns_pattern: r"(?i)\.duckduckgo\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Slurp",
user_agent_pattern: r"(?i)slurp",
reverse_dns_pattern: r"(?i)\.crawl\.yahoo\.net$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Facebookbot",
user_agent_pattern: r"(?i)facebookexternalhit",
reverse_dns_pattern: r"(?i)\.(facebook|fbsv)\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Twitterbot",
user_agent_pattern: r"(?i)twitterbot",
reverse_dns_pattern: r"(?i)\.twitter\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "LinkedInBot",
user_agent_pattern: r"(?i)linkedinbot",
reverse_dns_pattern: r"(?i)\.linkedin\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "Applebot",
user_agent_pattern: r"(?i)applebot",
reverse_dns_pattern: r"(?i)\.applebot\.apple\.com$",
ip_ranges: None,
verification_required: true,
},
CrawlerDefinition {
name: "AhrefsBot",
user_agent_pattern: r"(?i)ahrefsbot",
reverse_dns_pattern: r"(?i)\.ahrefs\.com$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "SemrushBot",
user_agent_pattern: r"(?i)semrushbot",
reverse_dns_pattern: r"(?i)\.semrush\.com$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "MJ12bot",
user_agent_pattern: r"(?i)mj12bot",
reverse_dns_pattern: r"(?i)\.majestic12\.co\.uk$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "DotBot",
user_agent_pattern: r"(?i)dotbot",
reverse_dns_pattern: r"(?i)\.opensiteexplorer\.com$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "ScreamingFrog",
user_agent_pattern: r"(?i)screaming frog seo spider",
reverse_dns_pattern: r".*", ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "Pinterestbot",
user_agent_pattern: r"(?i)pinterest",
reverse_dns_pattern: r"(?i)\.pinterest\.com$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "Slackbot",
user_agent_pattern: r"(?i)slackbot|slackbot-linkexpanding",
reverse_dns_pattern: r"(?i)\.slack\.com$",
ip_ranges: None,
verification_required: false,
},
CrawlerDefinition {
name: "Discordbot",
user_agent_pattern: r"(?i)discordbot",
reverse_dns_pattern: r"(?i)\.discord\.com$",
ip_ranges: None,
verification_required: false,
},
];