1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# robots.txt — HyperI AI Training Crawler Blocklist
#
# This file blocks known AI model training crawlers while permitting
# standard search engine indexing (Googlebot, Bingbot, etc.).
#
# Based on: https://github.com/ai-robots-txt/ai.robots.txt
# Policy: https://github.com/hyperi-io/licensing/blob/main/AI-TRAINING-POLICY.md
#
# Last updated: 2026-03-31
# --- AI Model Training Crawlers ---
User-agent: AI2Bot
User-agent: AI2Bot-DeepResearchEval
User-agent: Ai2Bot-Dolma
User-agent: anthropic-ai
User-agent: Applebot-Extended
User-agent: Bytespider
User-agent: CCBot
User-agent: ChatGLM-Spider
User-agent: ClaudeBot
User-agent: Claude-Web
User-agent: cohere-ai
User-agent: cohere-training-data-crawler
User-agent: Crawl4AI
User-agent: Crawlspace
User-agent: Diffbot
User-agent: FacebookBot
User-agent: facebookexternalhit
User-agent: FirecrawlAgent
User-agent: Google-Extended
User-agent: GoogleOther
User-agent: GoogleOther-Image
User-agent: GoogleOther-Video
User-agent: GPTBot
User-agent: img2dataset
User-agent: laion-huggingface-processor
User-agent: LAIONDownloader
User-agent: meta-externalagent
User-agent: Meta-ExternalAgent
User-agent: meta-externalfetcher
User-agent: Meta-ExternalFetcher
User-agent: meta-webindexer
User-agent: MistralAI-User
User-agent: MistralAI-User/1.0
User-agent: Omgilibot
User-agent: omgili
User-agent: OpenAI
User-agent: PanguBot
User-agent: Panscient
User-agent: panscient.com
User-agent: PetalBot
User-agent: Scrapy
User-agent: SBIntuitionsBot
User-agent: Timpibot
User-agent: VelenPublicWebCrawler
User-agent: Webzio-Extended
User-agent: webzio-extended
Disallow: /
# --- AI Data Collection / Research Crawlers ---
User-agent: AmazonBuyForMe
User-agent: Amazonbot
User-agent: ApifyBot
User-agent: ApifyWebsiteContentCrawler
User-agent: bedrockbot
User-agent: Cloudflare-AutoRAG
User-agent: CloudVertexBot
User-agent: DeepSeekBot
User-agent: Devin
User-agent: Google-CloudVertexBot
User-agent: Google-Firebase
User-agent: Google-NotebookLM
User-agent: NotebookLM
User-agent: ISSCyberRiskCrawler
User-agent: KunatoCrawler
User-agent: NovaAct
User-agent: Operator
User-agent: TikTokSpider
User-agent: WRTNBot
Disallow: /
# --- AI Search / Chat Agents (block to prevent content ingestion) ---
# These are user-initiated AI search agents. Blocking prevents
# content from being retrieved and potentially cached/indexed by AI systems.
User-agent: ChatGPT-User
User-agent: Claude-SearchBot
User-agent: Claude-User
User-agent: DuckAssistBot
User-agent: Google-Agent
User-agent: GoogleAgent-Mariner
User-agent: Gemini-Deep-Research
User-agent: iAskBot
User-agent: iaskspider
User-agent: iaskspider/2.0
User-agent: OAI-SearchBot
User-agent: PerplexityBot
User-agent: Perplexity-User
User-agent: PhindBot
User-agent: YouBot
Disallow: /
# --- Standard Search Engines (ALLOWED) ---
# These crawlers are permitted to index content for search results.
User-agent: Googlebot
Allow: /
User-agent: Bingbot
Allow: /
User-agent: Slurp
Allow: /
User-agent: DuckDuckBot
Allow: /
User-agent: Baiduspider
Allow: /
User-agent: YandexBot
Allow: /
# --- Default: Allow all other crawlers ---
# Unknown crawlers are allowed by default. The AI training restriction
# in the LICENSE file applies regardless of robots.txt compliance.
User-agent: *
Allow: /