hyperi-rustlib 2.2.1

Opinionated Rust framework for high-throughput data pipelines at PB scale. Auto-wiring config, logging, metrics, tracing, health, and graceful shutdown — built from many years of production infrastructure experience.
Documentation
# robots.txt — HyperI AI Training Crawler Blocklist
#
# This file blocks known AI model training crawlers while permitting
# standard search engine indexing (Googlebot, Bingbot, etc.).
#
# Based on: https://github.com/ai-robots-txt/ai.robots.txt
# Policy: https://github.com/hyperi-io/licensing/blob/main/AI-TRAINING-POLICY.md
#
# Last updated: 2026-03-31

# --- AI Model Training Crawlers ---

User-agent: AI2Bot
User-agent: AI2Bot-DeepResearchEval
User-agent: Ai2Bot-Dolma
User-agent: anthropic-ai
User-agent: Applebot-Extended
User-agent: Bytespider
User-agent: CCBot
User-agent: ChatGLM-Spider
User-agent: ClaudeBot
User-agent: Claude-Web
User-agent: cohere-ai
User-agent: cohere-training-data-crawler
User-agent: Crawl4AI
User-agent: Crawlspace
User-agent: Diffbot
User-agent: FacebookBot
User-agent: facebookexternalhit
User-agent: FirecrawlAgent
User-agent: Google-Extended
User-agent: GoogleOther
User-agent: GoogleOther-Image
User-agent: GoogleOther-Video
User-agent: GPTBot
User-agent: img2dataset
User-agent: laion-huggingface-processor
User-agent: LAIONDownloader
User-agent: meta-externalagent
User-agent: Meta-ExternalAgent
User-agent: meta-externalfetcher
User-agent: Meta-ExternalFetcher
User-agent: meta-webindexer
User-agent: MistralAI-User
User-agent: MistralAI-User/1.0
User-agent: Omgilibot
User-agent: omgili
User-agent: OpenAI
User-agent: PanguBot
User-agent: Panscient
User-agent: panscient.com
User-agent: PetalBot
User-agent: Scrapy
User-agent: SBIntuitionsBot
User-agent: Timpibot
User-agent: VelenPublicWebCrawler
User-agent: Webzio-Extended
User-agent: webzio-extended
Disallow: /

# --- AI Data Collection / Research Crawlers ---

User-agent: AmazonBuyForMe
User-agent: Amazonbot
User-agent: ApifyBot
User-agent: ApifyWebsiteContentCrawler
User-agent: bedrockbot
User-agent: Cloudflare-AutoRAG
User-agent: CloudVertexBot
User-agent: DeepSeekBot
User-agent: Devin
User-agent: Google-CloudVertexBot
User-agent: Google-Firebase
User-agent: Google-NotebookLM
User-agent: NotebookLM
User-agent: ISSCyberRiskCrawler
User-agent: KunatoCrawler
User-agent: NovaAct
User-agent: Operator
User-agent: TikTokSpider
User-agent: WRTNBot
Disallow: /

# --- AI Search / Chat Agents (block to prevent content ingestion) ---
# These are user-initiated AI search agents. Blocking prevents
# content from being retrieved and potentially cached/indexed by AI systems.

User-agent: ChatGPT-User
User-agent: Claude-SearchBot
User-agent: Claude-User
User-agent: DuckAssistBot
User-agent: Google-Agent
User-agent: GoogleAgent-Mariner
User-agent: Gemini-Deep-Research
User-agent: iAskBot
User-agent: iaskspider
User-agent: iaskspider/2.0
User-agent: OAI-SearchBot
User-agent: PerplexityBot
User-agent: Perplexity-User
User-agent: PhindBot
User-agent: YouBot
Disallow: /

# --- Standard Search Engines (ALLOWED) ---
# These crawlers are permitted to index content for search results.

User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Baiduspider
Allow: /

User-agent: YandexBot
Allow: /

# --- Default: Allow all other crawlers ---
# Unknown crawlers are allowed by default. The AI training restriction
# in the LICENSE file applies regardless of robots.txt compliance.

User-agent: *
Allow: /