# Default robots.txt — Conexcol cPanel servers
# Served automatically for sites without their own robots.txt
# Customers can override by placing their own robots.txt in public_html
# Doc: docs/BOT-CRAWLER-POLICY-2026-03-03.md
# Updated: 2026-03-04 — Allow retrieval/search bots, block only training crawlers

# --- AI Training Crawlers (BLOCKED) ---
# These bots absorb content into model weights permanently.
# Industry default: block (Cloudflare, 79% of major publishers)

User-agent: GPTBot
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: Amazonbot
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: PerplexityBot
Disallow: /

# --- SEO Scrapers (BLOCKED) ---
# Aggressive scrapers that generate excessive load with no benefit to sites

User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

# --- AI Search/Retrieval Bots (ALLOWED) ---
# These fetch individual pages on-demand, cite with attribution and links.
# Low volume (~2% of AI traffic). Benefit: sites appear in AI search answers.

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

# --- Default for all other bots ---
User-agent: *
Allow: /
Crawl-delay: 10