# JediPic — robots.txt
# Crawler instructions. Sitemap location is the canonical XML sitemap.
# Assistant inventory: https://jedipic.com/llms.txt
#
# Strategy:
#   - General web search crawlers (Google, Bing, DuckDuckGo, Yandex, Apple): allow
#     everything except admin / API / private uploads.
#   - LLM crawlers that drive live AI answers (ChatGPT-User, PerplexityBot,
#     ClaudeBot, anthropic-ai, Google-Extended, Applebot-Extended): ALLOW so
#     JediPic appears in AI answers when shoppers ask "gifts for X".
#   - LLM training crawlers without product value (OAI-SearchBot training,
#     CCBot, Bytespider): Disallow to avoid free training-set ingestion.
#   - "User-agent: *" is intentionally permissive — the named LLM rules above
#     override per-bot. The named rules take precedence per the spec.

User-agent: *
Allow: /
Disallow: /admin
Disallow: /admin/
Disallow: /admin-console-v2.html
Disallow: /admin-dashboard.html
Disallow: /admin-login.html
Disallow: /api/
Disallow: /components/
Disallow: /pages/auth/
Disallow: /pages/onboarding/
Disallow: /shop-manager
Disallow: /shop-manager/
Disallow: /templates/
Disallow: /legacy/
Disallow: /uploads/private/
Disallow: /data/
Disallow: /scripts/
Disallow: /docs/
Crawl-delay: 1

# ───── AI / LLM crawlers ─────
#
# These directives are advisory; not all crawlers honour them. The intent is
# explicit and matches the public guidance each vendor publishes.

# Anthropic Claude (live retrieval for Claude answers + training).
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-Web
Allow: /

# Perplexity AI (drives live AI search results).
User-agent: PerplexityBot
Allow: /

# Google AI Overviews + Gemini grounding. Google-Extended is the
# search-index-separate token for AI surfaces only. Allowing this lets
# JediPic appear in AI Overviews without affecting classic Google ranking
# (which uses the default Googlebot rules at the top of this file).
User-agent: Google-Extended
Allow: /

# Apple Intelligence + Siri. Applebot-Extended is the AI-specific opt-in.
User-agent: Applebot-Extended
Allow: /

# OpenAI live retrieval — used when a ChatGPT user clicks "Search the web"
# or asks a real-time question. Distinct from training crawlers.
User-agent: ChatGPT-User
Allow: /

# OpenAI search index crawler (Search inside ChatGPT product).
User-agent: OAI-SearchBot
Allow: /

# Cohere live retrieval.
User-agent: cohere-ai
Allow: /

# You.com search.
User-agent: YouBot
Allow: /

# Diffbot — used by many LLM aggregators.
User-agent: Diffbot
Allow: /

# ───── Blocked: training-only crawlers, low-value scrapers ─────

# OpenAI training crawler (GPT models). Distinct from ChatGPT-User which is
# live retrieval. We block training-set ingestion until we have a partnership.
User-agent: GPTBot
Disallow: /

# Common Crawl — used as training corpus by many LLM vendors. Blocking it
# removes JediPic from CC dumps; specific live-retrieval crawlers above can
# still index us for end-user answers.
User-agent: CCBot
Disallow: /

# ByteDance training crawler (TikTok / Doubao).
User-agent: Bytespider
Disallow: /

# Meta-ExternalAgent — Llama training crawler.
User-agent: Meta-ExternalAgent
Disallow: /

# Amazon Bedrock training crawler.
User-agent: Amazonbot
Disallow: /

Sitemap: https://jedipic.com/sitemap.xml