Skip to content

Instantly share code, notes, and snippets.

@sergiospagnuolo
Created October 29, 2024 16:09
Show Gist options
  • Save sergiospagnuolo/63d666f6e4871fdc5c25991338da9c5e to your computer and use it in GitHub Desktop.
Save sergiospagnuolo/63d666f6e4871fdc5c25991338da9c5e to your computer and use it in GitHub Desktop.
Identifica presença de parâmetros de agentes de IA em arquivos robots.txt
library(httr)
library(stringr)
# Agentes de AI, mapeados do nytimes.com/robots.txt
ai_keywords <- c(
"GPTBot", "ChatGPT-User", "PerplexityBot", "Amazonbot", "ClaudeBot",
"Omgilibot", "FacebookBot", "Applebot", "Applebot-Extended", "anthropic-ai", "Bytespider",
"Claude-Web", "YouBot", "CCBot", "Google-Extended", "Quora-Bot", "Meta-ExternalAgent"
)
check_ai_block <- function(url) {
# constroi a URL
robots_url <- paste0(url, "/robots.txt")
res <- try(GET(robots_url), silent = TRUE)
# checa sucesso do retorno do servidor
if (inherits(res, "try-error") || status_code(res) != 200) {
return(list(url = url, robots_exists = FALSE, ai_blocked = NA))
}
# extrai o conteúdo do arquivo
robots_content <- content(res, as = "text", encoding = "UTF-8")
# divide os conteúdos em linhas
lines <- str_split(robots_content, "\n")[[1]]
found_agents <- c()
ai_blocked <- FALSE
for (line in lines) {
if (is.na(line) || line == "") next
lower_line <- tolower(line) # comparação incluindo tipo de caixa (alta ou baixa)
ai_agent_match <- str_match(lower_line, "^user-agent:\\s*(.*)$")
if (!is.na(ai_agent_match[1])) {
agent_name <- ai_agent_match[2]
if (!is.na(agent_name) && any(str_detect(agent_name, tolower(ai_keywords)))) {
found_agents <- c(found_agents, agent_name) # rastreia agentes de IA
}
}
# Checa se o argumento "Disallow: /" está presente, assim como regras para agentas de IA
if (!is.na(lower_line) && str_detect(lower_line, "^disallow:\\s*/") && length(found_agents) > 0) {
ai_blocked <- TRUE
break # para de retornar coisas depois de achar resultados
}
}
# retorno dos resultados
return(list(url = url, robots_exists = TRUE, ai_blocked = ai_blocked))
}
# Lista de site para checar (adicionar lista própria)
websites <- c("https://g1.globo.com", "https://nucleo.jor.br", "https://aosfatos.org")
# Roda a função
results <- lapply(websites, check_ai_block)
# Cria o data.frame
results_df <- do.call(rbind, lapply(results, as.data.frame))
# Printa os resultados
print(results_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment