Created
October 29, 2024 16:09
-
-
Save sergiospagnuolo/63d666f6e4871fdc5c25991338da9c5e to your computer and use it in GitHub Desktop.
Identifica presença de parâmetros de agentes de IA em arquivos robots.txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr) | |
library(stringr) | |
# Agentes de AI, mapeados do nytimes.com/robots.txt | |
ai_keywords <- c( | |
"GPTBot", "ChatGPT-User", "PerplexityBot", "Amazonbot", "ClaudeBot", | |
"Omgilibot", "FacebookBot", "Applebot", "Applebot-Extended", "anthropic-ai", "Bytespider", | |
"Claude-Web", "YouBot", "CCBot", "Google-Extended", "Quora-Bot", "Meta-ExternalAgent" | |
) | |
check_ai_block <- function(url) { | |
# constroi a URL | |
robots_url <- paste0(url, "/robots.txt") | |
res <- try(GET(robots_url), silent = TRUE) | |
# checa sucesso do retorno do servidor | |
if (inherits(res, "try-error") || status_code(res) != 200) { | |
return(list(url = url, robots_exists = FALSE, ai_blocked = NA)) | |
} | |
# extrai o conteúdo do arquivo | |
robots_content <- content(res, as = "text", encoding = "UTF-8") | |
# divide os conteúdos em linhas | |
lines <- str_split(robots_content, "\n")[[1]] | |
found_agents <- c() | |
ai_blocked <- FALSE | |
for (line in lines) { | |
if (is.na(line) || line == "") next | |
lower_line <- tolower(line) # comparação incluindo tipo de caixa (alta ou baixa) | |
ai_agent_match <- str_match(lower_line, "^user-agent:\\s*(.*)$") | |
if (!is.na(ai_agent_match[1])) { | |
agent_name <- ai_agent_match[2] | |
if (!is.na(agent_name) && any(str_detect(agent_name, tolower(ai_keywords)))) { | |
found_agents <- c(found_agents, agent_name) # rastreia agentes de IA | |
} | |
} | |
# Checa se o argumento "Disallow: /" está presente, assim como regras para agentas de IA | |
if (!is.na(lower_line) && str_detect(lower_line, "^disallow:\\s*/") && length(found_agents) > 0) { | |
ai_blocked <- TRUE | |
break # para de retornar coisas depois de achar resultados | |
} | |
} | |
# retorno dos resultados | |
return(list(url = url, robots_exists = TRUE, ai_blocked = ai_blocked)) | |
} | |
# Lista de site para checar (adicionar lista própria) | |
websites <- c("https://g1.globo.com", "https://nucleo.jor.br", "https://aosfatos.org") | |
# Roda a função | |
results <- lapply(websites, check_ai_block) | |
# Cria o data.frame | |
results_df <- do.call(rbind, lapply(results, as.data.frame)) | |
# Printa os resultados | |
print(results_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment