Last active
April 24, 2025 20:19
-
-
Save lpereira/2d256ed9b05bae5247457c99a2fb6b53 to your computer and use it in GitHub Desktop.
Block pesky crawlers automatically
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# nobots: block stupid crawler bots | |
# | |
# Block list maintained by L. Pereira; use is free for personal | |
# use. Commercial usage is not allowed. No warranty whatsoever. | |
# | |
# Stick this in an hourly cron job and that's it. (The list is only updated | |
# every hour, so running it more frequently doesn't help. In fact, you may | |
# run this script once a day and it'll most likely be fine.) | |
# | |
# How is this block list generated? Who gets there? How to remove addresses? | |
# It's magic. Don't worry about it. | |
# | |
# If you find this useful, consider sponsoring me: | |
# https://github.com/sponsors/lpereira/ | |
# | |
# FAQ | |
# --- | |
# | |
# 1) Why use DROP and not REJECT? | |
# So that clients waste their time and resources. Feel free to change this script | |
# to do what you want, though! | |
# | |
# 2) How is the list generated, really? Where do these addresses come from? | |
# Honeypots, and a bit of scripting. Fishy requests come to one of my servers, they | |
# get blocked and added to the list. It's stupid and won't detect more sophisticated | |
# bots that use things similar to curl-impersonate (unless they're requesting things | |
# they shouldn't!), but it is good enough for now. | |
# | |
# 3) Can you share the code for the honeypot? | |
# Maybe some day. | |
# | |
# 4) Can my IP address be removed from your list? | |
# There's only one way: cut the crap and wait a few days. It'll sort itself out. There's | |
# no need to bother me. | |
# | |
# 5) I want to use the list commercially. What do I do? | |
# If you really, really want to use this commercially, we can talk. | |
# | |
# 6) Do you accept PRs to this gist? | |
# Yes. This has been written while I was drunk and is a bunch of duct tape. Improvements | |
# are welcome! | |
# | |
# 7) Is blocked-ips.php really written in PHP? | |
# No. It's an implementation detail for the honeypot. | |
# | |
import collections | |
import subprocess | |
import requests | |
import sys | |
refreshed = requests.get('https://tia.mat.br/blocked-ips.php') | |
if refreshed.status_code != 200: | |
print("Couldn't get the Blocked IP list") | |
sys.exit(1) | |
refreshed = set(refreshed.text.split("\n")) | |
counter4 = collections.Counter() | |
for addr in refreshed: | |
if ':' in addr: | |
continue | |
last_dot = addr.rfind(".") | |
subnet = addr[:last_dot + 1] | |
counter4[subnet] += 1 | |
banned_subnets = [subnet for subnet, count in counter4.items() if count >= 10] | |
def banned_subnet(addr): | |
for banned in banned_subnets: | |
if addr.startswith(banned): | |
return banned | |
return None | |
def addr_or_subnet(addr): | |
subnet = banned_subnet(addr) | |
return f"{subnet}0/24" if subnet else addr | |
def addr_from_iptables_line(line): | |
return line[3] if ":" in line[3] else addr_or_subnet(line[4]) | |
current = {} | |
for iptables in ("iptables", "ip6tables"): | |
curr = subprocess.check_output([iptables, "-L", "--line-numbers", "-n"]) | |
curr = curr.decode("ascii") | |
curr = curr.split("\n") | |
curr = (line for line in curr if "/* nobots */" in line) | |
curr = (line.split() for line in curr) | |
curr = { | |
addr_from_iptables_line(line): line[0] | |
for line in curr | |
if line[1] == "DROP" | |
} | |
current.update(curr) | |
current_key_set = set(current.keys()) | |
to_block = current_key_set ^ refreshed | |
blocked_addrs = set() | |
for new in to_block: | |
new = addr_or_subnet(new) | |
if new in blocked_addrs: | |
continue | |
blocked_addrs.add(new) | |
print(f"Blocking {new}") | |
iptables = "iptables" if "." in new else "ip6tables" | |
subprocess.check_output([iptables, "-I", "INPUT", "-s", new, | |
"-j", "DROP", "-m", "comment", "--comment", "nobots"]) | |
to_unblock = current_key_set.difference(refreshed) | |
for old in sorted(to_unblock, key=lambda item: current[item], reverse=True): | |
print(f"Unblocking {old}") | |
iptables = "iptables" if "." in old else "ip6tables" | |
subprocess.check_output([iptables, "-D", "INPUT", current[old]]) | |
print(f"Blocked {len(to_block)} IPs or subnets, unblocked {len(to_unblock)}, total {len(refreshed)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment