Skip to content

Instantly share code, notes, and snippets.

@b0tting
Last active March 13, 2025 10:00
Show Gist options
  • Save b0tting/a092220f0665b8054fa1f893b3927d2c to your computer and use it in GitHub Desktop.
Save b0tting/a092220f0665b8054fa1f893b3927d2c to your computer and use it in GitHub Desktop.
Python script that extracts a flat list of sites from a mirrormanager website URL. The usecase is to generate a white list for everything that can be touched from a yum.repos.d directory. In case of Rockylinux, note you'll need both Rocky Linux and Fedora site mirrors!
# This will use beuaitful soup to get thet list of linux mirror sites from a linux mirror site. Usecase
# for me was to generate a whitelist for a proxy server.
import re
import sys
try:
import argparse
import requests
from bs4 import BeautifulSoup
except ImportError as e:
print(e)
print("Please install the required modules, for example with 'pip install requests beautifulsoup4'")
sys.exit(1)
http_regex = re.compile(r"https?://\w+.\w+")
slashes_regex = re.compile(r".*/.*")
def extract_site_from_url(url):
if http_regex.match(url):
return url.split("/")[2]
# Sometimes I get https URLs with a trailing slashes
elif slashes_regex.match(url):
return url.split("/")[0]
else:
return url
def get_list_of_linux_mirror_sites(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
resultset = set()
# This regex will match any string that has more than one period. We could assume it's the third column
# in the table but let's just.. be sure.
mirror_regex = re.compile(r"\w+[.\w+]{2,}")
column_index = 0
row = soup.find_all("tr")[1]
for td in row.find_all("td"):
if mirror_regex.match(td.text):
break
column_index += 1
for tr in soup.find_all("tr"):
try:
text = tr.find_all("td")[column_index].text
resultset.add(extract_site_from_url(text))
except IndexError:
continue
return resultset
def add_additional_domains(resultset, additional_domains):
for domain in additional_domains.split(","):
resultset.add(domain.strip())
if __name__ == "__main__":
description = """
This script will generate a flat list of sites from a linux mirror site webpage.
For example:
python extract_mirrorlist --url https://mirrormanager.fedoraproject.org/mirrors/EPEL/9/x86_64 --export-file fedora_mirror_list_20122024.txt
python extract_mirrorlist --url https://mirror.rockylinux.org/mirrormanager/mirrors/Rocky/8.10/x86_64 --additional-domains postgres.org,veeam.com --export-file rockylinux_mirror_list_20122024.txt
"""
parser = argparse.ArgumentParser(description="Get a list of linux mirror sites")
parser.add_argument("-u", "--url", required=True, help="The URL of the linux mirror site")
parser.add_argument("-f", "--export-file", help="If you want to export the list to a file, add the filename here")
parser.add_argument("-p", "--append-to-file", help="Append to the file instead of overwriting it", action="store_true")
parser.add_argument("-a", "--additional-domains", help="If you want to add additional domains to the list, add them here, comma separated")
args = parser.parse_args()
resultset = get_list_of_linux_mirror_sites(args.url)
# This is hacky, but works.for centos, fedora and rocky linux
mirrormanager_site = extract_site_from_url(args.url)
resultset.add(mirrormanager_site)
first_bit = mirrormanager_site.split(".")[0]
resultset.add(mirrormanager_site.replace(first_bit, "mirrors"))
if args.additional_domains:
add_additional_domains(resultset, args.additional_domains)
resultlist = list(resultset)
resultlist.sort()
if args.export_file:
if args.append_to_file:
with open(args.export_file, "a") as f:
for item in resultlist:
f.write("%s\n" % item)
else:
with open(args.export_file, "w") as f:
for item in resultlist:
f.write("%s\n" % item)
else:
for item in resultlist:
print(item)
@marcel-drost
Copy link

marcel-drost commented Mar 12, 2025

Thank you, it looks very promising!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment