Last active
March 13, 2025 10:00
-
-
Save b0tting/a092220f0665b8054fa1f893b3927d2c to your computer and use it in GitHub Desktop.
Python script that extracts a flat list of sites from a mirrormanager website URL. The usecase is to generate a white list for everything that can be touched from a yum.repos.d directory. In case of Rockylinux, note you'll need both Rocky Linux and Fedora site mirrors!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This will use beuaitful soup to get thet list of linux mirror sites from a linux mirror site. Usecase | |
# for me was to generate a whitelist for a proxy server. | |
import re | |
import sys | |
try: | |
import argparse | |
import requests | |
from bs4 import BeautifulSoup | |
except ImportError as e: | |
print(e) | |
print("Please install the required modules, for example with 'pip install requests beautifulsoup4'") | |
sys.exit(1) | |
http_regex = re.compile(r"https?://\w+.\w+") | |
slashes_regex = re.compile(r".*/.*") | |
def extract_site_from_url(url): | |
if http_regex.match(url): | |
return url.split("/")[2] | |
# Sometimes I get https URLs with a trailing slashes | |
elif slashes_regex.match(url): | |
return url.split("/")[0] | |
else: | |
return url | |
def get_list_of_linux_mirror_sites(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
resultset = set() | |
# This regex will match any string that has more than one period. We could assume it's the third column | |
# in the table but let's just.. be sure. | |
mirror_regex = re.compile(r"\w+[.\w+]{2,}") | |
column_index = 0 | |
row = soup.find_all("tr")[1] | |
for td in row.find_all("td"): | |
if mirror_regex.match(td.text): | |
break | |
column_index += 1 | |
for tr in soup.find_all("tr"): | |
try: | |
text = tr.find_all("td")[column_index].text | |
resultset.add(extract_site_from_url(text)) | |
except IndexError: | |
continue | |
return resultset | |
def add_additional_domains(resultset, additional_domains): | |
for domain in additional_domains.split(","): | |
resultset.add(domain.strip()) | |
if __name__ == "__main__": | |
description = """ | |
This script will generate a flat list of sites from a linux mirror site webpage. | |
For example: | |
python extract_mirrorlist --url https://mirrormanager.fedoraproject.org/mirrors/EPEL/9/x86_64 --export-file fedora_mirror_list_20122024.txt | |
python extract_mirrorlist --url https://mirror.rockylinux.org/mirrormanager/mirrors/Rocky/8.10/x86_64 --additional-domains postgres.org,veeam.com --export-file rockylinux_mirror_list_20122024.txt | |
""" | |
parser = argparse.ArgumentParser(description="Get a list of linux mirror sites") | |
parser.add_argument("-u", "--url", required=True, help="The URL of the linux mirror site") | |
parser.add_argument("-f", "--export-file", help="If you want to export the list to a file, add the filename here") | |
parser.add_argument("-p", "--append-to-file", help="Append to the file instead of overwriting it", action="store_true") | |
parser.add_argument("-a", "--additional-domains", help="If you want to add additional domains to the list, add them here, comma separated") | |
args = parser.parse_args() | |
resultset = get_list_of_linux_mirror_sites(args.url) | |
# This is hacky, but works.for centos, fedora and rocky linux | |
mirrormanager_site = extract_site_from_url(args.url) | |
resultset.add(mirrormanager_site) | |
first_bit = mirrormanager_site.split(".")[0] | |
resultset.add(mirrormanager_site.replace(first_bit, "mirrors")) | |
if args.additional_domains: | |
add_additional_domains(resultset, args.additional_domains) | |
resultlist = list(resultset) | |
resultlist.sort() | |
if args.export_file: | |
if args.append_to_file: | |
with open(args.export_file, "a") as f: | |
for item in resultlist: | |
f.write("%s\n" % item) | |
else: | |
with open(args.export_file, "w") as f: | |
for item in resultlist: | |
f.write("%s\n" % item) | |
else: | |
for item in resultlist: | |
print(item) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you, it looks very promising!