Last active
July 13, 2023 09:45
-
-
Save stain/9bbc97c2388eaa178296a2afc4f5a277 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
__author__ = 'Eric Van Cleve, Stian Soiland-Reyes' | |
__copyright__ = 'Copyright 2019, Proofpoint Inc, 2023 The Universit of Manchester' | |
__license__ = 'GPL v.3' | |
__version__ = '3.1-rewriter' | |
__email__ = '[email protected]' | |
__status__ = 'Production' | |
## PROMINENT NOTICE: | |
# This script was modified 2023-07-13 by Stian Soiland-Reyes, The University of Manchester, UK | |
# - Added URLDefenseDecoder.v3_matcher pattern | |
# - Added option to work as stdin/stdout filter | |
## https://help.proofpoint.com/Threat_Insight_Dashboard/Concepts/How_do_I_decode_a_rewritten_URL%3F | |
# https://help.proofpoint.com/@api/deki/files/2775/urldecoder.py?revision=1 | |
import sys | |
import re | |
import string | |
from argparse import ArgumentParser | |
from base64 import urlsafe_b64decode | |
if sys.version_info[0] < 3: | |
from urllib import unquote | |
import HTMLParser | |
htmlparser = HTMLParser.HTMLParser() | |
unescape = htmlparser.unescape | |
from string import maketrans | |
else: | |
from urllib.parse import unquote | |
from html import unescape | |
maketrans = str.maketrans | |
class URLDefenseDecoder(object): | |
@staticmethod | |
def __init__(): | |
URLDefenseDecoder.ud_pattern = re.compile(r'https://urldefense(?:\.proofpoint)?\.com/(v[0-9])/') | |
URLDefenseDecoder.v1_pattern = re.compile(r'u=(?P<url>.+?)&k=') | |
URLDefenseDecoder.v2_pattern = re.compile(r'u=(?P<url>.+?)&[dc]=') | |
URLDefenseDecoder.v3_pattern = re.compile(r'v3/__(?P<url>.+?)__;(?P<enc_bytes>.*?)!') | |
# https://urldefense.com/v3/__https://github.com/taviso/wpunix__;!!PDiH4ENfjr2_Jw!HYwmxF7GOdHw2YRp2a5P487O_1_JBXGRNdYJ88g43_Nh_1ufYHN9SDNZ4TWZBUGn3Xs_ZlP_8mx7TMZkVzQ9bD8XrPUk1e_IGmM$[github[.]com] | |
# https://urldefense.com/v3/__https://groups.google.com/d/msgid/digital-curation/CAO7v-1Sq3zW7yi-opE19p*2BL3vf93F1Nk90cm*2BOOmCH2p*3DebPNg*40mail.gmail.com__;JSUlJQ!!PDiH4ENfjr2_Jw!HYwmxF7GOdHw2YRp2a5P487O_1_JBXGRNdYJ88g43_Nh_1ufYHN9SDNZ4TWZBUGn3Xs_ZlP_8mx7TMZkVzQ9bD8XrPUku5-LAcA$ | |
# https://urldefense.com/v3/__http://example.com/test_under_score__;!!PDiH4ENfjr2_Jw!Fmw2wel8qlsqPIcK7EBGskW2DigGIcgLmh4pliChiN0CTWSb2yJI2Upn0BmciePHrfEvwEvmeHW1lKQcYN4squMBoIMnFT8$ | |
# [groups[.]google[.]com] | |
URLDefenseDecoder.v3_matcher = re.compile(r'https://urldefense\.com/v3/__.[^;!$]+__;[^!]*![^!]*![^!]*![^$]*\$(?: ?\[(?:(?:\[\.\])?[^][]*)*\])?') | |
# https://urldefense.com/v3/__https://**A11.**C/2023/phd/evaluating-fdo/ro-crate-preview.html__;0LfQtdGO!!PDiH4ENfjr2_Jw!GneQqCVpfZSR4Zf45TG0LezPLaq_CI-gSfUAR1yTl6JU4DekDwQbbKnr9t_pFO-BfgA7BtmPRWfu_eCpZUcJatCRUsFRUCY$ | |
URLDefenseDecoder.v3_token_pattern = re.compile(r"\*(\*.)?") | |
URLDefenseDecoder.v3_single_slash = re.compile(r"^([a-z0-9+.-]+:/)([^/].+)", re.IGNORECASE) | |
URLDefenseDecoder.v3_run_mapping = {} | |
run_values = string.ascii_uppercase + string.ascii_lowercase + string.digits + '-' + '_' | |
run_length = 2 | |
for value in run_values: | |
URLDefenseDecoder.v3_run_mapping[value] = run_length | |
run_length += 1 | |
def decode(self, rewritten_url): | |
match = self.ud_pattern.search(rewritten_url) | |
if match: | |
if match.group(1) == 'v1': | |
return self.decode_v1(rewritten_url) | |
elif match.group(1) == 'v2': | |
return self.decode_v2(rewritten_url) | |
elif match.group(1) == 'v3': | |
return self.decode_v3(rewritten_url) | |
else: | |
raise ValueError('Unrecognized version in: ', rewritten_url) | |
else: | |
raise ValueError('Does not appear to be a URL Defense URL') | |
def decode_v1(self, rewritten_url): | |
match = self.v1_pattern.search(rewritten_url) | |
if match: | |
url_encoded_url = match.group('url') | |
html_encoded_url = unquote(url_encoded_url) | |
url = unescape(html_encoded_url) | |
return url | |
else: | |
raise ValueError('Error parsing URL') | |
def decode_v2(self, rewritten_url): | |
match = self.v2_pattern.search(rewritten_url) | |
if match: | |
special_encoded_url = match.group('url') | |
trans = maketrans('-_', '%/') | |
url_encoded_url = special_encoded_url.translate(trans) | |
html_encoded_url = unquote(url_encoded_url) | |
url = unescape(html_encoded_url) | |
return url | |
else: | |
raise ValueError('Error parsing URL') | |
def decode_v3(self, rewritten_url): | |
def replace_token(token): | |
if token == '*': | |
character = self.dec_bytes[self.current_marker] | |
self.current_marker += 1 | |
return character | |
if token.startswith('**'): | |
run_length = self.v3_run_mapping[token[-1]] | |
run = self.dec_bytes[self.current_marker:self.current_marker + run_length] | |
self.current_marker += run_length | |
return run | |
def substitute_tokens(text, start_pos=0): | |
match = self.v3_token_pattern.search(text, start_pos) | |
if match: | |
start = text[start_pos:match.start()] | |
built_string = start | |
token = text[match.start():match.end()] | |
built_string += replace_token(token) | |
built_string += substitute_tokens(text, match.end()) | |
return built_string | |
else: | |
return text[start_pos:len(text)] | |
match = self.v3_pattern.search(rewritten_url) | |
if match: | |
url = match.group('url') | |
singleSlash = self.v3_single_slash.findall(url) | |
if singleSlash and len(singleSlash[0]) == 2: | |
url = singleSlash[0][0] + "/" + singleSlash[0][1] | |
encoded_url = unquote(url) | |
enc_bytes = match.group('enc_bytes') | |
enc_bytes += '==' | |
self.dec_bytes = (urlsafe_b64decode(enc_bytes)).decode('utf-8') | |
self.current_marker = 0 | |
return substitute_tokens(encoded_url) | |
else: | |
raise ValueError('Error parsing URL') | |
def main(): | |
parser = ArgumentParser(prog='URLDefenseDecode', description='Decode URLs rewritten by URL Defense. Supports v1, v2, and v3 URLs.') | |
parser.add_argument('rewritten_urls', nargs='*') | |
args = parser.parse_args() | |
urldefense_decoder = URLDefenseDecoder() | |
if args.rewritten_urls: | |
for rewritten_url in args.rewritten_urls: | |
try: | |
print(urldefense_decoder.decode(rewritten_url)) | |
except ValueError as e: | |
print(e,file=sys.stderr) | |
else: | |
# Assume stdin/stdout filter | |
content = sys.stdin.read() | |
matches = urldefense_decoder.v3_matcher.findall(content) | |
for url_match in matches: | |
try: | |
real_url_match = urldefense_decoder.decode(url_match) | |
content = content.replace(url_match, real_url_match) | |
except ValueError as e: | |
print(e,file=sys.stderr) | |
sys.stdout.write(content) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment