Last active
February 26, 2024 11:06
-
-
Save bsidhom/c5c83a8d38f500db4be397926cae8d02 to your computer and use it in GitHub Desktop.
Dump FS symlinks as JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Dump all symlinks and their targets under a given direcory in a | |
# newline-separated JSON format. This JSON data includes both the raw path data | |
# encoded as base64 as well as human-readable strings. The strings should not be | |
# used for automated FS operations, as they may have lost information in | |
# encoding. Instead, these should be used as approximate representations for | |
# hand inspection. | |
# This script does _not_ follow any symlinks when searching for links themselves | |
# or when resolving the targets (pointees). The final targets may or may not | |
# resolve (i.e., links may be broken). This is expected behavior, as this tool | |
# is meant to be used to help reconstruct directory/link structure, e.g., after | |
# moving file systems. | |
import argparse | |
import base64 | |
import json | |
import os | |
import os.path | |
def main(): | |
parser = argparse.ArgumentParser( | |
"Create a newline-separated JSON dump of all links under the given directory. This tool does not follow symlinks themselves during directory traversal, but treats them as files." | |
) | |
parser.add_argument("--root", type=os.fsencode, required=True) | |
args = parser.parse_args() | |
# NOTE: We use the raw bytes representation of the root path in order to | |
# get the os.walk() results under the raw OS encoding (which may not be | |
# utf-8 or even unicode). This works correctly as long as the _root_ | |
# directory is valid unicode; subdirectories are handled correctly while | |
# walking the tree. Unfortunately, I'm not sure how to work around issues | |
# when the root dir cannot be encoded as a string. | |
dump_links(args.root) | |
def dump_links(root: bytes): | |
for root, _, files, dir_fd in os.fwalk(root, follow_symlinks=False): | |
for f in files: | |
path = os.path.join(root, f) | |
if os.path.islink(path): | |
target = os.readlink(path, dir_fd=dir_fd) | |
j = to_json(path, target) | |
print(j) | |
def to_json(link, target): | |
# Defensively encode raw path byte strings in base64, since JSON | |
# does not have a raw bytes format. The string representation is | |
# just a convenience for users. | |
link_str = link.decode("utf-8", errors="replace") | |
# NOTE: Decoding the base64 as a string must _always_ work. | |
link_encoded = base64.b64encode(link).decode("utf-8") | |
target_str = target.decode("utf-8", errors="replace") | |
target_encoded = base64.b64encode(target).decode("utf-8") | |
d = { | |
"link_str": link_str, | |
"link": link_encoded, | |
"target_str": target_str, | |
"target": target_encoded, | |
} | |
return json.dumps(d) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment