Created
July 29, 2024 01:14
-
-
Save kism/cf018aa87e429fe4796201502ca6e8af to your computer and use it in GitHub Desktop.
Replicate/replace file modified times
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
./date_replicate.py my_folder/original my_folder/copied | |
Will find files with same sizes and hash (first 10MB) but different modified times. | |
Update the second folder's matching files with the first folders files modified time. | |
""" | |
import os | |
import sys | |
import datetime | |
import hashlib | |
import json | |
def get_files_with_size_and_mtime(directory): | |
files_info = {} | |
for root, _, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
try: | |
file_size = os.path.getsize(file_path) | |
file_mtime = os.path.getmtime(file_path) | |
if file_size not in files_info: | |
files_info[file_size] = [] | |
files_info[file_size].append((file_path, file_mtime)) | |
except OSError as e: | |
print(f"Error accessing file {file_path}: {e}") | |
return files_info | |
def get_file_checksum(file_path): | |
hash_algo = hashlib.sha256() | |
chunk_size = 4096 # 4KB chunks | |
with open(file_path, "rb") as f: | |
file_size = f.seek(0, 2) # Move to the end of the file to get the file size | |
f.seek(0) # Move back to the start of the file | |
if file_size >= 4 * 1024 * 1024: # 4MB in bytes | |
bytes_to_read = min(file_size, 10 * 1024 * 1024) # 10MB in bytes | |
else: | |
bytes_to_read = file_size | |
while bytes_to_read > 0: | |
chunk = f.read(min(chunk_size, bytes_to_read)) | |
if not chunk: | |
break | |
hash_algo.update(chunk) | |
bytes_to_read -= len(chunk) | |
return hash_algo.hexdigest() | |
def compare_checksums(file_path1, file_path2): | |
"""Compare the SHA-256 checksums of two files.""" | |
checksum1 = get_file_checksum(file_path1) | |
checksum2 = get_file_checksum(file_path2) | |
if checksum1 is None or checksum2 is None: | |
return False, "One or both files do not exist." | |
if checksum1 == checksum2: | |
return True, "The checksums match." | |
else: | |
return False, "The checksums do not match." | |
def compare_files_in_directories(dir1, dir2): | |
dir1_files = get_files_with_size_and_mtime(dir1) | |
dir2_files = get_files_with_size_and_mtime(dir2) | |
print("Files with same size but different modified time:") | |
try: | |
for size, files in dir1_files.items(): | |
if size in dir2_files: | |
for file1, mtime1 in files: | |
for file2, mtime2 in dir2_files[size]: | |
if mtime1 != mtime2: | |
# print(f"{file1} (Modified Time: {mtime1}), {file2} (Modified Time: {mtime2})") | |
print("") | |
if os.path.basename(file1) == os.path.basename(file2): | |
print(f"{os.path.basename(file1)}") | |
else: | |
print(f"{os.path.basename(file1)}") | |
print(f"{os.path.basename(file2)}") | |
result, message = compare_checksums(file1, file2) | |
if result: | |
time_diff = datetime.datetime.fromtimestamp( | |
mtime2 | |
) - datetime.datetime.fromtimestamp(mtime1) | |
if time_diff.total_seconds() > 0: | |
print( | |
f"Checksums match, time diff: {time_diff.total_seconds()} appending to list." | |
) | |
out_list.append( | |
{ | |
"path": file2, | |
"time_new": mtime1, | |
"time_original": mtime2, | |
} | |
) | |
except KeyboardInterrupt: | |
pass | |
def get_next_filename(base_filename): | |
if not os.path.exists(base_filename): | |
return base_filename | |
filename, file_extension = os.path.splitext(base_filename) | |
i = 1 | |
while True: | |
new_filename = f"{filename}_{i}{file_extension}" | |
if not os.path.exists(new_filename): | |
return new_filename | |
i += 1 | |
dir1 = sys.argv[1] | |
dir2 = sys.argv[2] | |
out_list = [] | |
compare_files_in_directories(dir1, dir2) | |
filename = get_next_filename("backup_date_replicate.json") | |
with open(filename, "w") as f: | |
json.dump(out_list, f) | |
for entry in out_list: | |
target_time = datetime.datetime.fromtimestamp(entry["time_new"]) | |
print(f'{entry["path"]}. New time: {target_time}') | |
os.utime(entry["path"], (entry["time_new"], entry["time_new"])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
./finddate.py my_folder | |
Will search for files modified between dates, and set modification to the TARGET_DATE_STR. | |
See the constants for the start and end dates to search for. | |
""" | |
import sys | |
import os | |
import random | |
import json | |
from datetime import datetime, timedelta | |
START_DATE = "2024-01-01" | |
END_DATE = "2024-01-02" | |
TARGET_DATE_STR = "2021-10-22" | |
def get_next_filename(base_filename): | |
if not os.path.exists(base_filename): | |
return base_filename | |
filename, file_extension = os.path.splitext(base_filename) | |
i = 1 | |
while True: | |
new_filename = f"{filename}_{i}{file_extension}" | |
if not os.path.exists(new_filename): | |
return new_filename | |
i += 1 | |
def find_files_within_dates(directory, START_DATE, END_DATE): | |
# Convert start and end dates to datetime objects | |
START_DATE = datetime.strptime(START_DATE, "%Y-%m-%d") | |
END_DATE = datetime.strptime(END_DATE, "%Y-%m-%d") | |
matching_files = [] | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path)) | |
if START_DATE <= file_mod_time <= END_DATE: | |
matching_files.append( | |
{"file": file_path, "mod_time": file_mod_time.timestamp()} | |
) | |
return matching_files | |
directory_to_search = sys.argv[1] | |
target_date = datetime.strptime(TARGET_DATE_STR, "%Y-%m-%d") | |
files = find_files_within_dates(directory_to_search, START_DATE, END_DATE) | |
filename = get_next_filename("backup_find_date.json") | |
with open(filename, "w") as f: | |
json.dump(files, f) | |
for file in files: | |
original_time = datetime.fromtimestamp(file["mod_time"]) | |
new_time = target_date + timedelta(microseconds=(random.randint(0, 1000000))) | |
new_time_epoch = new_time.timestamp() | |
print() | |
print( | |
f"{file['file']} was modified on {original_time.strftime('%Y-%m-%d %H:%M:%S.%f')}" | |
) | |
print(f"new time: {new_time.strftime('%Y-%m-%d %H:%M:%S.%f')}") | |
os.utime(file["file"], times=(new_time_epoch, new_time_epoch)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Generate test dirs and files for the two scripts | |
mkdir my_folder | |
mkdir my_folder/original | |
mkdir my_folder/copied | |
rm my_folder/original/* | |
rm my_folder/copied/* | |
for i in {1..10} | |
do | |
dd if=/dev/urandom of=my_folder/original/file$i.bin bs=1M count=$i | |
touch -m -t 202401010000 my_folder/original/file$i.bin | |
cp my_folder/original/file$i.bin my_folder/copied/file$i.bin | |
touch -m -t 202401010000 my_folder/copied/file$i.bin | |
done | |
touch -m -t 202001010000 my_folder/original/file1.bin | |
touch -m -t 202001010000 my_folder/original/file2.bin | |
touch -m -t 202001010000 my_folder/original/file3.bin |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment