Created
March 21, 2024 06:31
-
-
Save RuizSerra/d729a96b9e0e699daa42555138ad212c to your computer and use it in GitHub Desktop.
To redact student names from `.html` comparison files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
import os | |
import zipfile | |
def redact_names(html_filename, OUTPUT_DIR='.'): | |
## Load html file | |
with open(html_filename, 'r') as f: | |
txt = f.read() | |
soup = BeautifulSoup(txt, 'html.parser') | |
# Change path references | |
for tag in soup.find_all(href=True): | |
if tag['href'].startswith('../assets/'): | |
tag['href'] = './' + tag['href'][3:] | |
for tag in soup.find_all(src=True): | |
if tag['src'].startswith('../assets/'): | |
tag['src'] = './' + tag['src'][3:] | |
## Redact student names from title | |
title = soup.find('title') | |
title.string = re.sub(r'^.* x .* : ', '', title.text) | |
## Get student names | |
student_left = soup.select_one('.split .header') | |
student_left_original = student_left.string | |
student_right = soup.select_one('.split-right .header') | |
student_right_original = student_right.string | |
## Redact student left | |
student_left.string = 'OTHER STUDENT' | |
# Save to directory | |
unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_right_original) | |
directory = unikey | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
# with filename | |
out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename) | |
with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f: | |
f.write(str(soup)) | |
print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}') | |
## Redact student right | |
student_left.string = student_left_original | |
student_right.string = 'OTHER STUDENT' | |
# Save to directory | |
unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_left_original) | |
directory = unikey | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
# with filename | |
out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename) | |
with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f: | |
f.write(str(soup)) | |
print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}') | |
## Reset | |
student_right.string = student_right_original | |
# ---------------------------------------------------------------------- | |
INPUT_BASE_DIR = '../path-of-dir' | |
input_dirs = [ | |
os.path.join(INPUT_BASE_DIR, d) for d in os.listdir(INPUT_BASE_DIR) | |
if re.match('^[a-z]{4}\d{4}-', d) | |
] | |
for input_dir in input_dirs: | |
for filename in os.listdir(input_dir): | |
if filename.endswith('.html'): | |
html_filename = os.path.join(input_dir, filename) | |
print('Input: ', html_filename) | |
redact_names(html_filename) | |
filtered_list = [d for d in os.listdir('.') if re.match(r'^[a-z]{4}[\d]{4}$', d)] | |
for directory in filtered_list: | |
zip_filename = f'{directory}-files.zip' | |
with zipfile.ZipFile(zip_filename, 'w') as zipf: | |
# Copy HTML files into zip file | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
print(file_path) | |
zipf.write(file_path, os.path.relpath(file_path, directory)) | |
# Copy assets into zip file | |
for root, dirs, files in os.walk('assets'): | |
for file in files: | |
file_path = os.path.join(root, file) | |
print(file_path) | |
zipf.write(file_path, os.path.relpath(file_path, '.')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment