Last active
May 22, 2019 21:21
-
-
Save christopher-b/b63ebb335fe95aa8cd667cacb52e5ba5 to your computer and use it in GitHub Desktop.
Canvas FileZapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Canvas FileZapper. Zap yer files. | |
# Monkey patch the File class. | |
# This is to work around a bug in gems/attachment_fu/lib/attachment_fu#detect_mimetype. | |
# During att.make_childless, Canvas will call attachment.uploaded_data = data, data being a File | |
# instance. Attachment#uploaded_data= will call detect_mimetype with data, but will fail if data | |
# does not respond to #content_type. So we add the content_type method, using the same code that | |
# detect_mimetype would use anyways. | |
class File | |
def content_type | |
File.mime_type?(self) | |
end | |
end | |
class FileZapper | |
# This class deletes user-uploaded and system-generated files, to free up space on disk. It can be | |
# used to comply with your institutional data retention policies, and to remove old cruft. | |
# USE WITH CAUTION. Files are DELETED FROM DISK and cannot be retrieved. | |
# Attachment records are not removed. The underlying files are deleted, and Canvas' native de-dup | |
# behaviour is used replace the file with a placeholder. A new placeholder attachment record will | |
# be created and set as the root attachment for all deleted attachments. | |
# For some fully disposable files like system-generated reports and exports, the files are deleted | |
# altogehter, and not replaced with placeholders. | |
# Only tested with local storage. Behaviour with S3 is unclear. | |
# To Do: | |
# - Files in account-level groups | |
# - Disposable files, like ePub exports, SIS imports, reports etc. | |
# - Clear out failed uploads? | |
def initialize(options={}) | |
defaults = { | |
cutoff_deleted: 1.year.ago, | |
cutoff_content_export: 1.year.ago, | |
cutoff_epubs: 1.year.ago, | |
placeholder_filename: 'OCADU_file_removed_2019', | |
} | |
@options = defaults.merge(options) | |
end | |
def replace_course_files(term) | |
term = verify_term(term) | |
att_ids = Attachment.where( | |
context: term.courses, | |
file_state: :available | |
).pluck(:id) | |
# Get files from course groups | |
att_ids.concat Attachment.where( | |
context: Group.where(context: term.courses), | |
file_state: 'available' | |
).pluck(:id) | |
replace_files(att_ids) | |
end | |
def replace_submissions(term, also=[:comments, :quizzes]) | |
# Remove student assignment submissions for the given term. Optionally also delete files | |
# attached to submissions comments and quiz submission attachments | |
term = verify_term(term) | |
# Find ALL submissions with attachments for the given terms | |
# Pluck attachment IDs (comma-delimited) and flatten them | |
att_ids = Submission | |
.where(assignment: Assignment.where(context: term.courses)) | |
.where.not(attachment_ids: '') | |
.pluck(:attachment_ids) | |
.map { |ids| ids.split(',') } | |
.flatten | |
# Submission comment attachments | |
if also.include?(:comments) | |
att_ids.concat Attachment | |
.where(context: Assignment.where(context: term.courses)) | |
.where.not(workflow_state: :zipped) # Exclude submission exports | |
.pluck(:id) | |
end | |
# Files attached to quiz submissions | |
if also.include?(:quizzes) | |
att_ids.concat Attachment.where( | |
context: Quizzes::QuizSubmission.where( | |
quiz: Quizzes::Quiz.where(context: term.courses) | |
) | |
).pluck(:id) | |
end | |
replace_files(att_ids) | |
end | |
def delete_content_exports | |
ContentExport.where('created_at < ?', @options[:cutoff_content_export]).each do |ce| | |
log("Deleting ContextExport #{ce.id}") | |
# ContentExport#destroy is broken: PG throws a FK violation when trying to delete the attachment row | |
# So we manually delete the content and destroy, rather than delete the attachment | |
ce.attachment&.tap do |att| | |
log("Deleting Attachment #{att.id}") | |
destroy_attachment(att) | |
end | |
ce.workflow_state = 'deleted' | |
ce.save! | |
end | |
end | |
def delete_deleted_files | |
# Remove files that have been manually deleted. Any file deleted before `cutoff_deleted` will be | |
# removed from disk. We don't need to replace these, because they're not referenced anywhere. | |
Attachment | |
.where(file_state: :deleted) | |
.where('deleted_at < ?', @options[:cutoff_deleted]) | |
.each do |att| | |
destroy_attachment(att) | |
end | |
end | |
private | |
def replace_files(att_ids) | |
# Delete the original file from disk and replace it with a handy placeholder | |
# Adapted from Attachment#destroy_content_and_replace and Attachments::GarbageCollector | |
att_ids.each_slice(500) do |ids_batch| | |
Attachment.where(id: ids_batch).each do |att| | |
log("Deleting attachment #{att.id}") | |
# Find the appropriate placeholder root attachment | |
new_root = is_image?(att) ? root_image : root_pdf | |
if att.root_attachment_id | |
# Skip files we've already processed | |
next if att.root_attachment_id == new_root.id | |
# Don't delete content from child items. Just set the new root, and save the old root | |
# for later reloading | |
old_root = att.root_attachment | |
else | |
old_root = nil | |
# This will copy the file to a child and make it the new root | |
att.make_childless | |
# Delete original file. DANGER! | |
begin | |
att.destroy_content | |
att.thumbnail&.destroy | |
rescue Errno::ENOENT | |
# The file was not found. Oh well? | |
end | |
end | |
att.root_attachment = new_root | |
[:filename, :md5, :size, :content_type].each do |key| | |
att.send("#{key}=", new_root.send(key)) | |
end | |
# Fix file extension, so the file will open properly | |
unless File.extname(att.display_name) == new_root.extension | |
att.display_name = att.display_name + new_root.extension | |
end | |
att.save! | |
# Make sure to update associations on the old root_attachment | |
old_root&.reload | |
end | |
end | |
end | |
def destroy_attachment(att) | |
# Remove the file from disk and mark the attachment as deleted | |
unless att.root_attachment_id | |
att.make_childless | |
att.destroy_content | |
end | |
att.destroy | |
end | |
def root_pdf | |
@root_pdf ||= Attachment.find_by( | |
filename: placeholder_pdf_filename, | |
context: Account.default, | |
root_attachment_id: nil | |
) || create_root_pdf | |
end | |
def root_image | |
@root_image ||= Attachment.find_by( | |
filename: placeholder_image_filename, | |
context: Account.default, | |
root_attachment_id: nil | |
) || create_root_image | |
end | |
def create_root_pdf | |
file_removed_pdf = File.open Rails.root.join('tmp', 'files', 'file_removed.pdf') | |
Attachment.new do |att| | |
att.context = Account.default | |
att.filename = placeholder_pdf_filename | |
att.uploaded_data = file_removed_pdf | |
att.content_type = 'application/pdf' | |
att.save | |
end | |
end | |
def create_root_image | |
file_removed_image = File.open Rails.root.join('tmp', 'files', 'file_removed.png') | |
Attachment.new do |att| | |
att.context = Account.default | |
att.filename = placeholder_image_filename | |
att.uploaded_data = file_removed_image | |
att.content_type = 'image/png' | |
att.save | |
end | |
end | |
def is_image?(att) | |
image_types = %w(image/gif image/jpeg image/pjpeg image/png image/x-png image/bmp) | |
image_types.include? att.content_type | |
end | |
def verify_term(term) | |
term.is_a?(EnrollmentTerm) ? term : EnrollmentTerm.find_by(sis_source_id: term) | |
end | |
def log(message) | |
Rails.logger.info {"---#{message}"} | |
end | |
def placeholder_pdf_filename | |
"#{@options[:placeholder_filename]}.pdf" | |
end | |
def placeholder_image_filename | |
"#{@options[:placeholder_filename]}.png" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment