Created
November 28, 2024 22:40
-
-
Save romiras/71b2980815bdda5fc5151f95f6865814 to your computer and use it in GitHub Desktop.
Naïve data deduplication in Ruby, variant 2 (2018)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
require 'digest/sha1' | |
require 'data_mapper' | |
FILE_STDIN = '/dev/stdin'.freeze | |
def debug_log(msg) | |
puts msg | |
end | |
class Document | |
include DataMapper::Resource | |
property :id, Serial | |
property :path, String, length: 255 | |
end | |
class DataChunk | |
include DataMapper::Resource | |
property :id, Serial | |
property :seq, Integer | |
property :hash_sum, String, length: 40 | |
property :document_id, Integer | |
end | |
class Storage | |
BLOB_DIR = 'blobs'.freeze | |
def initialize(path, options = {}) | |
@path = File.join(path, BLOB_DIR) | |
Dir.mkdir(@path, 0700) unless Dir.exists?(@path) | |
end | |
def get_path_for(storage_key) | |
sub_path = File.join(@path, storage_key[0,2]) | |
filename = storage_key[2..-1] | |
[sub_path, filename] | |
end | |
def has_object?(sub_path, filename) | |
Dir.mkdir(sub_path) unless Dir.exist?(sub_path) | |
File.exists?(File.join(sub_path, filename)) | |
end | |
# persist content of io under storage_key | |
def put_object(io, copy_length, storage_key) | |
return if copy_length == 0 | |
sub_path, filename = get_path_for(storage_key) | |
if has_object?(sub_path, filename) | |
# debug_log "\t#{storage_key} - skipped! :-)" | |
else | |
#IO.binwrite( File.join(@path, sub_path, filename), io.binread ) | |
File.open(File.join(sub_path, filename), 'wb') do |output_stream| | |
IO.copy_stream(io, output_stream, copy_length) | |
end | |
debug_log "Saved BLOB to #{storage_key} (#{copy_length} bytes)" | |
end | |
end | |
end | |
class Chunker | |
CHUNK_SIZE = 1 << 13 # 8192 | |
def initialize(storage_path) | |
@storage_path = storage_path | |
init_storage | |
@storage = Storage.new(storage_path) | |
@digest = Digest::SHA1.new | |
@chunk_size = CHUNK_SIZE | |
end | |
def init_storage | |
# DataMapper::Logger.new($stdout, :debug) | |
db_location = File.join @storage_path, "index.db" | |
# debug_log db_location | |
# DataMapper.setup(:default, 'sqlite::memory:') | |
DataMapper.setup(:default, "sqlite://#{ db_location }") | |
DataMapper.finalize | |
DataMapper.auto_upgrade! | |
end | |
def process(inp_file, storage_key) | |
seq = 0 | |
doc = Document.create(path: storage_key) | |
DataChunk.transaction do | |
while chunk = inp_file.read( @chunk_size ) | |
@digest.update( chunk ) | |
@storage.put_object( StringIO.new(chunk), chunk.size, @digest.hexdigest ) | |
seq += 1 | |
DataChunk.create( | |
seq: seq, | |
hash_sum: @digest.hexdigest, | |
document_id: doc.id | |
) | |
@digest.reset | |
end | |
end | |
end | |
end | |
def parse_options | |
# default options: | |
options = {} | |
OptionParser.new do |opts| | |
# banner and separator are the usage description showed with '--help' or '-h' | |
opts.banner = "Deduplicates and stores blobs in data storage" | |
opts.separator "Usage: chunker.rb [-p] [-l | -f]" | |
opts.separator "Options:" | |
# options (switch - true/false) | |
opts.on("-p", "--storage_path DIR", "Path to store directory 'blobs'. Default is current directory.") do |f| | |
options[:storage_path] = f | |
end | |
options[:storage_path] ||= Dir.pwd | |
opts.on("-l", "--list-from FILE", "Get names to process from FILE") do |f| | |
options[:files_list] = f | |
end | |
options[:files_list] = FILE_STDIN if options[:files_list] == '-' | |
opts.on("-f", "--file FILE", "File to process") do |f| | |
options[:file] = f # f becames a filename given after -f or --file | |
end | |
options[:file] = FILE_STDIN if options[:file] == '-' | |
end.parse! | |
options | |
end | |
options = parse_options | |
begin | |
if options[:file] | |
chunker = Chunker.new options[:storage_path] | |
filename = options[:file] | |
File.open(filename, "rb") do |file| | |
chunker.process( file, filename ) | |
end | |
elsif options[:files_list] | |
chunker = Chunker.new options[:storage_path] | |
files_list = options[:files_list] | |
files_list = FILE_STDIN if files_list == '-' | |
File.foreach(files_list) do |filename| | |
filename.strip! | |
File.open(filename, "rb") do |file| | |
chunker.process( file, filename ) | |
end | |
end | |
else | |
abort "Missing arguments! Run with argument -h for help." | |
end | |
rescue Errno::ENOENT => e | |
abort e.message | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment