Skip to content

Instantly share code, notes, and snippets.

@romiras
Created November 28, 2024 22:40
Show Gist options
  • Save romiras/71b2980815bdda5fc5151f95f6865814 to your computer and use it in GitHub Desktop.
Save romiras/71b2980815bdda5fc5151f95f6865814 to your computer and use it in GitHub Desktop.
Naïve data deduplication in Ruby, variant 2 (2018)
#!/usr/bin/env ruby
require 'optparse'
require 'digest/sha1'
require 'data_mapper'
FILE_STDIN = '/dev/stdin'.freeze
def debug_log(msg)
puts msg
end
class Document
include DataMapper::Resource
property :id, Serial
property :path, String, length: 255
end
class DataChunk
include DataMapper::Resource
property :id, Serial
property :seq, Integer
property :hash_sum, String, length: 40
property :document_id, Integer
end
class Storage
BLOB_DIR = 'blobs'.freeze
def initialize(path, options = {})
@path = File.join(path, BLOB_DIR)
Dir.mkdir(@path, 0700) unless Dir.exists?(@path)
end
def get_path_for(storage_key)
sub_path = File.join(@path, storage_key[0,2])
filename = storage_key[2..-1]
[sub_path, filename]
end
def has_object?(sub_path, filename)
Dir.mkdir(sub_path) unless Dir.exist?(sub_path)
File.exists?(File.join(sub_path, filename))
end
# persist content of io under storage_key
def put_object(io, copy_length, storage_key)
return if copy_length == 0
sub_path, filename = get_path_for(storage_key)
if has_object?(sub_path, filename)
# debug_log "\t#{storage_key} - skipped! :-)"
else
#IO.binwrite( File.join(@path, sub_path, filename), io.binread )
File.open(File.join(sub_path, filename), 'wb') do |output_stream|
IO.copy_stream(io, output_stream, copy_length)
end
debug_log "Saved BLOB to #{storage_key} (#{copy_length} bytes)"
end
end
end
class Chunker
CHUNK_SIZE = 1 << 13 # 8192
def initialize(storage_path)
@storage_path = storage_path
init_storage
@storage = Storage.new(storage_path)
@digest = Digest::SHA1.new
@chunk_size = CHUNK_SIZE
end
def init_storage
# DataMapper::Logger.new($stdout, :debug)
db_location = File.join @storage_path, "index.db"
# debug_log db_location
# DataMapper.setup(:default, 'sqlite::memory:')
DataMapper.setup(:default, "sqlite://#{ db_location }")
DataMapper.finalize
DataMapper.auto_upgrade!
end
def process(inp_file, storage_key)
seq = 0
doc = Document.create(path: storage_key)
DataChunk.transaction do
while chunk = inp_file.read( @chunk_size )
@digest.update( chunk )
@storage.put_object( StringIO.new(chunk), chunk.size, @digest.hexdigest )
seq += 1
DataChunk.create(
seq: seq,
hash_sum: @digest.hexdigest,
document_id: doc.id
)
@digest.reset
end
end
end
end
def parse_options
# default options:
options = {}
OptionParser.new do |opts|
# banner and separator are the usage description showed with '--help' or '-h'
opts.banner = "Deduplicates and stores blobs in data storage"
opts.separator "Usage: chunker.rb [-p] [-l | -f]"
opts.separator "Options:"
# options (switch - true/false)
opts.on("-p", "--storage_path DIR", "Path to store directory 'blobs'. Default is current directory.") do |f|
options[:storage_path] = f
end
options[:storage_path] ||= Dir.pwd
opts.on("-l", "--list-from FILE", "Get names to process from FILE") do |f|
options[:files_list] = f
end
options[:files_list] = FILE_STDIN if options[:files_list] == '-'
opts.on("-f", "--file FILE", "File to process") do |f|
options[:file] = f # f becames a filename given after -f or --file
end
options[:file] = FILE_STDIN if options[:file] == '-'
end.parse!
options
end
options = parse_options
begin
if options[:file]
chunker = Chunker.new options[:storage_path]
filename = options[:file]
File.open(filename, "rb") do |file|
chunker.process( file, filename )
end
elsif options[:files_list]
chunker = Chunker.new options[:storage_path]
files_list = options[:files_list]
files_list = FILE_STDIN if files_list == '-'
File.foreach(files_list) do |filename|
filename.strip!
File.open(filename, "rb") do |file|
chunker.process( file, filename )
end
end
else
abort "Missing arguments! Run with argument -h for help."
end
rescue Errno::ENOENT => e
abort e.message
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment