Last active
March 4, 2025 19:11
-
-
Save ippeiukai/4c34a016c660e91239c5 to your computer and use it in GitHub Desktop.
ActiveRecord's find_each and find_in_batches ported to Sequel. Sequel's paged_each is not practical when converting large data due to its use of transaction and offset. (Special thanks to @nomuson for working out the primary_keys_expr logic together.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ActiveRecord's find_each and find_in_batches ported to Sequel. | |
# Sequel's paged_each is not practical when converting large data due to its use of transaction and offset. | |
# | |
# Usage: | |
# | |
# SequelEachInBatches.find_each(dataset, keys) { |record| ... } | |
# | |
# It can also monkey patch Sequel::Dataset: | |
# | |
# Sequel::Dataset.send(:include, SequelEachInBatches) | |
# dataset.find_each(keys) { |record| ... } | |
# | |
# We also have a plugin for Sequel::Model: | |
# | |
# MyModel.plugin :each_in_batches | |
# MyModel.find_each { |record| ... } | |
# | |
module SequelEachInBatches | |
module Implementation | |
# like #find_each of ActiveRecord | |
# optimization with no transaction and no offset compared to Dataset#paged_each | |
def find_each(dataset, primary_keys, options = {}, &block) | |
return to_enum(:find_each, dataset, primary_keys, options) unless block_given? | |
find_in_batches(dataset, primary_keys, options) do |batch| | |
batch.each(&block) | |
end | |
nil | |
end | |
# like #find_in_batches of ActiveRecord | |
def find_in_batches(dataset, primary_keys, options = {}, &_block) | |
return to_enum(:find_in_batches, dataset, primary_keys, options) unless block_given? | |
primary_keys = Array(primary_keys) | |
batch_size = options.fetch(:batch_size, 3000) | |
dataset = dataset.order(*primary_keys) | |
records = dataset.limit(batch_size).all | |
yield records | |
until records.size < batch_size | |
last_record = records.last | |
# construct the primary key condition for the next batch. | |
# depth first nesting with reverse each over primary_keys. | |
# Q1: key3 > val3 | |
# Q2: key2 > val2 OR ( key2 = val2 AND ( Q1 ) ) | |
# Q3: key1 > val1 OR ( key1 = val1 AND ( Q2 ) ) | |
primary_keys_expr = key_larger_than(primary_keys.last, last_record) | |
primary_keys[0..-2].reverse_each do |key| | |
primary_keys_expr = ( key_larger_than(key, last_record) | (key_equal_to(key, last_record) & primary_keys_expr) ) | |
end | |
records = dataset.where(primary_keys_expr).limit(batch_size).all | |
yield records unless records.empty? | |
end | |
nil | |
end | |
private | |
def key_larger_than(key, record) | |
Sequel.expr(Sequel.expr(key) > record[key]) | |
end | |
def key_equal_to(key, record) | |
Sequel.expr(key => record[key]) | |
end | |
end | |
# make methods available as singleton method | |
extend Implementation | |
# make this module can be included to Dataset | |
module DatasetMethods | |
# make methods available as instance method with the first argument set to the instance itself | |
Implementation.instance_methods.each do |m| | |
define_method(m) { |*args, &block| SequelEachInBatches.public_send(m, self, *args, &block) } | |
end | |
end | |
include DatasetMethods | |
end | |
# http://sequel.jeremyevans.net/rdoc/classes/Sequel/Plugins.html | |
module Sequel | |
module Plugins | |
module EachInBatches | |
DatasetMethods = SequelEachInBatches::DatasetMethods | |
module ClassMethods | |
DatasetMethods.instance_methods.each do |m| | |
# delegate methods to dataset | |
# primary_keys are inferred from model if available | |
define_method(m) do |primary_keys = nil, options = {}, &block| | |
if options == {} && (primary_keys.nil? || primary_keys.is_a?(Hash)) | |
options = primary_keys if primary_keys | |
primary_keys = self.primary_key | |
raise ArgumentError, 'This model requires explicit primary_keys.' if primary_keys.nil? | |
end | |
self.dataset.public_send(m, primary_keys, options, &block) | |
end | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ippeiukai, Time to make this a gem!