ytti · May 19, 2024 07:21
diff --git a/cvm.rb b/cvm.rb
 #!/usr/bin/env ruby

 ## CVM is algorithm to estimate amount of unique items in list of items. It is so simple and cheap that
 ## it could reasonably be implemented in silicon, so that cheap switches and routers could report fps or flows
 ## per second counter in addition to bps and pps. Understanding total flow count allows you to do understand
 ## statistical probability and confidence in your ability to reason from ipfix sampled data, if you don't know
 ## your total flow count, you don't know what share of flows you see, and most things cannot be reasoned from ipfix
 ## My implementation almost certainly is wrong, as I don't understand the paper, and Quanta version is wrong
 ##  - https://arxiv.org/abs/2301.10191
 ##  - https://www.cs.toronto.edu/~meel/Slides/meel-distinct.pdf
 ##  - https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/

 class WordCount
  def initialize(file, max_size)
    @io = File.open(file, 'r')
    @cvm = CVM.new(max_size)
  end
  def count
    @io.each_line do |line|
      line.split.each do |word|
        @cvm.add(word.downcase.gsub(/[^a-z]/, ''))
      end
    end
    return @cvm.count
  end
 end


 class CVM
  def initialize(max_size=100)
    @max_size = max_size
    @mem = []
    @round = 0
  end
  def halve
    @mem.delete_if { |e| rand(0..1) == 0 }
    @round += 1
  end
  def add(word)
    @mem.delete(word)
    @mem.append(word) unless delete?
    halve if @mem.size == @max_size
  end
  def delete?
    @round.times { return true if rand(0..1) == 0 }
    return false
  end
  def count
    return @mem.size / 0.5 ** @round
  end
 end

 begin
  if __FILE__ == $0
    puts WordCount.new(ARGV[0], ARGV[1].to_i).count
  end
 end

 ## ╰─ ./cvm.rb ./hamlet.txt 10
 ## 5120.0
 ## ╰─ ./cvm.rb ./hamlet.txt 50
 ## 4480.0
 ## ╰─ ./cvm.rb ./hamlet.txt 100
 ## 3840.0
 ##
 ## Quanta Magazine version, causes huge round count and estimations
 ##  def add(word)
 ##    if @mem.include?(word)
 ##      @mem.delete(word) if delete?
 ##    else
 ##      @mem.append(word)
 ##      halve if @mem.size == @max_size
 ##    end
 ##  end
 ##
 ## This version also works equally well, and may be more correct, but costs more (Set instead of Array would amortize some)
 ##  def add(word)
 ##    @mem.append(word) unless @mem.include?(word)
 ##    @mem.delete(word) if delete?
 ##    halve if @mem.size == @max_size
 ##  end
	#!/usr/bin/env ruby

	## CVM is algorithm to estimate amount of unique items in list of items. It is so simple and cheap that
	## it could reasonably be implemented in silicon, so that cheap switches and routers could report fps or flows
	## per second counter in addition to bps and pps. Understanding total flow count allows you to do understand
	## statistical probability and confidence in your ability to reason from ipfix sampled data, if you don't know
	## your total flow count, you don't know what share of flows you see, and most things cannot be reasoned from ipfix
	## My implementation almost certainly is wrong, as I don't understand the paper, and Quanta version is wrong
	## - https://arxiv.org/abs/2301.10191
	## - https://www.cs.toronto.edu/~meel/Slides/meel-distinct.pdf
	## - https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/

	class WordCount
	def initialize(file, max_size)
	@io = File.open(file, 'r')
	@cvm = CVM.new(max_size)
	end
	def count
	@io.each_line do \|line\|
	line.split.each do \|word\|
	@cvm.add(word.downcase.gsub(/[^a-z]/, ''))
	end
	end
	return @cvm.count
	end
	end


	class CVM
	def initialize(max_size=100)
	@max_size = max_size
	@mem = []
	@round = 0
	end
	def halve
	@mem.delete_if { \|e\| rand(0..1) == 0 }
	@round += 1
	end
	def add(word)
	@mem.delete(word)
	@mem.append(word) unless delete?
	halve if @mem.size == @max_size
	end
	def delete?
	@round.times { return true if rand(0..1) == 0 }
	return false
	end
	def count
	return @mem.size / 0.5 ** @round
	end
	end

	begin
	if __FILE__ == $0
	puts WordCount.new(ARGV[0], ARGV[1].to_i).count
	end
	end

	## ╰─ ./cvm.rb ./hamlet.txt 10
	## 5120.0
	## ╰─ ./cvm.rb ./hamlet.txt 50
	## 4480.0
	## ╰─ ./cvm.rb ./hamlet.txt 100
	## 3840.0
	##
	## Quanta Magazine version, causes huge round count and estimations
	## def add(word)
	## if @mem.include?(word)
	## @mem.delete(word) if delete?
	## else
	## @mem.append(word)
	## halve if @mem.size == @max_size
	## end
	## end
	##
	## This version also works equally well, and may be more correct, but costs more (Set instead of Array would amortize some)
	## def add(word)
	## @mem.append(word) unless @mem.include?(word)
	## @mem.delete(word) if delete?
	## halve if @mem.size == @max_size
	## end