Skip to content

Instantly share code, notes, and snippets.

@sambacha
Forked from mdehoog/main.go
Created February 24, 2025 04:34
Show Gist options
  • Save sambacha/579acb721dafa1c95cb5ea561ec2df2d to your computer and use it in GitHub Desktop.
Save sambacha/579acb721dafa1c95cb5ea561ec2df2d to your computer and use it in GitHub Desktop.
Dump the transcation size + estimated compressed size from a geth database, and analyze using numpy
package main
import (
"bytes"
"compress/zlib"
"context"
"encoding/binary"
"fmt"
"log"
"math/big"
"os"
"strings"
"time"
"github.com/ethereum/go-ethereum/cmd/utils"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/rawdb"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/ethclient"
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/node"
)
func main() {
trimSignature := false
bootstrapTxs := 1000
endBlock := uint64(105235064) // OP bedrock block + 1
//endBlock := uint64(0)
startBlock := int64(-1) // -1 for latest
// remote node URL or local database location:
// clientLocation := "https://mainnet.optimism.io"
clientLocation := "/data/op-geth"
output, err := os.Create("/data/fastlz.bin")
if err != nil {
log.Fatal(err)
}
defer output.Close()
var client Client
if strings.HasPrefix(clientLocation, "http://") || strings.HasPrefix(clientLocation, "https://") {
client, err = ethclient.Dial(clientLocation)
} else {
client, err = NewLocalClient(clientLocation)
}
if err != nil {
log.Fatal(err)
}
defer client.Close()
zlibBestBatchEstimator := newZlibBatchEstimator()
printInterval := 10 * time.Second
lastPrint := time.Now().Add(-printInterval)
count := 0
var nextBlockHash *common.Hash
for {
var block *types.Block
if nextBlockHash == nil {
block, err = client.BlockByNumber(context.Background(), big.NewInt(startBlock))
} else {
block, err = client.BlockByHash(context.Background(), *nextBlockHash)
}
if err != nil {
log.Fatal(err)
}
if block == nil || block.NumberU64() <= endBlock {
break
}
p := block.ParentHash()
nextBlockHash = &p
if time.Since(lastPrint) > printInterval {
lastPrint = time.Now()
fmt.Println("Block:", block.NumberU64())
}
for _, tx := range block.Transactions() {
if tx.Type() == types.DepositTxType {
continue
}
b, err := tx.MarshalBinary()
if err != nil {
log.Fatal(err)
}
if trimSignature {
// for span batch mode we trim the signature, and assume there is no estimation
// error on this component were we to just treat it as entirely incompressible.
b = b[:len(b)-68.]
}
count++
if count <= bootstrapTxs {
zlibBestBatchEstimator.write(b)
continue
}
best := zlibBestBatchEstimator.write(b)
fastlz := FlzCompressLen(b)
zeroes := uint32(0)
nonZeroes := uint32(0)
for _, b := range b {
if b == 0 {
zeroes++
} else {
nonZeroes++
}
}
if best == 0 {
// invalid datapoint, ignore
continue
}
// block numbers fit in 32-bits (for now)
err = binary.Write(output, binary.LittleEndian, uint32(block.NumberU64()))
if err != nil {
log.Fatal(err)
}
err = binary.Write(output, binary.LittleEndian, best)
if err != nil {
log.Fatal(err)
}
err = binary.Write(output, binary.LittleEndian, fastlz)
if err != nil {
log.Fatal(err)
}
err = binary.Write(output, binary.LittleEndian, zeroes)
if err != nil {
log.Fatal(err)
}
err = binary.Write(output, binary.LittleEndian, nonZeroes)
if err != nil {
log.Fatal(err)
}
}
}
}
// zlibBatchEstimator simulates a zlib compressor at max compression that works on (large) tx
// batches. Should bootstrap it before use by calling it on several samples of representative
// data.
type zlibBatchEstimator struct {
b [2]bytes.Buffer
w [2]*zlib.Writer
}
func newZlibBatchEstimator() *zlibBatchEstimator {
b := &zlibBatchEstimator{}
var err error
for i := range b.w {
b.w[i], err = zlib.NewWriterLevel(&b.b[i], zlib.BestCompression)
if err != nil {
log.Fatal(err)
}
}
return b
}
func (w *zlibBatchEstimator) write(p []byte) uint32 {
// targeting:
// b[0] == 0-64kb
// b[1] == 64kb-128kb
before := w.b[1].Len()
_, err := w.w[1].Write(p)
if err != nil {
log.Fatal(err)
}
err = w.w[1].Flush()
if err != nil {
log.Fatal(err)
}
after := w.b[1].Len()
// if b[1] > 64kb, write to b[0]
if w.b[1].Len() > 64*1024 {
_, err = w.w[0].Write(p)
if err != nil {
log.Fatal(err)
}
err = w.w[0].Flush()
if err != nil {
log.Fatal(err)
}
}
// if b[1] > 128kb, rotate
if w.b[1].Len() > 128*1024 {
w.b[1].Reset()
w.w[1].Reset(&w.b[1])
tb := w.b[1]
tw := w.w[1]
w.b[1] = w.b[0]
w.w[1] = w.w[0]
w.b[0] = tb
w.w[0] = tw
}
if after-before-2 < 0 {
return 0
}
return uint32(after - before - 2) // flush writes 2 extra "sync" bytes so don't count those
}
type Client interface {
BlockByHash(ctx context.Context, hash common.Hash) (*types.Block, error)
BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error)
Close()
}
type LocalClient struct {
n *node.Node
db ethdb.Database
}
func NewLocalClient(dataDir string) (Client, error) {
nodeCfg := node.DefaultConfig
nodeCfg.Name = "geth"
nodeCfg.DataDir = dataDir
n, err := node.New(&nodeCfg)
if err != nil {
return nil, err
}
handles := utils.MakeDatabaseHandles(1024)
db, err := n.OpenDatabaseWithFreezer("chaindata", 512, handles, "", "", true)
if err != nil {
return nil, err
}
return &LocalClient{
n: n,
db: db,
}, nil
}
func (c *LocalClient) Close() {
_ = c.db.Close()
_ = c.n.Close()
}
func (c *LocalClient) BlockByHash(ctx context.Context, hash common.Hash) (*types.Block, error) {
number := rawdb.ReadHeaderNumber(c.db, hash)
if number == nil {
return nil, nil
}
return rawdb.ReadBlock(c.db, hash, *number), nil
}
func (c *LocalClient) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) {
if number.Int64() < 0 {
return c.BlockByHash(ctx, rawdb.ReadHeadBlockHash(c.db))
}
hash := rawdb.ReadCanonicalHash(c.db, number.Uint64())
if bytes.Equal(hash.Bytes(), common.Hash{}.Bytes()) {
return nil, nil
}
return rawdb.ReadBlock(c.db, hash, number.Uint64()), nil
}
func FlzCompressLen(ib []byte) uint32 {
n := uint32(0)
ht := make([]uint32, 8192)
u24 := func(i uint32) uint32 {
return uint32(ib[i]) | (uint32(ib[i+1]) << 8) | (uint32(ib[i+2]) << 16)
}
cmp := func(p uint32, q uint32, e uint32) uint32 {
l := uint32(0)
for e -= q; l < e; l++ {
if ib[p+l] != ib[q+l] {
e = 0
}
}
return l
}
literals := func(r uint32) {
n += 0x21 * (r / 0x20)
r %= 0x20
if r != 0 {
n += r + 1
}
}
match := func(l uint32) {
l--
n += 3 * (l / 262)
if l%262 >= 6 {
n += 3
} else {
n += 2
}
}
hash := func(v uint32) uint32 {
return ((2654435769 * v) >> 19) & 0x1fff
}
setNextHash := func(ip uint32) uint32 {
ht[hash(u24(ip))] = ip
return ip + 1
}
a := uint32(0)
ipLimit := uint32(len(ib)) - 13
if len(ib) < 13 {
ipLimit = 0
}
for ip := a + 2; ip < ipLimit; {
r := uint32(0)
d := uint32(0)
for {
s := u24(ip)
h := hash(s)
r = ht[h]
ht[h] = ip
d = ip - r
if ip >= ipLimit {
break
}
ip++
if d <= 0x1fff && s == u24(r) {
break
}
}
if ip >= ipLimit {
break
}
ip--
if ip > a {
literals(ip - a)
}
l := cmp(r+3, ip+3, ipLimit+9)
match(l)
ip = setNextHash(setNextHash(ip + l))
a = ip
}
literals(uint32(len(ib)) - a)
return n
}
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime
dt = np.dtype([('block', '<u4'), ('best', '<u4'), ('fastlz', '<u4'), ('zeroes', '<u4'), ('ones', '<u4')])
op_mainnet = np.fromfile('./op-mainnet.bin', dtype=dt)
input_array = np.array(op_mainnet.tolist())
# base_mainnet = np.fromfile('./base-mainnet.bin', dtype=dt)
# input_array = np.concatenate((input_array, np.array(base_mainnet.tolist())))
op_mainnet_genesis_time = datetime.datetime.fromtimestamp(1686068905)
op_mainnet_genesis_block = 105235064
block_time = 2
signature_omitted = False
training_start = datetime.datetime(2023, 10, 1)
training_end = datetime.datetime(2023, 11, 1)
training_block_start = op_mainnet_genesis_block + (int(training_start.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time
training_block_end = op_mainnet_genesis_block + (int(training_end.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time
training_index_start = next(i for i,v in enumerate(input_array) if v[0] == training_block_start)
training_index_end = next(i for i,v in enumerate(input_array) if v[0] == training_block_end)
first_day = datetime.timedelta(days=1) + datetime.datetime.combine(
op_mainnet_genesis_time.date(), datetime.datetime.min.time())
blocks_on_first_day = (int(first_day.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time
blocks_per_day = 60*60*24 // block_time
# use first 10% of data for regression calculation
x = np.delete(input_array, [0,1], 1)[training_index_end:training_index_start]
y = input_array[:, 1][training_index_end:training_index_start]
fastlz_model = LinearRegression().fit(x, y)
print(f'fastlz_model: {fastlz_model.intercept_} {fastlz_model.coef_}')
x_zeros_ones_combined = np.copy(x)
x_zeros_ones_combined[:, 1] += x_zeros_ones_combined[:, 2]
x_zeros_ones_combined = np.delete(x_zeros_ones_combined, [2], 1)
fastlz_combined_model = LinearRegression().fit(x_zeros_ones_combined, y)
print(f'fastlz_combined_model: {fastlz_combined_model.intercept_} {fastlz_combined_model.coef_}')
x_simple = np.delete(x, [1,2], 1)
fastlz_simple_model = LinearRegression().fit(x_simple, y)
print(f'fastlz_simple_model: {fastlz_simple_model.intercept_} {fastlz_simple_model.coef_}')
x_naive = np.delete(x, [0], 1)
naive_model = LinearRegression().fit(x_naive, y)
print(f'naive_model: {naive_model.intercept_} {naive_model.coef_}')
naive_scalar = np.sum(y) / (np.sum(x[:, 1]*4+x[:, 2]*16)/16)
fastlz_scalar = np.sum(y) / np.sum(x[:, 0])
print(f'naive_scalar: {naive_scalar}')
print(f'fastlz_scalar: {fastlz_scalar}')
normalized = input_array + [blocks_per_day - op_mainnet_genesis_block - blocks_on_first_day, 0, 0, 0, 0]
grouped = normalized // [blocks_per_day, 1, 1, 1, 1]
sorted = grouped[grouped[:, 0].argsort()]
split = np.split(sorted, np.unique(sorted[:, 0], return_index=True)[1][1:])
print()
print(f'date,rms_fastlz_zeroes_ones,rms_fastlz_txsize,rms_fastlz_only,rms_fastlz_no_intercept,rms_naive_with_intercept,rms_naive_no_intercept,ma_fastlz_zeroes_ones,ma_fastlz_txsize,ma_fastlz_only,ma_fastlz_no_intercept,ma_naive_with_intercept,ma_naive_no_intercept,total_fastlz_zeroes_ones,total_fastlz_txsize,total_fastlz_only,total_fastlz_no_intercept,total_naive_with_intercept,total_naive_no_intercept,total_best')
for day in split:
xd = np.delete(day, [0,1], 1)
yd = day[:, 1]
rms = np.sqrt(np.sum(np.power(fastlz_model.predict(xd) - yd, 2)) / np.size(yd))
ma = np.sum(np.absolute(fastlz_model.predict(xd) - yd)) / np.size(yd)
total = np.sum(fastlz_model.predict(xd)) + (68*np.size(yd) if signature_omitted else 0)
xd_zeros_ones_combined = np.copy(xd)
xd_zeros_ones_combined[:, 1] += xd_zeros_ones_combined[:, 2]
xd_zeros_ones_combined = np.delete(xd_zeros_ones_combined, [2], 1)
rms_combined = np.sqrt(np.sum(np.power(fastlz_combined_model.predict(xd_zeros_ones_combined) - yd, 2)) / np.size(yd))
ma_combined = np.sum(np.absolute(fastlz_combined_model.predict(xd_zeros_ones_combined) - yd)) / np.size(yd)
total_combined = np.sum(fastlz_combined_model.predict(xd_zeros_ones_combined)) + (68*np.size(yd) if signature_omitted else 0)
xd_simple = np.delete(xd, [1,2], 1)
rms_simple = np.sqrt(np.sum(np.power(fastlz_simple_model.predict(xd_simple) - yd, 2)) / np.size(yd))
ma_simple = np.sum(np.absolute(fastlz_simple_model.predict(xd_simple) - yd)) / np.size(yd)
total_simple = np.sum(fastlz_simple_model.predict(xd_simple)) + (68*np.size(yd) if signature_omitted else 0)
rms_fastlz_cheap = np.sqrt(np.sum(np.power(yd - xd[:, 0]*fastlz_scalar, 2)) / np.size(yd))
ma_fastlz_cheap = np.sum(np.absolute(yd - xd[:, 0]*fastlz_scalar)) / np.size(yd)
total_fastlz_cheap = np.sum(xd[:, 0]*fastlz_scalar) + (68*np.size(yd) if signature_omitted else 0)
xd_naive = np.delete(xd, [0], 1)
rms_naive = np.sqrt(np.sum(np.power(naive_model.predict(xd_naive) - yd, 2)) / np.size(yd))
ma_naive = np.sum(np.absolute(naive_model.predict(xd_naive) - yd)) / np.size(yd)
total_naive = np.sum(naive_model.predict(xd_naive)) + (68*np.size(yd) if signature_omitted else 0)
rms_naive_cheap = np.sqrt(np.sum(np.power(yd - (xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar, 2)) / np.size(yd))
ma_naive_cheap = np.sum(np.absolute(yd - (xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar)) / np.size(yd)
total_naive_cheap = np.sum((xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar) + (68*np.size(yd) if signature_omitted else 0)
total_best = np.sum(yd) + (68*np.size(yd) if signature_omitted else 0)
print(f'{(op_mainnet_genesis_time+datetime.timedelta(days=int(day[0][0]))).date()},{rms:.2f},{rms_combined:.2f},{rms_simple:.2f},{rms_fastlz_cheap:.2f},{rms_naive:.2f},{rms_naive_cheap:.2f},{ma:.2f},{ma_combined:.2f},{ma_simple:.2f},{ma_fastlz_cheap:.2f},{ma_naive:.2f},{ma_naive_cheap:.2f},{total:.2f},{total_combined:.2f},{total_simple:.2f},{total_fastlz_cheap:.2f},{total_naive:.2f},{total_naive_cheap:.2f},{total_best:.2f}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment