-
-
Save sambacha/579acb721dafa1c95cb5ea561ec2df2d to your computer and use it in GitHub Desktop.
Dump the transcation size + estimated compressed size from a geth database, and analyze using numpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"compress/zlib" | |
"context" | |
"encoding/binary" | |
"fmt" | |
"log" | |
"math/big" | |
"os" | |
"strings" | |
"time" | |
"github.com/ethereum/go-ethereum/cmd/utils" | |
"github.com/ethereum/go-ethereum/common" | |
"github.com/ethereum/go-ethereum/core/rawdb" | |
"github.com/ethereum/go-ethereum/core/types" | |
"github.com/ethereum/go-ethereum/ethclient" | |
"github.com/ethereum/go-ethereum/ethdb" | |
"github.com/ethereum/go-ethereum/node" | |
) | |
func main() { | |
trimSignature := false | |
bootstrapTxs := 1000 | |
endBlock := uint64(105235064) // OP bedrock block + 1 | |
//endBlock := uint64(0) | |
startBlock := int64(-1) // -1 for latest | |
// remote node URL or local database location: | |
// clientLocation := "https://mainnet.optimism.io" | |
clientLocation := "/data/op-geth" | |
output, err := os.Create("/data/fastlz.bin") | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer output.Close() | |
var client Client | |
if strings.HasPrefix(clientLocation, "http://") || strings.HasPrefix(clientLocation, "https://") { | |
client, err = ethclient.Dial(clientLocation) | |
} else { | |
client, err = NewLocalClient(clientLocation) | |
} | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer client.Close() | |
zlibBestBatchEstimator := newZlibBatchEstimator() | |
printInterval := 10 * time.Second | |
lastPrint := time.Now().Add(-printInterval) | |
count := 0 | |
var nextBlockHash *common.Hash | |
for { | |
var block *types.Block | |
if nextBlockHash == nil { | |
block, err = client.BlockByNumber(context.Background(), big.NewInt(startBlock)) | |
} else { | |
block, err = client.BlockByHash(context.Background(), *nextBlockHash) | |
} | |
if err != nil { | |
log.Fatal(err) | |
} | |
if block == nil || block.NumberU64() <= endBlock { | |
break | |
} | |
p := block.ParentHash() | |
nextBlockHash = &p | |
if time.Since(lastPrint) > printInterval { | |
lastPrint = time.Now() | |
fmt.Println("Block:", block.NumberU64()) | |
} | |
for _, tx := range block.Transactions() { | |
if tx.Type() == types.DepositTxType { | |
continue | |
} | |
b, err := tx.MarshalBinary() | |
if err != nil { | |
log.Fatal(err) | |
} | |
if trimSignature { | |
// for span batch mode we trim the signature, and assume there is no estimation | |
// error on this component were we to just treat it as entirely incompressible. | |
b = b[:len(b)-68.] | |
} | |
count++ | |
if count <= bootstrapTxs { | |
zlibBestBatchEstimator.write(b) | |
continue | |
} | |
best := zlibBestBatchEstimator.write(b) | |
fastlz := FlzCompressLen(b) | |
zeroes := uint32(0) | |
nonZeroes := uint32(0) | |
for _, b := range b { | |
if b == 0 { | |
zeroes++ | |
} else { | |
nonZeroes++ | |
} | |
} | |
if best == 0 { | |
// invalid datapoint, ignore | |
continue | |
} | |
// block numbers fit in 32-bits (for now) | |
err = binary.Write(output, binary.LittleEndian, uint32(block.NumberU64())) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = binary.Write(output, binary.LittleEndian, best) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = binary.Write(output, binary.LittleEndian, fastlz) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = binary.Write(output, binary.LittleEndian, zeroes) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = binary.Write(output, binary.LittleEndian, nonZeroes) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
} | |
} | |
// zlibBatchEstimator simulates a zlib compressor at max compression that works on (large) tx | |
// batches. Should bootstrap it before use by calling it on several samples of representative | |
// data. | |
type zlibBatchEstimator struct { | |
b [2]bytes.Buffer | |
w [2]*zlib.Writer | |
} | |
func newZlibBatchEstimator() *zlibBatchEstimator { | |
b := &zlibBatchEstimator{} | |
var err error | |
for i := range b.w { | |
b.w[i], err = zlib.NewWriterLevel(&b.b[i], zlib.BestCompression) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
return b | |
} | |
func (w *zlibBatchEstimator) write(p []byte) uint32 { | |
// targeting: | |
// b[0] == 0-64kb | |
// b[1] == 64kb-128kb | |
before := w.b[1].Len() | |
_, err := w.w[1].Write(p) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = w.w[1].Flush() | |
if err != nil { | |
log.Fatal(err) | |
} | |
after := w.b[1].Len() | |
// if b[1] > 64kb, write to b[0] | |
if w.b[1].Len() > 64*1024 { | |
_, err = w.w[0].Write(p) | |
if err != nil { | |
log.Fatal(err) | |
} | |
err = w.w[0].Flush() | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
// if b[1] > 128kb, rotate | |
if w.b[1].Len() > 128*1024 { | |
w.b[1].Reset() | |
w.w[1].Reset(&w.b[1]) | |
tb := w.b[1] | |
tw := w.w[1] | |
w.b[1] = w.b[0] | |
w.w[1] = w.w[0] | |
w.b[0] = tb | |
w.w[0] = tw | |
} | |
if after-before-2 < 0 { | |
return 0 | |
} | |
return uint32(after - before - 2) // flush writes 2 extra "sync" bytes so don't count those | |
} | |
type Client interface { | |
BlockByHash(ctx context.Context, hash common.Hash) (*types.Block, error) | |
BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) | |
Close() | |
} | |
type LocalClient struct { | |
n *node.Node | |
db ethdb.Database | |
} | |
func NewLocalClient(dataDir string) (Client, error) { | |
nodeCfg := node.DefaultConfig | |
nodeCfg.Name = "geth" | |
nodeCfg.DataDir = dataDir | |
n, err := node.New(&nodeCfg) | |
if err != nil { | |
return nil, err | |
} | |
handles := utils.MakeDatabaseHandles(1024) | |
db, err := n.OpenDatabaseWithFreezer("chaindata", 512, handles, "", "", true) | |
if err != nil { | |
return nil, err | |
} | |
return &LocalClient{ | |
n: n, | |
db: db, | |
}, nil | |
} | |
func (c *LocalClient) Close() { | |
_ = c.db.Close() | |
_ = c.n.Close() | |
} | |
func (c *LocalClient) BlockByHash(ctx context.Context, hash common.Hash) (*types.Block, error) { | |
number := rawdb.ReadHeaderNumber(c.db, hash) | |
if number == nil { | |
return nil, nil | |
} | |
return rawdb.ReadBlock(c.db, hash, *number), nil | |
} | |
func (c *LocalClient) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) { | |
if number.Int64() < 0 { | |
return c.BlockByHash(ctx, rawdb.ReadHeadBlockHash(c.db)) | |
} | |
hash := rawdb.ReadCanonicalHash(c.db, number.Uint64()) | |
if bytes.Equal(hash.Bytes(), common.Hash{}.Bytes()) { | |
return nil, nil | |
} | |
return rawdb.ReadBlock(c.db, hash, number.Uint64()), nil | |
} | |
func FlzCompressLen(ib []byte) uint32 { | |
n := uint32(0) | |
ht := make([]uint32, 8192) | |
u24 := func(i uint32) uint32 { | |
return uint32(ib[i]) | (uint32(ib[i+1]) << 8) | (uint32(ib[i+2]) << 16) | |
} | |
cmp := func(p uint32, q uint32, e uint32) uint32 { | |
l := uint32(0) | |
for e -= q; l < e; l++ { | |
if ib[p+l] != ib[q+l] { | |
e = 0 | |
} | |
} | |
return l | |
} | |
literals := func(r uint32) { | |
n += 0x21 * (r / 0x20) | |
r %= 0x20 | |
if r != 0 { | |
n += r + 1 | |
} | |
} | |
match := func(l uint32) { | |
l-- | |
n += 3 * (l / 262) | |
if l%262 >= 6 { | |
n += 3 | |
} else { | |
n += 2 | |
} | |
} | |
hash := func(v uint32) uint32 { | |
return ((2654435769 * v) >> 19) & 0x1fff | |
} | |
setNextHash := func(ip uint32) uint32 { | |
ht[hash(u24(ip))] = ip | |
return ip + 1 | |
} | |
a := uint32(0) | |
ipLimit := uint32(len(ib)) - 13 | |
if len(ib) < 13 { | |
ipLimit = 0 | |
} | |
for ip := a + 2; ip < ipLimit; { | |
r := uint32(0) | |
d := uint32(0) | |
for { | |
s := u24(ip) | |
h := hash(s) | |
r = ht[h] | |
ht[h] = ip | |
d = ip - r | |
if ip >= ipLimit { | |
break | |
} | |
ip++ | |
if d <= 0x1fff && s == u24(r) { | |
break | |
} | |
} | |
if ip >= ipLimit { | |
break | |
} | |
ip-- | |
if ip > a { | |
literals(ip - a) | |
} | |
l := cmp(r+3, ip+3, ipLimit+9) | |
match(l) | |
ip = setNextHash(setNextHash(ip + l)) | |
a = ip | |
} | |
literals(uint32(len(ib)) - a) | |
return n | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.linear_model import LinearRegression | |
import datetime | |
dt = np.dtype([('block', '<u4'), ('best', '<u4'), ('fastlz', '<u4'), ('zeroes', '<u4'), ('ones', '<u4')]) | |
op_mainnet = np.fromfile('./op-mainnet.bin', dtype=dt) | |
input_array = np.array(op_mainnet.tolist()) | |
# base_mainnet = np.fromfile('./base-mainnet.bin', dtype=dt) | |
# input_array = np.concatenate((input_array, np.array(base_mainnet.tolist()))) | |
op_mainnet_genesis_time = datetime.datetime.fromtimestamp(1686068905) | |
op_mainnet_genesis_block = 105235064 | |
block_time = 2 | |
signature_omitted = False | |
training_start = datetime.datetime(2023, 10, 1) | |
training_end = datetime.datetime(2023, 11, 1) | |
training_block_start = op_mainnet_genesis_block + (int(training_start.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time | |
training_block_end = op_mainnet_genesis_block + (int(training_end.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time | |
training_index_start = next(i for i,v in enumerate(input_array) if v[0] == training_block_start) | |
training_index_end = next(i for i,v in enumerate(input_array) if v[0] == training_block_end) | |
first_day = datetime.timedelta(days=1) + datetime.datetime.combine( | |
op_mainnet_genesis_time.date(), datetime.datetime.min.time()) | |
blocks_on_first_day = (int(first_day.timestamp()) - int(op_mainnet_genesis_time.timestamp())) // block_time | |
blocks_per_day = 60*60*24 // block_time | |
# use first 10% of data for regression calculation | |
x = np.delete(input_array, [0,1], 1)[training_index_end:training_index_start] | |
y = input_array[:, 1][training_index_end:training_index_start] | |
fastlz_model = LinearRegression().fit(x, y) | |
print(f'fastlz_model: {fastlz_model.intercept_} {fastlz_model.coef_}') | |
x_zeros_ones_combined = np.copy(x) | |
x_zeros_ones_combined[:, 1] += x_zeros_ones_combined[:, 2] | |
x_zeros_ones_combined = np.delete(x_zeros_ones_combined, [2], 1) | |
fastlz_combined_model = LinearRegression().fit(x_zeros_ones_combined, y) | |
print(f'fastlz_combined_model: {fastlz_combined_model.intercept_} {fastlz_combined_model.coef_}') | |
x_simple = np.delete(x, [1,2], 1) | |
fastlz_simple_model = LinearRegression().fit(x_simple, y) | |
print(f'fastlz_simple_model: {fastlz_simple_model.intercept_} {fastlz_simple_model.coef_}') | |
x_naive = np.delete(x, [0], 1) | |
naive_model = LinearRegression().fit(x_naive, y) | |
print(f'naive_model: {naive_model.intercept_} {naive_model.coef_}') | |
naive_scalar = np.sum(y) / (np.sum(x[:, 1]*4+x[:, 2]*16)/16) | |
fastlz_scalar = np.sum(y) / np.sum(x[:, 0]) | |
print(f'naive_scalar: {naive_scalar}') | |
print(f'fastlz_scalar: {fastlz_scalar}') | |
normalized = input_array + [blocks_per_day - op_mainnet_genesis_block - blocks_on_first_day, 0, 0, 0, 0] | |
grouped = normalized // [blocks_per_day, 1, 1, 1, 1] | |
sorted = grouped[grouped[:, 0].argsort()] | |
split = np.split(sorted, np.unique(sorted[:, 0], return_index=True)[1][1:]) | |
print() | |
print(f'date,rms_fastlz_zeroes_ones,rms_fastlz_txsize,rms_fastlz_only,rms_fastlz_no_intercept,rms_naive_with_intercept,rms_naive_no_intercept,ma_fastlz_zeroes_ones,ma_fastlz_txsize,ma_fastlz_only,ma_fastlz_no_intercept,ma_naive_with_intercept,ma_naive_no_intercept,total_fastlz_zeroes_ones,total_fastlz_txsize,total_fastlz_only,total_fastlz_no_intercept,total_naive_with_intercept,total_naive_no_intercept,total_best') | |
for day in split: | |
xd = np.delete(day, [0,1], 1) | |
yd = day[:, 1] | |
rms = np.sqrt(np.sum(np.power(fastlz_model.predict(xd) - yd, 2)) / np.size(yd)) | |
ma = np.sum(np.absolute(fastlz_model.predict(xd) - yd)) / np.size(yd) | |
total = np.sum(fastlz_model.predict(xd)) + (68*np.size(yd) if signature_omitted else 0) | |
xd_zeros_ones_combined = np.copy(xd) | |
xd_zeros_ones_combined[:, 1] += xd_zeros_ones_combined[:, 2] | |
xd_zeros_ones_combined = np.delete(xd_zeros_ones_combined, [2], 1) | |
rms_combined = np.sqrt(np.sum(np.power(fastlz_combined_model.predict(xd_zeros_ones_combined) - yd, 2)) / np.size(yd)) | |
ma_combined = np.sum(np.absolute(fastlz_combined_model.predict(xd_zeros_ones_combined) - yd)) / np.size(yd) | |
total_combined = np.sum(fastlz_combined_model.predict(xd_zeros_ones_combined)) + (68*np.size(yd) if signature_omitted else 0) | |
xd_simple = np.delete(xd, [1,2], 1) | |
rms_simple = np.sqrt(np.sum(np.power(fastlz_simple_model.predict(xd_simple) - yd, 2)) / np.size(yd)) | |
ma_simple = np.sum(np.absolute(fastlz_simple_model.predict(xd_simple) - yd)) / np.size(yd) | |
total_simple = np.sum(fastlz_simple_model.predict(xd_simple)) + (68*np.size(yd) if signature_omitted else 0) | |
rms_fastlz_cheap = np.sqrt(np.sum(np.power(yd - xd[:, 0]*fastlz_scalar, 2)) / np.size(yd)) | |
ma_fastlz_cheap = np.sum(np.absolute(yd - xd[:, 0]*fastlz_scalar)) / np.size(yd) | |
total_fastlz_cheap = np.sum(xd[:, 0]*fastlz_scalar) + (68*np.size(yd) if signature_omitted else 0) | |
xd_naive = np.delete(xd, [0], 1) | |
rms_naive = np.sqrt(np.sum(np.power(naive_model.predict(xd_naive) - yd, 2)) / np.size(yd)) | |
ma_naive = np.sum(np.absolute(naive_model.predict(xd_naive) - yd)) / np.size(yd) | |
total_naive = np.sum(naive_model.predict(xd_naive)) + (68*np.size(yd) if signature_omitted else 0) | |
rms_naive_cheap = np.sqrt(np.sum(np.power(yd - (xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar, 2)) / np.size(yd)) | |
ma_naive_cheap = np.sum(np.absolute(yd - (xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar)) / np.size(yd) | |
total_naive_cheap = np.sum((xd[:, 1]*4+xd[:, 2]*16)/16*naive_scalar) + (68*np.size(yd) if signature_omitted else 0) | |
total_best = np.sum(yd) + (68*np.size(yd) if signature_omitted else 0) | |
print(f'{(op_mainnet_genesis_time+datetime.timedelta(days=int(day[0][0]))).date()},{rms:.2f},{rms_combined:.2f},{rms_simple:.2f},{rms_fastlz_cheap:.2f},{rms_naive:.2f},{rms_naive_cheap:.2f},{ma:.2f},{ma_combined:.2f},{ma_simple:.2f},{ma_fastlz_cheap:.2f},{ma_naive:.2f},{ma_naive_cheap:.2f},{total:.2f},{total_combined:.2f},{total_simple:.2f},{total_fastlz_cheap:.2f},{total_naive:.2f},{total_naive_cheap:.2f},{total_best:.2f}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment