adamo1139 · April 20, 2026 15:38
diff --git a/.txt b/.txt
 #ulimit -n 100000
 #exllamav3==0.0.28
 #flash_attn==2.8.3
 #torch==2.8.0+cu128
 #ran on rented 4090 48GB modded gpu from Vast.AI

 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-2bpw_H6
 	
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  7.56554846
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.5820
      K = 2: 0.7017
      K = 3: 0.7581
      K = 4: 0.7919
      K = 5: 0.8151
 -- Top-K agreement, A vs B:
      K = 1: 0.7596
      K = 2: 0.4099
      K = 3: 0.1765
      K = 4: 0.0649
      K = 5: 0.0220
 -- KL divergence (A, B):  0.59911569
 -- KL divergence (B, A):  0.51078425



 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-3bpw_H6
 	
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  6.01840956
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6212
      K = 2: 0.7363
      K = 3: 0.7877
      K = 4: 0.8188
      K = 5: 0.8402
 -- Top-K agreement, A vs B:
      K = 1: 0.8629
      K = 2: 0.5994
      K = 3: 0.3556
      K = 4: 0.1875
      K = 5: 0.0901
 -- KL divergence (A, B):  0.21367304
 -- KL divergence (B, A):  0.20173767



 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-4bpw_H6
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.70074813
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6303
      K = 2: 0.7445
      K = 3: 0.7948
      K = 4: 0.8247
      K = 5: 0.8453
 -- Top-K agreement, A vs B:
      K = 1: 0.9113
      K = 2: 0.7162
      K = 3: 0.5023
      K = 4: 0.3186
      K = 5: 0.1892
 -- KL divergence (A, B):  0.10793664
 -- KL divergence (B, A):  0.10609606

 	

 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-5bpw_H8
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.48369300
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6360
      K = 2: 0.7486
      K = 3: 0.7993
      K = 4: 0.8289
      K = 5: 0.8490
 -- Top-K agreement, A vs B:
      K = 1: 0.9358
      K = 2: 0.7850
      K = 3: 0.6051
      K = 4: 0.4321
      K = 5: 0.2904
 -- KL divergence (A, B):  0.07210313
 -- KL divergence (B, A):  0.07154187

 	

 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-6bpw_H8
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.46985636
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6364
      K = 2: 0.7492
      K = 3: 0.7997
      K = 4: 0.8290
      K = 5: 0.8492
 -- Top-K agreement, A vs B:
      K = 1: 0.9472
      K = 2: 0.8163
      K = 3: 0.6543
      K = 4: 0.4921
      K = 5: 0.3507
 -- KL divergence (A, B):  0.05867771
 -- KL divergence (B, A):  0.05889685

 	

 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-8bpw_H8
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.47170094
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6362
      K = 2: 0.7492
      K = 3: 0.7995
      K = 4: 0.8295
      K = 5: 0.8498
 -- Top-K agreement, A vs B:
      K = 1: 0.9531
      K = 2: 0.8342
      K = 3: 0.6852
      K = 4: 0.5307
      K = 5: 0.3931
 -- KL divergence (A, B):  0.05094092
 -- KL divergence (B, A):  0.05181698

 	
 	
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-210bpw-tuned
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  6.65374450
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6007
      K = 2: 0.7192
      K = 3: 0.7736
      K = 4: 0.8062
      K = 5: 0.8284
 -- Top-K agreement, A vs B:
      K = 1: 0.7945
      K = 2: 0.4672
      K = 3: 0.2254
      K = 4: 0.0949
      K = 5: 0.0365
 -- KL divergence (A, B):  0.44456966
 -- KL divergence (B, A):  0.38990254

 	
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-257bpw-tuned
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  6.04089632
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6160
      K = 2: 0.7328
      K = 3: 0.7860
      K = 4: 0.8173
      K = 5: 0.8393
 -- Top-K agreement, A vs B:
      K = 1: 0.8280
      K = 2: 0.5291
      K = 3: 0.2835
      K = 4: 0.1323
      K = 5: 0.0577
 -- KL divergence (A, B):  0.31167571
 -- KL divergence (B, A):  0.28659851
 
 	
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-315bpw-tuned
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.65554713
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6291
      K = 2: 0.7441
      K = 3: 0.7949
      K = 4: 0.8252
      K = 5: 0.8462
 -- Top-K agreement, A vs B:
      K = 1: 0.8852
      K = 2: 0.6495
      K = 3: 0.4167
      K = 4: 0.2402
      K = 5: 0.1274
 -- KL divergence (A, B):  0.15319949
 -- KL divergence (B, A):  0.15014105

 	
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
 	-ma /home/ubuntu/workspace/models/glm47-bf16 \
 	-mb /home/ubuntu/workspace/models/glm47-384bpw-tuned
 	
 -- A perplexity:  5.44790776
 -- B perplexity:  5.66231484
 -- A label in top-K:
      K = 1: 0.6365
      K = 2: 0.7498
      K = 3: 0.7997
      K = 4: 0.8296
      K = 5: 0.8496
 -- B label in top-K:
      K = 1: 0.6309
      K = 2: 0.7443
      K = 3: 0.7951
      K = 4: 0.8249
      K = 5: 0.8456
 -- Top-K agreement, A vs B:
      K = 1: 0.9093
      K = 2: 0.7117
      K = 3: 0.4969
      K = 4: 0.3149
      K = 5: 0.1856
 -- KL divergence (A, B):  0.10877120
 -- KL divergence (B, A):  0.10692901
	#ulimit -n 100000
	#exllamav3==0.0.28
	#flash_attn==2.8.3
	#torch==2.8.0+cu128
	#ran on rented 4090 48GB modded gpu from Vast.AI

	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-2bpw_H6


	-- A perplexity: 5.44790776
	-- B perplexity: 7.56554846
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.5820
	K = 2: 0.7017
	K = 3: 0.7581
	K = 4: 0.7919
	K = 5: 0.8151
	-- Top-K agreement, A vs B:
	K = 1: 0.7596
	K = 2: 0.4099
	K = 3: 0.1765
	K = 4: 0.0649
	K = 5: 0.0220
	-- KL divergence (A, B): 0.59911569
	-- KL divergence (B, A): 0.51078425



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-3bpw_H6


	-- A perplexity: 5.44790776
	-- B perplexity: 6.01840956
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6212
	K = 2: 0.7363
	K = 3: 0.7877
	K = 4: 0.8188
	K = 5: 0.8402
	-- Top-K agreement, A vs B:
	K = 1: 0.8629
	K = 2: 0.5994
	K = 3: 0.3556
	K = 4: 0.1875
	K = 5: 0.0901
	-- KL divergence (A, B): 0.21367304
	-- KL divergence (B, A): 0.20173767



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-4bpw_H6

	-- A perplexity: 5.44790776
	-- B perplexity: 5.70074813
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6303
	K = 2: 0.7445
	K = 3: 0.7948
	K = 4: 0.8247
	K = 5: 0.8453
	-- Top-K agreement, A vs B:
	K = 1: 0.9113
	K = 2: 0.7162
	K = 3: 0.5023
	K = 4: 0.3186
	K = 5: 0.1892
	-- KL divergence (A, B): 0.10793664
	-- KL divergence (B, A): 0.10609606



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-5bpw_H8

	-- A perplexity: 5.44790776
	-- B perplexity: 5.48369300
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6360
	K = 2: 0.7486
	K = 3: 0.7993
	K = 4: 0.8289
	K = 5: 0.8490
	-- Top-K agreement, A vs B:
	K = 1: 0.9358
	K = 2: 0.7850
	K = 3: 0.6051
	K = 4: 0.4321
	K = 5: 0.2904
	-- KL divergence (A, B): 0.07210313
	-- KL divergence (B, A): 0.07154187



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-6bpw_H8

	-- A perplexity: 5.44790776
	-- B perplexity: 5.46985636
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6364
	K = 2: 0.7492
	K = 3: 0.7997
	K = 4: 0.8290
	K = 5: 0.8492
	-- Top-K agreement, A vs B:
	K = 1: 0.9472
	K = 2: 0.8163
	K = 3: 0.6543
	K = 4: 0.4921
	K = 5: 0.3507
	-- KL divergence (A, B): 0.05867771
	-- KL divergence (B, A): 0.05889685



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-8bpw_H8

	-- A perplexity: 5.44790776
	-- B perplexity: 5.47170094
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6362
	K = 2: 0.7492
	K = 3: 0.7995
	K = 4: 0.8295
	K = 5: 0.8498
	-- Top-K agreement, A vs B:
	K = 1: 0.9531
	K = 2: 0.8342
	K = 3: 0.6852
	K = 4: 0.5307
	K = 5: 0.3931
	-- KL divergence (A, B): 0.05094092
	-- KL divergence (B, A): 0.05181698



	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-210bpw-tuned

	-- A perplexity: 5.44790776
	-- B perplexity: 6.65374450
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6007
	K = 2: 0.7192
	K = 3: 0.7736
	K = 4: 0.8062
	K = 5: 0.8284
	-- Top-K agreement, A vs B:
	K = 1: 0.7945
	K = 2: 0.4672
	K = 3: 0.2254
	K = 4: 0.0949
	K = 5: 0.0365
	-- KL divergence (A, B): 0.44456966
	-- KL divergence (B, A): 0.38990254


	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-257bpw-tuned

	-- A perplexity: 5.44790776
	-- B perplexity: 6.04089632
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6160
	K = 2: 0.7328
	K = 3: 0.7860
	K = 4: 0.8173
	K = 5: 0.8393
	-- Top-K agreement, A vs B:
	K = 1: 0.8280
	K = 2: 0.5291
	K = 3: 0.2835
	K = 4: 0.1323
	K = 5: 0.0577
	-- KL divergence (A, B): 0.31167571
	-- KL divergence (B, A): 0.28659851


	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-315bpw-tuned

	-- A perplexity: 5.44790776
	-- B perplexity: 5.65554713
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6291
	K = 2: 0.7441
	K = 3: 0.7949
	K = 4: 0.8252
	K = 5: 0.8462
	-- Top-K agreement, A vs B:
	K = 1: 0.8852
	K = 2: 0.6495
	K = 3: 0.4167
	K = 4: 0.2402
	K = 5: 0.1274
	-- KL divergence (A, B): 0.15319949
	-- KL divergence (B, A): 0.15014105


	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python model_diff.py \
	-ma /home/ubuntu/workspace/models/glm47-bf16 \
	-mb /home/ubuntu/workspace/models/glm47-384bpw-tuned

	-- A perplexity: 5.44790776
	-- B perplexity: 5.66231484
	-- A label in top-K:
	K = 1: 0.6365
	K = 2: 0.7498
	K = 3: 0.7997
	K = 4: 0.8296
	K = 5: 0.8496
	-- B label in top-K:
	K = 1: 0.6309
	K = 2: 0.7443
	K = 3: 0.7951
	K = 4: 0.8249
	K = 5: 0.8456
	-- Top-K agreement, A vs B:
	K = 1: 0.9093
	K = 2: 0.7117
	K = 3: 0.4969
	K = 4: 0.3149
	K = 5: 0.1856
	-- KL divergence (A, B): 0.10877120
	-- KL divergence (B, A): 0.10692901
No results found