ngxson · July 10, 2024 22:27 · ngxson · Jul 10, 2024 · compilade · Jul 10, 2024
diff --git a/make_fake_moe.py b/make_fake_moe.py
 import logging
 import argparse
 import contextlib
 import json
 import os
 import re
 import sys
 import numpy as np
 import math
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf


 # Download non-MOE model here:
 # https://huggingface.co/ngxson/test_gguf_lora_adapter/blob/main/stories15M.gguf
 #
 # This work by repeating the weight of base model to create 4 experts
 #
 # Run: ./llama-cli -m ./fake_moe.gguf -n 20 --temp 0
 # expected output: Once upon a time, there was a little girl named Lily. She loved to play outside in
 #
 # With lora: ./llama-cli -m ./fake_moe.gguf -n 20 --temp 0 --lora ./fake_moe_lora.gguf
 # expected output: (should be the same as the case below)
 #
 # With merged model: ./llama-cli -m ./fake_moe_lora_merged.gguf -n 20 --temp 0
 # expected output: other othergggggggggggggggggg

 ### decode_field and get_field_data are copied from gguf_new_metadata.py
 def decode_field(field: gguf.ReaderField | None):
    if field and field.types:
        main_type = field.types[0]
        if main_type == gguf.GGUFValueType.ARRAY:
            sub_type = field.types[-1]

            if sub_type == gguf.GGUFValueType.STRING:
                return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
            else:
                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
        if main_type == gguf.GGUFValueType.STRING:
            return str(bytes(field.parts[-1]), encoding='utf-8')
        else:
            return field.parts[-1][0]
    return None
 def get_field_data(reader: gguf.GGUFReader, key: str):
    field = reader.get_field(key)
    return decode_field(field)



 reader = gguf.GGUFReader(path='stories15M.gguf')
 orig_tensor_map = {}
 for t in reader.tensors:
    orig_tensor_map[t.name] = t.data
    #print(t.name)

 #print(orig_tensor_map)
 #exit(1)

 #### generate base model MOE

 N_HEAD = 6
 N_LAYERS = 6
 N_EXPERTS = 4
 N_FF = 768
 N_EMBD = 288

 gguf_writer = gguf.GGUFWriter(path=None, arch='llama')

 def set_hparams():
    gguf_writer.add_name('mixtral_fake')
    gguf_writer.add_block_count(N_LAYERS)
    gguf_writer.add_context_length(128)
    gguf_writer.add_embedding_length(N_EMBD)
    gguf_writer.add_feed_forward_length(N_FF)
    gguf_writer.add_head_count(N_HEAD)
    gguf_writer.add_head_count_kv(N_HEAD)
    gguf_writer.add_expert_count(N_EXPERTS)
    gguf_writer.add_expert_used_count(2)
    gguf_writer.add_rope_dimension_count(48)
    gguf_writer.add_layer_norm_rms_eps(0.000001)
    gguf_writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)
    gguf_writer.add_bos_token_id(1)
    gguf_writer.add_eos_token_id(2)
    gguf_writer.add_unk_token_id(0)

    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(get_field_data(reader, gguf.Keys.Tokenizer.LIST))
    gguf_writer.add_token_scores(get_field_data(reader, gguf.Keys.Tokenizer.SCORES))
    gguf_writer.add_token_types(get_field_data(reader, gguf.Keys.Tokenizer.TOKEN_TYPE))

 set_hparams()
 tensors = []
 tensors.append(('token_embd.weight', (32000, N_EMBD)))
 for il in range(N_LAYERS):
    tensors.append((f'blk.{il}.attn_k.weight', (N_EMBD, N_EMBD)))
    tensors.append((f'blk.{il}.attn_q.weight', (N_EMBD, N_EMBD)))
    tensors.append((f'blk.{il}.attn_v.weight', (N_EMBD, N_EMBD)))
    tensors.append((f'blk.{il}.attn_norm.weight', (N_EMBD)))
    tensors.append((f'blk.{il}.attn_output.weight', (N_EMBD, N_EMBD)))
    tensors.append((f'blk.{il}.ffn_norm.weight', (N_EMBD)))

    tensors.append((f'blk.{il}.ffn_gate_inp.weight', (N_EXPERTS, N_EMBD)))
    tensors.append((f'blk.{il}.ffn_down_exps.weight', (N_EXPERTS, N_EMBD, N_FF)))
    tensors.append((f'blk.{il}.ffn_gate_exps.weight', (N_EXPERTS, N_FF, N_EMBD)))
    tensors.append((f'blk.{il}.ffn_up_exps.weight', (N_EXPERTS, N_FF, N_EMBD)))
 tensors.append((f'output.weight', (32000, N_EMBD)))
 tensors.append((f'output_norm.weight', (N_EMBD)))

 def get_orig_tensor(name):
    return orig_tensor_map[name.replace('_exps', '')]

 np.random.seed(0)
 base_tensors = []
 for name, shape in tensors:
    if 'ffn_gate_inp' in name:
        base_tensors.append((name, np.random.rand(*shape)))
    elif 'ffn_norm' in name:
        base_tensors.append((name, get_orig_tensor(name)))
    elif 'ffn_' in name:
        exp_3d_tensor = np.stack([get_orig_tensor(name)] * N_EXPERTS, axis=0)
        base_tensors.append((name, exp_3d_tensor))
        print('to 3D', name, exp_3d_tensor.shape)
    elif name in orig_tensor_map:
        base_tensors.append((name, get_orig_tensor(name)))
    else:
        print(f'unhandled tensor {name}')
        exit(1)

 print('>> base_tensors')
 for name, t in base_tensors:
    dtype = gguf.GGMLQuantizationType.F32
    print(name, t.shape)
    t = t.squeeze().astype(np.float32)
    if t.ndim != 1:
        t = t.squeeze().astype(np.float16)
        dtype = gguf.GGMLQuantizationType.F16
    gguf_writer.add_tensor(name, t, raw_dtype=dtype)

 gguf_writer.write_header_to_file('./fake_moe.gguf')
 gguf_writer.write_kv_data_to_file()
 gguf_writer.write_tensors_to_file(progress=True)
 gguf_writer.close()

 #### generate LoRA with random weights

 print('LoRA')
 N_RANK = 32

 gguf_writer = gguf.GGUFWriter(path=None, arch='llama')
 gguf_writer.add_name('mixtral_fake')
 gguf_writer.add_string('training.type', 'finetune_lora')

 lora_tensors = []
 for name, shape in tensors:
    if any(x in name for x in ['attn_k', 'attn_q', 'attn_v', 'attn_output', 'ffn_gate_inp']):
        lora_a = np.random.rand(N_RANK, shape[1]) * 0.015
        lora_b = np.random.rand(shape[0], N_RANK) * 0.015
        lora_tensors.append((f'{name}.lora_a', lora_a))
        lora_tensors.append((f'{name}.lora_b', lora_b))
    elif any(x in name for x in ['ffn_down', 'ffn_gate', 'ffn_up']):
        lora_a = np.random.rand(N_EXPERTS, N_RANK, shape[2]) * 0.015
        lora_b = np.random.rand(N_EXPERTS, shape[1], N_RANK) * 0.015
        lora_tensors.append((f'{name}.lora_a', lora_a))
        lora_tensors.append((f'{name}.lora_b', lora_b))
 for name, t in lora_tensors:
    dtype = gguf.GGMLQuantizationType.F32
    t = t.squeeze().astype(np.float32)
    print(name, t.shape)
    gguf_writer.add_tensor(name, t, raw_dtype=dtype)
 gguf_writer.write_header_to_file('./fake_moe_lora.gguf')
 gguf_writer.write_kv_data_to_file()
 gguf_writer.write_tensors_to_file(progress=True)
 gguf_writer.close()

 #### generate merge LoRA to original model

 gguf_writer = gguf.GGUFWriter(path=None, arch='llama')
 set_hparams()
 lora_tensor_map = {}
 for name, t in lora_tensors:
    lora_tensor_map[name] = t
 for name, t in base_tensors:
    print(name, t.shape)
    # merge lora to weight
    if any(x in name for x in ['attn_k', 'attn_q', 'attn_v', 'attn_output', 'ffn_gate_inp', 'ffn_down', 'ffn_gate', 'ffn_up']):
        lora_a = lora_tensor_map[f'{name}.lora_a']
        lora_b = lora_tensor_map[f'{name}.lora_b']
        print(lora_a.shape)
        print(lora_b.shape)
        t = np.add(t, np.matmul(lora_b, lora_a))

    dtype = gguf.GGMLQuantizationType.F32
    t = t.squeeze().astype(np.float32)
    if t.ndim != 1:
        t = t.squeeze().astype(np.float16)
        dtype = gguf.GGMLQuantizationType.F16
    gguf_writer.add_tensor(name, t, raw_dtype=dtype)

 gguf_writer.write_header_to_file('./fake_moe_lora_merged.gguf')
 gguf_writer.write_kv_data_to_file()
 gguf_writer.write_tensors_to_file(progress=True)
 gguf_writer.close()
	import logging
	import argparse
	import contextlib
	import json
	import os
	import re
	import sys
	import numpy as np
	import math
	import torch
	from pathlib import Path
	if 'NO_LOCAL_GGUF' not in os.environ:
	sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
	import gguf


	# Download non-MOE model here:
	# https://huggingface.co/ngxson/test_gguf_lora_adapter/blob/main/stories15M.gguf
	#
	# This work by repeating the weight of base model to create 4 experts
	#
	# Run: ./llama-cli -m ./fake_moe.gguf -n 20 --temp 0
	# expected output: Once upon a time, there was a little girl named Lily. She loved to play outside in
	#
	# With lora: ./llama-cli -m ./fake_moe.gguf -n 20 --temp 0 --lora ./fake_moe_lora.gguf
	# expected output: (should be the same as the case below)
	#
	# With merged model: ./llama-cli -m ./fake_moe_lora_merged.gguf -n 20 --temp 0
	# expected output: other othergggggggggggggggggg

	### decode_field and get_field_data are copied from gguf_new_metadata.py
	def decode_field(field: gguf.ReaderField \| None):
	if field and field.types:
	main_type = field.types[0]
	if main_type == gguf.GGUFValueType.ARRAY:
	sub_type = field.types[-1]

	if sub_type == gguf.GGUFValueType.STRING:
	return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
	else:
	return [pv for idx in field.data for pv in field.parts[idx].tolist()]
	if main_type == gguf.GGUFValueType.STRING:
	return str(bytes(field.parts[-1]), encoding='utf-8')
	else:
	return field.parts[-1][0]
	return None
	def get_field_data(reader: gguf.GGUFReader, key: str):
	field = reader.get_field(key)
	return decode_field(field)



	reader = gguf.GGUFReader(path='stories15M.gguf')
	orig_tensor_map = {}
	for t in reader.tensors:
	orig_tensor_map[t.name] = t.data
	#print(t.name)

	#print(orig_tensor_map)
	#exit(1)

	#### generate base model MOE

	N_HEAD = 6
	N_LAYERS = 6
	N_EXPERTS = 4
	N_FF = 768
	N_EMBD = 288

	gguf_writer = gguf.GGUFWriter(path=None, arch='llama')

	def set_hparams():
	gguf_writer.add_name('mixtral_fake')
	gguf_writer.add_block_count(N_LAYERS)
	gguf_writer.add_context_length(128)
	gguf_writer.add_embedding_length(N_EMBD)
	gguf_writer.add_feed_forward_length(N_FF)
	gguf_writer.add_head_count(N_HEAD)
	gguf_writer.add_head_count_kv(N_HEAD)
	gguf_writer.add_expert_count(N_EXPERTS)
	gguf_writer.add_expert_used_count(2)
	gguf_writer.add_rope_dimension_count(48)
	gguf_writer.add_layer_norm_rms_eps(0.000001)
	gguf_writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)
	gguf_writer.add_bos_token_id(1)
	gguf_writer.add_eos_token_id(2)
	gguf_writer.add_unk_token_id(0)

	gguf_writer.add_tokenizer_model("llama")
	gguf_writer.add_token_list(get_field_data(reader, gguf.Keys.Tokenizer.LIST))
	gguf_writer.add_token_scores(get_field_data(reader, gguf.Keys.Tokenizer.SCORES))
	gguf_writer.add_token_types(get_field_data(reader, gguf.Keys.Tokenizer.TOKEN_TYPE))

	set_hparams()
	tensors = []
	tensors.append(('token_embd.weight', (32000, N_EMBD)))
	for il in range(N_LAYERS):
	tensors.append((f'blk.{il}.attn_k.weight', (N_EMBD, N_EMBD)))
	tensors.append((f'blk.{il}.attn_q.weight', (N_EMBD, N_EMBD)))
	tensors.append((f'blk.{il}.attn_v.weight', (N_EMBD, N_EMBD)))
	tensors.append((f'blk.{il}.attn_norm.weight', (N_EMBD)))
	tensors.append((f'blk.{il}.attn_output.weight', (N_EMBD, N_EMBD)))
	tensors.append((f'blk.{il}.ffn_norm.weight', (N_EMBD)))

	tensors.append((f'blk.{il}.ffn_gate_inp.weight', (N_EXPERTS, N_EMBD)))
	tensors.append((f'blk.{il}.ffn_down_exps.weight', (N_EXPERTS, N_EMBD, N_FF)))
	tensors.append((f'blk.{il}.ffn_gate_exps.weight', (N_EXPERTS, N_FF, N_EMBD)))
	tensors.append((f'blk.{il}.ffn_up_exps.weight', (N_EXPERTS, N_FF, N_EMBD)))
	tensors.append((f'output.weight', (32000, N_EMBD)))
	tensors.append((f'output_norm.weight', (N_EMBD)))

	def get_orig_tensor(name):
	return orig_tensor_map[name.replace('_exps', '')]

	np.random.seed(0)
	base_tensors = []
	for name, shape in tensors:
	if 'ffn_gate_inp' in name:
	base_tensors.append((name, np.random.rand(*shape)))
	elif 'ffn_norm' in name:
	base_tensors.append((name, get_orig_tensor(name)))
	elif 'ffn_' in name:
	exp_3d_tensor = np.stack([get_orig_tensor(name)] * N_EXPERTS, axis=0)
	base_tensors.append((name, exp_3d_tensor))
	print('to 3D', name, exp_3d_tensor.shape)
	elif name in orig_tensor_map:
	base_tensors.append((name, get_orig_tensor(name)))
	else:
	print(f'unhandled tensor {name}')
	exit(1)

	print('>> base_tensors')
	for name, t in base_tensors:
	dtype = gguf.GGMLQuantizationType.F32
	print(name, t.shape)
	t = t.squeeze().astype(np.float32)
	if t.ndim != 1:
	t = t.squeeze().astype(np.float16)
	dtype = gguf.GGMLQuantizationType.F16
	gguf_writer.add_tensor(name, t, raw_dtype=dtype)

	gguf_writer.write_header_to_file('./fake_moe.gguf')
	gguf_writer.write_kv_data_to_file()
	gguf_writer.write_tensors_to_file(progress=True)
	gguf_writer.close()

	#### generate LoRA with random weights

	print('LoRA')
	N_RANK = 32

	gguf_writer = gguf.GGUFWriter(path=None, arch='llama')
	gguf_writer.add_name('mixtral_fake')
	gguf_writer.add_string('training.type', 'finetune_lora')

	lora_tensors = []
	for name, shape in tensors:
	if any(x in name for x in ['attn_k', 'attn_q', 'attn_v', 'attn_output', 'ffn_gate_inp']):
	lora_a = np.random.rand(N_RANK, shape[1]) * 0.015
	lora_b = np.random.rand(shape[0], N_RANK) * 0.015
	lora_tensors.append((f'{name}.lora_a', lora_a))
	lora_tensors.append((f'{name}.lora_b', lora_b))
	elif any(x in name for x in ['ffn_down', 'ffn_gate', 'ffn_up']):
	lora_a = np.random.rand(N_EXPERTS, N_RANK, shape[2]) * 0.015
	lora_b = np.random.rand(N_EXPERTS, shape[1], N_RANK) * 0.015
	lora_tensors.append((f'{name}.lora_a', lora_a))
	lora_tensors.append((f'{name}.lora_b', lora_b))
	for name, t in lora_tensors:
	dtype = gguf.GGMLQuantizationType.F32
	t = t.squeeze().astype(np.float32)
	print(name, t.shape)
	gguf_writer.add_tensor(name, t, raw_dtype=dtype)
	gguf_writer.write_header_to_file('./fake_moe_lora.gguf')
	gguf_writer.write_kv_data_to_file()
	gguf_writer.write_tensors_to_file(progress=True)
	gguf_writer.close()

	#### generate merge LoRA to original model

	gguf_writer = gguf.GGUFWriter(path=None, arch='llama')
	set_hparams()
	lora_tensor_map = {}
	for name, t in lora_tensors:
	lora_tensor_map[name] = t
	for name, t in base_tensors:
	print(name, t.shape)
	# merge lora to weight
	if any(x in name for x in ['attn_k', 'attn_q', 'attn_v', 'attn_output', 'ffn_gate_inp', 'ffn_down', 'ffn_gate', 'ffn_up']):
	lora_a = lora_tensor_map[f'{name}.lora_a']
	lora_b = lora_tensor_map[f'{name}.lora_b']
	print(lora_a.shape)
	print(lora_b.shape)
	t = np.add(t, np.matmul(lora_b, lora_a))

	dtype = gguf.GGMLQuantizationType.F32
	t = t.squeeze().astype(np.float32)
	if t.ndim != 1:
	t = t.squeeze().astype(np.float16)
	dtype = gguf.GGMLQuantizationType.F16
	gguf_writer.add_tensor(name, t, raw_dtype=dtype)

	gguf_writer.write_header_to_file('./fake_moe_lora_merged.gguf')
	gguf_writer.write_kv_data_to_file()
	gguf_writer.write_tensors_to_file(progress=True)
	gguf_writer.close()