文字轉unicode較為簡單,用ord(x)即可
import re
def word2unicode(x):
uni = hex(ord(x))
uni = re.sub("^0x", "", uni).upper()
return uni
word2unicode("字") # 5B57
# GPT2 BPE-Tokenizer token 轉 utf-8 處理 | |
# 轉換僅針對不在詞表內,以bytes形式表達的token(如中文字) | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
word = "台" | |
tokens = tokenizer.convert_ids_to_tokens(tokenizer(word,add_special_tokens=False)["input_ids"]) | |
print("tokens:",tokens) | |
# 轉 utf-8 |
一 | |
丁 | |
七 | |
三 | |
下 | |
丈 | |
上 | |
丑 | |
丐 | |
不 |
# https://huggingface.co/docs/transformers/perplexity | |
from typing import Any | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
class PPL(): | |
def __init__(self, model_id="gpt2") -> None: | |
self.model = AutoModelForCausalLM.from_pretrained(model_id) | |
self.tokenizer = AutoTokenizer.from_pretrained(model_id) | |
self.device = 'cpu' |
<s>[INST] <<SYS>>你是一位中文母語使用者,你只能用中文對話<</SYS>>hello [/INST] *你好* (nǐ hǎo) </s> | |
<s>[INST] 你是誰 [/INST] *我是 líng* (wǒ shì líng) - I am Chinese. </s> | |
<s>[INST] 說個笑話來聽聽 [/INST] *笑* (xì) - Sure, here's a Chinese joke for you </s> |
# $ pip install deepspeed>=0.9.3 | |
# $ deepspeed deepspeed_inference.py | |
import os | |
import deepspeed | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
local_rank = int(os.getenv("LOCAL_RANK", "0")) |