Skip to content

Instantly share code, notes, and snippets.

@ayaka14732
Last active July 24, 2024 16:21
Show Gist options
  • Select an option

  • Save ayaka14732/d4527a028e5e96e54de67305eb2967f1 to your computer and use it in GitHub Desktop.

Select an option

Save ayaka14732/d4527a028e5e96e54de67305eb2967f1 to your computer and use it in GitHub Desktop.
入声字情感分析:入声字的情感与舒声字无明显差别
/zh_en.linux
/char.csv
/词表.txt
/结果.txt

与舒声字相比,入声字表示的情感更负面吗?

wget https://github.com/Heptagon196/Dict/raw/master/dic/zh_en.linux
wget https://github.com/CanCLID/rime-cantonese-upstream/raw/d82d3e3e5fc3d39cc3ec67116385e6be5ec37b17/char.csv
python preprocess.py
python 入声字情感分析.py

结果:

舒声字平均分: 60.90
入声字平均分: 59.42

结果表明入声字的情感与舒声字无明显差别。

from collections import defaultdict
import re
d = defaultdict(list)
# https://ayaka.shn.hk/hanregex/zh-CN/
def has_han(s):
return bool(re.search(r'[\u3006\u3007\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U0003134f]', s))
# workaround for cases like 'AAAA (BBBB; CCCC); DDDD'
def convert_all_nested_semicolon_to_正(s):
last_s = s
while True:
s = re.sub(r'(\([^()]*);([^()]*\))', r'\1正\2', s)
if s == last_s:
break
last_s = s
return s
def convert_正_to_semicolon(s):
return s.replace('正', ';')
with open('zh_en.linux', encoding='utf-8') as f:
try:
while True:
word = next(f).rstrip('\n')
explanation = next(f).rstrip('\n')
if len(word) != 1:
continue
assert explanation[:2] == r'\n'
explanation = explanation[2:]
explanation = convert_all_nested_semicolon_to_正(explanation)
items = explanation.split('; ')
items = [convert_正_to_semicolon(item) for item in items]
for item in items:
if not has_han(item):
d[word].append(item)
except StopIteration:
pass
with open('词表.txt', 'w', encoding='utf-8') as f:
for k, vs in d.items():
vfs = [v for v in vs if v[0] != '(']
v = vfs[0] if vfs else vs[0]
assert '\t' not in v
print(k, v, sep='\t', file=f)
tqdm
ToJyutping==0.2.1
numpy==1.22.3
tensorflow==2.10.0
transformers==4.18.0
import ToJyutping
from tqdm import tqdm
from transformers import pipeline
import numpy as np
import re
classifier = pipeline('sentiment-analysis')
def handle_one_result(result):
if result['label'] == 'POSITIVE':
score = result['score'] * 100
assert score > 50.
else:
score = 100. - result['score'] * 100
assert score < 50.
return score
def 分块(lst, n):
return [lst[i:i+n] for i in range(0, len(lst), n)]
def 根据粤拼判断入声(粵拼: str) -> bool:
return bool(re.search('[ptk]\d$', 粵拼))
汉字_英文列表 = []
英文列表 = []
with open('词表.txt', encoding='utf-8') as f:
for line in f:
汉字, 英文 = line.rstrip('\n').split('\t')
汉字_英文列表.append((汉字, 英文))
英文列表.append(英文)
分块英文列表 = 分块(英文列表, 64)
情感分析结果列表 = []
for 英文块 in tqdm(分块英文列表):
情感分析结果块 = classifier(英文块)
情感分析结果块 = list(map(handle_one_result, 情感分析结果块))
情感分析结果列表.extend(情感分析结果块)
舒声字得分列表 = []
入声字得分列表 = []
with open('结果.txt', 'w', encoding='utf-8') as f:
for (汉字, 英文), 情感分析结果 in zip(汉字_英文列表, 情感分析结果列表):
粵拼 = ToJyutping.get_jyutping_text(汉字)
if not 粵拼:
continue # 不处理没有读音的情况
是入声字 = 根据粤拼判断入声(粵拼)
if not 是入声字:
舒声字得分列表.append(情感分析结果)
else:
入声字得分列表.append(情感分析结果)
print(汉字, 粵拼, '舒' if not 是入声字 else '入', 英文, 情感分析结果, sep='\t', file=f)
舒声字平均分 = np.array(舒声字得分列表).mean()
入声字平均分 = np.array(入声字得分列表).mean()
print(f'舒声字平均分:{舒声字平均分:.2f}')
print(f'入声字平均分:{入声字平均分:.2f}')
@ChouUn
Copy link
Copy Markdown

ChouUn commented Jul 24, 2024

非常好研究,让我 GPU 风扇旋转

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment