Skip to content

Instantly share code, notes, and snippets.

@Taekyoon
Created July 3, 2019 07:23
Show Gist options
  • Save Taekyoon/e7cf95b7895b065332c92f74744dae46 to your computer and use it in GitHub Desktop.
Save Taekyoon/e7cf95b7895b065332c92f74744dae46 to your computer and use it in GitHub Desktop.
sequence tagging utils
def base_to_char_level_bio_tags(sent, tag, empty_tag='O'):
char_level_tags = list()
sent_list, tag_list = sent.split(), tag.split()
if len(sent_list) != len(tag_list):
raise ValueError()
for i, (s, t) in enumerate(zip(sent_list, tag_list)):
char_level_tags += [t for _ in range(len(s))]
if i >= len(sent_list) - 1:
break
if tag[i] == tag[i+1]:
char_level_tags += [t]
else:
char_level_tags += [empty_tag]
return ' '.join(char_level_tags)
def transit_bio_tags_level_by_sent(sent, label):
transited_label = list()
label_pos = 0
for t in sent.split():
t_l = len(t)
next_label_pos = label_pos + t_l
label_seq = label[label_pos:next_label_pos]
target_label = 'O'
for l in label_seq:
if 'B' in l:
target_label = l
break
if 'I' in l:
target_label = l
transited_label.append(target_label)
label_pos = next_label_pos
return ' '.join(transited_label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment