Created
October 19, 2016 23:03
-
-
Save vzhong/4a4dfea0037b16d584cc3b8b63f4a7e6 to your computer and use it in GitHub Desktop.
Converts from character offsets to word offsets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sents = [ | |
'i like sandwiches', | |
'the Fr. madison is a great friar', | |
] | |
char_offsets = [ | |
(2, 6), | |
(4, 15), | |
] | |
def tokenize(sent): | |
return sent.split() | |
def tokenize_and_convert_to_word_indices(sent, char_start, char_end): | |
proc = '' | |
for i, c in enumerate(sent): | |
if i == char_start: | |
proc += ' ESTART ' | |
if i == char_end: | |
proc += ' EEND ' | |
proc += c | |
words = proc.strip().split() | |
proc = [] | |
word_start = word_end = None | |
for i, w in enumerate(words): | |
if w == 'ESTART': | |
word_start = len(proc) | |
elif w == 'EEND': | |
word_end = len(proc) | |
else: | |
proc += [w] | |
assert word_start is not None | |
assert word_end is not None | |
return proc, word_start, word_end | |
if __name__ == '__main__': | |
for sent, char_offset in zip(sents, char_offsets): | |
print(sent) | |
char_start, char_end = char_offset | |
print(sent[char_start:char_end]) | |
words, word_start, word_end = tokenize_and_convert_to_word_indices(sent, char_start, char_end) | |
print(words) | |
print(words[word_start:word_end]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment