Skip to content

Instantly share code, notes, and snippets.

@kaustumbh7
Last active October 27, 2022 08:05
Show Gist options
  • Save kaustumbh7/4e7a086eccc5ca567a289bfc391f2761 to your computer and use it in GitHub Desktop.
Save kaustumbh7/4e7a086eccc5ca567a289bfc391f2761 to your computer and use it in GitHub Desktop.
# Convert .tsv file to dataturks json format.
import json
import logging
import sys
def tsv_to_json_format(input_path,output_path,unknown_label):
try:
f=open(input_path,'r') # input file
fp=open(output_path, 'w') # output file
data_dict={}
annotations =[]
label_dict={}
s=''
start=0
for line in f:
if line[0:len(line)-1]!='.\tO':
word,entity=line.split('\t')
s+=word+" "
entity=entity[:len(entity)-1]
if entity!=unknown_label:
if len(entity) != 1:
d={}
d['text']=word
d['start']=start
d['end']=start+len(word)-1
try:
label_dict[entity].append(d)
except:
label_dict[entity]=[]
label_dict[entity].append(d)
start+=len(word)+1
else:
data_dict['content']=s
s=''
label_list=[]
for ents in list(label_dict.keys()):
for i in range(len(label_dict[ents])):
if(label_dict[ents][i]['text']!=''):
l=[ents,label_dict[ents][i]]
for j in range(i+1,len(label_dict[ents])):
if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):
di={}
di['start']=label_dict[ents][j]['start']
di['end']=label_dict[ents][j]['end']
di['text']=label_dict[ents][i]['text']
l.append(di)
label_dict[ents][j]['text']=''
label_list.append(l)
for entities in label_list:
label={}
label['label']=[entities[0]]
label['points']=entities[1:]
annotations.append(label)
data_dict['annotation']=annotations
annotations=[]
json.dump(data_dict, fp)
fp.write('\n')
data_dict={}
start=0
label_dict={}
except Exception as e:
logging.exception("Unable to process file" + "\n" + "error = " + str(e))
return None
tsv_to_json_format("Data/ner_corpus_260.tsv",'Data/ner_corpus_260.json','abc')
@harshitrai17152
Copy link

def csv_to_json_format(input_path, output_path):

f=open(input_path,'r') # input file
fp=open(output_path, 'w') # output file

data_dict={}
annotations =[]
label_dict={}
s=''
start=0

for line in f:
    if line!='\n':
        word,entity=line.split('\t')
        entity = entity.strip()
        s+=word
        
        if entity!='':
            d={}
            d['text']=word
            d['start']=start
            d['end']=start+len(word)-1
            try:
                label_dict[entity].append(d)
            except:
                label_dict[entity]=[]
                label_dict[entity].append(d) 
        start+=len(word)

    data_dict['content']=s
    s=''
    label_list=[]
    for ents in list(label_dict.keys()):
        for i in range(len(label_dict[ents])):
            if(label_dict[ents][i]['text']!=''):
                l=[ents,label_dict[ents][i]]
                for j in range(i+1,len(label_dict[ents])): 
                    if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):
                        di={}
                        di['start']=label_dict[ents][j]['start']
                        di['end']=label_dict[ents][j]['end']
                        di['text']=label_dict[ents][i]['text']
                        l.append(di)
                        label_dict[ents][j]['text']=''
                label_list.append(l)         

        for entities in label_list:
            label={}
            label['label']=[entities[0]]
            label['points']=entities[1:]

        annotations.append(label)
        data_dict['annotation']=annotations
        annotations=[]
        
        json.dump(data_dict, fp)
        fp.write('\n')
        
        data_dict={}
        label_dict={}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment