Created
November 11, 2021 22:26
-
-
Save soldni/47fb3d67795eedc8bbc49438436462d0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import pymysql.cursors | |
import itertools | |
import tqdm | |
import re | |
# Connect to the database | |
connection = pymysql.connect(host='localhost', | |
user='ubuntu', | |
password='password', | |
database='2020.1', | |
autocommit=True, | |
cursorclass=pymysql.cursors.SSDictCursor) | |
mapping = { | |
"Target_Child": ["Target_Child"], | |
"Caregiver": ["Mother", "Father", "Grandmother", "Caretaker", "Grandfather"], | |
"Other_Child": ["Child", "Sister", "Brother", "Playmate", "Friend", "Student", "Sibling"], | |
"Other_Adult": ["Investigator", "Adult", "Teacher", "Nurse", "Visitor", "Target_Adult", "Doctor"], | |
"Other": ["Relative", "Participant", "Unidentified", "Subject", "Informant", | |
"Media", "Narrator", "Girl", "Group", "Teenager", "Environment", | |
"Boy", "Speaker", "Uncertain", "Male"] | |
} | |
inv_mapping = {} | |
for k, v in mapping.items(): | |
for elem in v: | |
inv_mapping[elem] = k | |
all_data = [] | |
with connection, connection.cursor() as cursor: | |
sql_count = "SELECT COUNT(id) FROM utterance;" | |
cursor.execute(sql_count) | |
count = cursor.fetchall() | |
count = int(count[0]['COUNT(id)']) | |
sql_rows = "SELECT * FROM utterance GROUP BY corpus_id, transcript_id, utterance_order, id" | |
cursor.execute(sql_rows) | |
current_transcript = {} | |
for row in tqdm.tqdm(cursor, desc='Reading utterances', unit=' utt', unit_scale=True, total=count): | |
# row = dict(zip(columns, row)) | |
if row['language'] != 'eng': | |
continue | |
if (current_transcript.get('corpus_id', None) != row['corpus_id'] or | |
current_transcript.get('transcript_id', None) != row['transcript_id']): | |
if current_transcript: | |
all_data.append(current_transcript) | |
current_transcript = {} | |
current_transcript.setdefault('utterances', []) | |
for key in ('collection_name', 'corpus_id', 'transcript_id', 'target_child_name', 'target_child_age', | |
'target_child_sex', 'target_child_id', 'corpus_name', 'language'): | |
current_transcript[key] = row[key] | |
current_transcript['utterances'].append({ | |
'id': row['id'], | |
'gloss': row['gloss'], | |
'type': row['type'], | |
'part_of_speech': row['part_of_speech'], | |
'speaker_name': row['speaker_name'], | |
'speaker_role': inv_mapping.get(row['speaker_role'], 'Other') | |
}) | |
if len(current_transcript): | |
all_data.append(current_transcript) | |
with open('/home/ubuntu/childes/childes.eng.jsonl', 'w', encoding='utf-8') as out_file: | |
for doc in tqdm.tqdm(all_data, desc='Writing to file', unit=' docs', unit_scale=True): | |
out_file.write(json.dumps(doc, sort_keys=True) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment