Last active
April 24, 2016 23:34
-
-
Save craffel/d7f772c9becaf3387d95 to your computer and use it in GitHub Desktop.
Scripts used for generating the clean MIDI subset, as used in https://github.com/craffel/midi-dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.chdir('..') | |
import sys | |
sys.path.append(os.getcwd()) | |
import normalize_names | |
import pickle | |
with open('data/Clean MIDIs-md5_to_artist_title.pickle') as f: | |
md5_to_artist_title = pickle.load(f) | |
with open('data/Clean MIDIs-md5_to_path.pickle') as f: | |
md5_to_path = pickle.load(f) | |
md5_to_freebase_artist_title = {} | |
for n, md5 in enumerate(md5_to_path): | |
artists_titles = md5_to_artist_title[md5] | |
artists = [artist_title[0] for artist_title in artists_titles] | |
titles = [artist_title[1] for artist_title in artists_titles] | |
for n, title in enumerate(titles): | |
# Some titles have " l" appended to the end which trips up freebase | |
if title[-2:] == ' l': | |
titles[n] = title[:-2] | |
print artists, titles | |
resolved_artists = normalize_names.echonest_normalize_artist(artists) | |
if resolved_artists is not None: | |
resolved_artist, resolved_title = \ | |
normalize_names.freebase_normalize_title(resolved_artists, titles) | |
if resolved_artist is not None and resolved_title is not None: | |
md5_to_freebase_artist_title[md5] = [resolved_artist, | |
resolved_title] | |
print '\t', resolved_artist, '-', resolved_title | |
with open('data/Clean MIDIs-md5_to_freebase_artist_title.pickle', 'wb') as f: | |
pickle.dump(md5_to_freebase_artist_title, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.chdir('..') | |
import sys | |
sys.path.append(os.getcwd()) | |
import pickle | |
import csv | |
import shutil | |
import normalize_names | |
def safe_copy(old_path, new_path): | |
''' | |
Copies a file, but if the destination exists it appends a number. | |
''' | |
if not os.path.exists(new_path): | |
shutil.copy(old_path, new_path) | |
else: | |
n = 1 | |
while os.path.exists((os.path.splitext(new_path)[0] + | |
'.{}.mid'.format(n))): | |
n += 1 | |
new_path = os.path.splitext(new_path)[0] + '.{}.mid'.format(n) | |
shutil.copy(old_path, new_path) | |
return new_path | |
if not os.path.exists('data/clean_midi/mid'): | |
os.makedirs('data/clean_midi/mid') | |
with open('data/Clean MIDIs-md5_to_freebase_artist_title.pickle') as f: | |
md5_to_artist_title = pickle.load(f) | |
with open('data/Clean MIDIs-md5_to_path.pickle') as f: | |
md5_to_path = pickle.load(f) | |
with open('file_lists/clean_midi.txt', 'wb') as f: | |
writer = csv.writer(f, delimiter='\t') | |
for n, (md5, artist_title) in enumerate(md5_to_artist_title.items()): | |
artist = normalize_names.clean(artist_title[0]).replace('/', ' ') | |
title = normalize_names.clean(artist_title[1]).replace('/', ' ') | |
original_path = os.path.join('data', md5_to_path[md5]) | |
if not os.path.exists(original_path): | |
print "{} not found".format(original_path) | |
continue | |
if not os.path.exists(os.path.join('data/clean_midi/mid', artist)): | |
os.makedirs(os.path.join('data/clean_midi/mid', artist)) | |
output_path = os.path.join('data/clean_midi/mid', artist, | |
title[:247] + '.mid') | |
output_path = safe_copy(original_path, output_path) | |
writer.writerow([n, artist, title, md5, | |
output_path.replace('data/clean_midi/mid/', '')]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <codecell> | |
import os | |
import sys | |
import numpy as np | |
import hashlib | |
import pickle | |
# <codecell> | |
path = '../data/Clean MIDIs/' | |
# <codecell> | |
def split_all_extensions( f ): | |
''' | |
Returns a filename with all extensions removed | |
''' | |
while os.path.splitext(f)[1] != '': | |
f = os.path.splitext(f)[0] | |
return f | |
# <codecell> | |
def safe_rename( old_path, new_path ): | |
''' | |
Moves a file, but if the destination exists it appends a number to the filename. | |
''' | |
if not os.path.exists( new_path ): | |
os.renames( old_path, new_path ) | |
else: | |
n = 1 | |
new_path = split_all_extensions(new_path) + os.path.splitext(new_path)[1] | |
while os.path.exists( os.path.splitext(new_path)[0] + '.{}.mid'.format( n ) ): | |
n += 1 | |
new_path = os.path.splitext(new_path)[0] + '.{}.mid'.format( n ) | |
os.renames( old_path, new_path ) | |
# <codecell> | |
def convert_camelCase( string ): | |
''' | |
Replaces any camelCase with camel Case | |
''' | |
lowers = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] | |
uppers = [s.upper() for s in lowers] | |
camel_case_spots = np.flatnonzero(np.array([b in lowers and c in uppers for b, c in zip( string[:-1], string[1:] )])) | |
if camel_case_spots.shape == (0,): | |
return None | |
shift = 1 | |
for n in camel_case_spots: | |
string = string[:n + shift] + ' ' + string[n + shift:] | |
shift += 1 | |
return string | |
# <codecell> | |
# Remove small and non-midi files, and rename .kar to .mid | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if os.path.splitext(f)[1].lower() == '.kar': | |
os.rename( os.path.join( root, f ), os.path.join( root, os.path.splitext(f)[0] + '.mid' ) ) | |
elif os.path.splitext(f)[1].lower() != '.mid': | |
os.remove( os.path.join(root, f) ) | |
elif os.path.getsize( os.path.join(root, f) ) < 2000: | |
os.remove( os.path.join(root, f) ) | |
# <codecell> | |
# Flatten subdirectories | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if len( os.path.join( root, f ).split('/') ) > 5: | |
new_path = '/'.join( os.path.join( root, f ).split('/')[:4] + [f] ) | |
safe_rename( os.path.join( root, f ), new_path ) | |
# <codecell> | |
# Remove empty subdirectories | |
for root, subdirectories, files in os.walk( path ): | |
for subdirectory in subdirectories: | |
if os.listdir( os.path.join(root, subdirectory) ) == [] or os.listdir( os.path.join(root, subdirectory) ) == ['.DS_Store']: | |
os.rmdir( os.path.join(root, subdirectory) ) | |
# <codecell> | |
# Remove duplicates | |
md5dict = {} | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
md5 = hashlib.md5( open( os.path.join( root, f )).read() ) | |
md5 = md5.hexdigest() | |
if md5 in md5dict: | |
os.remove( os.path.join(root, f) ) | |
md5dict[md5] += [os.path.join( root, f )] | |
else: | |
md5dict[md5] = [os.path.join( root, f )] | |
# <codecell> | |
# Convert CamelCase to Camel Case in subdirectories | |
for root, subdirectories, files in os.walk( path ): | |
for subdirectory in subdirectories: | |
if convert_camelCase( subdirectory ) is not None: | |
safe_rename( os.path.join(root, subdirectory), os.path.join(root, convert_camelCase(subdirectory) ) ) | |
# <codecell> | |
# Convert CamelCase to Camel Case in files | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if convert_camelCase( f ) is not None: | |
safe_rename( os.path.join(root, f), os.path.join(root, convert_camelCase(f) ) ) | |
# <codecell> | |
# Replace _ and - with space | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if f.find('_') > -1 or f.find('-') > -1: | |
safe_rename( os.path.join(root, f), os.path.join( root, f.replace('_', ' ').replace('-', ' ') ) ) | |
# <codecell> | |
# Remove files which were just artist names (oops) | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if f[:4] == '.mid': | |
os.remove( os.path.join(root, f) ) | |
# <codecell> | |
# Replace . with space | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
title = os.path.splitext( f )[0] | |
if title.find('.') > -1: | |
safe_rename( os.path.join(root, f), os.path.join(root, title.replace('.', ' ') + '.mid') ) | |
# <codecell> | |
# Change duplicate numbering with space to period (yesterday 7.mid -> yesterday.7.mid) | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
title = os.path.splitext(f)[0] | |
while len(title) > 2 and title[-2] == " " and title[-1] in [str(n) for n in xrange(10)]: | |
title = title[:-2] | |
if title != os.path.splitext(f)[0]: | |
safe_rename( os.path.join(root, f), os.path.join(root, title + '.mid') ) | |
# <codecell> | |
# Flatten all directories | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
if len( os.path.join( root, f ).split('/') ) > 4: | |
start = '/'.join( os.path.join(root, f).split('/')[:2] ) | |
end = '/'.join( os.path.join(root, f).split('/')[-2:] ) | |
new_path = os.path.join( start, end ) | |
safe_rename( os.path.join( root, f ), new_path ) | |
# <codecell> | |
# Remove artist name from track title | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
artist = os.path.split(root)[1] | |
if artist in f: | |
safe_rename( os.path.join(root, f), os.path.join( root, f.replace(artist, '').lstrip() ) ) | |
f = f.replace(artist, '').lstrip() | |
for word in artist.split(' '): | |
if len(word) > 3 and word in f: | |
safe_rename( os.path.join(root, f), os.path.join( root, f.replace(word, '').lstrip() ) ) | |
f = f.replace(word, '').lstrip() | |
# <codecell> | |
# Strip spaces at the beginning and end of filenames | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
new_f = os.path.splitext( f )[0].lstrip().rstrip() + os.path.splitext(f)[1] | |
if new_f != f: | |
safe_rename( os.path.join(root, f), os.path.join(root, new_f) ) | |
# <codecell> | |
# Remove multiple spaces | |
for root, subdirectories, files in os.walk( path ): | |
for f in files: | |
new_f = f | |
while new_f.find(' ') > -1: | |
new_f = new_f.replace(' ',' ') | |
if new_f != f: | |
safe_rename( os.path.join(root, f), os.path.join(root, new_f) ) | |
# <codecell> | |
def normalize_string(string): | |
''' | |
Make it lowercase and unicode | |
''' | |
return unicode(string.lower(), encoding='utf-8') | |
# <codecell> | |
# Make the md5->[[artist, title]] dict | |
md5_to_artist_title = {} | |
md5_to_paths = pickle.load( open('../data/Clean MIDIs-md5_to_paths.pickle') ) | |
for root, subdirectories, files in os.walk(path): | |
for f in files: | |
if '.mid' not in f: | |
continue | |
md5 = hashlib.md5( open( os.path.join(root, f) ).read() ) | |
md5 = md5.hexdigest() | |
title = split_all_extensions(f) | |
artist = os.path.split(root)[1] | |
title = normalize_string(title) | |
artist = normalize_string(artist) | |
md5_to_artist_title[md5] = [[artist, title]] | |
for some_path in md5_to_paths[md5]: | |
rem, title = os.path.split(some_path) | |
title = os.path.splitext(title)[0] | |
artist = os.path.split(rem)[1] | |
if convert_camelCase(title) is not None: | |
title = convert_camelCase(title) | |
if convert_camelCase(artist) is not None: | |
artist = convert_camelCase(artist) | |
title = title.replace("_", " ").replace("-"," ") | |
artist = artist.replace("_", " ").replace("-"," ") | |
if len(title) > 2 and title[-2] == " " and title[-1] in [str(n) for n in xrange(10)]: | |
title = title[:-2] | |
if artist in title: | |
title = title.replace(artist, "") | |
for word in artist.split(' '): | |
if len(word) > 3 and word in title: | |
title = title.replace(word, "") | |
while title.find(' ') > -1: | |
title = title.replace(' ',' ') | |
while artist.find(' ') > -1: | |
artist = artist.replace(' ',' ') | |
artist = artist.lstrip().rstrip() | |
title = title.lstrip().rstrip() | |
title = normalize_string(title) | |
artist = normalize_string(artist) | |
if [artist, title] not in md5_to_artist_title[md5]: | |
md5_to_artist_title[md5] += [[artist, title]] | |
# <codecell> | |
if __name__ == '__main__': | |
import whoosh_search | |
index = whoosh_search.get_whoosh_index('../data/cal500/index/') | |
searcher = index.searcher() | |
match_list = [] | |
for root, subdirectories, files in os.walk(path): | |
for f in files: | |
if '.mid' not in f.lower(): | |
break | |
title = split_all_extensions(f) | |
artist = os.path.split(root)[1] | |
results = whoosh_search.search(searcher, index.schema, artist, title) | |
for result in results: | |
match_list += [[os.path.join(artist, f), "{}-{}.mp3".format( result[1].replace(' ', '_'), result[2].replace(' ', '_') )]] | |
searcher.close() | |
pickle.dump( match_list, open('../data/Clean MIDIs-path_to_cal500_path.pickle', 'w') ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import urllib | |
import unicodedata | |
import pyen | |
import collections | |
FREEBASE_KEY = open('.freebase_key').read() | |
ECHONEST_KEY = open('.echonest_key').read() | |
FREEBASE_URL = 'https://www.googleapis.com/freebase/v1/search?' | |
def clean(string): | |
''' | |
Removes non-ascii characters from a string in a semi-smart way | |
:parameters: | |
- string : str or unicode | |
String to clean | |
:returns: | |
- clean_string : str | |
ASCII string | |
''' | |
# unicodedata requires unicode type as input | |
if type(string) == str: | |
string = unicode(string, 'utf-8', 'ignore') | |
# unicodedata tries to convert special characters to nearest ascii | |
# encode converts to ascii, ignoring encoding errors | |
return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore') | |
def echonest_normalize_artist(artists): | |
''' | |
Normalize artist names using echonest | |
:parameters: | |
- artists : str or list of str | |
Query artist name or list of potential artist names | |
- titles : str or list of str | |
Query title or list of potential song titles | |
:returns: | |
- artists : list of str | |
Unique list of matching artists | |
''' | |
# Allow strings/unicode to be passed instead of list | |
if type(artists) == str or type(artists) == unicode: | |
artists = [artists] | |
# Keep track of artists that echonest reports as matching | |
matched_artists = [] | |
# pyen makes querying echonest easy | |
en = pyen.Pyen(api_key=ECHONEST_KEY) | |
for query_artist in artists: | |
# Allow for http query failures | |
success = False | |
while not success: | |
try: | |
response = en.get('artist/search', | |
name=clean(query_artist), | |
results=5, | |
fuzzy_match='true') | |
# Skip any errors | |
except pyen.PyenException as e: | |
print e.message, e.args | |
continue | |
success = True | |
# If any artists were found, add them to the list | |
if len(response['artists']) > 0: | |
for matched_artist in response['artists']: | |
matched_artists.append(matched_artist['name']) | |
# No matches = return None | |
if len(matched_artists) == 0: | |
return None | |
# Get unique items from the list | |
matched_artists = list(collections.OrderedDict.fromkeys(matched_artists)) | |
return matched_artists | |
def freebase_normalize_title(artists, titles): | |
''' | |
Normalize a song title using freebase | |
:parameters: | |
- artists : str or list of str | |
Query artist name or list of potential artist names | |
- titles : str or list of str | |
Query title or list of potential song titles | |
:returns: | |
- artist : str or NoneType | |
Freebase's chosen artist from the supplied `artists` list | |
or None if no match | |
- title: str or NoneType | |
Freebase's purported title or None if no match | |
''' | |
def title_match(artist, title, old_correction=False): | |
''' Match a song title with some artist using freebase ''' | |
# Ask freebase for music recordings with the supplied artist | |
filter_str = '(all type:/music/recording /music/recording/artist:"{}")' | |
params = {'query': clean(title), | |
# Remove quotes, they mess up the query | |
'filter': filter_str.format(clean(artist).replace('"', '')), | |
# Only return one match | |
'limit': 1, | |
'key': FREEBASE_KEY, | |
# Allow for spelling mistakes | |
'spell': 'always'} | |
url = FREEBASE_URL + urllib.urlencode(params) | |
# Continually try http queries until a successful one | |
success = False | |
while not success: | |
try: | |
response = json.loads(urllib.urlopen(url).read()) | |
except Exception as e: | |
print e.message, e.args | |
continue | |
# A successful query should always have a 'result' key | |
if 'result' in response: | |
success = True | |
else: | |
print 'result not in response: {}'.format(response) | |
# Given a result, get the name | |
if len(response['result']) > 0: | |
return response['result'][0]['name'] | |
# For spelling corrections, re-try th query with the correction | |
if 'correction' in response: | |
# But only do it once | |
if old_correction: | |
return None | |
else: | |
return title_match(artist, | |
response['correction'][0], | |
True) | |
return None | |
# Allow for string args | |
if type(artists) == str or type(artists) == unicode: | |
artists = [artists] | |
if type(titles) == str or type(artists) == unicode: | |
titles = [titles] | |
# Try all combinations of supplied artists and titles | |
for query_artist in artists: | |
for query_title in titles: | |
title = title_match(query_artist, query_title) | |
if title is not None: | |
return query_artist, title | |
return None, None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment