Last active
April 1, 2025 23:18
-
-
Save bonelifer/4059100046084923660b23848b3c38f3 to your computer and use it in GitHub Desktop.
Script to Fetch MusicBrainz IDs (MBIDs) for Artists from a CSV File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Script to Fetch MusicBrainz IDs (MBIDs) for Artists from a CSV File | |
Purpose: | |
This script reads a CSV file containing artist names, extracts unique artist names | |
from the columns 'Artist Name 1' to 'Artist Name 6', and queries the MusicBrainz API | |
for the MusicBrainz ID (MBID) of each artist. It then saves the results to a JSON file. | |
Steps: | |
1. Load a CSV file containing song details, which includes up to six columns for artist names. | |
2. Extract artist names from these columns, handling cases where multiple artists are listed in a single cell. | |
3. Query the MusicBrainz API to retrieve the MBID for each unique artist. | |
4. If an MBID is found, it is saved in a JSON file with just the MBID value. | |
5. The script respects the MusicBrainz API rate limit by introducing a delay between requests. | |
6. The results are saved in a formatted JSON file named 'artists_mbids.json'. | |
""" | |
import os # Import os to handle file path checks | |
import pandas as pd # Import pandas to read CSV and manipulate data | |
import requests # Import requests for making HTTP requests to the MusicBrainz API | |
import time # Import time for adding delays to prevent hitting rate limits | |
import json # Import json for formatting the output as JSON | |
# Define the email and app name for the User-Agent header | |
email = "[email protected]" # Replace with your actual email address | |
app_name = "GoogleTakeoutMusicToMBid/1.0" # A unique name that describes the purpose of this script | |
# File path for the CSV file | |
file_path = r"./musiclibrarysongs.csv" # Change this path to where your CSV file is located | |
# Check if the file exists before attempting to read it | |
if not os.path.isfile(file_path): | |
# If the file doesn't exist, print an error message and suggest actions | |
print(f"Error: The file at '{file_path}' could not be found.") | |
print("Please ensure the file exists at the specified location and try again.") | |
else: | |
try: | |
# Attempt to read the CSV file into a DataFrame, skipping problematic lines | |
df = pd.read_csv(file_path, on_bad_lines='skip') # 'skip' will ignore problematic lines | |
print("File successfully loaded.") | |
except Exception as e: | |
# Catch any other errors that may occur while reading the file | |
print(f"An error occurred while trying to load the file: {e}") | |
print("Please check the file format or permissions and try again.") | |
# If the file is successfully loaded, we proceed to process the data | |
if 'df' in locals(): | |
# Clean up column names by stripping any leading or trailing spaces | |
# This is important for ensuring that column names are accessed correctly. | |
df.columns = df.columns.str.strip() | |
# Create a set to store unique artist names. We use a set to automatically handle duplicates. | |
artist_names = set() | |
# Iterate through the columns that represent artist names (Artist Name 1 to Artist Name 6). | |
# These columns may contain multiple artists separated by commas. | |
for i in range(1, 7): # Loop through columns 1 to 6 for artist names | |
artist_column = f'Artist Name {i}' # Dynamically generate the column name for each artist column | |
# Check if the column exists in the DataFrame before attempting to process it | |
if artist_column in df.columns: | |
# Drop any null values (NaNs) and iterate through the remaining artist names | |
for artists in df[artist_column].dropna(): | |
# Each artist name may be a comma-separated string, so split it into individual artists | |
for artist in artists.split(','): | |
artist_names.add(artist.strip()) # Strip extra spaces and add to the set | |
# Initialize a list to store the MusicBrainz IDs (MBIDs) | |
mbids = [] | |
# Define the MusicBrainz API endpoint that will be used for searching artists by name | |
api_endpoint = "https://musicbrainz.org/ws/2/artist/" | |
# Set up headers for the API request, including a custom User-Agent (required by MusicBrainz) | |
# The User-Agent helps MusicBrainz identify who is making the requests. | |
headers = { | |
'User-Agent': f'{app_name} ({email})' # Replace with your own details | |
} | |
def get_artist_mbid(artist_name): | |
""" | |
Queries the MusicBrainz API to retrieve the MusicBrainz ID (MBID) for a given artist. | |
Parameters: | |
artist_name (str): The name of the artist to search for. | |
Returns: | |
str: The MusicBrainz ID (MBID) for the first matching artist, or None if no match is found. | |
""" | |
# Define the query parameters for the API request | |
query_params = { | |
'query': artist_name, # The search query string to find the artist by name | |
'fmt': 'json' # Specify that the response should be in JSON format | |
} | |
# Send the request to the MusicBrainz API | |
try: | |
response = requests.get(api_endpoint, params=query_params, headers=headers) | |
response.raise_for_status() # Raise an error if the HTTP request failed | |
# If the request was successful, process the response | |
data = response.json() # Parse the response as JSON | |
if data.get('artists'): # Check if any artist was returned in the response | |
return data['artists'][0]['id'] # Return the ID of the first artist | |
except requests.exceptions.RequestException as e: | |
# If the API request failed, print an error message | |
print(f"Error while fetching data for artist '{artist_name}': {e}") | |
except Exception as e: | |
# Catch any other exceptions and print a general error message | |
print(f"Unexpected error while processing artist '{artist_name}': {e}") | |
# Return None if no artist was found or an error occurred | |
return None | |
# Iterate through the unique set of artist names and fetch their MusicBrainz IDs (MBIDs) | |
for artist in artist_names: | |
# Call the function to retrieve the MBID for the current artist | |
mbid = get_artist_mbid(artist) | |
# If an MBID is found, add it to the results list | |
if mbid: | |
mbids.append({"MusicBrainzId": mbid}) | |
else: | |
# If no MBID is found, log this information for the artist | |
print(f"MBID not found for artist: {artist}") | |
# To avoid hitting rate limits on the MusicBrainz API, introduce a 1-second delay between requests | |
# This helps ensure that the script doesn't overwhelm the API with too many requests in a short time. | |
time.sleep(1) | |
# Convert the list of MBIDs into a properly formatted JSON array | |
# The indent parameter is used to pretty-print the JSON output with 4 spaces for indentation | |
json_output = json.dumps(mbids, indent=4) | |
# Define the name of the output file where the JSON data will be saved | |
output_file = "artists_mbids.json" | |
# Open the output file in write mode and save the formatted JSON data to it | |
# Use UTF-8 encoding to handle any special characters properly | |
try: | |
with open(output_file, 'w', encoding='utf-8') as json_file: | |
json_file.write(json_output) | |
print(f"JSON array of MusicBrainz IDs has been saved to '{output_file}'.") | |
except Exception as e: | |
# Handle errors that occur during file writing | |
print(f"An error occurred while writing the output file: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment