Created
April 24, 2020 15:17
-
-
Save adsieg/431997d45819db60bbbef5bd8b0b70a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### Imports ###### | |
import argparse | |
import os | |
from pyspark.sql.functions import udf | |
from pyspark.sql.functions import lit | |
from pyspark.sql import SQLContext, SparkSession | |
from pyspark.sql.types import * | |
import pyspark | |
from pyspark import SparkFiles | |
###### Constants ###### | |
CONTI_FEATURES = ['year', 'bpm','nrgy', 'dnce', 'dB', 'live', 'val', 'dur','acous', 'spch', 'pop'] | |
RESULTS_FILENAME_DEFAULT = 'results_top50_spotify.csv' | |
###### Arguments ###### | |
parser = argparse.ArgumentParser(description='Top 50 songs most popular') | |
parser.add_argument('input_file', type=str, help='Input file path') | |
parser.add_argument('output_path', type=str, help='Output path to store results') | |
args = parser.parse_args() | |
###### Functions ###### | |
# Write a custom function to convert the data type of DataFrame columns | |
def convertColumn(df, names, newType): | |
for name in names: | |
df = df.withColumn(name, df[name].cast(newType)) | |
return df | |
if __name__=='__main__': | |
# 0. Arguments | |
filename = args.input_file | |
results_path = args.output_path | |
results_path_file = results_path + '/' + RESULTS_FILENAME_DEFAULT | |
# 1. Spark Context | |
sc = pyspark.SparkContext('local[*]') | |
sqlContext = pyspark.SQLContext(sc) | |
# 2. Read csv files | |
df_spotify = sqlContext.read.csv(SparkFiles.get(filename), header=True, inferSchema= True) | |
df_spotify = convertColumn(df_spotify, CONTI_FEATURES, FloatType()) | |
# 3. ETL / Computations | |
df_spotify_results = df_spotify.groupBy("top genre", "country").agg({'pop': 'mean'}).sort("avg(pop)",ascending=False) | |
# 4. Save results | |
df_spotify_results.write.csv(results_path_file) | |
# Improvements: http://blog.appliedinformaticsinc.com/how-to-write-spark-applications-in-python/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment