jovianlin · April 12, 2017 05:31
diff --git a/pyspark_quick_codes.py b/pyspark_quick_codes.py
 # Write DataFrame to Disk
 spark_df.coalesce(1).write.csv( '<saved_output/YOUR_FOLDER_NAME>', header=True, mode='overwrite' )

 # Read from Disk to DataFrame
 new_spark_df = sqlContext.read.csv(s3_path,      header=True, inferSchema=False) # For S3
 new_spark_df = sqlContext.read.csv('<LOCATION>', header=True, inferSchema=False) # mode='FAILFAST'

 # SORTING
 from pyspark.sql.functions import col
 col_name = 'restaurant_id'
 spark_df.groupBy(col_name).count().filter("count >= 99").sort(col("count").desc()) # optional: ".toPandas()"
	# Write DataFrame to Disk
	spark_df.coalesce(1).write.csv( '<saved_output/YOUR_FOLDER_NAME>', header=True, mode='overwrite' )

	# Read from Disk to DataFrame
	new_spark_df = sqlContext.read.csv(s3_path, header=True, inferSchema=False) # For S3
	new_spark_df = sqlContext.read.csv('<LOCATION>', header=True, inferSchema=False) # mode='FAILFAST'

	# SORTING
	from pyspark.sql.functions import col
	col_name = 'restaurant_id'
	spark_df.groupBy(col_name).count().filter("count >= 99").sort(col("count").desc()) # optional: ".toPandas()"