etheleon · May 18, 2021 08:03
diff --git a/add_partition_to_hive_table.sql b/add_partition_to_hive_table.sql
 -- ALTER TABLE schema.table DROP IF EXISTS PARTITION (year='2021', month='01', day='11', hour='01')
 ALTER TABLE pricing.demand_tbl ADD
 PARTITION (year='2021', month='01', day='11', hour='01') 
 LOCATION 's3://datascience-bucket/wesley.goi/data/pricing/demand_tbl/year=2021/month=01/day=11/hour=01'
diff --git a/bulk_update.sql b/bulk_update.sql
 MSCK REPAIR TABLE schema.table
diff --git a/cache_table.sql b/cache_table.sql
 CACHE TABLE <tablename>
diff --git a/chain_hints.sql b/chain_hints.sql
 SELECT /*+ REPARTITION(100), COALESCE(500), REPARTITION_BY_RANGE(3, c) */ * FROM t
diff --git a/check_if_table_exists.py b/check_if_table_exists.py
 spark._jsparkSession.catalog().tableExists(schema, table)
diff --git a/create_hive_table.sql b/create_hive_table.sql
 -- DROP TABLE IF EXISTS schema.table
 CREATE EXTERNAL TABLE IF NOT EXISTS pricing.demand_tbl
 (
     country_id bigint, 
     city_id bigint, 
     utcDate string, 
     user_id bigint
 )
 PARTITIONED BY (year string, month string, day string hour string)
 LOCATION 's3a://datascience-bucket/wesley.goi/data/pricing/demand_tbl/' -- base path
 STORED AS PARQUET
diff --git a/create_session.py b/create_session.py
 import pyspark
 from pyspark.sql import SparkSession

 conf = (
 	pyspark.SparkConf()
      .set("spark.executor.instances", num_executors)
 )
 spark = (
 	SparkSession.builder
 	.appName("my_app_name")
 	.config(conf=conf)
 	.enableHiveSupport()
 	.getOrCreate()
 )
diff --git a/create_sparksession.r b/create_sparksession.r
 library(sparklyr)

 conf <- spark_config()
 conf$spark.executor.memory = "16G"
 conf$spark.executor.instance = num_executors

 spark <- spark_connect(
 	master = "local",
 	version = "2.3",
 	appName = "my_app_name"
 	conf = conf
 )
diff --git a/create_temp_view.sql b/create_temp_view.sql
 CREATE TEMPORARY VIEW my_table_name IF NOT EXISTS AS 
 <QUERY>
diff --git a/create_udf.py b/create_udf.py
 from pyspark.sql import functions as F
 from pyspark.sql.types import FloatType

 def some_func(a: int, b: int) -> int:
 	return a+b

 # pyspark
 udf_some_func = F.udf(some_func, FloatType())

 # in SQL
 # SELECT *, UDF_SOME_FUNC(col_a, col_b) FROM table
 spark.udf.register("udf_some_func", some_func, StringType())
diff --git a/example_schema.json b/example_schema.json
 [
 {
   "description": "[DESCRIPTION]",
   "name": "[NAME]",
   "type": "[TYPE]",
   "mode": "[MODE]"
 }
 ....
 ]
diff --git a/hints.sql b/hints.sql
 -- spark2
 -- SELECT /*+ REPARTITION 3000 */ *  
 SELECT /*+ REPARTITION(2000, col1, col2, col3) */ *
 FROM table
diff --git a/infer_col_type.py b/infer_col_type.py
 # Infer column data types from dataframe

 ## partition columns
 partition_cols_dict = {
 	"year": "string",
 	"month": "string", 
 	"day": "string",
 	"hour": "string"
 }
 partitions = ", ".join([f"{key} {partition_cols_dict[key]}" for key in partition_cols_dict])
 # print(partition_cols)
 # year string, month string, day string, hour string

 # non-partition columns
 partition_cols = [key for key in partition_cols_dict]
 columns = ", \n".join(
    spark.sql("describe temp")
    .rdd.filter(lambda row: row.col_name not in partition_cols)
    .map(lambda row: row[0] + " " + row[1])
    .collect()
 )

 # print(columns)
 # country_id bigint, city_id bigint, utcDate string, user_id bigint
diff --git a/nested_partitions.sql b/nested_partitions.sql
 SELECT * 
 FROM table 
 WHERE year||month||day BETWEEN 20210301 AND 20210415
diff --git a/read_partitions.py b/read_partitions.py
 df = spark.read.parquet("s3://<bucket>/<suffix>/year=*/month=*/day=*/hour=*")
diff --git a/refresh_table.sql b/refresh_table.sql
 REFRESH TABLE pricing.demand_tbl
 PARTITION (year='2021', month='01', day='11', hour='01')
diff --git a/save_to_tfrecord.py b/save_to_tfrecord.py
 from spark.sql import Dataframe

 def save_to_tfrecord(df: DataFrame, path):
    """
 	  Saves Spark dataframe to tfrecord in S3
 	  Parameter
    ------
    df: DataFrame
 	      spark dataframe
    path: string
        file path, if it's S3 eg. 
        s3://<bucket>/some/path/tfrecord. 
 		  It'll save the part files under the this folder gzipped
 	  """
    (
        df.write
        .format("tfrecords")
        .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
        .mode("overwrite")
        .save(path)
    )
diff --git a/simple_partition.sql b/simple_partition.sql
 SELECT * 
 FROM table 
 WHERE date BETWEEN 20210301 AND 20210415
diff --git a/spark_three_opts.py b/spark_three_opts.py
 # enable AQE
 spark.conf.set("spark.sql.adaptive.enabled", "true")
 # enable shuffle partitions optimisation
 spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
	-- ALTER TABLE schema.table DROP IF EXISTS PARTITION (year='2021', month='01', day='11', hour='01')
	ALTER TABLE pricing.demand_tbl ADD
	PARTITION (year='2021', month='01', day='11', hour='01')
	LOCATION 's3://datascience-bucket/wesley.goi/data/pricing/demand_tbl/year=2021/month=01/day=11/hour=01'
	-- DROP TABLE IF EXISTS schema.table
	CREATE EXTERNAL TABLE IF NOT EXISTS pricing.demand_tbl
	(
	country_id bigint,
	city_id bigint,
	utcDate string,
	user_id bigint
	)
	PARTITIONED BY (year string, month string, day string hour string)
	LOCATION 's3a://datascience-bucket/wesley.goi/data/pricing/demand_tbl/' -- base path
	STORED AS PARQUET
	import pyspark
	from pyspark.sql import SparkSession

	conf = (
	pyspark.SparkConf()
	.set("spark.executor.instances", num_executors)
	)
	spark = (
	SparkSession.builder
	.appName("my_app_name")
	.config(conf=conf)
	.enableHiveSupport()
	.getOrCreate()
	)
	library(sparklyr)

	conf <- spark_config()
	conf$spark.executor.memory = "16G"
	conf$spark.executor.instance = num_executors

	spark <- spark_connect(
	master = "local",
	version = "2.3",
	appName = "my_app_name"
	conf = conf
	)
	from pyspark.sql import functions as F
	from pyspark.sql.types import FloatType

	def some_func(a: int, b: int) -> int:
	return a+b

	# pyspark
	udf_some_func = F.udf(some_func, FloatType())

	# in SQL
	# SELECT *, UDF_SOME_FUNC(col_a, col_b) FROM table
	spark.udf.register("udf_some_func", some_func, StringType())
	[
	{
	"description": "[DESCRIPTION]",
	"name": "[NAME]",
	"type": "[TYPE]",
	"mode": "[MODE]"
	}
	....
	]
	-- spark2
	-- SELECT /+ REPARTITION 3000 / *
	SELECT /+ REPARTITION(2000, col1, col2, col3) / *
	FROM table
	# Infer column data types from dataframe

	## partition columns
	partition_cols_dict = {
	"year": "string",
	"month": "string",
	"day": "string",
	"hour": "string"
	}
	partitions = ", ".join([f"{key} {partition_cols_dict[key]}" for key in partition_cols_dict])
	# print(partition_cols)
	# year string, month string, day string, hour string

	# non-partition columns
	partition_cols = [key for key in partition_cols_dict]
	columns = ", \n".join(
	spark.sql("describe temp")
	.rdd.filter(lambda row: row.col_name not in partition_cols)
	.map(lambda row: row[0] + " " + row[1])
	.collect()
	)

	# print(columns)
	# country_id bigint, city_id bigint, utcDate string, user_id bigint
	SELECT *
	FROM table
	WHERE year\|\|month\|\|day BETWEEN 20210301 AND 20210415
	REFRESH TABLE pricing.demand_tbl
	PARTITION (year='2021', month='01', day='11', hour='01')