apeletz512 · March 12, 2017 21:41 · ericbellet · Jan 5, 2023 · sunmoon4ever · Feb 8, 2023
diff --git a/build_hive_ddl.py b/build_hive_ddl.py
 def build_hive_ddl(
        table_name, object_schema, location, file_format, partition_schema=None, verbose=False):
    """
    :param table_name: the name of the table you want to register in the Hive metastore    
    :param object_schema: an instance of pyspark.sql.Dataframe.schema
    :param location: the storage location for this data (and S3 or HDFS filepath)
    :param file_format: a string compatible with the 'STORED AS <format>' Hive DDL syntax
    :param partition_schema: an optional instance of pyspark.sql.Dataframe.schema that stores the
    columns that are used for partitioning on disk
    :param verbose:
    :return: None
    """
    columns = (','.join([field.simpleString() for field in object_schema])).replace(':', ' ')
    ddl = 'CREATE EXTERNAL TABLE '+table_name+' ('\
        + columns + ')'\
        + (
              ' PARTITIONED BY ('
              + (','.join([field.simpleString() for field in partition_schema])).replace(':', ' ')
              + ')'
              if partition_schema else ''
          )\
        + ' STORED AS '+file_format\
        + ' LOCATION "'+location+'"'
    if verbose:
        print('Generated Hive DDL:\n'+ddl)
    return ddl
	def build_hive_ddl(
	table_name, object_schema, location, file_format, partition_schema=None, verbose=False):
	"""
	:param table_name: the name of the table you want to register in the Hive metastore
	:param object_schema: an instance of pyspark.sql.Dataframe.schema
	:param location: the storage location for this data (and S3 or HDFS filepath)
	:param file_format: a string compatible with the 'STORED AS <format>' Hive DDL syntax
	:param partition_schema: an optional instance of pyspark.sql.Dataframe.schema that stores the
	columns that are used for partitioning on disk
	:param verbose:
	:return: None
	"""
	columns = (','.join([field.simpleString() for field in object_schema])).replace(':', ' ')
	ddl = 'CREATE EXTERNAL TABLE '+table_name+' ('\
	+ columns + ')'\
	+ (
	' PARTITIONED BY ('
	+ (','.join([field.simpleString() for field in partition_schema])).replace(':', ' ')
	+ ')'
	if partition_schema else ''
	)\
	+ ' STORED AS '+file_format\
	+ ' LOCATION "'+location+'"'
	if verbose:
	print('Generated Hive DDL:\n'+ddl)
	return ddl