marhan · June 2, 2019 12:54
diff --git a/csv_import.scala b/csv_import.scala
 val filesPath = "/Volumes/Volume/bank-account-files/*.csv"

 import org.apache.spark.sql.types.{StructType, StructField, StringType, DecimalType};

 val customSchema = StructType( Array( StructField("Buchung", StringType, true), 
                                      StructField("Wert", StringType, true),
                                      StructField("Verwendungszweck", StringType, true),
                                      StructField("Betrag", StringType, true) ))

 val df = sqlContext.read.format("com.databricks.spark.csv").
                         option("header", "true").
                         option("delimiter",";").
                         option("charset", "Windows-1252").
                         option("treatEmptyValuesAsNulls", "True").
                         option("mode", "DROPMALFORMED").
                         schema(customSchema).load(filesPath)

 val account = df.withColumn("Betrag",  regexp_replace( regexp_replace( df("Betrag"),  "\\.","" ) , "\\,",".").cast(DecimalType(10,2)))
                .withColumn("Buchung", to_date( unix_timestamp( col("Buchung") , "dd.MM.yyyy" ).cast("timestamp") ) )
                .withColumn("Wert", to_date( unix_timestamp( col("Wert") , "dd.MM.yyyy" ).cast("timestamp") ) )

 account.registerTempTable("account")
	val filesPath = "/Volumes/Volume/bank-account-files/*.csv"

	import org.apache.spark.sql.types.{StructType, StructField, StringType, DecimalType};

	val customSchema = StructType( Array( StructField("Buchung", StringType, true),
	StructField("Wert", StringType, true),
	StructField("Verwendungszweck", StringType, true),
	StructField("Betrag", StringType, true) ))

	val df = sqlContext.read.format("com.databricks.spark.csv").
	option("header", "true").
	option("delimiter",";").
	option("charset", "Windows-1252").
	option("treatEmptyValuesAsNulls", "True").
	option("mode", "DROPMALFORMED").
	schema(customSchema).load(filesPath)

	val account = df.withColumn("Betrag", regexp_replace( regexp_replace( df("Betrag"), "\\.","" ) , "\\,",".").cast(DecimalType(10,2)))
	.withColumn("Buchung", to_date( unix_timestamp( col("Buchung") , "dd.MM.yyyy" ).cast("timestamp") ) )
	.withColumn("Wert", to_date( unix_timestamp( col("Wert") , "dd.MM.yyyy" ).cast("timestamp") ) )

	account.registerTempTable("account")