vsetka · November 18, 2017 17:06
diff --git a/get-data-frame-from-postgres-spark.scala b/get-data-frame-from-postgres-spark.scala
 case class City(
  id: Int,
  name: String,
  iata: Option[String],
  longitude: Option[Double],
  latitude: Option[Double],
  updatedTimestamp: Long)

 object udfHelpers {
  val toTimestamp = udf((date: String) => new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(date).getTime)
 }

 val spark = SparkSession
  .builder
  .appName("Spark Test")
  .master("local[*]")
  .getOrCreate

 import spark.implicits._

 def getSqlDf(host: String, port: Int, name: String, user: String, password: String, tableName: String): sql.DataFrame = {
  spark.read
    .format("jdbc")
    .option("driver", "org.postgresql.Driver")
    .option("url", s"jdbc:postgresql://${host}:${port}/${name}?zeroDateTimeBehavior=convertToNull&read_buffer_size=100M")
    .option("dbtable", tableName)
    .option("user", user)
    .option("password", password)
    .load
 }

 val citiesDF = getSqlDf(
  host = Settings.geoDb.host,
  port = Settings.geoDb.port,
  name = Settings.geoDb.name,
  user = Settings.geoDb.user,
  password = Settings.geoDb.password,
  tableName = "geo.\"Cities\""
 ).withColumn("updatedTimestamp", udfHelpers.toTimestamp(citiesRaw("updatedAt")))
 .filter("iata IS null")
 .toDF.as[City].cache
diff --git a/parse-json-json4s.scala b/parse-json-json4s.scala
 // Parsing JSON to Map using CustomSerializer

 // == JSON format ==
 //
 //    {
 //      "AAA": {
 //        "latitude": 1.234567,
 //        "longitude": -1.234567
 //      },
 //      ...
 //    }

 import org.json4s.JsonAST.JObject
 import org.json4s.{CustomSerializer, DefaultFormats}
 import org.json4s.native.JsonMethods.parse
 import org.json4s.JsonDSL._

 import java.nio.file.Files
 import java.nio.file.Paths

 implicit val formats = DefaultFormats + new LocationSerializer()

 case class Location(longitude: Double, latitude: Double)

 class LocationSerializer extends CustomSerializer[Location](format => ({
    case obj: JObject =>
      println(obj)
      val longitude = (obj \ "longitude").values.asInstanceOf[Double]
      val latitude = (obj \ "latitude").values.asInstanceOf[Double]

      Location(longitude, latitude)
  }, {
    case obj: Location =>
      ("longitude" -> obj.longitude) ~
        ("latitude" -> obj.latitude)
    }
  ))

 val jsonString = new String(Files.readAllBytes(Paths.get(s"${Settings.trends.rootDataFolder}/locations_cache.json")))
 parse(jsonString).extract[Map[String, Location]]
diff --git a/parse-json-spark.scala b/parse-json-spark.scala
 // Parse JSON using Spark DataFrames

 // == JSON format ==
 //
 // { "iata": "AAA", "location": { "lon": 123.456, "lat": -123.456 }},
 // { "iata": "BBB", "location": { "lon": 123.456, "lat": -123.456 }},
 // ...

 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema

 case class Location(longitude: Double, latitude: Double)
 case class IataLocation(iata: String, location: Location)

 val spark = SparkSession
      .builder
      .appName("Spark Test")
      .master("local[*]")
      .getOrCreate

 val locationCache = spark
      .read
      .json(s"file://${Settings.trends.rootDataFolder}/locations-cache.json")
      .map((item) =>
        IataLocation(iata = item.getAs("iata"), Location(
          longitude = item.getAs("location").asInstanceOf[GenericRowWithSchema].getAs("lon"),
          latitude = item.getAs("location").asInstanceOf[GenericRowWithSchema].getAs("lat")
        ))
      )
diff --git a/serialize-rf-ml-pipeline.scala b/serialize-rf-ml-pipeline.scala

 // A typical standard machine learning workflow is as follows:
 //   * Loading data (aka data ingestion)
 //   * Extracting features (aka feature extraction)
 //   * Training model (aka model training)
 //   * Evaluate (or predictionize)

 // ETL
 val loadedDataframe = ... // get data from source(s)
 val labeledPoints = ... // pack the data into a Dataset (typed) case class with label and features fields

 // Split the data into training and test sets (30% held out for testing).
 val Array(train, test) = labeledPoints.randomSplit(Array(.7, .3), 10204L)

 // Scale and normalize feature based on their individual variances
 val featureScaler = new StandardScaler()
  .setInputCol("features")
  .setOutputCol("scaledFeatures")
  .setWithStd(true)
  .setWithMean(false)

 // Automatically identify categorical features, and index them.
 val featureIndexer = new VectorIndexer()
  .setInputCol("scaledFeatures")
  .setOutputCol("indexedFeatures")

 // Train a RandomForest model.
 val rf = new RandomForestRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures")

 val pipeline = new Pipeline("MyModel")
  .setStages(Array(vectorAssembler, featureScaler, featureIndexer, rf))

 // Train model. This runs the input data through the pipeline, ending with RandomForestRegressor
 val randomForestModel = pipeline.fit(train)

 // Make predictions.
 val predictions = randomForestModel.transform(test)

 // Print first 10 labels and predictions
 predictions.select("prediction", "label").show(10)

 // Evaluate the pipeline prediction accuracy
 val evaluator = new RegressionEvaluator

 val r2 = evaluator.setMetricName("r2").evaluate(predictions)
 val rmse = evaluator.setMetricName("rmse").evaluate(predictions)
 val mse = evaluator.setMetricName("mse").evaluate(predictions)
 val mae = evaluator.setMetricName("mae").evaluate(predictions)

 println(s"Mean Squared Error: ${mse}")
 println(s"Root Mean Squared Error: ${rmse}")
 println(s"Coefficient of Determination R-squared: ${r2}")
 println(s"Mean Absoloute Error: ${mae}")
 println(s"Explained params: ${evaluator.explainParams}")

 // Serialize the pipeline to
 pipeline.write.overwrite.save(s"${somePath}/MyPipeline")

 // Deserialize the pipeline
 val deserializedPipeline = PipelineModel.load(s"${somePath}/MyPipeline")

 // Like on line 39, now we can use the loaded pipeline to make predictions.
 val predictions = deserializedPipeline.transform(test)
	case class City(
	id: Int,
	name: String,
	iata: Option[String],
	longitude: Option[Double],
	latitude: Option[Double],
	updatedTimestamp: Long)

	object udfHelpers {
	val toTimestamp = udf((date: String) => new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(date).getTime)
	}

	val spark = SparkSession
	.builder
	.appName("Spark Test")
	.master("local[*]")
	.getOrCreate

	import spark.implicits._

	def getSqlDf(host: String, port: Int, name: String, user: String, password: String, tableName: String): sql.DataFrame = {
	spark.read
	.format("jdbc")
	.option("driver", "org.postgresql.Driver")
	.option("url", s"jdbc:postgresql://${host}:${port}/${name}?zeroDateTimeBehavior=convertToNull&read_buffer_size=100M")
	.option("dbtable", tableName)
	.option("user", user)
	.option("password", password)
	.load
	}

	val citiesDF = getSqlDf(
	host = Settings.geoDb.host,
	port = Settings.geoDb.port,
	name = Settings.geoDb.name,
	user = Settings.geoDb.user,
	password = Settings.geoDb.password,
	tableName = "geo.\"Cities\""
	).withColumn("updatedTimestamp", udfHelpers.toTimestamp(citiesRaw("updatedAt")))
	.filter("iata IS null")
	.toDF.as[City].cache
	// Parsing JSON to Map using CustomSerializer

	// == JSON format ==
	//
	// {
	// "AAA": {
	// "latitude": 1.234567,
	// "longitude": -1.234567
	// },
	// ...
	// }

	import org.json4s.JsonAST.JObject
	import org.json4s.{CustomSerializer, DefaultFormats}
	import org.json4s.native.JsonMethods.parse
	import org.json4s.JsonDSL._

	import java.nio.file.Files
	import java.nio.file.Paths

	implicit val formats = DefaultFormats + new LocationSerializer()

	case class Location(longitude: Double, latitude: Double)

	class LocationSerializer extends CustomSerializer[Location](format => ({
	case obj: JObject =>
	println(obj)
	val longitude = (obj \ "longitude").values.asInstanceOf[Double]
	val latitude = (obj \ "latitude").values.asInstanceOf[Double]

	Location(longitude, latitude)
	}, {
	case obj: Location =>
	("longitude" -> obj.longitude) ~
	("latitude" -> obj.latitude)
	}
	))

	val jsonString = new String(Files.readAllBytes(Paths.get(s"${Settings.trends.rootDataFolder}/locations_cache.json")))
	parse(jsonString).extract[Map[String, Location]]
	// Parse JSON using Spark DataFrames

	// == JSON format ==
	//
	// { "iata": "AAA", "location": { "lon": 123.456, "lat": -123.456 }},
	// { "iata": "BBB", "location": { "lon": 123.456, "lat": -123.456 }},
	// ...

	import org.apache.spark.sql.SparkSession
	import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema

	case class Location(longitude: Double, latitude: Double)
	case class IataLocation(iata: String, location: Location)

	val spark = SparkSession
	.builder
	.appName("Spark Test")
	.master("local[*]")
	.getOrCreate

	val locationCache = spark
	.read
	.json(s"file://${Settings.trends.rootDataFolder}/locations-cache.json")
	.map((item) =>
	IataLocation(iata = item.getAs("iata"), Location(
	longitude = item.getAs("location").asInstanceOf[GenericRowWithSchema].getAs("lon"),
	latitude = item.getAs("location").asInstanceOf[GenericRowWithSchema].getAs("lat")
	))
	)

	// A typical standard machine learning workflow is as follows:
	// * Loading data (aka data ingestion)
	// * Extracting features (aka feature extraction)
	// * Training model (aka model training)
	// * Evaluate (or predictionize)

	// ETL
	val loadedDataframe = ... // get data from source(s)
	val labeledPoints = ... // pack the data into a Dataset (typed) case class with label and features fields

	// Split the data into training and test sets (30% held out for testing).
	val Array(train, test) = labeledPoints.randomSplit(Array(.7, .3), 10204L)

	// Scale and normalize feature based on their individual variances
	val featureScaler = new StandardScaler()
	.setInputCol("features")
	.setOutputCol("scaledFeatures")
	.setWithStd(true)
	.setWithMean(false)

	// Automatically identify categorical features, and index them.
	val featureIndexer = new VectorIndexer()
	.setInputCol("scaledFeatures")
	.setOutputCol("indexedFeatures")

	// Train a RandomForest model.
	val rf = new RandomForestRegressor()
	.setLabelCol("label")
	.setFeaturesCol("indexedFeatures")

	val pipeline = new Pipeline("MyModel")
	.setStages(Array(vectorAssembler, featureScaler, featureIndexer, rf))

	// Train model. This runs the input data through the pipeline, ending with RandomForestRegressor
	val randomForestModel = pipeline.fit(train)

	// Make predictions.
	val predictions = randomForestModel.transform(test)

	// Print first 10 labels and predictions
	predictions.select("prediction", "label").show(10)

	// Evaluate the pipeline prediction accuracy
	val evaluator = new RegressionEvaluator

	val r2 = evaluator.setMetricName("r2").evaluate(predictions)
	val rmse = evaluator.setMetricName("rmse").evaluate(predictions)
	val mse = evaluator.setMetricName("mse").evaluate(predictions)
	val mae = evaluator.setMetricName("mae").evaluate(predictions)

	println(s"Mean Squared Error: ${mse}")
	println(s"Root Mean Squared Error: ${rmse}")
	println(s"Coefficient of Determination R-squared: ${r2}")
	println(s"Mean Absoloute Error: ${mae}")
	println(s"Explained params: ${evaluator.explainParams}")

	// Serialize the pipeline to
	pipeline.write.overwrite.save(s"${somePath}/MyPipeline")

	// Deserialize the pipeline
	val deserializedPipeline = PipelineModel.load(s"${somePath}/MyPipeline")

	// Like on line 39, now we can use the loaded pipeline to make predictions.
	val predictions = deserializedPipeline.transform(test)