colbyford · January 10, 2024 02:36 · shubham12tomar · Jun 27, 2022
diff --git a/SparkML_LinearRegression.py b/SparkML_LinearRegression.py
 ########################################
 ## Title: Spark MLlib Linear Regression Script, with Cross-Validation and Parameter Sweep
 ## Language: PySpark
 ## Author: Colby T. Ford, Ph.D.
 ########################################

 from pyspark.ml.regression import LinearRegression
 from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
 from pyspark.ml.evaluation import RegressionEvaluator

 # Create initial LinearRegression model
 lr = LinearRegression(labelCol="label", featuresCol="features")


 # Create ParamGrid for Cross Validation
 lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
             #  .addGrid(lr.regParam, [0.01, 0.1, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             #  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20, 50])
             #  .addGrid(lr.maxIter, [1, 5, 10])
             .build())

 # Evaluate model
 lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")

 # Create 5-fold CrossValidator
 lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)

 # Run cross validations
 lrcvModel = lrcv.fit(train)
 print(lrcvModel)

 # Get Model Summary Statistics
 lrcvSummary = lrcvModel.bestModel.summary
 print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
 print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept

 # Use test set here so we can measure the accuracy of our model on new data
 lrpredictions = lrcvModel.transform(test)

 # cvModel uses the best model found from the Cross Validation
 # Evaluate best model
 print('RMSE:', lrevaluator.evaluate(lrpredictions))
	########################################
	## Title: Spark MLlib Linear Regression Script, with Cross-Validation and Parameter Sweep
	## Language: PySpark
	## Author: Colby T. Ford, Ph.D.
	########################################

	from pyspark.ml.regression import LinearRegression
	from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
	from pyspark.ml.evaluation import RegressionEvaluator

	# Create initial LinearRegression model
	lr = LinearRegression(labelCol="label", featuresCol="features")


	# Create ParamGrid for Cross Validation
	lrparamGrid = (ParamGridBuilder()
	.addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
	# .addGrid(lr.regParam, [0.01, 0.1, 0.5])
	.addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
	# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
	.addGrid(lr.maxIter, [1, 5, 10, 20, 50])
	# .addGrid(lr.maxIter, [1, 5, 10])
	.build())

	# Evaluate model
	lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")

	# Create 5-fold CrossValidator
	lrcv = CrossValidator(estimator = lr,
	estimatorParamMaps = lrparamGrid,
	evaluator = lrevaluator,
	numFolds = 5)

	# Run cross validations
	lrcvModel = lrcv.fit(train)
	print(lrcvModel)

	# Get Model Summary Statistics
	lrcvSummary = lrcvModel.bestModel.summary
	print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
	print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept

	# Use test set here so we can measure the accuracy of our model on new data
	lrpredictions = lrcvModel.transform(test)

	# cvModel uses the best model found from the Cross Validation
	# Evaluate best model
	print('RMSE:', lrevaluator.evaluate(lrpredictions))