This demonstrates Spark Job, Stage and Tasks Listeners
1) Start spark-shell
Welcome to
____ __
/ __/__ ___ _____/ /__
$ sudo apt install stockfish | |
$ sudo apt upgrade stockfish | |
$ pip install chess | |
import chess.pgn | |
import chess.engine | |
# Load the PGN file | |
pgn = open("./file.pgn") | |
engine = chess.engine.SimpleEngine |
Both of these two functions take two arguments: start and end of the frame and they can be specified as follows: | |
- Window.unboundedPreceding, Window.unboundedFollowing — the entire window from the beginning to the end | |
- Window.unboundedPreceding, Window.currentRow — from the beginning of the window to the current row, this is used for the cumulative sum | |
- using numerical values, for example, 0 means currentRow, but the meaning of other values can differ based on the framing function rowsBetween/rangeBetween. | |
df.withColumn('activity_sum', sum('activity').over(w)) | |
https://miro.medium.com/max/1400/1*WYO-zRP1SlrzGqT4S_5Jvw.webp | |
// Hadoop | |
Download winutils.exe and hadoop.dll: https://github.com/kontext-tech/winutils | |
add them inside older C:\hadoop\bin | |
add env varible hadoop.home.dir and HADOOP_HOME, with value = C:\hadoop | |
add %HADOOP_HOME%\bin to the path | |
add hadoop.dll to C:\Windows\system32 | |
Make sure your JVM is 64 bit. | |
// Spark | |
download spark from https://spark.apache.org/downloads.html |
// case FetchFailedException or MetadataFetchFailedException: how to avoid BroadcastNestedLoopJoin | |
- spark.executor.memoryOverhead=1g | |
- spark.kubernetes.memoryOverheadFactor=0.2 | |
// avoid skew spark >= 3.0 | |
spark.sql.adaptive.optimizeSkewedJoin.enabled |
// when you have a skewed data when joining this method will fix it | |
def saltedJoin(df: DataFrame, buildDf: DataFrame, joinExpression: Column, joinType: String, salt: Int): DataFrame = { | |
import org.apache.spark.sql.functions._ | |
val tmpDf = buildDf.withColumn("slt_range", array(Range(0, salt).toList.map(lit): _*)) | |
val tableDf = tmpDf.withColumn("slt_ratio_s", explode(tmpDf("slt_range"))).drop("slt_range") | |
val streamDf = df.withColumn("slt_ratio", monotonically_increasing_id % salt) | |
val saltedExpr = streamDf("slt_ratio") === tableDf("slt_ratio_s") && joinExpression | |
streamDf.join(tableDf, saltedExpr, joinType).drop("slt_ratio_s").drop("slt_ratio") |
// Cucmumber datatable to spark dataframe | |
import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` | |
import io.cucumber.datatable.DataTable | |
import org.apache.spark.sql.DataFrame | |
def dataTableToDataframe(table: DataTable): DataFrame = { | |
import sparkSession.implicits._ | |
val columns: Seq[String] = table.cells().head.toSeq | |
val data = table.cells().drop(1).toSeq.map(r => r.toList) | |
data.toDF().select(columns.indices.map(i => col("value")(i).alias(columns(i))): _*) |
// styles.scss | |
@import '~@angular/material/prebuilt-themes/indigo-pink.css'; | |
// HTML | |
<mat-table class="lessons-table mat-elevation-z8" [dataSource]="dataSource"> | |
<ng-container matColumnDef="id"> | |
<mat-header-cell *matHeaderCellDef>#</mat-header-cell> | |
<mat-cell *matCellDef="let customer">{{customer.id}}</mat-cell> |
import numpy as np | |
from PIL import ImageGrab | |
import cv2 | |
def draw_detections(img, rects, thickness = 1): | |
for x, y, w, h in rects: | |
pad_w, pad_h = int(0.15*w), int(0.05*h) | |
cv2.rectangle(img, (x+pad_w, y+pad_h), (x+w-pad_w, y+h-pad_h), (0, 255, 0), thickness) | |
hog = cv2.HOGDescriptor() |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from sklearn.preprocessing import Imputer | |
from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
from sklearn.linear_model import LinearRegression | |
# Importing the dataset | |
dataset = pd.read_csv('train.csv') | |
df = pd.DataFrame(dataset) |