Skip to content

Instantly share code, notes, and snippets.

@c0mpiler
Created March 22, 2023 19:34
import sys
from operator import add
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonWordCount")\
.getOrCreate()
sc=spark.sparkContext
words = 'the quick brown fox jumps over the\
lazy dog the quick brown fox jumps over the lazy dog'
seq = words.split()
# sc is SparkContext that is created with pyspark
data = sc.parallelize(seq)
counts = data.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).collect()
dict(counts)
sc.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment