TRBaldim · January 20, 2017 19:51
diff --git a/SplitRDDbyKey.py b/SplitRDDbyKey.py
 from pyspark import SparkContext

 # Close to the same solution of randomSplit
 # https://github.com/apache/spark/blob/master/python/pyspark/rdd.py#L429
 class SplitByKey:
    def __init__(self, key):
        self.key = key
        self.split = None
        
    def func(self, split, iterator):
        self.split = split
        for k in iterator:
            if k[0] == self.key:
                yield k

 rdd = sc.parallelize(range(10000000), 4).zipWithIndex()\
                                        .map(lambda x: (x[1], x[0] ** 2))\
                                        .map(lambda x: (x[len(x) - 1] % 10, x[:len(x) - 1][0]))

 # Getting the keys of RDD
 keys = sorted(rdd.keys().distinct().collect())

 result = [rdd.mapPartitionsWithIndex(SplitByKey(i).func, preservesPartitioning=True) for i in keys]
	from pyspark import SparkContext

	# Close to the same solution of randomSplit
	# https://github.com/apache/spark/blob/master/python/pyspark/rdd.py#L429
	class SplitByKey:
	def __init__(self, key):
	self.key = key
	self.split = None

	def func(self, split, iterator):
	self.split = split
	for k in iterator:
	if k[0] == self.key:
	yield k

	rdd = sc.parallelize(range(10000000), 4).zipWithIndex()\
	.map(lambda x: (x[1], x[0] ** 2))\
	.map(lambda x: (x[len(x) - 1] % 10, x[:len(x) - 1][0]))

	# Getting the keys of RDD
	keys = sorted(rdd.keys().distinct().collect())

	result = [rdd.mapPartitionsWithIndex(SplitByKey(i).func, preservesPartitioning=True) for i in keys]