Skip to content

Instantly share code, notes, and snippets.

@TRBaldim
Created January 20, 2017 19:51
Show Gist options
  • Save TRBaldim/ca103f264fd3950d90dae00aba5cdd2d to your computer and use it in GitHub Desktop.
Save TRBaldim/ca103f264fd3950d90dae00aba5cdd2d to your computer and use it in GitHub Desktop.
Split RDD into two or more RDD based in keys or other column of RDD
from pyspark import SparkContext
# Close to the same solution of randomSplit
# https://github.com/apache/spark/blob/master/python/pyspark/rdd.py#L429
class SplitByKey:
def __init__(self, key):
self.key = key
self.split = None
def func(self, split, iterator):
self.split = split
for k in iterator:
if k[0] == self.key:
yield k
rdd = sc.parallelize(range(10000000), 4).zipWithIndex()\
.map(lambda x: (x[1], x[0] ** 2))\
.map(lambda x: (x[len(x) - 1] % 10, x[:len(x) - 1][0]))
# Getting the keys of RDD
keys = sorted(rdd.keys().distinct().collect())
result = [rdd.mapPartitionsWithIndex(SplitByKey(i).func, preservesPartitioning=True) for i in keys]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment