-
-
Save charmby/be5e381c6f66c4d234bb523df3759a7f to your computer and use it in GitHub Desktop.
Spark Streaming Kafka at-least-once with manual offset commit in Zookeeper (i.e not using Spark Streaming checkpoints that may be not recoverable after code changes)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import kafka.common.TopicAndPartition | |
import kafka.message.MessageAndMetadata | |
import kafka.serializer.StringDecoder | |
import com.samsung.sami.common.Curator | |
import org.apache.spark.streaming._ | |
import org.apache.spark.streaming.kafka._ | |
import org.apache.spark.{Logging, SparkConf} | |
import org.scalactic.{One, Bad, Good} | |
import scala.util.control.NonFatal | |
object App extends Logging { | |
def main(args: Array[String]): Unit = { | |
try { | |
log.info("Init Iris job") | |
Curator.initialize() | |
val conf = new Config(Curator.getInstance()) | |
val sparkConf = makeSparkConf(conf.getMaxRatePerPartition()) | |
val ssc = new StreamingContext(sparkConf, Seconds(conf.getSparkBatchIntervalInSecond())) | |
val kafkaBrokerList = conf.getKafkaBrokerList() | |
.getOrElse(throw new RuntimeException("Cannot get KafkaBrokerList from ZK")) | |
val kafkaParams = Map( | |
"metadata.broker.list" -> kafkaBrokerList | |
) | |
val topic = conf.getKinesisSubcriptionTopic() | |
.getOrElse(throw new RuntimeException("Cannot get KinesisSubcriptionTopic from ZK")) | |
val kafkaOffsetManager = new KafkaOffsetsManager(Curator.getInstance()) | |
kafkaOffsetManager.init().badMap { error => | |
throw new RuntimeException(s"Error while initiating KafkaOffsetManager: $error") | |
} | |
val directKafkaStream = kafkaOffsetManager.getKafkaOffsets() match { | |
case Good(offsets) => | |
val offsetMap = offsets.map { case (partition, offset) => | |
TopicAndPartition(topic, partition) -> offset | |
} | |
log.info(s"Starting from following partition -> offset map: $offsetMap") | |
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)]( | |
ssc, | |
kafkaParams, | |
offsetMap, | |
(msg: MessageAndMetadata[String, String]) => msg.key() -> msg.message() | |
) | |
case Bad(One(error)) => | |
log.warn(s"Error while trying to get initial Kafka offsets: $error. Fallback to start consuming from latest Kafka offsets.") | |
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( | |
ssc, | |
kafkaParams, | |
Set(topic) | |
) | |
} | |
var offsetRanges = Array[OffsetRange]() | |
directKafkaStream.transform { rdd => | |
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges | |
rdd | |
} | |
// do your job here... | |
.foreachRDD { rdd => | |
// commiting offset | |
val offsetMap = offsetRanges.map(offsetRange => offsetRange.partition -> offsetRange.untilOffset).toMap | |
log.debug(s"Committing Kafka offsets $offsetMap") | |
kafkaOffsetManager.commitKafkaOffsets(offsetMap) match { | |
case Good(_) => log.debug(s"Successful commit of Kafka offsets $offsetMap") | |
case Bad(error) => log.error(s"Error while commiting Kafka offsets: $error") | |
} | |
} | |
sys.addShutdownHook(shutdown(ssc)) | |
ssc.start() | |
ssc.awaitTermination() | |
} catch { | |
case failure: Throwable => | |
log.error("Fatal unexpected error/failure in driver main", failure) | |
throw failure | |
} | |
} | |
def makeSparkConf(maybeMaxRatePerPartition: Option[Int]): SparkConf = { | |
val conf = new SparkConf() | |
.setAppName("Iris") | |
.set("spark.streaming.backpressure.enabled", "true") | |
maybeMaxRatePerPartition.map { maxRate => | |
conf.set("spark.streaming.kafka.maxRatePerPartition", maxRate.toString) | |
}.getOrElse(conf) | |
} | |
def shutdown(ssc: StreamingContext): Unit = { | |
log.info("Going to gracefully shutdown job...") | |
ssc.stop(stopSparkContext = true, stopGracefully = true) | |
} | |
} | |
class KafkaOffsetsManager(zkClient: CuratorFramework) { | |
import KafkaOffset._ | |
val zkPath = "/SAMI/iris/kafka/topic-subscription-offsets" | |
// NB: retries are handled at the Curator level | |
def init(): Unit Or One[ZKError] = { | |
try { | |
zkClient.create() | |
.creatingParentsIfNeeded() | |
.forPath(zkPath) | |
Good(unit) | |
} catch { | |
case _: NodeExistsException => Good(unit) | |
case NonFatal(err) => Bad(One(ZKError(err))) | |
} | |
} | |
def commitKafkaOffsets(offsets: PartitionsOffset): Unit Or One[ZKError] = { | |
try { | |
zkClient.setData().forPath(zkPath, serialize(offsets)) | |
Good(unit) | |
} catch { | |
case NonFatal(err) => Bad(One(ZKError(err))) | |
} | |
} | |
def getKafkaOffsets(): PartitionsOffset Or One[IrisError] = { | |
try { | |
val bytes = zkClient.getData().forPath(zkPath) | |
deserialize(bytes) | |
} catch { | |
case NonFatal(err) => Bad(One(ZKError(err))) | |
} | |
} | |
} | |
object KafkaOffset { | |
import play.api.libs.json._ | |
type PartitionsOffset = Map[Int, Long] | |
implicit val reads: Reads[PartitionsOffset] = | |
JsPath.read[Map[String, Long]].map(offsets => offsets.map(kv => kv._1.toInt -> kv._2)) | |
implicit val writes: Writes[PartitionsOffset] = | |
Writes(offsets => Json.toJson(offsets.map(kv => kv._1.toString -> kv._2))) | |
def serialize(offsets: PartitionsOffset): Array[Byte] = { | |
Json.toJson(offsets).toString.getBytes("UTF-8") | |
} | |
def deserialize(bytes: Array[Byte]): PartitionsOffset Or One[DeserializationError] = { | |
ErrorHelper.parseJsonBytes[PartitionsOffset](bytes) | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment