jfrazee · July 24, 2024 20:34 · sylvinho81 · Nov 28, 2018
diff --git a/FilterBadGzipFiles.scala b/FilterBadGzipFiles.scala
 import java.io._
 import scala.io._
 import java.util.zip._

 // Spark
 import org.slf4j.Logger
 import org.apache.spark.{ SparkConf, SparkContext, Logging }

 // Hadoop
 import org.apache.hadoop.io.compress.GzipCodec

 object FilterBadGzipFiles extends Logging {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
    val sc = new SparkContext(sparkConf)

    val files = sc.binaryFiles(args(0))

    val lines =
      files.flatMap {
        case (path, stream) =>
          try {
            val is =
              if (path.toLowerCase.endsWith(".gz"))
                new GZIPInputStream(stream.open)
              else
                stream.open

            try {
              Source.fromInputStream(is).getLines.toList
            } finally {
              try { is.close } catch { case _: Throwable => }
            }
          } catch {
            case e: Throwable =>
              log.warn(s"error reading from ${path}: ${e.getMessage}", e)
              List.empty[String]
          }
      }

    lines.saveAsTextFile(args(1), classOf[GzipCodec])
  }
 }
	import java.io._
	import scala.io._
	import java.util.zip._

	// Spark
	import org.slf4j.Logger
	import org.apache.spark.{ SparkConf, SparkContext, Logging }

	// Hadoop
	import org.apache.hadoop.io.compress.GzipCodec

	object FilterBadGzipFiles extends Logging {
	def main(args: Array[String]): Unit = {
	val sparkConf = new SparkConf()
	val sc = new SparkContext(sparkConf)

	val files = sc.binaryFiles(args(0))

	val lines =
	files.flatMap {
	case (path, stream) =>
	try {
	val is =
	if (path.toLowerCase.endsWith(".gz"))
	new GZIPInputStream(stream.open)
	else
	stream.open

	try {
	Source.fromInputStream(is).getLines.toList
	} finally {
	try { is.close } catch { case _: Throwable => }
	}
	} catch {
	case e: Throwable =>
	log.warn(s"error reading from ${path}: ${e.getMessage}", e)
	List.empty[String]
	}
	}

	lines.saveAsTextFile(args(1), classOf[GzipCodec])
	}
	}