Last active
November 8, 2016 22:32
-
-
Save dportabella/edd7e665121fba9b4984389354b8802f to your computer and use it in GitHub Desktop.
How to deserialize a hadoop result sequence file outside hadoop (or a spark saveAsObjectFile outside spark) without having the class declaration
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/" | |
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3" | |
// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0", | |
import java.io._ | |
import org.apache.hadoop.conf._ | |
import org.apache.hadoop.fs._ | |
import org.apache.hadoop.io._ | |
import org.unsynchronized.jdeserialize | |
def deserializeWithClassDeclaration(data: Array[Byte]) { | |
val values = new ObjectInputStream(new ByteArrayInputStream(data)).readObject().asInstanceOf[Array[Person]] // put here your class or _ | |
values.foreach(println) | |
} | |
def deserializeWithoutClassDeclaration(data: Array[Byte]) { | |
val fis = new ByteArrayInputStream(data) | |
val jd = new jdeserialize(null) | |
// jd.debugEnabled = true | |
jd.run(fis, true) | |
jd.dump(null) | |
} | |
val f = "//tmp/persons.rdd/part-00001" | |
val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f))) | |
try { | |
val key = NullWritable.get | |
val value = new BytesWritable | |
while (reader.next(key, value)) { | |
deserializeWithClassDeclaration(value.getBytes) // this works | |
deserializeWithoutClassDeclaration(value.getBytes) // this fails | |
} | |
} finally reader.close() | |
/* | |
Unfortunatelly jdeserialize fails to deserialize the object and fails with this (my /path/to/part-00000 file has a list of Person instances): | |
read: [array 0x7e0001 classdesc [cd 0x7e0000: name [LPerson; uid 8257594952091008868]: [arraycoll sz 2 Person _h0x7e0007 = r_0x7e0002; , Person _h0x7e00e4 = r_0x7e0002; ] | |
I tried with the many forks of jdeserialize | |
https://github.com/unsynchronized/jdeserialize | |
https://gist.github.com/dportabella/3dbd22333012682210b6d6ee2e50118d | |
Any idea? | |
Example to create a test input file: Test.scala | |
import org.apache.spark.rdd.RDD | |
import org.apache.spark.{SparkConf, SparkContext} | |
case class Person(name: String, age: Integer) | |
object Test extends App { | |
val sparkConf = new SparkConf().setAppName("test").setMaster("local[1]") | |
val sc = new SparkContext(sparkConf) | |
val personList = List(Person("John", 30), Person("Maria", 20)) | |
val rdd: RDD[Person] = sc.parallelize(personList) | |
rdd.saveAsObjectFile("/tmp/persons.rdd") | |
sc.stop() | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment