Created
March 24, 2018 07:09
-
-
Save tilakpatidar/29da7be217fd4e7175b7b9ea6bf4735a to your computer and use it in GitHub Desktop.
Apache Gobblin pull CSVs from S3 storage and write to AVRO
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to our local system | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to our local system and write as AVRO files | |
fs.uri=file:/// | |
# Set working directory | |
work.dir=/Users/tilak/gobblin/mopar-demo | |
writer.staging.dir=${work.dir}/taskStaging | |
writer.output.dir=${work.dir}/taskOutput | |
mr.job.root.dir=${work.dir}/working | |
# Set state store | |
state.store.enabled=true | |
state.store.type=mysql | |
state.store.db.jdbc.driver=com.mysql.jdbc.Driver | |
state.store.db.url=jdbc:mysql://localhost/mopar_demo | |
state.store.db.user=gobblin | |
state.store.db.password=gobblin | |
# Set writer and publisher | |
writer.fs.uri=file:/// | |
data.publisher.final.dir=${work.dir}/output | |
writer.destination.type=HDFS | |
writer.output.format=AVRO | |
writer.builder.class=org.apache.gobblin.writer.AvroDataWriterBuilder | |
data.publisher.fs.uri=${fs.uri} | |
data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher | |
data.publisher.metadata.output.dir=${work.dir}/metadata_out | |
# Source Configuration | |
source.class=org.apache.gobblin.data.management.copy.CopySource | |
gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.CopyableGlobDatasetFinder | |
gobblin.dataset.pattern=pricing.products_*.csv | |
# To copy from particular directory gobblin.dataset.pattern=some_folder/*.csv | |
gobblin.copy.recursive.update=true | |
# Source S3 Configuration | |
source.filebased.fs.uri=s3a://<bucket-name> | |
source.filebased.preserve.file.name=true | |
source.filebased.encrypted.fs.s3a.access.key=<s3-access-key> | |
source.filebased.encrypted.fs.s3a.secret.key=<s3-secret-key> | |
fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem | |
fs.s3a.buffer.dir=${work.dir}/buffer-dir | |
fs.s3a.connection.ssl.enabled=false | |
# Converters | |
source.schema={"namespace":"pricing", "type":"record", "name":"Products", "fields":[ { "name": "id", "type":"string" }, { "name": "description", "type":"string" }, { "name": "additional_description", "type":"string" }]} | |
csv.has.headers=true | |
converter.classes=org.apache.gobblin.data.management.copy.converter.ReadBatchedCSVConverter | |
# ==================================================================== | |
# Distcp configurations (do not change) | |
# ==================================================================== | |
job.class=org.apache.gobblin.azkaban.AzkabanJobLauncher | |
extract.namespace=org.apache.gobblin.copy | |
distcp.persist.dir=/tmp/distcp-persist-dir | |
task.maxretries=0 | |
workunit.retry.enabled=false | |
# Job History server | |
job.history.store.enabled=true | |
job.history.store.url=jdbc:mysql://localhost/mopar_demo | |
job.history.store.jdbc.driver=com.mysql.jdbc.Driver | |
job.history.store.user=gobblin | |
job.history.store.password=gobblin | |
# Other s3a settings | |
# Should be greater than 5MB else distcp won't work | |
fs.s3a.multipart.size=67108864 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment