Created
March 25, 2018 02:30
-
-
Save tilakpatidar/2b6b73190dc8fdfbba51b26ead96f647 to your computer and use it in GitHub Desktop.
Apache Gobblin job to ingest csv files from s3 buckets to a MySQL table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to MySQL | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to MySQL | |
fs.uri=file:/// | |
# Set working directory | |
work.dir=/Users/tilak/gobblin/mopar-demo | |
writer.staging.dir=${work.dir}/taskStaging | |
writer.output.dir=${work.dir}/taskOutput | |
mr.job.root.dir=${work.dir}/working | |
# Set state store | |
state.store.enabled=true | |
state.store.type=mysql | |
state.store.db.jdbc.driver=com.mysql.jdbc.Driver | |
state.store.db.url=jdbc:mysql://localhost/mopar_demo | |
state.store.db.user=gobblin | |
state.store.db.password=gobblin | |
# Set writer and publisher | |
writer.fs.uri=file:/// | |
data.publisher.final.dir=${work.dir}/output | |
writer.destination.type=MYSQL | |
writer.jdbc.batch_size=1000 | |
data.publisher.fs.uri=${fs.uri} | |
writer.builder.class=org.apache.gobblin.writer.JdbcWriterBuilder | |
data.publisher.metadata.output.dir=${work.dir}/metadata_out | |
data.publisher.type=org.apache.gobblin.publisher.JdbcPublisher | |
jdbc.publisher.driver=com.mysql.jdbc.Driver | |
jdbc.publisher.database_name=mopar_demo | |
jdbc.publisher.username=gobblin | |
jdbc.publisher.password=gobblin | |
jdbc.publisher.table_name=products | |
jdbc.publisher.url=jdbc:mysql://localhost:3306/mopar_demo | |
# Source Configuration | |
source.class=org.apache.gobblin.data.management.copy.CopySource | |
gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.CopyableGlobDatasetFinder | |
gobblin.dataset.pattern=pricing.products_*.csv | |
# To copy from particular directory gobblin.dataset.pattern=some_folder/*.csv | |
gobblin.copy.recursive.update=true | |
# Source S3 Configuration | |
source.filebased.fs.uri=s3a://<bucket-name> | |
source.filebased.preserve.file.name=true | |
source.filebased.encrypted.fs.s3a.access.key=<access-key> | |
source.filebased.encrypted.fs.s3a.secret.key=<secret-key> | |
fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem | |
fs.s3a.buffer.dir=${work.dir}/buffer-dir | |
fs.s3a.connection.ssl.enabled=false | |
# Converters | |
source.schema={"namespace":"pricing", "type":"record", "name":"Products", "fields":[ { "name": "id", "type":"string" }, { "name": "description", "type":"string" }, { "name": "additional_description", "type":"string" }, { "name": "sha", "type":"string" }]} | |
converter.avro.jdbc.entry_fields_pairs={"id":"id", "description":"description", "additional_description":"additional_description", "sha": "sha"} | |
csv.has.headers=true | |
csv.column.delimeter=; | |
csv.split.size=1000 | |
avro.primary.key=id | |
converter.classes=org.apache.gobblin.data.management.copy.converter.ReadBatchedCSVConverter,org.apache.gobblin.data.management.copy.converter.BatchToRecordIterableConverter,org.apache.gobblin.converter.jdbc.AvroToJdbcEntryConverter | |
# ==================================================================== | |
# Distcp configurations (do not change) | |
# ==================================================================== | |
job.class=org.apache.gobblin.azkaban.AzkabanJobLauncher | |
extract.namespace=org.apache.gobblin.copy | |
distcp.persist.dir=/tmp/distcp-persist-dir | |
task.maxretries=0 | |
workunit.retry.enabled=false | |
# Job History server | |
job.history.store.enabled=true | |
job.history.store.url=jdbc:mysql://localhost/mopar_demo | |
job.history.store.jdbc.driver=com.mysql.jdbc.Driver | |
job.history.store.user=gobblin | |
job.history.store.password=gobblin | |
# Other s3a settings | |
# Should be greater than 5MB else distcp won't work | |
fs.s3a.multipart.size=67108864 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment