Skip to content

Instantly share code, notes, and snippets.

@prithvi514
Last active September 21, 2017 13:27
Show Gist options
  • Save prithvi514/51ef8f4b6aad78bad60173ca90eb048f to your computer and use it in GitHub Desktop.
Save prithvi514/51ef8f4b6aad78bad60173ca90eb048f to your computer and use it in GitHub Desktop.
sqoop job --create test_job1 -- import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo" --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query 'select * from dbo.tVulnMgmt_DeltaRiskAGGHist where $CONDITIONS' --target-dir "/tmp/dbo-delta2" -m 1
sqoop job --list
sqoop job --exec test_job1
sqoop import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo" --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query "select * from dbo.tVulnMgmt_DeltaRiskAGGHist WHERE $CONDITIONS" --target-dir "/tmp/dbo-delta" --split-by "Date"
spark-submit --class org.apache.spark.examples.SparkPi --deploy-mode client --master yarn spark-examples_2.11-2.1.1.2.6.1.0-129.jar
hive.optimize.reducededuplication.min.reducer=4
hive.optimize.reducededuplication=true
hive.merge.mapfiles=true
hive.merge.mapredfiles=false
hive.merge.smallfiles.avgsize=16000000
hive.merge.size.per.task=256000000
hive.merge.sparkfiles=true
hive.auto.convert.join=true
hive.auto.convert.join.noconditionaltask=true
hive.auto.convert.join.noconditionaltask.size=20M(might need to increase for Spark, 200M)
hive.optimize.bucketmapjoin.sortedmerge=false
hive.map.aggr.hash.percentmemory=0.5
hive.map.aggr=true
hive.optimize.sort.dynamic.partition=false
hive.stats.autogather=true
hive.stats.fetch.column.stats=true
hive.compute.query.using.stats=true
hive.limit.pushdown.memory.usage=0.4 (MR and Spark)
hive.optimize.index.filter=true
hive.exec.reducers.bytes.per.reducer=67108864
hive.smbjoin.cache.rows=10000
hive.fetch.task.conversion=more
hive.fetch.task.conversion.threshold=1073741824
hive.optimize.ppd=true
nameNode=hdfs://compute-77.cloudwickdc.local:8020
jobTracker=compute-77.cloudwickdc.local:8050
queueName=default
examplesRoot=examples
oozie.use.system.libpath=true
oozie.wf.application.path=${nameNode}/user/prithvi
drop table if exists wells_poc.oozie_test2;
create table wells_poc.oozie_test2 as select * from atlas_test_table_sqoop1;
<workflow-app xmlns="uri:oozie:workflow:0.4" name="hive-wf">
<credentials>
<credential name='hive_credentials' type='hcat'>
<property>
<name>hcat.metastore.uri</name>
<value>thrift://compute-79.cloudwickdc.local:9083</value>
</property>
<property>
<name>hcat.metastore.principal</name>
<value>hive/[email protected]</value>
</property>
</credential>
</credentials>
<start to="hive-node"/>
<action cred='hive_credentials' name="hive-node">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<script>tt.hql</script>
</hive>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>Hive failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>
export OOZIE_URL=http://compute-80.cloudwickdc.local:11000/oozie
oozie job -config test.properties -run
oozie job -info <oozie_id>
%livy2.spark
val hiveContext=new org.apache.spark.sql.hive.HiveContext(sc)
hiveContext.sql("select count(*) from wells_poc.oozie_test2").collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment