prithvi514 · September 21, 2017 13:27
diff --git a/wells_snippets.sh b/wells_snippets.sh
 sqoop job --create test_job1 -- import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo"  --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query 'select * from dbo.tVulnMgmt_DeltaRiskAGGHist where $CONDITIONS'  --target-dir "/tmp/dbo-delta2" -m 1
 
 sqoop job --list
 sqoop job --exec test_job1

 sqoop import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo"  --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query "select * from dbo.tVulnMgmt_DeltaRiskAGGHist WHERE $CONDITIONS"  --target-dir "/tmp/dbo-delta" --split-by "Date"

 spark-submit --class org.apache.spark.examples.SparkPi --deploy-mode client --master yarn spark-examples_2.11-2.1.1.2.6.1.0-129.jar

 hive.optimize.reducededuplication.min.reducer=4
 hive.optimize.reducededuplication=true
 hive.merge.mapfiles=true
 hive.merge.mapredfiles=false
 hive.merge.smallfiles.avgsize=16000000
 hive.merge.size.per.task=256000000
 hive.merge.sparkfiles=true
 hive.auto.convert.join=true
 hive.auto.convert.join.noconditionaltask=true
 hive.auto.convert.join.noconditionaltask.size=20M(might need to increase for Spark, 200M)
 hive.optimize.bucketmapjoin.sortedmerge=false
 hive.map.aggr.hash.percentmemory=0.5
 hive.map.aggr=true
 hive.optimize.sort.dynamic.partition=false
 hive.stats.autogather=true
 hive.stats.fetch.column.stats=true
 hive.compute.query.using.stats=true
 hive.limit.pushdown.memory.usage=0.4 (MR and Spark)
 hive.optimize.index.filter=true
 hive.exec.reducers.bytes.per.reducer=67108864
 hive.smbjoin.cache.rows=10000
 hive.fetch.task.conversion=more
 hive.fetch.task.conversion.threshold=1073741824
 hive.optimize.ppd=true


 nameNode=hdfs://compute-77.cloudwickdc.local:8020
 jobTracker=compute-77.cloudwickdc.local:8050
 queueName=default
 examplesRoot=examples
 oozie.use.system.libpath=true
 oozie.wf.application.path=${nameNode}/user/prithvi



 drop table if exists wells_poc.oozie_test2;
 create table wells_poc.oozie_test2 as select * from atlas_test_table_sqoop1;


 <workflow-app xmlns="uri:oozie:workflow:0.4" name="hive-wf">
 <credentials>
  <credential name='hive_credentials' type='hcat'>
    <property>  
        <name>hcat.metastore.uri</name>  
        <value>thrift://compute-79.cloudwickdc.local:9083</value>
    </property>
    <property>  
        <name>hcat.metastore.principal</name> 
        <value>hive/[email protected]</value> 
    </property>  
  </credential>  
 </credentials> 
    <start to="hive-node"/>

    <action cred='hive_credentials' name="hive-node">
        <hive xmlns="uri:oozie:hive-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <script>tt.hql</script>
        </hive>
        <ok to="end"/>
        <error to="fail"/>
    </action>

    <kill name="fail">
        <message>Hive failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <end name="end"/>
 </workflow-app>


 export OOZIE_URL=http://compute-80.cloudwickdc.local:11000/oozie
 oozie job -config test.properties -run
 oozie job -info <oozie_id>

 %livy2.spark
 val hiveContext=new org.apache.spark.sql.hive.HiveContext(sc)
 hiveContext.sql("select count(*) from wells_poc.oozie_test2").collect()
	sqoop job --create test_job1 -- import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo" --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query 'select * from dbo.tVulnMgmt_DeltaRiskAGGHist where $CONDITIONS' --target-dir "/tmp/dbo-delta2" -m 1

	sqoop job --list
	sqoop job --exec test_job1

	sqoop import --connect "jdbc:sqlserver://192.168.1.206;database=wells-fargo" --username sa --password Hadoop123 --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --query "select * from dbo.tVulnMgmt_DeltaRiskAGGHist WHERE $CONDITIONS" --target-dir "/tmp/dbo-delta" --split-by "Date"

	spark-submit --class org.apache.spark.examples.SparkPi --deploy-mode client --master yarn spark-examples_2.11-2.1.1.2.6.1.0-129.jar

	hive.optimize.reducededuplication.min.reducer=4
	hive.optimize.reducededuplication=true
	hive.merge.mapfiles=true
	hive.merge.mapredfiles=false
	hive.merge.smallfiles.avgsize=16000000
	hive.merge.size.per.task=256000000
	hive.merge.sparkfiles=true
	hive.auto.convert.join=true
	hive.auto.convert.join.noconditionaltask=true
	hive.auto.convert.join.noconditionaltask.size=20M(might need to increase for Spark, 200M)
	hive.optimize.bucketmapjoin.sortedmerge=false
	hive.map.aggr.hash.percentmemory=0.5
	hive.map.aggr=true
	hive.optimize.sort.dynamic.partition=false
	hive.stats.autogather=true
	hive.stats.fetch.column.stats=true
	hive.compute.query.using.stats=true
	hive.limit.pushdown.memory.usage=0.4 (MR and Spark)
	hive.optimize.index.filter=true
	hive.exec.reducers.bytes.per.reducer=67108864
	hive.smbjoin.cache.rows=10000
	hive.fetch.task.conversion=more
	hive.fetch.task.conversion.threshold=1073741824
	hive.optimize.ppd=true


	nameNode=hdfs://compute-77.cloudwickdc.local:8020
	jobTracker=compute-77.cloudwickdc.local:8050
	queueName=default
	examplesRoot=examples
	oozie.use.system.libpath=true
	oozie.wf.application.path=${nameNode}/user/prithvi



	drop table if exists wells_poc.oozie_test2;
	create table wells_poc.oozie_test2 as select * from atlas_test_table_sqoop1;


	<workflow-app xmlns="uri:oozie:workflow:0.4" name="hive-wf">
	<credentials>
	<credential name='hive_credentials' type='hcat'>
	<property>
	<name>hcat.metastore.uri</name>
	<value>thrift://compute-79.cloudwickdc.local:9083</value>
	</property>
	<property>
	<name>hcat.metastore.principal</name>
	<value>hive/[email protected]</value>
	</property>
	</credential>
	</credentials>
	<start to="hive-node"/>

	<action cred='hive_credentials' name="hive-node">
	<hive xmlns="uri:oozie:hive-action:0.2">
	<job-tracker>${jobTracker}</job-tracker>
	<name-node>${nameNode}</name-node>
	<configuration>
	<property>
	<name>mapred.job.queue.name</name>
	<value>${queueName}</value>
	</property>
	</configuration>
	<script>tt.hql</script>
	</hive>
	<ok to="end"/>
	<error to="fail"/>
	</action>

	<kill name="fail">
	<message>Hive failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
	</kill>
	<end name="end"/>
	</workflow-app>


	export OOZIE_URL=http://compute-80.cloudwickdc.local:11000/oozie
	oozie job -config test.properties -run
	oozie job -info <oozie_id>

	%livy2.spark
	val hiveContext=new org.apache.spark.sql.hive.HiveContext(sc)
	hiveContext.sql("select count(*) from wells_poc.oozie_test2").collect()