Skip to content

Commit

Permalink
change: 1.减少日志输出
Browse files Browse the repository at this point in the history
add: 1.加入spark 1.0结合hive的使用例子
  • Loading branch information
Kyofin committed Jun 22, 2019
1 parent 9a36e2e commit 58024d9
Show file tree
Hide file tree
Showing 11 changed files with 149 additions and 57 deletions.
10 changes: 5 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
<version>1.18.8</version>
</dependency>

<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-hive_2.11</artifactId>-->
<!--<version>2.4.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.4.0</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
Expand Down
27 changes: 0 additions & 27 deletions src/main/java/com/wugui/sparkstarter/SimpleApp.java

This file was deleted.

4 changes: 2 additions & 2 deletions src/main/java/com/wugui/sparkstarter/SparkFlatMapJava.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ public class SparkFlatMapJava {
public static void main(String[] args){
SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkFlatMapJava");
JavaSparkContext sc = new JavaSparkContext(conf);
// 减少日志输出
sc.setLogLevel("ERROR");



//java实现
flatMapJava(sc);
Expand Down
66 changes: 66 additions & 0 deletions src/main/java/com/wugui/sparkstarter/SparkHiveOldVersion.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.wugui.sparkstarter;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.hive.HiveContext;

/**
* spark老版本支持hive的使用方法
* 2.0spark已经不推荐使用hiveContext,改用SparkSession进行统一
**/
public class SparkHiveOldVersion {


public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("hive");
conf.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);

//HiveContext是SQLContext的子类。
HiveContext hiveContext = new HiveContext(sc);
hiveContext.sql("USE default");
hiveContext.sql("show tables").show();
hiveContext.sql("DROP TABLE IF EXISTS student_infos");
//在hive中创建student_infos表
hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING,age INT) row format delimited fields terminated by ',' ");
hiveContext.sql("load data local inpath '/Users/huzekang/study/spark-starter/src/main/resources/student_infos.txt' into table student_infos");

hiveContext.sql("DROP TABLE IF EXISTS student_scores");
hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT) row format delimited fields terminated by ','");
hiveContext.sql("LOAD DATA "
+ "LOCAL INPATH '/Users/huzekang/study/spark-starter/src/main/resources/student_scores.txt'"
+ "INTO TABLE student_scores");

hiveContext.sql("select * from student_infos").show();
hiveContext.sql("select * from student_scores").show();
/**
* 查询表生成DataFrame
*/
Dataset goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score "
+ "FROM student_infos si "
+ "JOIN student_scores ss "
+ "ON si.name=ss.name "
+ "WHERE ss.score>=80");

hiveContext.sql("DROP TABLE IF EXISTS good_student_infos");

goodStudentsDF.registerTempTable("goodstudent");
Dataset result = hiveContext.sql("select * from goodstudent");
result.show();

/**
* 将结果保存到hive表 good_student_infos
*/
goodStudentsDF.write().mode(SaveMode.Overwrite).saveAsTable("good_student_infos");

// Row[] goodStudentRows = hiveContext.table("good_student_infos").collect();
// for(Row goodStudentRow : goodStudentRows) {
// System.out.println(goodStudentRow);
// }
sc.stop();
}

}
4 changes: 0 additions & 4 deletions src/main/java/com/wugui/sparkstarter/SparkMapJava.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,12 @@ public class SparkMapJava {
public static void main(String[] args){
SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkFlatMapJava");
JavaSparkContext sc = new JavaSparkContext(conf);
// 减少日志输出
sc.setLogLevel("ERROR");

// java实现
mapJava(sc);

//java8实现
mapJava8(sc);


}

public static void mapJava(JavaSparkContext sc){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,31 @@

/**
*
* @program: spark-starter
* @author: huzekang
* @create: 2019-06-20 21:46
**/
public class SparkSqlJava {
* 使用Spark2.0的SparkSession进行统一api操作
* Spark2.0中只有一个入口点(spark会话),从中你可以获得各种其他入口点(spark上下文,流上下文等)
*
* @author huzekang*/
public class SparkSessionStarter {
public static void main(String[] args) {
// 在Spark的早期版本,sparkContext是进入Spark的切入点。我们都知道RDD是Spark中重要的API,然而它的创建和操作得使用sparkContext提供的API;
// 对于RDD之外的其他东西,我们需要使用其他的Context。
// 比如对于流处理来说,我们得使用StreamingContext;对于SQL得使用sqlContext;而对于hive得使用HiveContext。
// 然而DataSet和Dataframe提供的API逐渐称为新的标准API,我们需要一个切入点来构建它们,所以在 Spark 2.0中我们引入了一个新的切入点(entry point):SparkSession。
// SparkConf、SparkContext和SQLContext都已经被封装在SparkSession当中.
//   SparkSession实质上是SQLContext和HiveContext的组合(未来可能还会加上StreamingContext),所以在SQLContext和HiveContext上可用的API在SparkSession上同样是可以使用的。
// SparkSession内部封装了sparkContext,所以计算实际上是由sparkContext完成的。
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("Java Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate();

// quickStart(spark);
quickStart(spark);

// code(spark);
customCode(spark);

reflection(spark);
reflectionCode(spark);

}

Expand All @@ -48,10 +55,10 @@ public static void quickStart(SparkSession spark) {
/**
*
* 文件 => JavaRDD => DataFrame
* 使用反射机制推断RDD的数据结构 :
* 使用反射机制推断RDD的数据结构(不推荐) :
*   当spark应用可以推断RDD数据结构时,可使用这种方式。这种基于反射的方法可以使代码更简洁有效。
*/
public static void reflection(SparkSession spark) {
public static void reflectionCode(SparkSession spark) {
// Create an RDD of Person objects from a text file
JavaRDD<Person> peopleRDD = spark.read()
.textFile("/Users/huzekang/study/spark-starter/src/main/resources/people.txt")
Expand Down Expand Up @@ -109,9 +116,9 @@ public static void reflection(SparkSession spark) {

/**
* 通过编程接口构造一个数据结构,然后映射到RDD上
*   当spark应用无法推断RDD数据结构时,可使用这种方式。
*   当spark应用无法推断RDD数据结构时,可使用这种方式。(推荐)
*/
public static void code(SparkSession spark) {
public static void customCode(SparkSession spark) {
// Create an RDD
JavaRDD<String> peopleRDD = spark.sparkContext()
.textFile("/Users/huzekang/study/spark-starter/src/main/resources/people.txt", 1)
Expand Down Expand Up @@ -173,4 +180,6 @@ public static class Person {
private Integer age;

}


}
19 changes: 13 additions & 6 deletions src/main/java/com/wugui/sparkstarter/SparkStarter.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import org.apache.spark.api.java.JavaSparkContext;

/**
* @program: spark-starter
* @author: huzekang
* @create: 2019-06-20 21:28
* 两个简单使用的例子
* 1. 读取文件打印每行
* 2. wordcount
**/
public class SparkStarter {

Expand All @@ -17,12 +17,19 @@ public static void main(String[] args) {
.setAppName("SparkStarter");
//之后你用的是Rdd
JavaSparkContext sc = new JavaSparkContext(sparkConf);
sc.setLogLevel("ERROR");

JavaRDD<String> stringJavaRDD = sc.textFile("/Users/huzekang/study/spark-starter/src/main/resources/students.txt");

stringJavaRDD.foreach(o -> System.out.println(o));

// Should be some file on your system
String logFile = "file:///Users/huzekang/study/spark-starter/src/main/resources/kv1.txt";

JavaRDD<String> logData = sc.textFile(logFile).cache();
long numAs = logData.filter(s -> s.contains("a")).count();
long numBs = logData.filter( s -> s.contains("b")).count();

System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);

sc.stop();

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public static void main(String[] args) throws SQLException {
//1.创建spark配置文件和上下文对象
SparkConf conf = new SparkConf().setAppName("sparkTest").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("ERROR");


//2.读取日志文件并创建一个RDD,使用SparkContext的textFile()方法
JavaRDD<String> javaRDD = sc.textFile("./app_log.txt");
Expand Down
35 changes: 35 additions & 0 deletions src/main/resources/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Set everything to be logged to the console
log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark-project.jetty=WARN
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
log4j.logger.org.apache.parquet=ERROR
log4j.logger.parquet=ERROR

# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
3 changes: 3 additions & 0 deletions src/main/resources/student_infos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
李小龙,18
周大侠,19
肖鸿飞,17
3 changes: 3 additions & 0 deletions src/main/resources/student_scores.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
李小龙,67
周大侠,88
肖鸿飞,59

0 comments on commit 58024d9

Please sign in to comment.