diff --git a/README.md b/README.md index 0315be2..c7bebe7 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,53 @@ +## 基本原理 +RDD,弹性分布式数据集,即分布式的元素集合。 +在spark中,对所有数据的操作不外乎是创建RDD、转化已有的RDD以及调用RDD操作进行求值。在这一切的背后,Spark会自动将RDD中的数据分发到集群中,并将操作并行化。 +Spark中的RDD就是一个不可变的分布式对象集合。每个RDD都被分为多个分区,这些分区运行在集群中的不同节点上。RDD可以包含Python,Java,Scala中任意类型的对象,甚至可以包含用户自定义的对象。 +用户可以使用两种方法创建RDD:读取一个外部数据集,或在驱动器程序中分发驱动器程序中的对象集合,比如list或者set。 +RDD的转化操作都是惰性求值的,这意味着我们对RDD调用转化操作,操作不会立即执行。相反,Spark会在内部记录下所要求执行的操作的相关信息。我们不应该把RDD看做存放着特定数据的数据集,而最好把每个RDD当做我们通过转化操作构建出来的、记录如何计算数据的指令列表。数据读取到RDD中的操作也是惰性的,数据只会在必要时读取。转化操作和读取操作都有可能多次执行。 -### 代码样例 + +## 代码样例 +Spark入门(四)--Spark的map、flatMap、mapToPair: https://juejin.im/post/5c77e383f265da2d8f474e29#heading-9 -### 本地测试和提交作业 +官方例子: +https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples + +## 常用api +1. map(func):对每行数据使用func,然后返回一个新的RDD,数据处理-每行。 +2. filter(func):对每行数据使用func,然后返回func后为true的数据,用于过滤。 +3. flatMap(func):和map差不多,但是flatMap生成的是多个结果,用于行转列。 +4. groupByKey(numTasks):返回(K,Seq[V]),也就是Hadoop中reduce函数接受的 +key-valuelist +5. reduceByKey(func,[numTasks]):就是用一个给定的reduce func再作用在groupByKey产 +生的(K,Seq[V]),比如求和,求平均数 +6. sortByKey([ascending],[numTasks]):按照key来进行排序,是升序还是降序,ascending + + +## 本地测试和提交作业 参考:https://blog.csdn.net/dream_an/article/details/54915894 - idea上测试spark作业 -- 提交作业到本地spark +引入依赖 +``` + + + org.apache.spark + spark-core_2.11 + 2.4.0 + + + + + org.glassfish.jersey.core + jersey-server + 2.0-m03 + +``` + +- 提交作业到本机的spark环境 + 将使用`mvn clean package`打包好的作业提交到本地安装好的spark上跑 ``` ~/opt/spark-2.4.0-bin-hadoop2.7 » bin/spark-submit --class "com.wugui.sparkstarter.SimpleApp" /Users/huzekang/study/spark-starter/target/spark-starter-1.0-SNAPSHOT.jar diff --git a/pom.xml b/pom.xml index b91f7a7..6cda958 100644 --- a/pom.xml +++ b/pom.xml @@ -16,7 +16,8 @@ 1.8 - + + org.apache.spark spark-core_2.11 2.4.0 @@ -29,6 +30,53 @@ 2.0-m03 + + org.projectlombok + lombok + 1.18.8 + + + + + + + + + + org.apache.spark + spark-sql_2.11 + 2.4.0 + + + + + + + + + + + + + + + + + + mysql + mysql-connector-java + 5.1.47 + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/java/com/wugui/sparkstarter/SparkSqlJava.java b/src/main/java/com/wugui/sparkstarter/SparkSqlJava.java new file mode 100644 index 0000000..bda539a --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/SparkSqlJava.java @@ -0,0 +1,153 @@ +package com.wugui.sparkstarter; + + +import lombok.Data; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.util.ArrayList; +import java.util.List; + +/** + * @program: spark-starter + * @author: huzekang + * @create: 2019-06-20 21:46 + **/ +public class SparkSqlJava { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .master("local") + .appName("Java Spark SQL basic example") + .config("spark.some.config.option", "some-value") + .getOrCreate(); + + + code(spark); + + + } + + /** + * 快速测试例子 + */ + public static void quickStart(SparkSession spark) { + Dataset df = spark.read().text("/Users/huzekang/study/spark-starter/src/main/resources/students.txt"); + df.show(); + } + + + /** + * 使用反射机制推断RDD的数据结构 : + *   当spark应用可以推断RDD数据结构时,可使用这种方式。这种基于反射的方法可以使代码更简洁有效。 + */ + public static void reflection(SparkSession spark) { + // Create an RDD of Person objects from a text file + JavaRDD peopleRDD = spark.read() + .textFile("/Users/huzekang/study/spark-starter/src/main/resources/people.txt") + .javaRDD() + .map(line -> { + String[] parts = line.split(","); + Person person = new Person(); + person.setName(parts[0]); + person.setAge(Integer.parseInt(parts[1].trim())); + return person; + }); + + // Apply a schema to an RDD of JavaBeans to get a DataFrame + Dataset peopleDF = spark.createDataFrame(peopleRDD, Person.class); + // Register the DataFrame as a temporary view + peopleDF.createOrReplaceTempView("people"); + + // SQL statements can be run by using the sql methods provided by spark + Dataset teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); + + // The columns of a row in the result can be accessed by field index + Encoder stringEncoder = Encoders.STRING(); + Dataset teenagerNamesByIndexDF = teenagersDF.map( + (MapFunction) row -> "Name: " + row.getString(0), + stringEncoder); + teenagerNamesByIndexDF.show(); + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + + // or by field name + Dataset teenagerNamesByFieldDF = teenagersDF.map( + (MapFunction) row -> "Name: " + row.getAs("name"), + stringEncoder); + teenagerNamesByFieldDF.show(); + // +------------+ + // | value| + // +------------+ + // |Name: Justin| + // +------------+ + } + + + /** + * 通过编程接口构造一个数据结构,然后映射到RDD上 + *   当spark应用无法推断RDD数据结构时,可使用这种方式。 + */ + public static void code(SparkSession spark) { +// Create an RDD + JavaRDD peopleRDD = spark.sparkContext() + .textFile("/Users/huzekang/study/spark-starter/src/main/resources/people.txt", 1) + .toJavaRDD(); + +// The schema is encoded in a string + String schemaString = "name age"; + +// Generate the schema based on the string of schema + List fields = new ArrayList<>(); + for (String fieldName : schemaString.split(" ")) { + StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); + fields.add(field); + } + StructType schema = DataTypes.createStructType(fields); + +// Convert records of the RDD (people) to Rows + JavaRDD rowRDD = peopleRDD.map((Function) record -> { + String[] attributes = record.split(","); + return RowFactory.create(attributes[0], attributes[1].trim()); + }); + +// Apply the schema to the RDD + Dataset peopleDataFrame = spark.createDataFrame(rowRDD, schema); + +// Creates a temporary view using the DataFrame + peopleDataFrame.createOrReplaceTempView("people"); + +// SQL can be run over a temporary view created using DataFrames + Dataset results = spark.sql("SELECT name FROM people"); + +// The results of SQL queries are DataFrames and support all the normal RDD operations +// The columns of a row in the result can be accessed by field index or by field name + Dataset namesDS = results.map( + (MapFunction) row -> "Name: " + row.getString(0), + Encoders.STRING()); + namesDS.show(); +// +-------------+ +// | value| +// +-------------+ +// |Name: Michael| +// | Name: Andy| +// | Name: Justin| +// +-------------+ + } + + @Data + public static class Person { + + private String name; + private Integer age; + + } +} diff --git a/src/main/java/com/wugui/sparkstarter/SparkStarter.java b/src/main/java/com/wugui/sparkstarter/SparkStarter.java new file mode 100644 index 0000000..4836d56 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/SparkStarter.java @@ -0,0 +1,28 @@ +package com.wugui.sparkstarter; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +/** + * @program: spark-starter + * @author: huzekang + * @create: 2019-06-20 21:28 + **/ +public class SparkStarter { + + public static void main(String[] args) { + SparkConf sparkConf = new SparkConf() + .setMaster("local[5]") + .setAppName("SparkStarter"); + //之后你用的是Rdd + JavaSparkContext sc = new JavaSparkContext(sparkConf); + sc.setLogLevel("ERROR"); + + JavaRDD stringJavaRDD = sc.textFile("/Users/huzekang/study/spark-starter/src/main/resources/students.txt"); + + stringJavaRDD.foreach(o -> System.out.println(o)); + + + } +} diff --git a/src/main/java/com/wugui/sparkstarter/demo1/AccessLogInfo.java b/src/main/java/com/wugui/sparkstarter/demo1/AccessLogInfo.java new file mode 100644 index 0000000..a154cf2 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/demo1/AccessLogInfo.java @@ -0,0 +1,21 @@ +package com.wugui.sparkstarter.demo1; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; + + +//log日志的javaBean +@Data +@AllArgsConstructor +public class AccessLogInfo implements Serializable { + + private static final long serivaVersionUID = 1L; + private long Timestamp; + private long upTraffic; + private long downTraffic; + + + +} \ No newline at end of file diff --git a/src/main/java/com/wugui/sparkstarter/demo1/AccessLogSortKey.java b/src/main/java/com/wugui/sparkstarter/demo1/AccessLogSortKey.java new file mode 100644 index 0000000..f3425e5 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/demo1/AccessLogSortKey.java @@ -0,0 +1,141 @@ +package com.wugui.sparkstarter.demo1; + +import lombok.Data; +import scala.math.Ordered; + +import java.io.Serializable; + +/** + * 根据key进行二次的排序,先按上行流量,在按下行流量,都一样的话安装时间戳排序 + */ +@Data +public class AccessLogSortKey implements Ordered,Serializable{ + //定义排序的三个对象 + private long timestamp; + private long upTraffic; + private long downTraffic; + + + + public AccessLogSortKey(long timestamp, long upTraffic, long downTraffic) { + this.timestamp = timestamp; + this.upTraffic = upTraffic; + this.downTraffic = downTraffic; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + AccessLogSortKey that = (AccessLogSortKey) o; + + if (timestamp != that.timestamp) return false; + if (upTraffic != that.upTraffic) return false; + return downTraffic == that.downTraffic; + } + + @Override + public int hashCode() { + int result = (int) (timestamp ^ (timestamp >>> 32)); + result = 31 * result + (int) (upTraffic ^ (upTraffic >>> 32)); + result = 31 * result + (int) (downTraffic ^ (downTraffic >>> 32)); + return result; + } + + + + private static final long serivaVersionUID = 1L; + @Override + public int compare(AccessLogSortKey other) { + if(upTraffic - other.upTraffic !=0){ + return (int)(upTraffic - other.upTraffic); + }else if (downTraffic - other.downTraffic !=0){ + return (int)(downTraffic - other.downTraffic ); + }else if(timestamp -other.timestamp!=0){ + return (int)(timestamp -other.timestamp); + } + return 0; + } + + @Override + public int compareTo(AccessLogSortKey other) { + if(upTraffic - other.upTraffic !=0){ + return (int)(upTraffic - other.upTraffic); + }else if (downTraffic - other.downTraffic !=0){ + return (int)(downTraffic - other.downTraffic ); + }else if(timestamp -other.timestamp!=0){ + return (int)(timestamp -other.timestamp); + } + return 0; + } + + @Override + public boolean $less(AccessLogSortKey other) { + if(upTrafficother.upTraffic){ + return true; + }else if(upTraffic==other.upTraffic&&downTraffic>other.downTraffic){ + return true; + }else if(upTraffic==other.upTraffic&&downTraffic==other.downTraffic&×tamp>other.timestamp){ + return true; + } + return false; + } + + @Override + public boolean $less$eq(AccessLogSortKey other) { + if($less(other)){ + return true; + }else if (upTraffic==other.upTraffic&&downTraffic==other.downTraffic&×tamp==other.timestamp){ + return true; + } + return false; + } + + @Override + public boolean $greater$eq(AccessLogSortKey other) { + if($greater(other)){ + return true; + }else if (upTraffic==other.upTraffic&&downTraffic==other.downTraffic&×tamp==other.timestamp){ + return true; + } + return false; + } + + +} + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/java/com/wugui/sparkstarter/demo1/AppLogSparkApplication.java b/src/main/java/com/wugui/sparkstarter/demo1/AppLogSparkApplication.java new file mode 100644 index 0000000..e3dcce6 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/demo1/AppLogSparkApplication.java @@ -0,0 +1,167 @@ +package com.wugui.sparkstarter.demo1; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import scala.Tuple2; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.List; + +public class AppLogSparkApplication { + static DBHelper db1 = null; + public static void main(String[] args) throws SQLException { + //1.创建spark配置文件和上下文对象 + SparkConf conf = new SparkConf().setAppName("sparkTest").setMaster("local"); + JavaSparkContext sc = new JavaSparkContext(conf); + sc.setLogLevel("ERROR"); + + //2.读取日志文件并创建一个RDD,使用SparkContext的textFile()方法 + JavaRDD javaRDD = sc.textFile("./app_log.txt"); + + //将RDD映射成为key-value格式,为后面的reducebykey聚合做准备。 + JavaPairRDD accessLogPairRdd = mapAccessLogRDD2Pair(javaRDD); + //根据deviceID返回聚合后的结果 + JavaPairRDD aggregateLogPairRDD = aggregateByDeviceId(accessLogPairRdd); + + //将按照deviceID聚合的key映射为二次排序的key,value映射为deviceID + JavaPairRDD accessLogSortRDD = mapRDDkey2SortKey(aggregateLogPairRDD); + ///实现降序排序 + JavaPairRDD sortedAccessLogRDD= accessLogSortRDD.sortByKey(false); + //获取前 top 10 + List> top10DataList = sortedAccessLogRDD.take(10); + //创建dbhelp对象 + db1 = new DBHelper(); + + String sql = "insert into spark(deviceId,upTraffic,downTraffic,timeStamp) values(?,?,?,?)"; + + //打印前top 10 + for(Tuple2 data : top10DataList){ +// System.out.println(data._2 +" "+data._1.getUpTraffic()); + PreparedStatement pt = db1.conn.prepareStatement(sql); + pt.setString(1,data._2); + pt.setString(2,data._1.getUpTraffic()+""); + pt.setString(3,data._1.getDownTraffic()+""); + pt.setString(4,data._1.getTimestamp()+""); + //注意让pt执行 + pt.executeUpdate(); + } + + //关闭上下文 + sc.close(); + + } + //将日志的RDD映射为key-value的格式 + private static JavaPairRDD mapAccessLogRDD2Pair(JavaRDD javaRDD){ + //PairFunction中第一个string表示的是传入的参数,后面两个代表返回值javaRDD + return javaRDD.mapToPair(new PairFunction() { + + private static final long serivaVersionUID = 1L; + @Override + //进行一行一行的读取 + public Tuple2 call(String javaRDD) throws Exception { + //根据\t进行切分 + String[] accessLogSplited = javaRDD.split("\t"); + //获取四个字段 + long timestamp = Long.valueOf(accessLogSplited[0]); + String deviceID = accessLogSplited[1]; + long upTraffic = Long.valueOf(accessLogSplited[2]); + long downTraffic = Long.valueOf(accessLogSplited[3]); + // 将时间戳,上行流量和下行流量封装为自定义的可序列化对象 + AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp,upTraffic,downTraffic); + return new Tuple2(deviceID,accessLogInfo); + } + }); + + } + /** + * 根据deviceID进行聚合求出上行和下行的流量,及其最早访问的时间 + */ + private static JavaPairRDD aggregateByDeviceId( JavaPairRDD accessLogPairRdd){ + //Function2的前两个accessLogInfo对应call的前两个,第三个是返回的 + return accessLogPairRdd.reduceByKey(new Function2() { + private static final long serivaVersionUID = 1L; + @Override + public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2) throws Exception { + long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp()?accessLogInfo1.getTimestamp():accessLogInfo2.getTimestamp(); + long upTraffic = accessLogInfo1.getUpTraffic()+accessLogInfo2.getUpTraffic(); + long downTraffic=accessLogInfo1.getDownTraffic()+accessLogInfo2.getDownTraffic(); + //进行聚合之后产生一个AccessLogInfo + AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp,upTraffic,downTraffic); + return accessLogInfo; + } + }); + } + + /** + * 将RDD的key映射为二次排序的key + */ + private static JavaPairRDD mapRDDkey2SortKey(JavaPairRDD aggregateLogPairRDD){ + //后两个为返回的参数 + return aggregateLogPairRDD.mapToPair(new PairFunction, AccessLogSortKey,String>() { + private static final long serivaVersionUID = 1L; + @Override + //tuple的key是deviceID,value是AccessLogInfo + public Tuple2 call(Tuple2 tuple ) throws Exception { + String deviceID= tuple._1; + AccessLogInfo accessLogInfo = tuple._2; + AccessLogSortKey accessLogSortKey = new AccessLogSortKey(accessLogInfo.getTimestamp(),accessLogInfo.getUpTraffic(),accessLogInfo.getDownTraffic()); + //new 出去一个新的Tuple,这时候key变成了二次排序的key + return new Tuple2(accessLogSortKey,deviceID); + } + }); + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/java/com/wugui/sparkstarter/demo1/DBHelper.java b/src/main/java/com/wugui/sparkstarter/demo1/DBHelper.java new file mode 100644 index 0000000..7c1a556 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/demo1/DBHelper.java @@ -0,0 +1,68 @@ +package com.wugui.sparkstarter.demo1; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; + +public class DBHelper { + public static final String url = "jdbc:mysql://192.168.5.148:3306/dataset"; + public static final String driver = "com.mysql.jdbc.Driver"; + public static final String user = "root"; + public static final String password ="eWJmP7yvpccHCtmVb61Gxl2XLzIrRgmT"; + + //获取数据库链接 + public Connection conn = null; + public DBHelper(){ + try{ + Class.forName(driver ); + conn = DriverManager.getConnection(url,user,password); + }catch (Exception e){ + e.printStackTrace(); + } + } + + public void close(){ + try { + this.conn.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/java/com/wugui/sparkstarter/demo1/MockData.java b/src/main/java/com/wugui/sparkstarter/demo1/MockData.java new file mode 100644 index 0000000..0dffd98 --- /dev/null +++ b/src/main/java/com/wugui/sparkstarter/demo1/MockData.java @@ -0,0 +1,44 @@ +package com.wugui.sparkstarter.demo1; + +import org.apache.spark.sql.catalyst.expressions.Rand; + +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.*; + +public class MockData { + public static void main(String[] args) { + StringBuffer sb = new StringBuffer(); + Random rand = new Random(); + List device = new ArrayList(); + for(int i=0;i<100;i++){ + device.add(getUUID()); + } + for(int j=0;j<1000;j++){ + Calendar cal = Calendar.getInstance(); + cal.setTime(new Date()); + cal.add(Calendar.MINUTE,-rand.nextInt(6000)); + long timeStamp =cal.getTimeInMillis(); + String deviceId = device.get(rand.nextInt(100)); + + long upTraffic = rand.nextInt(100000); + long downTraffic= rand.nextInt(10000); + sb.append(timeStamp).append("\t").append(deviceId).append("\t").append(upTraffic).append("\t").append(downTraffic).append("\n"); + } + PrintWriter pw = null; + try { + pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("./app_log.txt"))); + pw.write(sb.toString()); + } catch (FileNotFoundException e) { + + }finally { + pw.close(); + } + + } + public static String getUUID(){ + return UUID.randomUUID().toString().replace("-",""); + } +} \ No newline at end of file diff --git a/src/main/resources/employees.json b/src/main/resources/employees.json new file mode 100644 index 0000000..b548d15 --- /dev/null +++ b/src/main/resources/employees.json @@ -0,0 +1,1189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + spark/employees.json at 5264164a67df498b73facae207eda12ee133be7d · apache/spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + + +
+
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Watch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • +
    +
    + +
  • +
+ +

+ + /spark + + +

+ +
+ + + + + + +
+
+
+ + + + + + + Permalink + + + + +
+ + +
+ + Tree: + 5264164a67 + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ + +
+
+ + 1 contributor + + +
+ +

+ Users who have contributed to this file +

+
+ +
+
+
+
+ + + + + +
+ +
+ +
+ 5 lines (4 sloc) + + 130 Bytes +
+ +
+ +
+ Raw + Blame + History +
+ + +
+ + + + +
+
+
+ + + + + + +
+ + + + + + + + + + + + + + + + + + +
{"name":"Michael", "salary":3000}
{"name":"Andy", "salary":4500}
{"name":"Justin", "salary":3500}
{"name":"Berta", "salary":4000}
+ + + +
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + + diff --git a/src/main/resources/full_user.avsc b/src/main/resources/full_user.avsc new file mode 100644 index 0000000..ad38234 --- /dev/null +++ b/src/main/resources/full_user.avsc @@ -0,0 +1,1128 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + spark/full_user.avsc at 5264164a67df498b73facae207eda12ee133be7d · apache/spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + + +
+
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Watch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • +
    +
    + +
  • +
+ +

+ + /spark + + +

+ +
+ + + + + + +
+
+
+ + + + + + + Permalink + + + + +
+ + +
+ + Tree: + 5264164a67 + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ Fetching contributors… +
+ +
+ + Cannot retrieve contributors at this time +
+
+ + + + +
+ +
+ +
+ 1 lines (1 sloc) + + 240 Bytes +
+ +
+ +
+ Raw + Blame + History +
+ + +
+ + + + +
+
+
+ + + + + + +
+ + + + + + +
{"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]}
+ + + +
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + + diff --git a/src/main/resources/kv1.txt b/src/main/resources/kv1.txt new file mode 100644 index 0000000..5b82fe5 --- /dev/null +++ b/src/main/resources/kv1.txt @@ -0,0 +1,500 @@ +238 val_238 +86 val_86 +311 val_311 +27 val_27 +165 val_165 +409 val_409 +255 val_255 +278 val_278 +98 val_98 +484 val_484 +265 val_265 +193 val_193 +401 val_401 +150 val_150 +273 val_273 +224 val_224 +369 val_369 +66 val_66 +128 val_128 +213 val_213 +146 val_146 +406 val_406 +429 val_429 +374 val_374 +152 val_152 +469 val_469 +145 val_145 +495 val_495 +37 val_37 +327 val_327 +281 val_281 +277 val_277 +209 val_209 +15 val_15 +82 val_82 +403 val_403 +166 val_166 +417 val_417 +430 val_430 +252 val_252 +292 val_292 +219 val_219 +287 val_287 +153 val_153 +193 val_193 +338 val_338 +446 val_446 +459 val_459 +394 val_394 +237 val_237 +482 val_482 +174 val_174 +413 val_413 +494 val_494 +207 val_207 +199 val_199 +466 val_466 +208 val_208 +174 val_174 +399 val_399 +396 val_396 +247 val_247 +417 val_417 +489 val_489 +162 val_162 +377 val_377 +397 val_397 +309 val_309 +365 val_365 +266 val_266 +439 val_439 +342 val_342 +367 val_367 +325 val_325 +167 val_167 +195 val_195 +475 val_475 +17 val_17 +113 val_113 +155 val_155 +203 val_203 +339 val_339 +0 val_0 +455 val_455 +128 val_128 +311 val_311 +316 val_316 +57 val_57 +302 val_302 +205 val_205 +149 val_149 +438 val_438 +345 val_345 +129 val_129 +170 val_170 +20 val_20 +489 val_489 +157 val_157 +378 val_378 +221 val_221 +92 val_92 +111 val_111 +47 val_47 +72 val_72 +4 val_4 +280 val_280 +35 val_35 +427 val_427 +277 val_277 +208 val_208 +356 val_356 +399 val_399 +169 val_169 +382 val_382 +498 val_498 +125 val_125 +386 val_386 +437 val_437 +469 val_469 +192 val_192 +286 val_286 +187 val_187 +176 val_176 +54 val_54 +459 val_459 +51 val_51 +138 val_138 +103 val_103 +239 val_239 +213 val_213 +216 val_216 +430 val_430 +278 val_278 +176 val_176 +289 val_289 +221 val_221 +65 val_65 +318 val_318 +332 val_332 +311 val_311 +275 val_275 +137 val_137 +241 val_241 +83 val_83 +333 val_333 +180 val_180 +284 val_284 +12 val_12 +230 val_230 +181 val_181 +67 val_67 +260 val_260 +404 val_404 +384 val_384 +489 val_489 +353 val_353 +373 val_373 +272 val_272 +138 val_138 +217 val_217 +84 val_84 +348 val_348 +466 val_466 +58 val_58 +8 val_8 +411 val_411 +230 val_230 +208 val_208 +348 val_348 +24 val_24 +463 val_463 +431 val_431 +179 val_179 +172 val_172 +42 val_42 +129 val_129 +158 val_158 +119 val_119 +496 val_496 +0 val_0 +322 val_322 +197 val_197 +468 val_468 +393 val_393 +454 val_454 +100 val_100 +298 val_298 +199 val_199 +191 val_191 +418 val_418 +96 val_96 +26 val_26 +165 val_165 +327 val_327 +230 val_230 +205 val_205 +120 val_120 +131 val_131 +51 val_51 +404 val_404 +43 val_43 +436 val_436 +156 val_156 +469 val_469 +468 val_468 +308 val_308 +95 val_95 +196 val_196 +288 val_288 +481 val_481 +457 val_457 +98 val_98 +282 val_282 +197 val_197 +187 val_187 +318 val_318 +318 val_318 +409 val_409 +470 val_470 +137 val_137 +369 val_369 +316 val_316 +169 val_169 +413 val_413 +85 val_85 +77 val_77 +0 val_0 +490 val_490 +87 val_87 +364 val_364 +179 val_179 +118 val_118 +134 val_134 +395 val_395 +282 val_282 +138 val_138 +238 val_238 +419 val_419 +15 val_15 +118 val_118 +72 val_72 +90 val_90 +307 val_307 +19 val_19 +435 val_435 +10 val_10 +277 val_277 +273 val_273 +306 val_306 +224 val_224 +309 val_309 +389 val_389 +327 val_327 +242 val_242 +369 val_369 +392 val_392 +272 val_272 +331 val_331 +401 val_401 +242 val_242 +452 val_452 +177 val_177 +226 val_226 +5 val_5 +497 val_497 +402 val_402 +396 val_396 +317 val_317 +395 val_395 +58 val_58 +35 val_35 +336 val_336 +95 val_95 +11 val_11 +168 val_168 +34 val_34 +229 val_229 +233 val_233 +143 val_143 +472 val_472 +322 val_322 +498 val_498 +160 val_160 +195 val_195 +42 val_42 +321 val_321 +430 val_430 +119 val_119 +489 val_489 +458 val_458 +78 val_78 +76 val_76 +41 val_41 +223 val_223 +492 val_492 +149 val_149 +449 val_449 +218 val_218 +228 val_228 +138 val_138 +453 val_453 +30 val_30 +209 val_209 +64 val_64 +468 val_468 +76 val_76 +74 val_74 +342 val_342 +69 val_69 +230 val_230 +33 val_33 +368 val_368 +103 val_103 +296 val_296 +113 val_113 +216 val_216 +367 val_367 +344 val_344 +167 val_167 +274 val_274 +219 val_219 +239 val_239 +485 val_485 +116 val_116 +223 val_223 +256 val_256 +263 val_263 +70 val_70 +487 val_487 +480 val_480 +401 val_401 +288 val_288 +191 val_191 +5 val_5 +244 val_244 +438 val_438 +128 val_128 +467 val_467 +432 val_432 +202 val_202 +316 val_316 +229 val_229 +469 val_469 +463 val_463 +280 val_280 +2 val_2 +35 val_35 +283 val_283 +331 val_331 +235 val_235 +80 val_80 +44 val_44 +193 val_193 +321 val_321 +335 val_335 +104 val_104 +466 val_466 +366 val_366 +175 val_175 +403 val_403 +483 val_483 +53 val_53 +105 val_105 +257 val_257 +406 val_406 +409 val_409 +190 val_190 +406 val_406 +401 val_401 +114 val_114 +258 val_258 +90 val_90 +203 val_203 +262 val_262 +348 val_348 +424 val_424 +12 val_12 +396 val_396 +201 val_201 +217 val_217 +164 val_164 +431 val_431 +454 val_454 +478 val_478 +298 val_298 +125 val_125 +431 val_431 +164 val_164 +424 val_424 +187 val_187 +382 val_382 +5 val_5 +70 val_70 +397 val_397 +480 val_480 +291 val_291 +24 val_24 +351 val_351 +255 val_255 +104 val_104 +70 val_70 +163 val_163 +438 val_438 +119 val_119 +414 val_414 +200 val_200 +491 val_491 +237 val_237 +439 val_439 +360 val_360 +248 val_248 +479 val_479 +305 val_305 +417 val_417 +199 val_199 +444 val_444 +120 val_120 +429 val_429 +169 val_169 +443 val_443 +323 val_323 +325 val_325 +277 val_277 +230 val_230 +478 val_478 +178 val_178 +468 val_468 +310 val_310 +317 val_317 +333 val_333 +493 val_493 +460 val_460 +207 val_207 +249 val_249 +265 val_265 +480 val_480 +83 val_83 +136 val_136 +353 val_353 +172 val_172 +214 val_214 +462 val_462 +233 val_233 +406 val_406 +133 val_133 +175 val_175 +189 val_189 +454 val_454 +375 val_375 +401 val_401 +421 val_421 +407 val_407 +384 val_384 +256 val_256 +26 val_26 +134 val_134 +67 val_67 +384 val_384 +379 val_379 +18 val_18 +462 val_462 +492 val_492 +100 val_100 +298 val_298 +9 val_9 +341 val_341 +498 val_498 +146 val_146 +458 val_458 +362 val_362 +186 val_186 +285 val_285 +348 val_348 +167 val_167 +18 val_18 +273 val_273 +183 val_183 +281 val_281 +344 val_344 +97 val_97 +469 val_469 +315 val_315 +84 val_84 +28 val_28 +37 val_37 +448 val_448 +152 val_152 +348 val_348 +307 val_307 +194 val_194 +414 val_414 +477 val_477 +222 val_222 +126 val_126 +90 val_90 +169 val_169 +403 val_403 +400 val_400 +200 val_200 +97 val_97 \ No newline at end of file diff --git a/src/main/resources/people.json b/src/main/resources/people.json new file mode 100644 index 0000000..375a900 --- /dev/null +++ b/src/main/resources/people.json @@ -0,0 +1,1214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + spark/people.json at 5264164a67df498b73facae207eda12ee133be7d · apache/spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + + +
+
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Watch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • +
    +
    + +
  • +
+ +

+ + /spark + + +

+ +
+ + + + + + +
+
+
+ + + + + + + Permalink + + + + +
+ + +
+ + Tree: + 5264164a67 + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ + +
+
+ + 1 contributor + + +
+ +

+ Users who have contributed to this file +

+
+ +
+
+
+
+ + + + + +
+ +
+ +
+ 4 lines (3 sloc) + + 73 Bytes +
+ +
+ +
+ Raw + Blame + History +
+ + +
+ + + + +
+
+
+ + + + + + +
+ + + + + + + + + + + + + + +
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
+ + + +
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + + diff --git a/src/main/resources/people.txt b/src/main/resources/people.txt new file mode 100644 index 0000000..30f7501 --- /dev/null +++ b/src/main/resources/people.txt @@ -0,0 +1,3 @@ +Michael, 29 +Andy, 30 +Justin, 19 \ No newline at end of file diff --git a/src/main/resources/user.avsc b/src/main/resources/user.avsc new file mode 100644 index 0000000..48b7ebc --- /dev/null +++ b/src/main/resources/user.avsc @@ -0,0 +1,1206 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + spark/user.avsc at 5264164a67df498b73facae207eda12ee133be7d · apache/spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + + +
+
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Watch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • +
    +
    + +
  • +
+ +

+ + /spark + + +

+ +
+ + + + + + +
+
+
+ + + + + + + Permalink + + + + +
+ + +
+ + Tree: + 5264164a67 + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ + +
+
+ + 1 contributor + + +
+ +

+ Users who have contributed to this file +

+
+ +
+
+
+
+ + + + + +
+ +
+ +
+ 9 lines (8 sloc) + + 185 Bytes +
+ +
+ +
+ Raw + Blame + History +
+ + +
+ + + + +
+
+
+ + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
{"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_color", "type": ["string", "null"]}
]
}
+ + + +
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + + diff --git a/src/main/resources/users.avro b/src/main/resources/users.avro new file mode 100644 index 0000000..b4b8778 --- /dev/null +++ b/src/main/resources/users.avro @@ -0,0 +1,1103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + spark/users.avro at 5264164a67df498b73facae207eda12ee133be7d · apache/spark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + + +
+
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Watch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • +
    +
    + +
  • +
+ +

+ + /spark + + +

+ +
+ + + + + + +
+
+
+ + + + + + + Permalink + + + + +
+ + +
+ + Tree: + 5264164a67 + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ Fetching contributors… +
+ +
+ + Cannot retrieve contributors at this time +
+
+ + + + +
+ +
+ +
+ 334 Bytes +
+ +
+ + + + +
+ + + +
+
+
+ + + + + + +
+
+ View raw +
+
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + + diff --git a/src/main/resources/users.parquet b/src/main/resources/users.parquet new file mode 100644 index 0000000..aa52733 Binary files /dev/null and b/src/main/resources/users.parquet differ