Skip to content

Commit

Permalink
add: 1.scala语法学习 2. spark ml
Browse files Browse the repository at this point in the history
  • Loading branch information
Kyofin committed Sep 6, 2019
1 parent f049cdd commit be638cf
Show file tree
Hide file tree
Showing 18 changed files with 1,009,935 additions and 5 deletions.
16 changes: 11 additions & 5 deletions spark-starter/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@
<!--<version>2.1.3</version>-->
<!--</dependency>-->

<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-mllib_2.11</artifactId>-->
<!--<version>2.1.3</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.4.0</version>
</dependency>

<!--es-->
<dependency>
Expand Down Expand Up @@ -151,6 +151,12 @@
<!--<artifactId>hadoop-client</artifactId>-->
<!--<version>2.8.4</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.12</version>
<!--<scope>provided</scope>-->
</dependency>


</dependencies>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.wugui.sparkstarter.ml;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

/**
* @program: bigdata-starter
* @author: huzekang
* @create: 2019-09-05 20:38
**/
public class SparkDouban {
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().master("local").getOrCreate();
RDD<Row> rdd = sparkSession.read().text("/Users/huzekang/study/bigdata-starter/spark-starter/src/main/resources/data/hot_movies.csv").rdd();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package com.wugui.sparkstarter.ml;

import org.apache.spark.SparkContext;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.List;

/**
* @program: bigdata-starter
* @author: huzekang
* @create: 2019-09-05 18:49
**/
public class SparkTFIDF {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("TFIDF")
.master("local")
.getOrCreate();

List<Row> data = Arrays.asList(
RowFactory.create(0.0, "Hi I heard about Spark"),
RowFactory.create(0.0, "I wish Java could use case classes"),
RowFactory.create(1.0, "Logistic regression models are neat")
);

StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);

int numFeatures = 20;
HashingTF hashingTF = new HashingTF();
hashingTF.setInputCol("words")
.setOutputCol("rawFeatures");
//.setNumFeatures(numFeatures);

Dataset<Row> featurizedData = hashingTF.transform(wordsData);
featurizedData.show();

IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
Dataset<Row> rescalsedData = idfModel.transform(featurizedData);

System.out.println();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.wugui.sparkstarter.ml;

import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.List;

public class TokenizerExample
{
public static void main(String[] args)
{
SparkSession spark = SparkSession
.builder()
.appName("Tokenizer")
.master("local")
.getOrCreate();

List<Row> data = java.util.Arrays.asList(
RowFactory.create(0, "Hi I heard about spark"),
RowFactory.create(1, "I wish Java could use case classes"),
RowFactory.create(2, "Logistic,regression,models,are,neat")
);

StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("sentence",DataTypes.StringType, false, Metadata.empty())
});

Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);

Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");

Dataset<Row> wordsData = tokenizer.transform(sentenceDataFrame);
for (Row r : wordsData.select("words","label").takeAsList(3))
{
List<String> words = r.getList(0);
for (String word : words) {
System.out.println(word);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.wugui.sparkstarter.ml;

import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.ml.feature.Word2VecModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;

import java.util.Arrays;
import java.util.List;


public class Word2VecExample
{
public static void main(String[] args)
{
SparkSession spark = SparkSession
.builder()
.appName("TFIDF")
.master("local")
.getOrCreate();

List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("Hi I herd about spark".split(" "))),
RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
);

StructType schema = new StructType(new StructField[]{
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});

Dataset<Row> documentDF = spark.createDataFrame(data, schema);

Word2Vec word2Vec = new Word2Vec()
.setInputCol("text")
.setOutputCol("result")
.setVectorSize(7)
.setMinCount(0);

Word2VecModel model = word2Vec.fit(documentDF);
Dataset<Row> result = model.transform(documentDF);
result.show();
// result.write().text("wor2vec");
for (Row r : result.takeAsList(10))
{
System.out.println(r);
}

}
}
3 changes: 3 additions & 0 deletions spark-starter/src/main/resources/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
基于Spark ML实现的豆瓣电影推荐系统

https://colobu.com/2015/11/30/movie-recommendation-for-douban-users-by-spark-mllib/
166 changes: 166 additions & 0 deletions spark-starter/src/main/resources/data/hot_movies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
20645098,8.2,小王子
26259677,8.3,垫底辣妹
11808948,7.2,海绵宝宝
26253733,6.4,突然变异
25856265,6.7,烈日迷踪
26274810,6.6,侦探:为了原点
25889465,6.3,抢劫
1972724,7.3,斯坦福监狱实验
6845667,8.0,秘密特工
1866473,7.8,蚁人
25859495,8.2,思悼
25823132,8.1,暗杀
10533913,8.8,头脑特工队
25766754,8.2,年轻气盛
26393561,8.8,小萝莉的猴神大叔
26326395,5.1,真实魔鬼游戏
25955491,8.6,罪恶之家
25774051,6.8,寄生兽:完结篇
24325923,8.0,我和厄尔以及将死的女孩
2303845,7.2,刺客聂隐娘
24719063,7.9,烈日灼心
25911595,5.3,第三种爱情
25933898,6.4,爱恋
25763555,6.1,美式极端
25761178,8.3,百元之恋
25727048,7.8,福尔摩斯先生
25855951,8.3,贝利叶一家
26303865,7.7,维多利亚
26304268,6.7,致命礼物
25728010,7.5,老手
21937450,8.2,国际市场
25838463,6.2,像素大战
25821461,7.7,旅程终点
21350962,5.8,代号47
3445457,7.8,无境之兽
10773239,8.1,小男孩
24397586,8.5,小羊肖恩
26275494,7.3,橘色
26297388,7.5,这时对那时错
25955372,6.4,1980年代的爱情
25823840,6.4,奸臣
11624706,7.3,小黄人大眼萌
10741643,8.3,我的个神啊
25907004,4.6,坏姐姐之拆婚联盟
26235839,7.5,内在美
25774050,7.4,寄生兽
23769147,7.7,爱情限时恋未尽
26270517,7.8,愚人节
25958787,7.8,深夜食堂 电影版
26289144,7.6,滚蛋吧!肿瘤君
25752261,7.6,女间谍
25881628,6.7,幸存的女孩
25853129,6.3,瑞奇和闪电
25746375,7.3,我是路人甲
25753326,7.1,巴霍巴利王(上)
4075568,7.1,假期历险记
6039412,6.3,时光尽头的恋人
25870236,7.8,可爱的你
24751764,6.1,三城记
24405378,8.5,王牌特工:特工学院
3592854,8.5,疯狂的麦克斯4:狂暴之路
25830802,6.5,对风说爱你
24879839,5.4,道士下山
25774126,7.6,爷们些
26304167,8.1,出租车
25718082,7.0,念念
23761370,8.4,速度与激情7
10727641,7.8,碟中谍5:神秘国度
25745752,5.5,左耳
11540651,7.2,许三观
11776289,6.2,华丽上班族
4160540,7.5,机械姬
25956520,6.6,太平轮(下)·彼岸
26021055,4.1,栀子花开
25962735,4.4,既然青春留不住
26252157,7.5,龙三和他的七人党
25723907,7.0,捉妖记
11520649,8.2,麦克法兰
19957083,6.5,泰迪熊2
26252196,7.4,卫生间的圣母像
26147706,7.9,花与爱丽丝杀人事件
2973079,8.2,霍比特人3:五军之战
26366634,5.3,嘘!禁止想象!
3338862,6.9,终结者:创世纪
25895276,6.4,煎饼侠
3432861,6.5,黑色弥撒
6873042,6.2,明日世界
26384515,7.6,这里的黎明静悄悄
26279166,4.6,鸭王
4014396,4.1,神奇四侠2015
25823833,6.0,天将雄师
19897541,9.0,机动战士高达 THE ORIGIN I 青瞳的卡斯巴尔
24325815,6.4,非我
21345845,7.6,涉足荒野
25821585,6.5,生活残骸
24847343,4.6,小时代4:灵魂尽头
25858759,5.0,有一个地方只有我们知道
26582787,4.7,斗地主
25858785,5.8,澳门风云2
21349734,7.0,博物馆奇妙夜3
23788440,7.4,杀破狼2
25887846,6.7,传奇
25794212,5.8,分歧者2:绝地反击
6126442,6.2,一步之遥
5446197,7.2,铁拳
25862355,7.0,二十
25945356,4.3,新步步惊心
25786077,7.1,末日崩塌
10741834,7.1,复仇者联盟2:奥创纪元
25922902,7.5,唇上之歌
10827341,7.3,疯狂外星人
25881780,5.8,命中注定
10604554,6.8,躲藏
10792633,7.8,金衣女人
25856480,5.7,巴黎假期
26219652,5.7,少年班
10440138,7.8,侏罗纪世界
26328118,4.5,咒怨:完结篇
2325873,5.0,第七子:降魔之战
25944282,6.2,纸镇
25746414,5.3,暴走神探
25986688,4.5,流浪者年代记
25767747,7.4,故事的故事
21442760,7.2,最长的旅程
25872931,6.0,万物生长
26263443,5.7,恋爱中的城市
3078390,5.7,太平轮(上)
6875263,6.8,灰姑娘
24716045,6.8,远离尘嚣
6866928,5.1,进击的巨人真人版:前篇
26276359,7.1,酷毙了
25898213,7.1,军犬麦克斯
26356488,7.9,1944
26285777,5.2,有客到
24307637,6.3,江南1970
6846893,7.2,超能查派
25853727,7.3,破风
24753810,7.0,战狼
3608742,7.6,冲出康普顿
26599083,5.0,妈妈的朋友
25843352,7.2,如此美好
25908042,4.6,横冲直撞好莱坞
25912924,6.3,暗杀教室
25907088,5.0,魔镜
25809260,6.5,工作女郎
5154799,5.4,木星上行
25805054,6.4,十万个冷笑话
22522269,6.7,战斧骨
24872023,5.3,贵族大盗
24743709,4.1,北京纽约
25717176,6.2,新宿天鹅
24751757,4.8,微爱之渐入佳境
26265099,6.8,白河夜船
25835293,6.4,失孤
25868191,5.1,极道大战争
25779218,5.4,匆匆那年
25861695,7.1,海月姬
25731554,7.4,西部慢调
3006769,6.9,大眼睛
10440076,5.6,最后的女巫猎人
22556810,4.5,猛龙特囧
7003416,4.4,冲上云霄
25919385,7.8,长寿商会
11541282,5.8,魔力麦克2
10793610,6.4,法老与众神
25778488,4.1,宅女侦探桂香
Loading

0 comments on commit be638cf

Please sign in to comment.