快速开始
开发Flink程序有固定的流程。
- 获得一个执行环境。
- 加载/创建初始化数据。
- 指定操作数据的Transaction算子。
- 指定计算好的数据的存放位置。
- 调用execute()触发执行程序。
# WordCount
# Flink基本maven项目构建(java版)
- IDEA中新建Maven项目
- 添加pom.xml依赖
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<flink.version>1.13.2</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
- 准备WordCout源数据 在IDEA的resource文件夹下新建一个wordcount.txt文件
hello word
hello java
word count
hello python
python you
1
2
3
4
5
2
3
4
5
# 批处理WordCount
package tech.fz.batch;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
* 批处理的Wordcount
*/
public class WordCount {
public static void main(String[] args) throws Exception {
// 1、创建执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// 2、创建数据源
String wc_txt_file = "/Users/fangzheng/IdeaProjects/flink-study/src/main/resources/wordcount.txt";
DataSet<String> inputDataSource = env.readTextFile(wc_txt_file);
// 3、对数据集进行WordCount处理
AggregateOperator<Tuple2<String, Integer>> resultSet = inputDataSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> result) throws Exception {
String[] words = value.split(" ");
for (String word : words) {
result.collect(new Tuple2<>(word, 1));
}
}
})
.groupBy(0) // 0代表位置(目前是word),代表按照当前的word分组
.sum(1); // 1代表位置(目前是每个分割后的数字),代表按照当前的数量求和
resultSet.print(); // 打印结果
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# 执行结果
# 流处理WordCount
package tech.fz.stream;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 流的Wordcount
*/
public class WordCount {
public static void main(String[] args) throws Exception {
// 1、创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2、创建数据源
String wc_txt_file = "/Users/fangzheng/IdeaProjects/flink-study/src/main/resources/wordcount.txt";
DataStream<String> inputDataSource = env.readTextFile(wc_txt_file);
// 3、对数据集进行WordCount处理
DataStream<Tuple2<String, Integer>> inputWord = inputDataSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> result) throws Exception {
String[] words = value.split(" ");
for (String word : words) {
result.collect(new Tuple2<>(word, 1));
}
}
});
DataStream<Tuple2<String, Integer>> result = inputWord.keyBy(0).sum(1);
result.print();
// 4、启动流事件触发:注意这里比批处理多一个执行的过程
env.execute();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# 批处理和流处理的区别
最后更新时间: 2022/7/23 10:17:11