package wordcount import org.apache.spark.{SparkConf, SparkContext} object wordCountScala extends App{ val conf = new SparkConf().setAppName("Wordcount").setMaster("local"); val sc = new SparkContext(conf) val line = sc.textFile("F:\\大数据\\wordcount.txt") val result = line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_) result.foreach(println) }
java版本
package wordcount; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; import java.util.Arrays; import java.util.Iterator; public class wordCountJava { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("javaWordCount").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaRDD<String> javardd = jsc.textFile("F:\\大数据\\wordcount.txt"); JavaRDD<String> word = javardd.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { String[] s1 = s.split(" "); return Arrays.asList(s1).iterator(); } }); // 数据拼接 JavaPairRDD<String, Integer> javaPairRDD = word.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { // Tuple2<String, Integer> dd = new Tuple2<>(s, 1); // return dd; return new Tuple2<>(s,1); } }); // reduce 合并 JavaPairRDD<String, Integer> result = javaPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) throws Exception { return integer + integer2; } }); //打印 result.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { System.out.println(stringIntegerTuple2); } }); }
java 简洁版本
package wordcount; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.Arrays; import java.util.Iterator; public class javaWordcountSimple { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("javaWordCount").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaRDD<String> javardd = jsc.textFile("F:\\大数据\\wordcount.txt"); //数据切分 JavaRDD<String> word = javardd.flatMap((FlatMapFunction<String, String>) s -> { String[] s1 = s.split(" "); return Arrays.asList(s1).iterator(); }); // 数据拼接 JavaPairRDD<String, Integer> javaPairRDD = word.mapToPair((PairFunction<String, String, Integer>) m -> new Tuple2<>(m, 1)); // reduce 合并 JavaPairRDD<String, Integer> result = javaPairRDD.reduceByKey((Function2<Integer, Integer, Integer>) Integer::sum); result.foreach(s->{ System.out.println(s); }); } }