秉承归零心态是学习编程必备态度的原则,这次scala也使用python的方法,把代码敲一遍,所以,就有了下面的....同样保留了试错的内容。
scala> var rdd1 = sc.parallelizera(List(12,3,45,5,6,7,7)) <console>:24: error: value parallelizera is not a member of org.apache.spark.SparkContext var rdd1 = sc.parallelizera(List(12,3,45,5,6,7,7)) ^ scala> var rdd1 = sc.parallelize(List(12,3,45,5,6,7,7)) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24 scala> rdd1.collect collect collectAsync scala> rdd1.collect res0: Array[Int] = Array(12, 3, 45, 5, 6, 7, 7) scala> var rdd2 = rdd1.map(_*2) rdd2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1] at map at <console>:26 scala> rdd2.co coalesce collect collectAsync compute context copy count countApprox countApproxDistinct countAsync countByValue countByValueApprox scala> rdd2.collect res1: Array[Int] = Array(24, 6, 90, 10, 12, 14, 14) scala> var rdd3 = rdd2.sortBy(x=>x,true) rdd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[6] at sortBy at <console>:28 scala> rdd3.collect res2: Array[Int] = Array(6, 10, 12, 14, 14, 24, 90) scala> var rdd3 = rdd2.sortBy(x=>x,false) rdd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[11] at sortBy at <console>:28 scala> rdd3.collect res3: Array[Int] = Array(90, 24, 14, 14, 12, 10, 6) scala> var rdd3 = rdd2.sortBy(x,x,false) <console>:28: error: not found: value x var rdd3 = rdd2.sortBy(x,x,false) ^ <console>:28: error: not found: value x var rdd3 = rdd2.sortBy(x,x,false) ^ <console>:28: error: type mismatch; found : Boolean(false) required: Int var rdd3 = rdd2.sortBy(x,x,false) ^ scala> var rdd3 = rdd2.sortBy((_,)_,false) <console>:1: error: illegal start of simple expression var rdd3 = rdd2.sortBy((_,)_,false) ^ scala> var rdd3 = rdd2.sortBy((_,_),false) <console>:28: error: missing parameter type for expanded function ((x$1, x$2) => scala.Tuple2(x$1, x$2)) Error occurred in an application involving default arguments. var rdd3 = rdd2.sortBy((_,_),false) ^ <console>:28: error: missing parameter type for expanded function ((x$1: <error>, x$2) => scala.Tuple2(x$1, x$2)) Error occurred in an application involving default arguments. var rdd3 = rdd2.sortBy((_,_),false) ^ scala> var rdd3 = rdd2.sortBy((__),false) <console>:28: error: not found: value __ Error occurred in an application involving default arguments. var rdd3 = rdd2.sortBy((__),false) ^ scala> scala> var rdd3 = rdd2.sortBy((x)=>x,false) // Detected repl transcript. Paste more, or ctrl-D to finish. rdd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[16] at sortBy at <console>:28 scala> rdd3.collect res4: Array[Int] = Array(90, 24, 14, 14, 12, 10, 6) ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 2.1.0 /_/ Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_144) Type in expressions to have them evaluated. Type :help for more information. scala> var rdd1 = sc.parallelize(List(12,3,45,5,6,7,7)) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24 scala> var rdd3 = rdd1.sortBy(x=>x+3,false) rdd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[5] at sortBy at <console>:26 scala> rdd3.collect res0: Array[Int] = Array(45, 12, 7, 7, 6, 5, 3) scala> var rdd4 = rdd3.filter(_>6) rdd4: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[6] at filter at <console>:28 scala> rdd4.collect res1: Array[Int] = Array(45, 12, 7, 7) scala> var rdd5 = rdd3.distinct rdd5: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[9] at distinct at <console>:28 scala> rdd5.collect res2: Array[Int] = Array(45, 6, 3, 12, 7, 5) scala> var rdd1 = sc.parallelize(List(("Tom",1000),("Mary",999),("TTT",0900))) <console>:1: error: Decimal integer literals may not have a leading zero. (Octal syntax is obsolete.) var rdd1 = sc.parallelize(List(("Tom",1000),("Mary",999),("TTT",0900))) ^ scala> var rdd1 = sc.parallelize(List(("Tom",1000),("Mary",999),("TTT",900))) rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[10] at parallelize at <console>:24 scala> var rdd2 = sc.parallelize(List(("Tom",1000),("Mary",999),("666",900))) rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[11] at parallelize at <console>:24 scala> var rdd2 = sc.parallelize(List(("Tom",1050),("Mary",888),("666",900))) rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[12] at parallelize at <console>:24 scala> rdd3 = rdd1.union(rdd2) <console>:30: error: type mismatch; found : org.apache.spark.rdd.RDD[(String, Int)] required: org.apache.spark.rdd.RDD[Int] rdd3 = rdd1.union(rdd2) ^ scala> var rdd3 = rdd1.union(rdd2) rdd3: org.apache.spark.rdd.RDD[(String, Int)] = UnionRDD[13] at union at <console>:28 scala> rdd3.collect res3: Array[(String, Int)] = Array((Tom,1000), (Mary,999), (TTT,900), (Tom,1050), (Mary,888), (666,900)) scala> var rdd4 = rdd3.groupByKey() rdd4: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[14] at groupByKey at <console>:30 scala> rdd4.collect res4: Array[(String, Iterable[Int])] = Array((TTT,CompactBuffer(900)), (666,CompactBuffer(900)), (Mary,CompactBuffer(999, 888)), (Tom,CompactBuffer(1000, 1050))) scala> var rdd5 = rdd3.reduceByKey(_+_) rdd5: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[15] at reduceByKey at <console>:30 scala> rdd5.collect res5: Array[(String, Int)] = Array((TTT,900), (666,900), (Mary,1887), (Tom,2050)) scala> reducebykey底层调用的groupbykey <console>:24: error: not found: value reducebykey底层调用的groupbykey reducebykey底层调用的groupbykey ^ Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 2.1.0 /_/ Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_144) Type in expressions to have them evaluated. Type :help for more information. scala> var rdd1 = sc.parallelize(List(("Tom",1000),("Mary",999),("TTT",900))) rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:24 scala> var rdd2 = sc.parallelize(List(("Tom",1000),("Mary",999),("666",900))) rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at parallelize at <console>:24 scala> var rdd3 = rdd1.cogroup(rdd2) rdd3: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[3] at cogroup at <console>:28 scala> rdd3.collect res0: Array[(String, (Iterable[Int], Iterable[Int]))] = Array((TTT,(CompactBuffer(900),CompactBuffer())), (666,(CompactBuffer(),CompactBuffer(900))), (Tom,(CompactBuffer(1000),CompactBuffer(1000))), (Mary,(CompactBuffer(999),CompactBuffer(999)))) scala> var rdd2 = sc.parallelize(List(("Tom",1000),("Mary",999),("Tom",888),("666",900))) rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[4] at parallelize at <console>:24 scala> var rdd3 = rdd1.cogroup(rdd2) rdd3: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[6] at cogroup at <console>:28 scala> rdd3.collect res1: Array[(String, (Iterable[Int], Iterable[Int]))] = Array((TTT,(CompactBuffer(900),CompactBuffer())), (666,(CompactBuffer(),CompactBuffer(900))), (Tom,(CompactBuffer(1000),CompactBuffer(888, 1000))), (Mary,(CompactBuffer(999),CompactBuffer(999)))) scala> var rdd4 = rdd1.sortBykey(true) <console>:26: error: value sortBykey is not a member of org.apache.spark.rdd.RDD[(String, Int)] var rdd4 = rdd1.sortBykey(true) ^ scala> var rdd4 = rdd1.sortByKey(true) rdd4: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[9] at sortByKey at <console>:26 scala> rdd4.collect res2: Array[(String, Int)] = Array((Mary,999), (TTT,900), (Tom,1000)) scala> rdd5 = rdd1.union(rdd2) <console>:30: error: not found: value rdd5 val $ires6 = rdd5 ^ <console>:28: error: not found: value rdd5 rdd5 = rdd1.union(rdd2) ^ scala> var rdd5 = rdd1.union(rdd2) rdd5: org.apache.spark.rdd.RDD[(String, Int)] = UnionRDD[10] at union at <console>:28 scala> rdd5.collect res3: Array[(String, Int)] = Array((Tom,1000), (Mary,999), (TTT,900), (Tom,1000), (Mary,999), (Tom,888), (666,900)) scala> var rdd6 = rdd5.map(x=>(x._2,x._1)).sortByKey(true).map(x=>(x._2,x._1)) rdd6: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[15] at map at <console>:30 scala> rdd6.collect res4: Array[(String, Int)] = Array((Tom,888), (666,900), (TTT,900), (Mary,999), (Mary,999), (Tom,1000), (Tom,1000)) scala> var rdd7 = rdd5.sortBy(x=>x._2,true) rdd7: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[20] at sortBy at <console>:30 scala> rdd7.collect res5: Array[(String, Int)] = Array((Tom,888), (TTT,900), (666,900), (Mary,999), (Mary,999), (Tom,1000), (Tom,1000)) 明天再补一部分