06 December 2015

Actions on Pair RDDs : countByKey(), collectAsMap() & lookup()

Example
scala> val rdd = sc.parallelize(Seq(
     |                ("math",    55),
     |                ("math",    56),
     |                ("english", 57),
     |                ("english", 58),
     |                ("science", 59),
     |                ("science", 54)))
rdd: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[31] at parallelize at :21

scala> //Example : countByKey()
scala> val result1 = rdd.countByKey()
result1: scala.collection.Map[String,Long] = Map(math -> 2, english -> 2, science -> 2)

scala> //Example : collectAsMap()
scala> val reslt2 = rdd.collectAsMap()
reslt2: scala.collection.Map[String,Int] = Map(math -> 56, science -> 54, english -> 58)

scala> //Example : lookup()
scala> val result3 = rdd.lookup("math")
result3: Seq[Int] = WrappedArray(55, 56)