Example
Learning Spark : Page 57
scala> val inputrdd = sc.parallelize(Seq( | ("key1", 1), | ("key2", 2), | ("key1", 3))) inputrdd: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[80] at parallelize at:21 scala> val noPartition = inputrdd.reduceByKey((x, y) => x + y) noPartition: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[81] at reduceByKey at :23 scala> noPartition.partitions.length res50: Int = 8 scala> //Here Partition size is given as a second argument scala> val withPartition = inputrdd.reduceByKey((x, y) => x + y, 11) withPartition: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[82] at reduceByKey at :23 scala> withPartition.partitions.length res51: Int = 11 scala> val repartitioned = withPartition.repartition(16) repartitioned: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[86] at repartition at :25 scala> repartitioned.partitions.length res52: Int = 16 scala> val coalesced = if(4 < repartitioned.partitions.length) { | //Note : Use coalesce() only when the new partition size is | // less than the current partition size of the RDD | repartitioned.coalesce(4) | }else { | repartitioned | } coalesced: org.apache.spark.rdd.RDD[(String, Int)] = CoalescedRDD[87] at coalesce at :30 scala> coalesced.partitions.length res53: Int = 4
Reference
Learning Spark : Page 57