scala> // Sending a value from Driver to Worker Nodes without scala> // using Broadcast variable scala> val input = sc.parallelize(List(1, 2, 3)) input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:27 scala> val localVal = 2 localVal: Int = 2 scala> val added = input.map( x => x + localVal) added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[18] at map at <console>:31 scala> added.foreach(println) 4 3 5 scala> //** Local variable is once again transferred to worked nodes scala> // for the next operation scala> val multiplied = input.map( x => x * 2) multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[19] at map at <console>:29 scala> multiplied.foreach(println) 4 6 2
scala> // Sending a read-only value using Broadcast variable scala> // Can be used to send large read-only values to all worker scala> // nodes efficiently scala> val broadcastVar = sc.broadcast(2) broadcastVar: org.apache.spark.broadcast.Broadcast[Int] = Broadcast(14) scala> val added = input.map(x => broadcastVar.value + x) added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at map at <console>:31 scala> added.foreach(println) 5 3 4 scala> val multiplied = input.map(x => broadcastVar.value * x) multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[21] at map at <console>:31 scala> multiplied.foreach(println) 6 4 2 scala>