- scala> // Sending a value from Driver to Worker Nodes without
-
- scala> // using Broadcast variable
-
- scala> val input = sc.parallelize(List(1, 2, 3))
- input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:27
-
- scala> val localVal = 2
- localVal: Int = 2
-
- scala> val added = input.map( x => x + localVal)
- added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[18] at map at <console>:31
-
- scala> added.foreach(println)
- 4
- 3
- 5
-
- scala> //** Local variable is once again transferred to worked nodes
-
- scala> // for the next operation
-
- scala> val multiplied = input.map( x => x * 2)
- multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[19] at map at <console>:29
-
- scala> multiplied.foreach(println)
- 4
- 6
- 2
-
- scala> // Sending a read-only value using Broadcast variable
-
- scala> // Can be used to send large read-only values to all worker
-
- scala> // nodes efficiently
-
- scala> val broadcastVar = sc.broadcast(2)
- broadcastVar: org.apache.spark.broadcast.Broadcast[Int] = Broadcast(14)
-
- scala> val added = input.map(x => broadcastVar.value + x)
- added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at map at <console>:31
-
- scala> added.foreach(println)
- 5
- 3
- 4
-
- scala> val multiplied = input.map(x => broadcastVar.value * x)
- multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[21] at map at <console>:31
-
- scala> multiplied.foreach(println)
- 6
- 4
- 2
-
- scala>
-