18 June 2016

Broadcast Variable Example

scala> // Sending a value from Driver to Worker Nodes without

scala> // using Broadcast variable

scala> val input = sc.parallelize(List(1, 2, 3))
input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:27

scala> val localVal = 2
localVal: Int = 2

scala> val added = input.map( x => x + localVal)
added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[18] at map at <console>:31

scala> added.foreach(println)
4
3
5

scala> //** Local variable is once again transferred to worked nodes

scala> //   for the next operation

scala> val multiplied = input.map( x => x * 2)
multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[19] at map at <console>:29

scala> multiplied.foreach(println)
4
6
2

scala> // Sending a read-only value using Broadcast variable

scala> // Can be used to send large read-only values to all worker

scala> // nodes efficiently

scala> val broadcastVar = sc.broadcast(2)
broadcastVar: org.apache.spark.broadcast.Broadcast[Int] = Broadcast(14)

scala> val added = input.map(x => broadcastVar.value + x)
added: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at map at <console>:31

scala> added.foreach(println)
5
3
4

scala> val multiplied = input.map(x => broadcastVar.value * x)
multiplied: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[21] at map at <console>:31

scala> multiplied.foreach(println)
6
4
2

scala>