- [raj@Rajkumars-MacBook-Pro ~]$spark-shell --master local[*]
- 2016-05-28 15:37:24.325 java[3907:6309927] Unable to load realm info from SCDynamicStore
- Welcome to
- ____ __
- / __/__ ___ _____/ /__
- _\ \/ _ \/ _ `/ __/ '_/
- /___/ .__/\_,_/_/ /_/\_\ version 1.6.1
- /_/
-
- Using Scala version 2.10.5 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45)
- Type in expressions to have them evaluated.
- Type :help for more information.
- Spark context available as sc.
- SQL context available as sqlContext.
-
- scala> val lines = sc.parallelize(List("This is a word", "This is another word"), 7)
- lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at :27
- scala> // No of partitions
- scala> lines.partitions.size
- res0: Int = 7
- scala> // flatMap() : One of many transformation
- scala> val words = lines.flatMap(line => line.split(" "))
- words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[1] at flatMap at :29
- scala> val units = words.map ( word => (word, 1) )
- units: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[2] at map at :31
- scala> val counts = units.reduceByKey ( (x, y) => x + y )
- counts: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[3] at reduceByKey at :33
- scala> counts.toDebugString
- res1: String =
- (7) ShuffledRDD[3] at reduceByKey at :33 []
- +-(7) MapPartitionsRDD[2] at map at :31 []
- | MapPartitionsRDD[1] at flatMap at :29 []
- | ParallelCollectionRDD[0] at parallelize at :27 []
- scala> // collect() : One of many actions
- scala> counts.collect()
- res2: Array[(String, Int)] = Array((This,2), (is,2), (another,1), (a,1), (word,2))