package org.apache.spark.examplesimport org.apache.spark.SparkContextimport org.apache.spark.SparkContext._import org.apache.spark.SparkConf/*统计文档中单词字数出现频率大于threshold,然后再统计这些单词中每个字符出现的数量*/object SparkWordCount { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"Book example: Scala").setMaster("local[2]") val sc = new SparkContext(conf) val threshold = 1 // split each document into words val tokenized = sc.textFile("file:/Users/xxx/Documents/hadoopTools/scala/eclipse/Eclipse.app/Contents/MacOS/workspace/spark1.3.1/src/main/resources/people.txt").flatMap(_.split(" ")) // count the occurrence of each word val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _) // filter out words with less than threshold occurrences val filtered = wordCounts.filter(_._2 >= threshold) // count characters val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _) System.out.println(charCounts.collect().mkString(", ")) sc.stop() }}