當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

hadoop 文本统计一个字符的个数_使用hadoop统计多个文本中每个单词数目

發布時間：2025/3/21 编程问答 31 豆豆

生活随笔收集整理的這篇文章主要介紹了 hadoop 文本统计一个字符的个数_使用hadoop统计多个文本中每个单词数目小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

程序源碼

1 importjava.io.IOException;2 importjava.util.StringTokenizer;3 importorg.apache.hadoop.conf.Configuration;4 importorg.apache.hadoop.fs.Path;5 importorg.apache.hadoop.io.IntWritable;6 importorg.apache.hadoop.io.LongWritable;7 importorg.apache.hadoop.io.Text;8 importorg.apache.hadoop.mapreduce.Job;9 importorg.apache.hadoop.mapreduce.Mapper;10 importorg.apache.hadoop.mapreduce.Reducer;11 importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;12 importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;13 importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;14 importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;15

16 public classWordCount {17 public static class WordCountMap extends

18 Mapper{19 private final IntWritable one = new IntWritable(1);//輸出的值 120 private Text word = newText();//輸出的鍵單詞21

22 public voidmap(LongWritable key, Text value, Context context)23 throwsIOException, InterruptedException {//處理經過 TextInputFormat 產生的，然后產生 24 String line =value.toString();//讀取文本中25 StringTokenizer token = newStringTokenizer(line);//按照空格對單詞進行切割26 while(token.hasMoreTokens()) {27 word.set(token.nextToken());//讀取到的單詞作為鍵值28 context.write(word, one);//以單詞,1的中間形式交給reduce處理29 }30 }31 }32

33 public static class WordCountReduce extends

34 Reducer{35 public void reduce(Text key, Iterablevalues,36 Context context) throwsIOException, InterruptedException {37 int sum = 0;38 for(IntWritable val : values) {39 sum +=val.get();40 }41 context.write(key, newIntWritable(sum));42 }43 }44

45 public static void main(String[] args) throwsException {46 Configuration conf = newConfiguration();47 Job job = newJob(conf);48 job.setJarByClass(WordCount.class);49 job.setJobName("wordcount");50 job.setOutputKeyClass(Text.class);51 job.setOutputValueClass(IntWritable.class);52 job.setMapperClass(WordCountMap.class);53 job.setReducerClass(WordCountReduce.class);54 job.setInputFormatClass(TextInputFormat.class);//生成可供Map處理的鍵值對55 job.setOutputFormatClass(TextOutputFormat.class);56 FileInputFormat.addInputPath(job, new Path(args[0]));57 FileOutputFormat.setOutputPath(job, new Path(args[1]));58 job.waitForCompletion(true);59 }60 }

1 編譯源碼

javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java

將源碼編譯成class文件并放在當前文件夾下的word_count_class目錄，當然，首先需要創建該目錄

2 將源碼打成jar包

進入源碼目錄

jar -cvf wordcount.jar? *

3 上傳輸入文件

先在hadoop中為本次任務創建一個輸入文件存放目錄

hadoop fs -mkdir input_wordcount

將input目錄下的所有文本文件上傳到hadoop中的input_wordcount目錄下

hadoop fs -put input/* input_wordcount/

注意：不能在運行前穿創建輸出文件夾

4 上傳jar并執行

hadoop jar word_count_class/wordcount.jar input_wordcount output_wordcount

5 查看計算結果

程序輸出目錄

hadoop fs -ls output_wordcount

程序輸出內容

hadoop fs -cat output_wordcount/part-r-00000

版本二：自己實際操作中的程序

Map程序

1 packagecom.zln.chapter03;2

3 importorg.apache.hadoop.io.IntWritable;4 importorg.apache.hadoop.io.LongWritable;5 importorg.apache.hadoop.io.Text;6 importorg.apache.hadoop.mapred.MapReduceBase;7 importorg.apache.hadoop.mapred.Mapper;8 importorg.apache.hadoop.mapred.OutputCollector;9 importorg.apache.hadoop.mapred.Reporter;10

11 importjava.io.IOException;12 importjava.util.StringTokenizer;13

14 /**

15 * Created by sherry on 15-7-12.16 */

17 public class WordCountMap extends MapReduceBase implements Mapper{18 private final static IntWritable one = new IntWritable(1);//每個單詞 +1

19 private Text word = newText();20

21 @Override22 public void map(LongWritable longWritable, Text text, OutputCollector outputCollector, Reporter reporter) throwsIOException {23 String line =text.toString();24 StringTokenizer tokenizer = new StringTokenizer(line);//分割出單詞

25 while(tokenizer.hasMoreTokens()){26 word.set(tokenizer.nextToken());27 outputCollector.collect(word,one);28 }29 }30 }

Reduce程序

1 packagecom.zln.chapter03;2

3 importorg.apache.hadoop.io.IntWritable;4 importorg.apache.hadoop.io.Text;5 importorg.apache.hadoop.mapred.MapReduceBase;6 importorg.apache.hadoop.mapred.OutputCollector;7 importorg.apache.hadoop.mapred.Reducer;8 importorg.apache.hadoop.mapred.Reporter;9

10 importjava.io.IOException;11 importjava.util.Iterator;12

13 /**

14 * Created by sherry on 15-7-12.15 */

16 public class WordCountReduce extends MapReduceBase implements Reducer{17 @Override18 public void reduce(Text text, Iterator iterator, OutputCollector outputCollector, Reporter reporter) throwsIOException {19 int sum = 0;20 while(iterator.hasNext()){21 sum +=iterator.next().get();22 }23 outputCollector.collect(text,newIntWritable(sum));24 }25 }

主函數

1 packagecom.zln.chapter03;2

3 importorg.apache.hadoop.fs.Path;4 importorg.apache.hadoop.io.IntWritable;5 importorg.apache.hadoop.io.Text;6 import org.apache.hadoop.mapred.*;7

8 importjava.io.IOException;9

11 /**

12 * Created by sherry on 15-7-12.13 */

14 public classWordCount {15 public static void main(String[] args) throwsIOException {16 JobConf conf = new JobConf(WordCount.class);17 conf.setJobName("wordCount");18

19 //設置輸出格式

20 conf.setOutputKeyClass(Text.class);21 conf.setOutputValueClass(IntWritable.class);22

23 //設置MapReduce類

24 conf.setMapperClass(WordCountMap.class);25 conf.setReducerClass(WordCountReduce.class);26

27 //設置處理輸入類

28 conf.setInputFormat(TextInputFormat.class);29 //設置處理輸出類

30 conf.setOutputFormat(TextOutputFormat.class);31

32 FileInputFormat.setInputPaths(conf, new Path(args[0]));33 FileOutputFormat.setOutputPath(conf, new Path(args[1]));34

35 JobClient.runJob(conf);36 }37 }

準備輸入文件

file1

Hello Word By Word

Hello Word By zln

file2

Hello Hadoop

Hello GoodBye

放在同一個目錄下：/home/sherry/IdeaProjects/Hadoop/WordCount/輸入文件準備

編譯class打成一個jar包

我使用IDEA進行編譯。注意不要忘記指定main函數

上傳輸入文件

root@sherry:/opt/hadoop-1.2.1# hadoop fs -mkdir /user/root/zln/WordCount/InputFiles

root@sherry:/opt/hadoop-1.2.1# hadoop fs -put /home/sherry/IdeaProjects/Hadoop/WordCount/輸入文件準備/*/user/root/zln/WordCount/InputFiles

上傳jar并執行

root@sherry:/opt/hadoop-1.2.1# hadoop jar /home/sherry/IdeaProjects/Hadoop/out/artifacts/WordCount_jar/WordCount.jar /user/root/zln/WordCount/InputFiles /user/root/zln/WordCount/OutputFiles

查看執行結果

root@sherry:/opt/hadoop-1.2.1# hadoop fs -ls /user/root/zln/WordCount/OutputFiles

root@sherry:/opt/hadoop-1.2.1# hadoop fs -text /user/root/zln/WordCount/OutputFiles/part-00000

版本三：使用新版本的API對Map? Reduce? main函數進行重寫

Map

1 packagecom.zln.chapter03;2

3 importorg.apache.hadoop.io.IntWritable;4 importorg.apache.hadoop.io.LongWritable;5 importorg.apache.hadoop.io.Text;6 importorg.apache.hadoop.mapreduce.Mapper;7

8 importjava.io.IOException;9 importjava.util.StringTokenizer;10

11 /**

12 * Created by sherry on 15-7-12.13 */

14 public class WordCountMap extends Mapper{15 private final static IntWritable one = new IntWritable(1);//每個單詞 +1

16 private Text word = newText();17

19 @Override20 protected void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {21 String line =value.toString();22 StringTokenizer tokenizer = new StringTokenizer(line);//分割出單詞

23 while(tokenizer.hasMoreTokens()){24 word.set(tokenizer.nextToken());25 context.write(word,one);26 }27 }28

29 }

Reduce

1 packagecom.zln.chapter03;2

3 importorg.apache.hadoop.io.IntWritable;4 importorg.apache.hadoop.io.Text;5 importorg.apache.hadoop.mapreduce.Reducer;6

7 importjava.io.IOException;8

9 /**

10 * Created by sherry on 15-7-12.11 */

12 public class WordCountReduce extends Reducer{13

14 @Override15 protected void reduce(Text key, Iterable values, Context context) throwsIOException, InterruptedException {16 int sum = 0;17 for(IntWritable intWritable:values){18 sum +=intWritable.get();19 }20 context.write(key,newIntWritable(sum));21 }22 }

Main

1 packagecom.zln.chapter03;2

4 importorg.apache.hadoop.conf.Configured;5 importorg.apache.hadoop.fs.Path;6 importorg.apache.hadoop.io.IntWritable;7 importorg.apache.hadoop.io.Text;8 importorg.apache.hadoop.mapreduce.Job;9 importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;10 importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;11 importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;12 importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;13 importorg.apache.hadoop.util.Tool;14 importorg.apache.hadoop.util.ToolRunner;15

18 /**

19 * Created by sherry on 15-7-12.20 */

21 public class WordCount extends Configured implementsTool{22

23 public int run(String[] args) throwsException {24 Job job = newJob(getConf());25 job.setJarByClass(WordCount.class);26 job.setJobName("WordCount");27

29 job.setOutputKeyClass(Text.class);30 job.setOutputValueClass(IntWritable.class);31

32 job.setMapperClass(WordCountMap.class);33 job.setReducerClass(WordCountReduce.class);34

35 job.setInputFormatClass(TextInputFormat.class);36 job.setOutputFormatClass(TextOutputFormat.class);37

38 FileInputFormat.setInputPaths(job,new Path(args[0]));39 FileOutputFormat.setOutputPath(job,new Path(args[1]));40

41 boolean success = job.waitForCompletion(true);42 return success?0:1;43 }44

45 public static void main(String[] args) throwsException {46 int ret = ToolRunner.run(newWordCount(),args);47 System.exit(ret);48 }49 }

總結

以上是生活随笔為你收集整理的hadoop 文本统计一个字符的个数_使用hadoop统计多个文本中每个单词数目的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： linux 心跳灯_Linux下点亮第一
下一篇： git指定版本openwrt源码_[Op