前提读取的文件已经put到hdfs上了,还是报错,仔细想想,为什么两个读取文件只报后面那个读取文件不存在呢?看代码,是读取的方式不同,前面一个是通过sparkcontext读取,后面是file,所以情况应该是只有通过spark生成的对象sc读取才可以,带着这个思路,修改代码,才运行成功。
JavaRDD<String> linesRDD2 = sc.textFile("src/main/resources/santi/bad_words.txt");
//JavaRDD<String> linesRDD2 = sc.textFile("/tmp/bad_words.txt");
// Path path = Paths.get("src/main/resources/santi/santiquanji_liucixin.txt");
// byte[] bytes = Files.readAllBytes(path);
// String text = new String(bytes, Charset.defaultCharset());
// System.out.println(text);
// ArrayList<String> bad_words = new ArrayList<>();
List<String> bad_words = linesRDD2.collect();
sc.parallelize(bad_words);
//File file = new File("src/main/resources/santi/bad_words.txt");
/*File file = new File("hdfs://hadoop:9000/user/hadoop/bad_words.txt");
// 将字节流向字符流转换
InputStreamReader inputStreamReader = new InputStreamReader(new FileInputStream(file),
"utf-8");
// 创建字符流缓冲区
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String str = null;
while ((str = bufferedReader.readLine()) != null) {
bad_words.add(str);
}*/