Driver.java
package driver;
import java.io.IOException;
import mapper.NormalMapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import data.Conf;
import data.Record;
import reducer.NormalReducer;
/**
* Driver of NormalJoin(or BasicJoin)
*
* Implements JoinDriver interface
*
*/
public class NormalJoin implements JoinDriver{
public static void main(String[] args) throws ClassNotFoundException,
IOException, InterruptedException {
if (args.length != 3) {
System.err
.println("Usage: Join <left-side table path> <right-side table path> <output path>");
System.exit(-1);
}
String userDir = System.getProperty("user.dir");
Conf conf = new Conf();
if (!conf.loadConf(userDir + "/conf.properties")) { // TODO
System.err.println("Failed in loading configuration file, exit");
System.exit(-2);
}
new NormalJoin().join(args, conf);
}
public void join(String[] args, Conf conf) throws IOException {
JobConf job = new JobConf(NormalJoin.class);
job.setJobName("Equal Join");
Path inLeft = new Path(args[0]);
Path inRight = new Path(args[1]);
Path out = new Path(args[2]);
FileInputFormat.addInputPath(job, inLeft);
FileInputFormat.addInputPath(job, inRight);
FileOutputFormat.setOutputPath(job, out);
job.setMapperClass(NormalMapper.class);
job.setReducerClass(NormalReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputValueClass(Record.class);
// configuration
job.set("inputNameLeft", inLeft.toString());
job.set("mapred.textoutputformat.separator", conf.separator);
job.setBoolean("mapred.conf.eliminateDuplicate", conf.eliminateDuplicate);
JobClient.runJob(job);
}
这是我生成的控制台输出,它正在生成一个空输出文件
15/08/03 04:29:47 INFO Configuration.deprecation:
mapred.textoutputformat.separator is deprecated. Instead, use
mapreduce.output.textoutputformat.separator
15/08/03 04:29:47 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
15/08/03 04:29:47 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
15/08/03 04:29:48 WARN mapreduce.JobSubmitter: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/08/03 04:29:50 INFO mapred.FileInputFormat: Total input paths to process : 2
15/08/03 04:29:50 INFO mapreduce.JobSubmitter: number of splits:3
15/08/03 04:29:50 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1437465092759_0002
15/08/03 04:29:51 INFO impl.YarnClientImpl: Submitted application application_1437465092759_0002
15/08/03 04:29:51 INFO mapreduce.Job: The url to track the job: http://quickstart.cloudera:8088/proxy/application_1437465092759_0002/
15/08/03 04:29:51 INFO mapreduce.Job: Running job: job_1437465092759_0002
15/08/03 04:30:04 INFO mapreduce.Job: Job job_1437465092759_0002 running in uber mode : false
15/08/03 04:30:04 INFO mapreduce.Job: map 0% reduce 0%
15/08/03 04:30:20 INFO mapreduce.Job: map 33% reduce 0%
15/08/03 04:30:22 INFO mapreduce.Job: map 67% reduce 0%
15/08/03 04:30:23 INFO mapreduce.Job: map 100% reduce 0%
15/08/03 04:30:28 INFO mapreduce.Job: map 100% reduce 100%
15/08/03 04:30:28 INFO mapreduce.Job: Job job_1437465092759_0002 completed successfully
15/08/03 04:30:28 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=5768091
FILE: Number of bytes written=11979199
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=5283057
HDFS: Number of bytes written=0
HDFS: Number of read operations=12
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=3
Launched reduce tasks=1
Data-local map tasks=3
Total time spent by all maps in occupied slots (ms)=44449
Total time spent by all reduces in occupied slots (ms)=5532
Total time spent by all map tasks (ms)=44449
Total time spent by all reduce tasks (ms)=5532
Total vcore-seconds taken by all map tasks=44449
Total vcore-seconds taken by all reduce tasks=5532
Total megabyte-seconds taken by all map tasks=45515776
Total megabyte-seconds taken by all reduce tasks=5664768
Map-Reduce Framework
Map input records=69495
Map output records=69495
Map output bytes=5629095
Map output materialized bytes=5768103
Input split bytes=327
Combine input records=0
Combine output records=0
Reduce input groups=55273
Reduce shuffle bytes=5768103
Reduce input records=69495
Reduce output records=0
Spilled Records=138990
Shuffled Maps =3
Failed Shuffles=0
Merged Map outputs=3
GC time elapsed (ms)=672
CPU time spent (ms)=4400
Physical memory (bytes) snapshot=805781504
Virtual memory (bytes) snapshot=6027804672
Total committed heap usage (bytes)=557592576
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=5282730
File Output Format Counters
Bytes Written=0
最佳答案
您在程序中使用了旧的 API
mapred.textoutputformat.separator
尝试使用
mapreduce.output.textoutputformat.separator
hadoop.mapred vs hadoop.mapreduce?
同时发布你的代码
关于java - Hadoop 在命令行上执行时生成空输出文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/31786380/
我有一个Ruby程序,它使用rubyzip压缩XML文件的目录树。gem。我的问题是文件开始变得很重,我想提高压缩级别,因为压缩时间不是问题。我在rubyzipdocumentation中找不到一种为创建的ZIP文件指定压缩级别的方法。有人知道如何更改此设置吗?是否有另一个允许指定压缩级别的Ruby库? 最佳答案 这是我通过查看rubyzip内部创建的代码。level=Zlib::BEST_COMPRESSIONZip::ZipOutputStream.open(zip_file)do|zip|Dir.glob("**/*")d
我试图在一个项目中使用rake,如果我把所有东西都放到Rakefile中,它会很大并且很难读取/找到东西,所以我试着将每个命名空间放在lib/rake中它自己的文件中,我添加了这个到我的rake文件的顶部:Dir['#{File.dirname(__FILE__)}/lib/rake/*.rake'].map{|f|requiref}它加载文件没问题,但没有任务。我现在只有一个.rake文件作为测试,名为“servers.rake”,它看起来像这样:namespace:serverdotask:testdoputs"test"endend所以当我运行rakeserver:testid时
我的目标是转换表单输入,例如“100兆字节”或“1GB”,并将其转换为我可以存储在数据库中的文件大小(以千字节为单位)。目前,我有这个:defquota_convert@regex=/([0-9]+)(.*)s/@sizes=%w{kilobytemegabytegigabyte}m=self.quota.match(@regex)if@sizes.include?m[2]eval("self.quota=#{m[1]}.#{m[2]}")endend这有效,但前提是输入是倍数(“gigabytes”,而不是“gigabyte”)并且由于使用了eval看起来疯狂不安全。所以,功能正常,
我需要在客户计算机上运行Ruby应用程序。通常需要几天才能完成(复制大备份文件)。问题是如果启用sleep,它会中断应用程序。否则,计算机将持续运行数周,直到我下次访问为止。有什么方法可以防止执行期间休眠并让Windows在执行后休眠吗?欢迎任何疯狂的想法;-) 最佳答案 Here建议使用SetThreadExecutionStateWinAPI函数,使应用程序能够通知系统它正在使用中,从而防止系统在应用程序运行时进入休眠状态或关闭显示。像这样的东西:require'Win32API'ES_AWAYMODE_REQUIRED=0x0
Rails2.3可以选择随时使用RouteSet#add_configuration_file添加更多路由。是否可以在Rails3项目中做同样的事情? 最佳答案 在config/application.rb中:config.paths.config.routes在Rails3.2(也可能是Rails3.1)中,使用:config.paths["config/routes"] 关于ruby-on-rails-Rails3中的多个路由文件,我们在StackOverflow上找到一个类似的问题
对于具有离线功能的智能手机应用程序,我正在为Xml文件创建单向文本同步。我希望我的服务器将增量/差异(例如GNU差异补丁)发送到目标设备。这是计划:Time=0Server:hasversion_1ofXmlfile(~800kiB)Client:hasversion_1ofXmlfile(~800kiB)Time=1Server:hasversion_1andversion_2ofXmlfile(each~800kiB)computesdeltaoftheseversions(=patch)(~10kiB)sendspatchtoClient(~10kiBtransferred)Cl
我正在寻找执行以下操作的正确语法(在Perl、Shell或Ruby中):#variabletoaccessthedatalinesappendedasafileEND_OF_SCRIPT_MARKERrawdatastartshereanditcontinues. 最佳答案 Perl用__DATA__做这个:#!/usr/bin/perlusestrict;usewarnings;while(){print;}__DATA__Texttoprintgoeshere 关于ruby-如何将脚
使用带有Rails插件的vim,您可以创建一个迁移文件,然后一次性打开该文件吗?textmate也可以这样吗? 最佳答案 你可以使用rails.vim然后做类似的事情::Rgeneratemigratonadd_foo_to_bar插件将打开迁移生成的文件,这正是您想要的。我不能代表textmate。 关于ruby-使用VimRails,您可以创建一个新的迁移文件并一次性打开它吗?,我们在StackOverflow上找到一个类似的问题: https://sta
我想用ruby编写一个小的命令行实用程序并将其作为gem分发。我知道安装后,Guard、Sass和Thor等某些gem可以从命令行自行运行。为了让gem像二进制文件一样可用,我需要在我的gemspec中指定什么。 最佳答案 Gem::Specification.newdo|s|...s.executable='name_of_executable'...endhttp://docs.rubygems.org/read/chapter/20 关于ruby-在Ruby中编写命令行实用程序
在MRIRuby中我可以这样做:deftransferinternal_server=self.init_serverpid=forkdointernal_server.runend#Maketheserverprocessrunindependently.Process.detach(pid)internal_client=self.init_client#Dootherstuffwithconnectingtointernal_server...internal_client.post('somedata')ensure#KillserverProcess.kill('KILL',