草庐IT

apache-spark - 执行 hbase 扫描时出现异常

coder 2024-01-08 原文

我正在尝试 hbase spark distributed scan example .

我的简单代码如下所示:

public class DistributedHBaseScanToRddDemo {

    public static void main(String[] args) {
        JavaSparkContext jsc = getJavaSparkContext("hbasetable1");
        Configuration hbaseConf = getHbaseConf(0, "", "");
        JavaHBaseContext javaHbaseContext = new JavaHBaseContext(jsc, hbaseConf);

        Scan scan = new Scan();
        scan.setCaching(100);

        JavaRDD<Tuple2<ImmutableBytesWritable, Result>> javaRdd =
                  javaHbaseContext.hbaseRDD(TableName.valueOf("hbasetable1"), scan);

        List<String> results = javaRdd.map(new ScanConvertFunction()).collect();
        System.out.println("Result Size: " + results.size());
    }

    public static Configuration getHbaseConf(int pRimeout, String pQuorumIP, String pClientPort)
    {
        Configuration hbaseConf = HBaseConfiguration.create();
        hbaseConf.setInt("timeout", 120000); 
        hbaseConf.set("hbase.zookeeper.quorum", "10.56.36.14"); 
        hbaseConf.set("hbase.zookeeper.property.clientPort", "2181");
        return hbaseConf;
    }

    public static JavaSparkContext getJavaSparkContext(String pTableName)
    {
        SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkPut" + pTableName);
        sparkConf.setMaster("local");
        sparkConf.set("spark.testing.memory", "471859200");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);

        return jsc;
    }

    private static class ScanConvertFunction implements Function<Tuple2<ImmutableBytesWritable, Result>, String> {
        public String call(Tuple2<ImmutableBytesWritable, Result> v1) throws Exception {
            return Bytes.toString(v1._1().copyBytes());
        }
    }
}

我遇到以下异常:

Exception in thread "main" org.apache.hadoop.hbase.DoNotRetryIOException: /10.56.48.219:16020 is unable to read call parameter from client 10.56.49.148; java.lang.UnsupportedOperationException: GetRegionLoad
    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
    at org.apache.hadoop.hbase.ipc.RemoteWithExtrasException.instantiateException(RemoteWithExtrasException.java:93)
    at org.apache.hadoop.hbase.ipc.RemoteWithExtrasException.unwrapRemoteException(RemoteWithExtrasException.java:83)
    at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.makeIOExceptionOfException(ProtobufUtil.java:368)
    at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.getRemoteException(ProtobufUtil.java:345)
    at org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil.getRegionLoad(ProtobufUtil.java:1746)
    at org.apache.hadoop.hbase.client.HBaseAdmin.getRegionLoad(HBaseAdmin.java:2089)
    at org.apache.hadoop.hbase.mapreduce.RegionSizeCalculator.init(RegionSizeCalculator.java:82)
    at org.apache.hadoop.hbase.mapreduce.RegionSizeCalculator.<init>(RegionSizeCalculator.java:60)
    at org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.oneInputSplitPerRegion(TableInputFormatBase.java:293)
    at org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:257)
    at org.apache.hadoop.hbase.mapreduce.TableInputFormat.getSplits(TableInputFormat.java:254)
    at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:121)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:892)
    at org.apache.spark.api.java.JavaRDDLike$class.collect(JavaRDDLike.scala:360)
    at org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
    at com.myproj.poc.sparkhbaseneo4j.DistributedHBaseScanToRddDemo.main(DistributedHBaseScanToRddDemo.java:32)
Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.DoNotRetryIOException): /10.56.48.219:16020 is unable to read call parameter from client 10.56.49.148; java.lang.UnsupportedOperationException: GetRegionLoad
    at org.apache.hadoop.hbase.ipc.AbstractRpcClient.onCallFinished(AbstractRpcClient.java:387)
    at org.apache.hadoop.hbase.ipc.AbstractRpcClient.access$100(AbstractRpcClient.java:95)
    at org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:410)
    at org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:406)
    at org.apache.hadoop.hbase.ipc.Call.callComplete(Call.java:103)
    at org.apache.hadoop.hbase.ipc.Call.setException(Call.java:118)
    at org.apache.hadoop.hbase.ipc.NettyRpcDuplexHandler.readResponse(NettyRpcDuplexHandler.java:161)
    at org.apache.hadoop.hbase.ipc.NettyRpcDuplexHandler.channelRead(NettyRpcDuplexHandler.java:191)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
    at org.apache.hadoop.hbase.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:310)
    at org.apache.hadoop.hbase.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:284)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
    at org.apache.hadoop.hbase.shaded.io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:287)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1334)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:926)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:134)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:644)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:579)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:496)
    at org.apache.hadoop.hbase.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:458)
    at org.apache.hadoop.hbase.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
    at org.apache.hadoop.hbase.shaded.io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138)
    at java.lang.Thread.run(Thread.java:745)

我也试过批量getput示例,它们工作正常。所以我在猜测批量扫描示例出了什么问题。

最佳答案

这个 Cloudera hbase-spark 连接器似乎可以工作:

https://mvnrepository.com/artifact/org.apache.hbase/hbase-spark?repo=cloudera

因此,在 pom.xml 中添加如下内容:

 <repositories>
    <repository>
      <id>cloudera</id>
      <name>cloudera</name>
      <url>https://repository.cloudera.com/content/repositories/releases/</url>
    </repository>
  </repositories>

对于依赖项:

 <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-spark</artifactId>
      <version>${hbase-spark.version}</version>
    </dependency>

我注意到的一件事是,此功能似乎没有很好地重用 HBase 连接,而是尝试为每个分区重新建立它。在这里查看我的问题和相关讨论:

HBase-Spark Connector: connection to HBase established for every scan?

出于这个原因,我实际上避免了此功能,但很想知道您对此的体验。

关于apache-spark - 执行 hbase 扫描时出现异常,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50271222/

有关apache-spark - 执行 hbase 扫描时出现异常的更多相关文章

  1. ruby-openid:执行发现时未设置@socket - 2

    我在使用omniauth/openid时遇到了一些麻烦。在尝试进行身份验证时,我在日志中发现了这一点:OpenID::FetchingError:Errorfetchinghttps://www.google.com/accounts/o8/.well-known/host-meta?hd=profiles.google.com%2Fmy_username:undefinedmethod`io'fornil:NilClass重要的是undefinedmethodio'fornil:NilClass来自openid/fetchers.rb,在下面的代码片段中:moduleNetclass

  2. ruby - ECONNRESET (Whois::ConnectionError) - 尝试在 Ruby 中查询 Whois 时出错 - 2

    我正在用Ruby编写一个简单的程序来检查域列表是否被占用。基本上它循环遍历列表,并使用以下函数进行检查。require'rubygems'require'whois'defcheck_domain(domain)c=Whois::Client.newc.query("google.com").available?end程序不断出错(即使我在google.com中进行硬编码),并打印以下消息。鉴于该程序非常简单,我已经没有什么想法了-有什么建议吗?/Library/Ruby/Gems/1.8/gems/whois-2.0.2/lib/whois/server/adapters/base.

  3. ruby - 在 64 位 Snow Leopard 上使用 rvm、postgres 9.0、ruby 1.9.2-p136 安装 pg gem 时出现问题 - 2

    我想为Heroku构建一个Rails3应用程序。他们使用Postgres作为他们的数据库,所以我通过MacPorts安装了postgres9.0。现在我需要一个postgresgem并且共识是出于性能原因你想要pggem。但是我对我得到的错误感到非常困惑当我尝试在rvm下通过geminstall安装pg时。我已经非常明确地指定了所有postgres目录的位置可以找到但仍然无法完成安装:$envARCHFLAGS='-archx86_64'geminstallpg--\--with-pg-config=/opt/local/var/db/postgresql90/defaultdb/po

  4. ruby - Chef 执行非顺序配方 - 2

    我遵循了教程http://gettingstartedwithchef.com/,第1章。我的运行list是"run_list":["recipe[apt]","recipe[phpap]"]我的phpapRecipe默认Recipeinclude_recipe"apache2"include_recipe"build-essential"include_recipe"openssl"include_recipe"mysql::client"include_recipe"mysql::server"include_recipe"php"include_recipe"php::modul

  5. ruby-on-rails - Rails - 乐观锁定总是触发 StaleObjectError 异常 - 2

    我正在学习Rails,并阅读了关于乐观锁的内容。我已将类型为integer的lock_version列添加到我的articles表中。但现在每当我第一次尝试更新记录时,我都会收到StaleObjectError异常。这是我的迁移:classAddLockVersionToArticle当我尝试通过Rails控制台更新文章时:article=Article.first=>#我这样做:article.title="newtitle"article.save我明白了:(0.3ms)begintransaction(0.3ms)UPDATE"articles"SET"title"='dwdwd

  6. 使用 ACL 调用 upload_file 时出现 Ruby S3 "Access Denied"错误 - 2

    我正在尝试编写一个将文件上传到AWS并公开该文件的Ruby脚本。我做了以下事情:s3=Aws::S3::Resource.new(credentials:Aws::Credentials.new(KEY,SECRET),region:'us-west-2')obj=s3.bucket('stg-db').object('key')obj.upload_file(filename)这似乎工作正常,除了该文件不是公开可用的,而且我无法获得它的公共(public)URL。但是当我登录到S3时,我可以正常查看我的文件。为了使其公开可用,我将最后一行更改为obj.upload_file(file

  7. ruby - #之间? Cooper 的 *Beginning Ruby* 中的错误或异常 - 2

    在Cooper的书BeginningRuby中,第166页有一个我无法重现的示例。classSongincludeComparableattr_accessor:lengthdef(other)@lengthother.lengthenddefinitialize(song_name,length)@song_name=song_name@length=lengthendenda=Song.new('Rockaroundtheclock',143)b=Song.new('BohemianRhapsody',544)c=Song.new('MinuteWaltz',60)a.betwee

  8. ruby - 为什么 Ruby 的 each 迭代器先执行? - 2

    我在用Ruby执行简单任务时遇到了一件奇怪的事情。我只想用每个方法迭代字母表,但迭代在执行中先进行:alfawit=("a".."z")puts"That'sanalphabet:\n\n#{alfawit.each{|litera|putslitera}}"这段代码的结果是:(缩写)abc⋮xyzThat'sanalphabet:a..z知道为什么它会这样工作或者我做错了什么吗?提前致谢。 最佳答案 因为您的each调用被插入到在固定字符串之前执行的字符串文字中。此外,each返回一个Enumerable,实际上您甚至打印它。试试

  9. ruby - 在 Ruby 中重新分配常量时抛出异常? - 2

    我早就知道Ruby中的“常量”(即大写的变量名)不是真正常量。与其他编程语言一样,对对象的引用是唯一存储在变量/常量中的东西。(侧边栏:Ruby确实具有“卡住”引用对象不被修改的功能,据我所知,许多其他语言都没有提供这种功能。)所以这是我的问题:当您将一个值重新分配给常量时,您会收到如下警告:>>FOO='bar'=>"bar">>FOO='baz'(irb):2:warning:alreadyinitializedconstantFOO=>"baz"有没有办法强制Ruby抛出异常而不是打印警告?很难弄清楚为什么有时会发生重新分配。 最佳答案

  10. ruby - 检查是否通过 require 执行或导入了 Ruby 程序 - 2

    如何检查Ruby文件是否是通过“require”或“load”导入的,而不是简单地从命令行执行的?例如:foo.rb的内容:puts"Hello"bar.rb的内容require'foo'输出:$./foo.rbHello$./bar.rbHello基本上,我想调用bar.rb以不执行puts调用。 最佳答案 将foo.rb改为:if__FILE__==$0puts"Hello"end检查__FILE__-当前ruby​​文件的名称-与$0-正在运行的脚本的名称。 关于ruby-检查是否

随机推荐