非常感谢您阅读我的帖子。
我正在做 CUDA 工作,但一直收到 cudaDeviceSynchronize() 错误代码 77:cudaErrorIllegalAddress,不知道为什么。我搜索了代码和函数,令人惊讶的是,只有几条记录出现了。很奇怪。
我基本上总结了图像的所有像素。为了让我的问题有尽可能多的引用,我在这里展示了我所有的 CUDA 代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
我已经为主机和设备内存分配了足够且等效的内存空间。任何意见表示赞赏。
最佳答案
虽然这可能不是代码中唯一的错误来源,但您没有为缩减内核分配任何动态共享内存,从而导致您看到的非法寻址错误。正确的内核启动应该是这样的
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
这为运行在缩减内核中的每个线程分配了一个 unsigned long long 的等价物,这(通过我非常粗略地阅读你的代码)应该使共享内存数组 sdata 的正确大小内核在没有越界访问该数组的情况下运行。
关于c++ - cudaDeviceSynchronize() 错误代码 77 : cudaErrorIllegalAddress,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23069468/
大约一年前,我决定确保每个包含非唯一文本的Flash通知都将从模块中的方法中获取文本。我这样做的最初原因是为了避免一遍又一遍地输入相同的字符串。如果我想更改措辞,我可以在一个地方轻松完成,而且一遍又一遍地重复同一件事而出现拼写错误的可能性也会降低。我最终得到的是这样的:moduleMessagesdefformat_error_messages(errors)errors.map{|attribute,message|"Error:#{attribute.to_s.titleize}#{message}."}enddeferror_message_could_not_find(obje
如何在buildr项目中使用Ruby?我在很多不同的项目中使用过Ruby、JRuby、Java和Clojure。我目前正在使用我的标准Ruby开发一个模拟应用程序,我想尝试使用Clojure后端(我确实喜欢功能代码)以及JRubygui和测试套件。我还可以看到在未来的不同项目中使用Scala作为后端。我想我要为我的项目尝试一下buildr(http://buildr.apache.org/),但我注意到buildr似乎没有设置为在项目中使用JRuby代码本身!这看起来有点傻,因为该工具旨在统一通用的JVM语言并且是在ruby中构建的。除了将输出的jar包含在一个独特的、仅限ruby
在rails源中:https://github.com/rails/rails/blob/master/activesupport/lib/active_support/lazy_load_hooks.rb可以看到以下内容@load_hooks=Hash.new{|h,k|h[k]=[]}在IRB中,它只是初始化一个空哈希。和做有什么区别@load_hooks=Hash.new 最佳答案 查看rubydocumentationforHashnew→new_hashclicktotogglesourcenew(obj)→new_has
我的瘦服务器配置了nginx,我的ROR应用程序正在它们上运行。在我发布代码更新时运行thinrestart会给我的应用程序带来一些停机时间。我试图弄清楚如何优雅地重启正在运行的Thin实例,但找不到好的解决方案。有没有人能做到这一点? 最佳答案 #Restartjustthethinserverdescribedbythatconfigsudothin-C/etc/thin/mysite.ymlrestartNginx将继续运行并代理请求。如果您将Nginx设置为使用多个上游服务器,例如server{listen80;server
我遵循MichaelHartl的“RubyonRails教程:学习Web开发”,并创建了检查用户名和电子邮件长度有效性的测试(名称最多50个字符,电子邮件最多255个字符)。test/helpers/application_helper_test.rb的内容是:require'test_helper'classApplicationHelperTest在运行bundleexecraketest时,所有测试都通过了,但我看到以下消息在最后被标记为错误:ERROR["test_full_title_helper",ApplicationHelperTest,1.820016791]test
我是rails的新手,想在form字段上应用验证。myviewsnew.html.erb.....模拟.rbclassSimulation{:in=>1..25,:message=>'Therowmustbebetween1and25'}end模拟Controller.rbclassSimulationsController我想检查模型类中row字段的整数范围,如果不在范围内则返回错误信息。我可以检查上面代码的范围,但无法返回错误消息提前致谢 最佳答案 关键是您使用的是模型表单,一种显示ActiveRecord模型实例属性的表单。c
我正在尝试编写一个将文件上传到AWS并公开该文件的Ruby脚本。我做了以下事情:s3=Aws::S3::Resource.new(credentials:Aws::Credentials.new(KEY,SECRET),region:'us-west-2')obj=s3.bucket('stg-db').object('key')obj.upload_file(filename)这似乎工作正常,除了该文件不是公开可用的,而且我无法获得它的公共(public)URL。但是当我登录到S3时,我可以正常查看我的文件。为了使其公开可用,我将最后一行更改为obj.upload_file(file
我克隆了一个rails仓库,我现在正尝试捆绑安装背景:OSXElCapitanruby2.2.3p173(2015-08-18修订版51636)[x86_64-darwin15]rails-v在您的Gemfile中列出的或native可用的任何gem源中找不到gem'pg(>=0)ruby'。运行bundleinstall以安装缺少的gem。bundleinstallFetchinggemmetadatafromhttps://rubygems.org/............Fetchingversionmetadatafromhttps://rubygems.org/...Fe
在Cooper的书BeginningRuby中,第166页有一个我无法重现的示例。classSongincludeComparableattr_accessor:lengthdef(other)@lengthother.lengthenddefinitialize(song_name,length)@song_name=song_name@length=lengthendenda=Song.new('Rockaroundtheclock',143)b=Song.new('BohemianRhapsody',544)c=Song.new('MinuteWaltz',60)a.betwee
我是Google云的新手,我正在尝试对其进行首次部署。我的第一个部署是RubyonRails项目。我基本上是在关注thisguideinthegoogleclouddocumentation.唯一的区别是我使用的是我自己的项目,而不是他们提供的“helloworld”项目。这是我的app.yaml文件runtime:customvm:trueentrypoint:bundleexecrackup-p8080-Eproductionconfig.ruresources:cpu:0.5memory_gb:1.3disk_size_gb:10当我转到我的项目目录并运行gcloudprevie