c++ - 如何在 C/C++ 中执行 _mm256_maskstore_epi8()？

coder 2024-02-05 原文

问题

我想做的是，如果我有一个 27(不是 32!)的 vector int8_t:

x = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 ,21,22,23,24,25,26}

我想首先将它向右循环移位 n(不是常数)，例如如果 n=1:

x2 = {26,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 ,20,21,22,23,24,25}

然后这个vector被用来做一些非常复杂的计算，但是为了简单起见，我们假设下一步只是将它循环左移n，然后存入内存。所以我应该有一个新的 vector 27 int8_t:

y = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 ,21,22,23,24,25,26}

所以有成千上万个这样的 vector ，性能在这里非常关键。我们使用的 CPU 支持 AVX2，所以我们想用它来加快速度。

我目前的解决方案

为了获得 x2，我使用了两个 _mm256_loadu_si256() 和一个 _mm256_blendv_epi8():

int8_t x[31+27+31];
for(int i=0; i<27; i++){
    x[31+i] = i;
}
__m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
__m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+1));  //{1,2,...,26}
__m256i x_first_part  = _mm256_loadu_si256((__m256i*)(x+31-26)); //{0}
__m256i x2            = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0}
int8_t y[31+27+31];
_mm256_storeu_si256((__m256i*)(y+31-26), x2);
_mm256_storeu_si256((__m256i*)(y+31+1), x2);

x 和 y 的大小被声明为 [31+27+31] 的原因是在这种情况下 _mm256_loadu_si256() 和 _mm256_storeu_si256() 不会导致段错误。

我可以通过以下方式获取 y 的值:

for(int i=0; i<27; i++){
    cout << (int)y[31+i] << ' ';
}

新问题

不幸的是，所有的 vector 在内存中必须是连续的，例如，如果总共有两个 vector 需要处理:

x = {[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26];
     [27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]};

然后我不能只使用 _mm256_storeu_si256() 将 y 的值放回内存，因为当第二个 vector 的值写入内存时，它会覆盖一些第一个 vector 的值:

int8_t x[31+27+27+31];
int8_t y[31+27+27+31];
for(int i=0; i<27*2; i++){
    x[31+i] = i;
}
for(int i=0; i<2; i++){
    __m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
    __m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+27*i+1));  //{1,2,...,26}
    __m256i x_first_part  = _mm256_loadu_si256((__m256i*)(x+31+27*i-26)); //{0}
    __m256i x2            = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0}
    _mm256_storeu_si256((__m256i*)(y+31+27*i-26), x2);
    _mm256_storeu_si256((__m256i*)(y+31+27*i+1), x2);
}
for(int i=0; i<27; i++){
    cout << (int)y[31+i] << ' ';
}cout << endl;
for(int i=0; i<27; i++){
    cout << (int)y[31+27+i] << ' ';
}cout << endl;

会输出

0 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

代替

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

所以我在考虑使用maskstore。但是在Intel Intrinsic Guide我找不到 _mm256_maskstore_epi8。这让我回到主题:

如何在 C/C++ 中执行 _mm256_maskstore_epi8()？

最佳答案

还有另一种使用 AVX2 在 27 字节 vector 内实现循环移位的方法:

#include <iostream>
#include <immintrin.h>

const __m256i K0 = _mm256_setr_epi8(
    0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
    0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0);

const __m256i K1 = _mm256_setr_epi8(
    0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
    0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70);

inline const __m256i Shuffle(const __m256i & value, const __m256i & shuffle)
{
    return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, K0)),
        _mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), _mm256_add_epi8(shuffle, K1)));
}

__m256i shuffles[27];

void Init()
{
    uint8_t * p = (uint8_t *)shuffles;
    for (int s = 0; s < 27; ++s)
        for (int i = 0; i < 32; ++i)
            p[s*32 + i] = i < 27 ? (27 + i - s)%27 : i;
}

void CyclicShift27(const uint8_t * src, size_t shift, uint8_t * dst)
{
    _mm256_storeu_si256((__m256i*)dst,  Shuffle(_mm256_loadu_si256((__m256i*)src), shuffles[shift]));
}

int main()
{
    Init();
    uint8_t src[32] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 }, dst[32];
    for (int j = 0; j < 27; ++j)
    {
        CyclicShift27(src, j, dst);
        std::cout << "\t";
        for (int i = 0; i < 32; i++)
            std::cout << (int)dst[i] << ' ';
        std::cout << std::endl;
    }
    return 0;
}

输出:

    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 27 28 29 30 31
    25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 27 28 29 30 31
    24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 27 28 29 30 31
    23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 27 28 29 30 31
    22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 27 28 29 30 31
    21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 27 28 29 30 31
    20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 27 28 29 30 31
    19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 27 28 29 30 31
    18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 27 28 29 30 31
    17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 27 28 29 30 31
    16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 27 28 29 30 31
    15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 27 28 29 30 31
    14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 27 28 29 30 31
    13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 27 28 29 30 31
    12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 27 28 29 30 31
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 27 28 29 30 31
    10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 27 28 29 30 31
    9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 27 28 29 30 31
    8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 27 28 29 30 31
    7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 27 28 29 30 31
    6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 27 28 29 30 31
    5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 27 28 29 30 31
    4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 27 28 29 30 31
    3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 27 28 29 30 31
    2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 27 28 29 30 31
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 27 28 29 30 31

看起来比我之前的回答简单多了。

关于c++ - 如何在 C/C++ 中执行 _mm256_maskstore_epi8()？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/40284282/

amp 43 256 code 27 c++simd intrinsics avx avx2

有关c++ - 如何在 C/C++ 中执行 _mm256_maskstore_epi8()？的更多相关文章

ruby - 如何在 Ruby 中顺序创建 PI - 2
出于纯粹的兴趣，我很好奇如何按顺序创建PI，而不是在过程结果之后生成数字，而是让数字在过程本身生成时显示。如果是这种情况，那么数字可以自行产生，我可以对以前看到的数字实现垃圾收集，从而创建一个无限系列。结果只是在Pi系列之后每秒生成一个数字。这是我通过互联网筛选的结果:这是流行的计算机友好算法，类机器算法:defarccot(x,unity)xpow=unity/xn=1sign=1sum=0loopdoterm=xpow/nbreakifterm==0sum+=sign*(xpow/n)xpow/=x*xn+=2sign=-signendsumenddefcalc_pi(digits
ruby-openid:执行发现时未设置@socket - 2
我在使用omniauth/openid时遇到了一些麻烦。在尝试进行身份验证时，我在日志中发现了这一点:OpenID::FetchingError:Errorfetchinghttps://www.google.com/accounts/o8/.well-known/host-meta?hd=profiles.google.com%2Fmy_username:undefinedmethod`io'fornil:NilClass重要的是undefinedmethodio'fornil:NilClass来自openid/fetchers.rb，在下面的代码片段中:moduleNetclass
ruby - 如何在 buildr 项目中使用 Ruby 代码？ - 2
如何在buildr项目中使用Ruby？我在很多不同的项目中使用过Ruby、JRuby、Java和Clojure。我目前正在使用我的标准Ruby开发一个模拟应用程序，我想尝试使用Clojure后端(我确实喜欢功能代码)以及JRubygui和测试套件。我还可以看到在未来的不同项目中使用Scala作为后端。我想我要为我的项目尝试一下buildr(http://buildr.apache.org/)，但我注意到buildr似乎没有设置为在项目中使用JRuby代码本身!这看起来有点傻，因为该工具旨在统一通用的JVM语言并且是在ruby中构建的。除了将输出的jar包含在一个独特的、仅限ruby
ruby - 什么是填充的 Base64 编码字符串以及如何在 ruby 中生成它们？ - 2
我正在使用的第三方API的文档状态:"[O]urAPIonlyacceptspaddedBase64encodedstrings."什么是“填充的Base64编码字符串”以及如何在Ruby中生成它们。下面的代码是我第一次尝试创建转换为Base64的JSON格式数据。xa=Base64.encode64(a.to_json) 最佳答案他们说的padding其实就是Base64本身的一部分。它是末尾的“=”和“==”。Base64将3个字节的数据包编码为4个编码字符。所以如果你的输入数据有长度n和n%3=1=>"=="末尾用于填充n%
ruby-on-rails - 如何在 ruby 中使用两个参数异步运行 exe？ - 2
exe应该在我打开页面时运行。异步进程需要运行。有什么方法可以在ruby中使用两个参数异步运行exe吗？我已经尝试过ruby命令-system()、exec()但它正在等待过程完成。我需要用参数启动exe，无需等待进程完成是否有任何rubygems会支持我的问题？最佳答案您可以使用Process.spawn和Process.wait2:pid=Process.spawn'your.exe','--option'#Later...pid,status=Process.wait2pid您的程序将作为解释器的子进程执行。除
ruby-on-rails - 如何优雅地重启 thin + nginx？ - 2
我的瘦服务器配置了nginx，我的ROR应用程序正在它们上运行。在我发布代码更新时运行thinrestart会给我的应用程序带来一些停机时间。我试图弄清楚如何优雅地重启正在运行的Thin实例，但找不到好的解决方案。有没有人能做到这一点？最佳答案 #Restartjustthethinserverdescribedbythatconfigsudothin-C/etc/thin/mysite.ymlrestartNginx将继续运行并代理请求。如果您将Nginx设置为使用多个上游服务器，例如server{listen80;server
ruby - 如何在续集中重新加载表模式？ - 2
鉴于我有以下迁移:Sequel.migrationdoupdoalter_table:usersdoadd_column:is_admin,:default=>falseend#SequelrunsaDESCRIBEtablestatement,whenthemodelisloaded.#Atthispoint,itdoesnotknowthatusershaveais_adminflag.#Soitfails.@user=User.find(:email=>"admin@fancy-startup.example")@user.is_admin=true@user.save!ende
ruby - 如何在 Ruby 中拆分参数字符串 Bash 样式？ - 2
我正在为一个项目制作一个简单的shell，我希望像在Bash中一样解析参数字符串。foobar"helloworld"fooz应该变成:["foo","bar","helloworld","fooz"]等等。到目前为止，我一直在使用CSV::parse_line，将列分隔符设置为""和.compact输出。问题是我现在必须选择是要支持单引号还是双引号。CSV不支持超过一个分隔符。Python有一个名为shlex的模块:>>>shlex.split("Test'helloworld'foo")['Test','helloworld','foo']>>>shlex.split('Test"
ruby - 如何在 Lion 上安装 Xcode 4.6，需要用 RVM 升级 ruby - 2
我实际上是在尝试使用RVM在我的OSX10.7.5上更新ruby，并在输入以下命令后:rvminstallruby我得到了以下回复:Searchingforbinaryrubies,thismighttakesometime.Checkingrequirementsforosx.Installingrequirementsforosx.Updatingsystem.......Errorrunning'requirements_osx_brew_update_systemruby-2.0.0-p247',pleaseread/Users/username/.rvm/log/138121
ruby - Chef 执行非顺序配方 - 2
我遵循了教程http://gettingstartedwithchef.com/,第1章。我的运行list是"run_list":["recipe[apt]","recipe[phpap]"]我的phpapRecipe默认Recipeinclude_recipe"apache2"include_recipe"build-essential"include_recipe"openssl"include_recipe"mysql::client"include_recipe"mysql::server"include_recipe"php"include_recipe"php::modul

c++ - 如何在 C/C++ 中执行 _mm256_maskstore_epi8()？

问题

我目前的解决方案

新问题

如何在 C/C++ 中执行 _mm256_maskstore_epi8()？

有关c++ - 如何在 C/C++ 中执行 _mm256_maskstore_epi8()？的更多相关文章

随机推荐