c++ - 加快一些 SSE2 Intrinsics 的颜色转换

coder 2024-02-20 原文

我正在尝试执行从 YCbCr 到 BGRA 的图像颜色转换(不要问 A 位，好头疼)。

无论如何，这需要尽可能快地执行，所以我使用编译器内部函数编写它以利用 SSE2。这是我第一次涉足 SIMD 领域，我基本上是一个初学者，所以我确信我做的很多事情都是低效的。

事实证明，我执行实际颜色转换的算术代码特别慢，Intel 的 VTune 显示它是一个重大瓶颈。

那么，有什么方法可以加快以下代码的速度吗？它以 32 位、一次 4 个像素完成。我最初尝试以 8 位、一次 16 个像素(如上循环)进行计算，但计算导致整数溢出和转换中断。整个过程，包括 Intel jpeg 解码，对于全高清的单场大约需要 14 毫秒。如果我能将它降低到至少 12 毫秒，最好是 10 毫秒，那就太好了。

非常感谢任何帮助或提示。谢谢!

const __m128i s128_8    = _mm_set1_epi8((char)128);


const int nNumPixels = roi.width * roi.height;

for (int i=0; i<nNumPixels; i+=32)
{
    // Go ahead and prefetch our packed UV Data.
    // As long as the load remains directly next, this saves us time.
    _mm_prefetch((const char*)&pSrc8u[2][i],_MM_HINT_T0); 

    // We need to fetch and blit out our k before we write over it with UV data.
    __m128i sK1 = _mm_load_si128((__m128i*)&pSrc8u[2][i]);
    __m128i sK2 = _mm_load_si128((__m128i*)&pSrc8u[2][i+16]);

    // Using the destination buffer temporarily here so we don't need to waste time doing a memory allocation.
    _mm_store_si128 ((__m128i*)&m_pKBuffer[i],      sK1);
    _mm_store_si128 ((__m128i*)&m_pKBuffer[i+16],   sK2);

    // In theory, this prefetch needs to be some cycles ahead of the first read. It isn't, yet it does appear to save us time. Worth investigating.
    _mm_prefetch((const char*)&pSrc8u[1][i],_MM_HINT_T0); 

    __m128i sUVI1 = _mm_load_si128((__m128i*)&pSrc8u[1][i]);
    __m128i sUVI2 = _mm_load_si128((__m128i*)&pSrc8u[1][i+16]);  

    // Subtract the 128 here ahead of our YCbCr -> BGRA conversion so we can go 16 pixels at a time rather than 4.
    sUVI1 = _mm_sub_epi8(sUVI1, s128_8);
    sUVI2 = _mm_sub_epi8(sUVI2, s128_8);

    // Swizzle and double up UV data from interleaved 8x1 byte blocks into planar
    __m128i sU1 = _mm_unpacklo_epi8(sUVI1, sUVI1);
    __m128i sV1 = _mm_unpackhi_epi8(sUVI1, sUVI1);

    __m128i sU2 = _mm_unpacklo_epi8(sUVI2, sUVI2);  
    __m128i sV2 = _mm_unpackhi_epi8(sUVI2, sUVI2);  

    _mm_store_si128((__m128i*)&pSrc8u[1][i],        sU1);
    _mm_store_si128((__m128i*)&pSrc8u[1][i+16],     sU2); 

    _mm_store_si128((__m128i*)&pSrc8u[2][i],        sV1);
    _mm_store_si128((__m128i*)&pSrc8u[2][i+16],     sV2); 
}

const __m128i s16   = _mm_set1_epi32(16);
const __m128i s299  = _mm_set1_epi32(299);
const __m128i s410  = _mm_set1_epi32(410);
const __m128i s518  = _mm_set1_epi32(518);
const __m128i s101  = _mm_set1_epi32(101);
const __m128i s209  = _mm_set1_epi32(209);

Ipp8u* pDstP = pDst8u;
for (int i=0; i<nNumPixels; i+=4, pDstP+=16)
{
    __m128i sK = _mm_set_epi32(m_pKBuffer[i],           m_pKBuffer[i+1],            m_pKBuffer[i+2],            m_pKBuffer[i+3]);

    __m128i sY = _mm_set_epi32(pSrc8u[0][i],            pSrc8u[0][i+1],             pSrc8u[0][i+2],             pSrc8u[0][i+3]);
    __m128i sU = _mm_set_epi32((char)pSrc8u[1][i],      (char)pSrc8u[1][i+1],       (char)pSrc8u[1][i+2],       (char)pSrc8u[1][i+3]);
    __m128i sV = _mm_set_epi32((char)pSrc8u[2][i],      (char)pSrc8u[2][i+1],       (char)pSrc8u[2][i+2],       (char)pSrc8u[2][i+3]);

    // N.b. - Attempted to do the sub 16 in 8 bits similar to the sub 128 for U and V - however doing it here is quicker
    // as the time saved on the arithmetic is less than the time taken by the additional loads/stores needed in the swizzle loop
    sY = _mm_mullo_epi32(_mm_sub_epi32(sY, s16), s299);

    __m128i sR  = _mm_srli_epi32(_mm_add_epi32(sY,_mm_mullo_epi32(s410, sV)), 8);
    __m128i sG  = _mm_srli_epi32(_mm_sub_epi32(_mm_sub_epi32(sY, _mm_mullo_epi32(s101, sU)),_mm_mullo_epi32(s209, sV)), 8);
    __m128i sB  = _mm_srli_epi32(_mm_add_epi32(sY, _mm_mullo_epi32(s518, sU)), 8);

    //Microsoft's YUV Conversion
    //__m128i sC = _mm_sub_epi32(sY, s16);
    //__m128i sD = _mm_sub_epi32(sU, s128);
    //__m128i sE = _mm_sub_epi32(sV, s128);
    //
    //__m128i sR =  _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s409, sE)), s128), 8);
    //__m128i sG    = _mm_srli_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(s298, sC), _mm_sub_epi32(_mm_mullo_epi32(s100, sD), _mm_mullo_epi32(s208, sE))), s128), 8);
    //__m128i sB    = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s516, sD)), s128), 8);

    __m128i sKGl = _mm_unpacklo_epi32(sK, sG);
    __m128i sKGh = _mm_unpackhi_epi32(sK, sG);

    __m128i sRBl = _mm_unpacklo_epi32(sR, sB);
    __m128i sRBh = _mm_unpackhi_epi32(sR, sB);

    __m128i sKRGB1 = _mm_unpackhi_epi32(sKGh,sRBh);
    __m128i sKRGB2 = _mm_unpacklo_epi32(sKGh,sRBh);
    __m128i sKRGB3 = _mm_unpackhi_epi32(sKGl,sRBl);
    __m128i sKRGB4 = _mm_unpacklo_epi32(sKGl,sRBl);

    __m128i p1 = _mm_packus_epi16(sKRGB1, sKRGB2);
    __m128i p2 = _mm_packus_epi16(sKRGB3, sKRGB4);

    __m128i po = _mm_packus_epi16(p1, p2);

    _mm_store_si128((__m128i*)pDstP, po);
}

最佳答案

此处您的带宽可能受到限制，因为相对于加载和存储的数量，计算量非常少。

一个建议:摆脱 _mm_prefetch 内在函数 - 它们几乎肯定没有帮助，甚至可能会阻碍更新的 CPU 上的操作(它们已经在自动预取方面做得很好)。

另一个要看的地方:

__m128i sK = _mm_set_epi32(m_pKBuffer[i],           m_pKBuffer[i+1],            m_pKBuffer[i+2],            m_pKBuffer[i+3]);
__m128i sY = _mm_set_epi32(pSrc8u[0][i],            pSrc8u[0][i+1],             pSrc8u[0][i+2],             pSrc8u[0][i+3]);
__m128i sU = _mm_set_epi32((char)pSrc8u[1][i],      (char)pSrc8u[1][i+1],       (char)pSrc8u[1][i+2],       (char)pSrc8u[1][i+3]);
__m128i sV = _mm_set_epi32((char)pSrc8u[2][i],      (char)pSrc8u[2][i+1],       (char)pSrc8u[2][i+2],       (char)pSrc8u[2][i+3]);

这会生成很多不必要的指令 - 你应该在这里使用 _mm_load_xxx 和 _mm_unpackxx_xxx。它将看起来像更多的代码，但它会更有效率。而且您可能应该在循环的每次迭代中处理 16 个像素，而不是 4 个 - 这样您就可以加载一次 8 位值的 vector ，然后根据需要解压缩以将每组 4 个值作为 32 位整数的 vector 。

关于c++ - 加快一些 SSE2 Intrinsics 的颜色转换，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/5031409/

有关c++ - 加快一些 SSE2 Intrinsics 的颜色转换的更多相关文章

ruby-on-rails - 在 Rails 中将文件大小字符串转换为等效千字节 - 2
我的目标是转换表单输入，例如“100兆字节”或“1GB”，并将其转换为我可以存储在数据库中的文件大小(以千字节为单位)。目前，我有这个:defquota_convert@regex=/([0-9]+)(.*)s/@sizes=%w{kilobytemegabytegigabyte}m=self.quota.match(@regex)if@sizes.include?m[2]eval("self.quota=#{m[1]}.#{m[2]}")endend这有效，但前提是输入是倍数(“gigabytes”，而不是“gigabyte”)并且由于使用了eval看起来疯狂不安全。所以，功能正常，
ruby - 使用 ruby 将 HTML 转换为纯文本并维护结构/格式 - 2
我想将html转换为纯文本。不过，我不想只删除标签，我想智能地保留尽可能多的格式。为插入换行符标签，检测段落并格式化它们等。输入非常简单，通常是格式良好的html(不是整个文档，只是一堆内容，通常没有anchor或图像)。我可以将几个正则表达式放在一起，让我达到80%，但我认为可能有一些现有的解决方案更智能。最佳答案首先，不要尝试为此使用正则表达式。很有可能你会想出一个脆弱/脆弱的解决方案，它会随着HTML的变化而崩溃，或者很难管理和维护。您可以使用Nokogiri快速解析HTML并提取文本:require'nokogiri'h
ruby - 将数组的内容转换为 int - 2
我需要读入一个包含数字列表的文件。此代码读取文件并将其放入二维数组中。现在我需要获取数组中所有数字的平均值，但我需要将数组的内容更改为int。有什么想法可以将to_i方法放在哪里吗？ClassTerraindefinitializefile_name@input=IO.readlines(file_name)#readinfile@size=@input[0].to_i@land=[@size]x=1whilex 最佳答案只需将数组映射为整数:@land边注如果你想得到一条线的平均值，你可以这样做:values=@input[x]
ruby - 将散列转换为嵌套散列 - 2
这道题是thisquestion的逆题.给定一个散列，每个键都有一个数组，例如{[:a,:b,:c]=>1,[:a,:b,:d]=>2,[:a,:e]=>3,[:f]=>4,}将其转换为嵌套哈希的最佳方法是什么{:a=>{:b=>{:c=>1,:d=>2},:e=>3,},:f=>4,} 最佳答案这是一个迭代的解决方案，递归的解决方案留给读者作为练习:defconvert(h={})ret={}h.eachdo|k,v|node=retk[0..-2].each{|x|node[x]||={};node=node[x]}node[
ruby-on-rails - 如何优雅地重启 thin + nginx？ - 2
我的瘦服务器配置了nginx，我的ROR应用程序正在它们上运行。在我发布代码更新时运行thinrestart会给我的应用程序带来一些停机时间。我试图弄清楚如何优雅地重启正在运行的Thin实例，但找不到好的解决方案。有没有人能做到这一点？最佳答案 #Restartjustthethinserverdescribedbythatconfigsudothin-C/etc/thin/mysite.ymlrestartNginx将继续运行并代理请求。如果您将Nginx设置为使用多个上游服务器，例如server{listen80;server
ruby - 在没有 sass 引擎的情况下使用 sass 颜色函数 - 2
我想在一个没有Sass引擎的类中使用Sass颜色函数。我已经在项目中使用了sassgem，所以我认为搭载会像以下一样简单:classRectangleincludeSass::Script::FunctionsdefcolorSass::Script::Color.new([0x82,0x39,0x06])enddefrender#hamlengineexecutedwithcontextofself#sothatwithintemlateicouldcall#%stop{offset:'0%',stop:{color:lighten(color)}}endend更新:参见上面的#re
ruby-on-rails - Ruby url 到 html 链接转换 - 2
我正在使用Rails构建一个简单的聊天应用程序。当用户输入url时，我希望将其输出为html链接(即“url”)。我想知道在Ruby中是否有任何库或众所周知的方法可以做到这一点。如果没有，我有一些不错的正则表达式示例代码可以使用... 最佳答案查看auto_linkRails提供的辅助方法。这会将所有URL和电子邮件地址变成可点击的链接(htmlanchor标记)。这是文档中的代码示例。auto_link("Gotohttp://www.rubyonrails.organdsayhellotodavid@loudthinking.
ruby-on-rails - 如何生成传递一些自定义参数的 `link_to` URL？ - 2
我正在使用RubyonRails3.0.9，我想生成一个传递一些自定义参数的link_toURL。也就是说，有一个articles_path(www.my_web_site_name.com/articles)我想生成如下内容:link_to'Samplelinktitle',...#HereIshouldimplementthecode#=>'http://www.my_web_site_name.com/articles?param1=value1¶m2=value2&...我如何编写link_to语句“alàRubyonRailsWay”以实现该目的？如果我想通过传递一些
ruby 诅咒颜色 - 2
如何使用Ruby的默认Curses库获取颜色？所以像这样:puts"\e[0m\e[30;47mtest\e[0m"效果很好。在浅灰色背景上呈现漂亮的黑色。但是这个:#!/usr/bin/envrubyrequire'curses'Curses.noecho#donotshowtypedkeysCurses.init_screenCurses.stdscr.keypad(true)#enablearrowkeys(forpageup/down)Curses.stdscr.nodelay=1Curses.clearCurses.setpos(0,0)Curses.addstr"Hello
ruby-on-rails - 使用 ruby 将多个实例变量转换为散列的更好方法？ - 2
我收到格式为的回复#我需要将其转换为哈希值(针对活跃商家)。目前我正在遍历变量并执行此操作:response.instance_variables.eachdo|r|my_hash.merge!(r.to_s.delete("@").intern=>response.instance_eval(r.to_s.delete("@")))end这有效，它将生成{:first="charlie",:last=>"kelly"},但它似乎有点hacky和不稳定。有更好的方法吗？编辑:我刚刚意识到我可以使用instance_variable_get作为该等式的第二部分，但这仍然是主要问题。

c++ - 加快一些 SSE2 Intrinsics 的颜色转换

有关c++ - 加快一些 SSE2 Intrinsics 的颜色转换的更多相关文章

随机推荐